Python TextParser.TextParser 예제들, TextParser.TextParser, dash-subtitle-extractor Python 예제들

예제 #1

0

파일 보기

파일: reqresp.py 프로젝트: thuylt83/darkc0de-old-stuff

	def sent_header_callback(self,type,data):
		if type==pycurl.INFOTYPE_HEADER_OUT:
			tp=TextParser()
			tp.setSource("string",data)

			while (tp.readUntil("^([^:]+): (.*)$")):
				self.addHeader(tp[0][0],tp[0][1])

예제 #2

0

파일 보기

    def parseMultipart(self, cad, boundary):
        self.boundary = boundary
        dicc = []
        tp = TextParser()
        tp.setSource("string", cad)

        while True:
            headers = []
            if not tp.readUntil("name=\"([^\"]+)\""):
                break
            var = tp[0][0]
            headers.append(tp.lastFull_line.strip())
            while True:
                tp.readLine()
                if tp.search("^([^:]+): (.*)$"):
                    headers.append(tp.lastFull_line.strip())
                else:
                    break

            value = ""
            while True:
                tp.readLine()
                if not tp.search(boundary):
                    value += tp.lastFull_line
                else:
                    break

            if value[-2:] == "\r\n":
                value = value[:-2]

            dicc.append(Variable(var, value, headers))

        self.variables = dicc

예제 #3

0

파일 보기

파일: reqresp.py 프로젝트: thuylt83/darkc0de-old-stuff

    def parseResponse(self, rawResponse):
        tp = TextParser()
        tp.setSource("string", rawResponse)

        tp.readLine()
        tp.search("(HTTP\S*) ([0-9]+)")

        try:
            self.protocol = tp[0][0]
        except:
            self.protocol = "unknown"

        try:
            self.code = tp[0][1]
        except:
            self.code = "0"


#		try:
#			self.message=tp[2]
#		except:
#			self.message="unknown"

        self.code = int(self.code)

        while True:
            tp.readLine()
            if (tp.search("^([^:]+): ?(.*)$")):
                self.addHeader(tp[0][0], tp[0][1])
            else:
                break

        while tp.skip(1):
            self.addContent(tp.lastFull_line)

예제 #4

0

파일 보기

파일: dlgTextPreprocessingProgressOld.py 프로젝트: rambasnet/MAKE2

 def __init__(self, win, startTime):
     import HTMLParser
     self.win = win
     self.StartTime = startTime
     self.DocID = 0
     self.WordID = 0
     self.StemmedWordID = 0
     self.DirCount = 0
     self.FilesCount = 0
     self.WordCount = 0
     self.StemmedWordCount = 0
     self.ElapsedTime = ""
     self.ParseStatus = "Indexing in Progress..."
     self.KeyColumnNames = ""
     self.UseStemmer = False
     self.Stemmer = None
     #self.SetupTextCatDB()
     DBFunctions.SetupTextCatTables(Globals.TextCatFileName)
     
     """
     self.timerStatus = wx.Timer(id=wx.NewId(), owner=self)
     self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer,
           id=self.timerStatus.GetId())
     """
     self.EventStart = time.time()
     self.splitter = re.compile(r'\W*')
     #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I)
     if Globals.Stemmer == "Porter Stemmer":
         self.Stemmer = PorterStemmer()
         #self.UseStemmer = True
     self.htmlParser = HTMLParser.HTMLParser(self.Stemmer)   
     self.textParser = TextParser.TextParser(self.Stemmer)
     """

예제 #5

0

파일 보기

	def parseResponse (self,rawResponse,type="curl"):
		self.__content=""
		self.__headers=[]

		tp=TextParser()
		tp.setSource("string",rawResponse)

		while True:
			tp.readUntil("(HTTP\S*) ([0-9]+)")

			try:
				self.protocol=tp[0][0]
			except:
				self.protocol="unknown"

			try:
				self.code=tp[0][1]
			except:
				self.code="0"

			if self.code!="100":
				break


		self.code=int(self.code)

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): ?(.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		while tp.skip(1):
			self.addContent(tp.lastFull_line)

		if type=='curl':
			self.delHeader("Transfer-Encoding")

		if self.header_equal("Transfer-Encoding","chunked"):
			result=""
			content=StringIO.StringIO(self.__content)
			hexa=content.readline()	
			nchunk=int(hexa.strip(),16)
			
			while nchunk:
				result+=content.read(nchunk)
				content.readline()
				hexa=content.readline()	
				nchunk=int(hexa.strip(),16)

			self.__content=result

		if self.header_equal("Content-Encoding","gzip"):
			compressedstream = StringIO.StringIO(self.__content)
			gzipper = gzip.GzipFile(fileobj=compressedstream)
			body=gzipper.read()
			self.__content=body
			self.delHeader("Content-Encoding")

예제 #6

0

파일 보기

	def __init__ (self,file,growing=False,exclude=""):
		self.tp=TextParser()
		self.tp.setSource("file",file)
		self.tp.growing=growing

		self.reqs=[]
		self.resp=[]

		self.reqs_exclude=[]
		self.resp_exclude=[]

		self.exclude=exclude

예제 #7

0

파일 보기

파일: reqresp.py 프로젝트: thuylt83/darkc0de-old-stuff

	def parsePOSTDATA(self,pd):

		if self.ContentType=="application/x-www-form-urlencoded":
			dicc=self.readUrlEncodedVariables(pd)
			self.__variablesPOST.update(dicc)

		elif self.ContentType=="multipart/form-data":
			self.multiPOSThead={}
			dicc={}
			tp=TextParser()
			tp.setSource("string",pd)
		#	print self.boundary
		#	print tp.readUntil("%s$" % (self.boundary))

			while True:
				headers=[]
				if not tp.readUntil("name=\"([^\"]+)\""):
					break
				var=tp[0][0]
				headers.append(tp.lastFull_line.strip())
				while True:
					tp.readLine()
					if tp.search("^([^:]+): (.*)$"):
						headers.append(tp.lastFull_line.strip())
					else:
						break

				value=""
				while True:
					tp.readLine()
					if not tp.search(self.boundary):
						value+=tp.lastFull_line
					else:
						break

				if value[-2:]=="\r\n":
					value=value[:-2]


				dicc[var]=value
				self.multiPOSThead[var]=headers

				if tp.search(self.boundary+"--"):
					break

			
			self.__variablesPOST.update(dicc)
#			print pd
#			print dicc
#			print self.__variablesPOST

		else:
			self.__uknPostData=pd

예제 #8

0

파일 보기

파일: ParsedInputSources.py 프로젝트: badams77-cpu/SuperBayesCat

    def get_sources(self, cat):
        word_counts = []
        import TextParser
        if not cat in self.files:
            print("category not found: " + cat)
            return word_counts
        for file in self.files[cat]:
            from TextParser import TextParser
            parser = TextParser()

            parser.parse(file)
            word_counts.append(parser.get_word_counts())
        return word_counts

예제 #9

0

파일 보기

파일: reqresp.py 프로젝트: thuylt83/darkc0de-old-stuff

	def parseRequest (self,rawRequest,prot="http"):
		''' Aun esta en fase BETA y por probar'''
		tp=TextParser()
		tp.setSource("string",rawRequest)

		self.__variablesPOST={}
		self.__headers={}		# diccionario, por ejemplo headers["Cookie"]


		tp.readLine()
		try:
			tp.search("(\w+) (.*) (HTTP\S*)")
			self.method=tp[0][0]
			self.protocol=tp[0][2]
		except:
			print "error en"
			print rawRequest
			return

		pathTMP=tp[0][1]
		pathTMP=('','')+urlparse(pathTMP)[2:]
		pathTMP=urlunparse(pathTMP)
		pathTMP=pathTMP.replace("//","/")
		self.time=strftime("%H:%M:%S", gmtime())

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): (.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		self.setUrl(prot+"://"+self.__headers["Host"]+pathTMP)

		if self.method.upper()=="POST":

			pd=""
			while tp.readLine(): 
				pd+=tp.lastFull_line


			if "Content-Type" in self.__headers:
				values=self.__headers["Content-Type"].split(";")
				if values[0].strip().lower()=="application/x-www-form-urlencoded":
					self.ContentType=values[0]
				elif values[0].strip().lower()=="multipart/form-data":
					self.ContentType=values[0]
					self.boundary=values[1].split("=")[1].strip()

			self.parsePOSTDATA(pd)

예제 #10

0

파일 보기

    def parseRequest(self, rawRequest, prot="http"):
        ''' Aun esta en fase BETA y por probar'''
        tp = TextParser()
        tp.setSource("string", rawRequest)

        self.__variablesPOST = VariablesSet()
        self._headers = {}  # diccionario, por ejemplo headers["Cookie"]

        tp.readLine()
        try:
            tp.search("^(\w+) (.*) (HTTP\S*)$")
            self.method = tp[0][0]
            self.protocol = tp[0][2]
        except Exception, a:
            print rawRequest
            raise a

예제 #11

0

파일 보기

파일: OutlookTextParser.py 프로젝트: rambasnet/MAKE2

    def __init__(self,
                 db,
                 AttachmentsDict,
                 Stopwords=[],
                 Stemmer=None,
                 bloomFilter=None,
                 logFile=None):
        #self.filePath = filePath
        self.db = db
        self.Stemmer = Stemmer
        self.bloomFilter = bloomFilter
        self.AttachmentsDict = AttachmentsDict
        self.Stopwords = Stopwords
        self.logFile = logFile
        self.FromRE = re.compile(r"(From:\W*)(.*)", re.I)
        self.ToRE = re.compile(r'(To:\W*)(.*)', re.I)
        self.CcRE = re.compile(r"(Cc:\W*)(.*)", re.I)
        self.BccRE = re.compile(r"(Bcc:\W*)(.*)", re.I)
        self.DateRE = re.compile(r'(Date:\W*)(.*)')
        self.SubjectRE = re.compile(r'(Subject:\W*)(.*)')
        self.AttachmentsRE = re.compile(r'(Attachments:\W*)(.*)')
        self.EmailRE = re.compile(r"[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+", re.I)
        #self.EmailsDict = EmailsDict

        self.query = "insert into " + Constants.EmailsTable + "(DocID, FromID, ToID,EmailDate,Subject,Attachments,FilePath,AttachmentsPath,TotalRecipients,Size,`Group`,Label) values (?,?,?,?,?,?,?,?,?,?,?,?)"
        #self.query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)"

        #added for TC on Emails
        #self.WordCount = 0
        #self.StemmedWordCount = 0
        self.FooterLineRE = re.compile(r'[_-]{2,}')
        self.ListSepRE = re.compile(r'[~`!#$^&*()+=|\\{}\'"?><\[\],;]')
        self.Splitter = re.compile(r'\W+', re.I)

        self.PhoneRE = re.compile(
            r'([\d{3}]*)[\(\)-/\. ]*(\d{3})[\(\)-/\. ]*(\d{4})\D*')
        self.EmailRE = re.compile(r"\A[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+",
                                  re.I)
        self.HTTPRE = re.compile(r"\A(http://)[a-z0-9_-]+\.[a-z]{2,4}\b", re.I)

        self.textParser = TextParser.TextParser(db,
                                                Stopwords,
                                                Stemmer=self.Stemmer,
                                                bloomFilter=self.bloomFilter)

예제 #12

0

파일 보기

def run_game():
    """This is the general controller function to run the game.
    First, it generates the TextParser and Player character. Second, it gives a choice to either run with default
        options or run a DungeonCreator.
    """
    parser = TextParser()
    player = Player()

    choice = ""
    while not choice.isdigit() or int(choice) < 1 or int(choice) > 2:
        choice = input("(1) Run with default Dungeon settings or (2) Generate Dungeon? (1/2): ")

    if int(choice) == 1:
        dungeon = Dungeon("")
        if dungeon.json_dungeon is not None:
            dungeon.dungeon_control(parser, player)
    else:
        dungeon_creator = DungeonCreator()
        file_name = dungeon_creator.generate_dungeon(parser, player)
        dungeon = Dungeon(os.path.join(os.path.dirname(__file__), "UserDefinedFiles", file_name))
        if dungeon.json_dungeon is not None:
            dungeon.dungeon_control(parser, player)

예제 #13

0

파일 보기

    def Run(self):
        #print Globals.TextCatCategoryList
        db = SqliteDatabase(Globals.TextCatFileName)
        if not db.OpenConnection():
            return
        #self.htmlParser = HTMLParser.HTMLParser(self.Stemmer)
        textParser = TextParser.TextParser(db, Globals.Stopwords, self.Stemmer)

        #print Globals.TextCatDirList
        for dirPath, dirs, files in os.walk(self.rootPath):
            self.DirCount += 1
            for afile in files:
                self.FilesCount += 1
                if not self.keepGoing:
                    self.running = False
                    return

                filePath = os.path.join(dirPath, afile)
                try:
                    #print filePath
                    parsed = False
                    dotIndex = filePath.rfind('.')
                    if dotIndex >= 0:
                        extension = filePath[dotIndex:]

                        fileType = wx.TheMimeTypesManager.GetFileTypeFromExtension(
                            extension)
                        if fileType:
                            mimeType = fileType.GetMimeType() or "unknown"
                            if Globals.TextCatCategoryList:
                                if mimeType not in Globals.TextCatCategoryList:

                                    self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                        time.time() - self.StartTime)

                                    if (time.time() - self.EventStart) > 10:
                                        #print time.time() - self.EventStart
                                        self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                            time.time() - self.StartTime)
                                        self.SendEvent()

                                    continue

                            #print filePath
                            query = "insert into %s (DocPath) values (?)" % (
                                Constants.TextCatDocumentsTable)
                            DocID = db.InsertAutoRow(query, [(filePath, )])

                            if mimeType == 'application/msword':
                                try:
                                    textParser.parse(
                                        DocID,
                                        MSOfficeToText.WordToText(filePath),
                                        filePath)
                                except:
                                    textParser.parse(
                                        DocID, HTMLParser.getText(filePath),
                                        filePath)

                            elif mimeType == 'application/pdf':
                                try:
                                    textParser.parse(
                                        DocID, PDFToText.GetText(filePath),
                                        filePath)
                                except:
                                    textParser.parse(
                                        DocID, HTMLParser.getText(filePath),
                                        filePath)
                            else:
                                textParser.parse(DocID,
                                                 HTMLParser.getText(filePath),
                                                 filePath)

                            parsed = True

                    if not parsed:
                        textParser.parse(DocID, HTMLParser.getText(filePath),
                                         filePath)

                    self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                        time.time() - self.StartTime)

                    if (time.time() - self.EventStart) > 10:
                        #print time.time() - self.EventStart
                        self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                            time.time() - self.StartTime)
                        self.SendEvent()
                except Exception, value:
                    #print "Failed to read file: %s Error: %s"%(filePath, value)
                    try:
                        print "Error in Text Preprocessing: ", filePath, value
                    except:
                        print "Error in Text Preprocessing..."
                        continue

예제 #14

0

파일 보기

from TextParser import *


def analyze(string):
    sentiment = ''
    blob = TextBlob(string)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        sentiment = 'pos'
    elif polarity == 0:
        sentiment = 'neu'
    elif polarity < 0:
        sentiment = 'neg'

    return sentiment, polarity


if __name__ == '__main__':

    server = couchdb.Server('http://115.146.95.53:5984')
    db = server['twitter_rest']

    TextParser.getStopWords()
    textParser = TextParser()
    for row in db.view('C2E2View/C2E2'):
        doc = db.get(row.id)
        (tag, score) = analyze(textParser.parsing(row.value['what']['text']))
        doc['sentiment'] = {'sentiment': tag, 'sentiScore': score}
        db.save(doc)

예제 #15

0

파일 보기

파일: reqresp.py 프로젝트: thuylt83/darkc0de-old-stuff

    def parseRequest(self, rawRequest, prot="http"):
        ''' Aun esta en fase BETA y por probar'''
        tp = TextParser()
        tp.setSource("string", rawRequest)

        self.__postdata = ""  # Datos por POST, toto el string
        self.__variablesPOST = {}
        self.__headers = {}  # diccionario, por ejemplo headers["Cookie"]

        tp.readLine()
        try:
            tp.search("(\w+) (.*) (HTTP\S*)")
            self.method = tp[0][0]
            self.protocol = tp[0][2]
        except:
            print "error en"
            print rawRequest
            return

        pathTMP = tp[0][1]
        pathTMP = ('', ) + urlparse(pathTMP)[1:]
        pathTMP = urlunparse(pathTMP)
        pathTMP = pathTMP.replace("//", "/")
        self.time = strftime("%H:%M:%S", gmtime())

        while True:
            tp.readLine()
            if (tp.search("^([^:]+): (.*)$")):
                self.addHeader(tp[0][0], tp[0][1])
            else:
                break

        self.setUrl(prot + "://" + self.__headers["Host"] + pathTMP)

        if self.method.upper() == "POST":

            lastBytesread = tp.readLine()
            totalBytesRead = 0
            pd = ""
            while lastBytesread:
                totalBytesRead += lastBytesread
                pd += tp.lastFull_line
                lastBytesread = tp.readLine()

            self.__headers["Content-Length"] = str(totalBytesRead)
            self.__postdata = pd

            if string.find(self.__postdata, "\n") == -1:
                variables = self.__postdata.split("&")
                for i in variables:
                    tmp = i.split("=", 1)
                    if len(tmp) == 2:
                        self.addVariablePOST(tmp[0], tmp[1])
                    else:
                        self.addVariablePOST(tmp[0], '')

        self.pathWithoutVariables = self.path
        if len(self.path.split("?")) > 1:
            variables = self.path.split("?")[1].split("&")
            self.pathWithoutVariables = self.path.split("?")[0]
            for i in variables:
                list = i.split("=")
                if len(list) == 1:
                    self.addVariableGET(list[0], "")
                elif len(list) == 2:
                    self.addVariableGET(list[0], list[1])

        self.url = "%s://%s" % (prot, self.__headers["Host"])

예제 #16

0

파일 보기

파일: dlgEmailPreprocessingProgress.py 프로젝트: rambasnet/MAKE2

    def Run(self):
        db = SqliteDatabase(Globals.EmailsFileName)
        if not db.OpenConnection():
            return

        self.bloomFilter = self.CreateBloomFilter()
        #self.bloomFilter = None

        logFileName = PlatformMethods.Decode(
            os.path.join(
                Globals.CasePath,
                (Globals.EmailsFileName[Globals.EmailsFileName.rfind(os.sep) +
                                        1:] + '.log')))
        self.fout = open(logFileName, 'ab')
        #print self.CheckedMimeTypes
        self.fout.write(
            'Parsing/Indexing Emails Attachments Started at: %s\n' %
            (time.ctime()))

        if self.AddressBookPath:
            self.ParseStatus = "Parsing Address book..."
            self.SendEvent()
            AddressBookParser = OutlookAddressBook.AddressBookParser(
                Globals.AddressBookDict)
            for root, dirs, files in os.walk(self.AddressBookPath):
                for eachfile in files:
                    filePath = os.path.join(root, eachfile)
                    self.FilesCount += 1
                    if (filePath.rfind('.') == -1):
                        continue
                    #print filePath
                    extension = filePath[filePath.rfind('.'):]
                    #print 'extension ', extension
                    if extension.lower() == ".csv":
                        AddressBookParser.Parse(filePath)
                        #print 'add book parsed'
        else:
            self.fout.write('No Addressbook path found!\n')
        #Updte Addressbook
        query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)"
        ManyValues = []
        for key in Globals.AddressBookDict:
            #'EmailID': email, 'FirstName': firstName, 'MiddleName': middleName, 'LastName': lastName, 'InBook':1}
            ManyValues.append((Globals.AddressBookDict[key]['EmailID'],
                               Globals.AddressBookDict[key]['FirstName'],
                               Globals.AddressBookDict[key]['MiddleName'],
                               Globals.AddressBookDict[key]['LastName'],
                               Globals.AddressBookDict[key]['InBook']))

        #query = "delete from %s"%Constants.AddressBookTable
        #db.ExecuteNonQuery(query)
        #print ManyValues
        db.ExecuteMany(query1, ManyValues)

        #self.ParseStatus = "Done Preprocessing/Indexing Emails!"
        #return

        textParser = TextParser.TextParser(db,
                                           Globals.EmailsStopwords,
                                           self.Stemmer,
                                           bloomFilter=self.bloomFilter)
        docxParser = DocxParser.DocxParser(db,
                                           Globals.EmailsStopwords,
                                           self.Stemmer,
                                           bloomFilter=self.bloomFilter)
        docParser = DocParser.DocParser(db,
                                        Globals.EmailsStopwords,
                                        self.Stemmer,
                                        bloomFilter=self.bloomFilter)
        docQuery = "insert into %s (DocPath, DocType) values (?, ?)" % (
            Constants.TextCatDocumentsTable)

        if self.AttachmentsPath:
            for root, dirs, files in os.walk(self.AttachmentsPath):
                for eachfile in files:
                    filePath = os.path.join(root, eachfile)

                    fileNameList = eachfile.split()
                    if len(fileNameList) >= 2:
                        dateTimeFileName = "%s %s - %s" % (
                            fileNameList[0],
                            (fileNameList[1].replace(".", ":")),
                            (eachfile[eachfile.rfind('-') + 1:]))

                        if self.AttachmentsDict.has_key(dateTimeFileName):
                            self.AttachmentsDict[dateTimeFileName].append(
                                filePath)
                        else:
                            self.AttachmentsDict[dateTimeFileName] = [filePath]
                            #print 'Intersting! more than 1 attach. file found with same date time: %s'%
                        #else:
                        #    self.AttachmentsDict[dateTimeFileName] = filePath
                    else:
                        self.fout.write(
                            'Attachment filename found without date time: %s\n'
                            % (PlatformMethods.Encode(filePath)))

        #AttachmentsDict, Stopwords=[], Stemmer=None
        self.outlookTextParser = OutlookTextParser.OutlookTextParser(
            db,
            self.AttachmentsDict,
            Globals.EmailsStopwords,
            self.Stemmer,
            bloomFilter=self.bloomFilter,
            logFile=self.fout)

        if self.IndexMessages:
            self.ParseStatus = "Parsing and Indexing Emails..."
        else:
            self.ParseStatus = "Parsing Email Headers..."

        self.SendEvent()
        for root, dirs, files in os.walk(self.EmailsPath):
            if not self.keepGoing:
                self.running = False
                return

            for eachfile in files:
                self.FilesCount += 1
                if not self.keepGoing:
                    self.running = False
                    return

                filePath = os.path.join(root, eachfile)
                #print filePath
                if (filePath.rfind('.') == -1):
                    continue

                try:
                    extension = filePath[filePath.rfind('.'):]
                    fileType = wx.TheMimeTypesManager.GetFileTypeFromExtension(
                        extension)
                    if fileType:
                        mimeType = fileType.GetMimeType() or "unknown"
                        if mimeType == "text/plain":
                            try:
                                self.outlookTextParser.parse(
                                    filePath, self.IndexMessages)
                            except Exception, msg:
                                self.fout.write(
                                    'Error Parsing Message: %s Msg:: %s\n' %
                                    (PlatformMethods.Encode(filePath), msg))

                            self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                time.time() - self.StartTime)

                    if (time.time() - self.EventStart) > 10:
                        self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                            time.time() - self.StartTime)
                        self.SendEvent()

                except Exception, value:
                    #try:
                    self.fout.write(
                        "Error Parsing Message: %s Msg: %s\n" %
                        (PlatformMethods.Encode(filePath), str(value)))
                    self.fout.flush()

예제 #17

0

파일 보기

파일: iSearch.py 프로젝트: resslerruntime/proxystrike

    def getNewPage(self):

        if self.MoreResults == False:
            raise StopIteration

        if self.start == None:
            self.start = self.startIndex
        else:
            self.start += self.increment

        req = Request()
        req.addHeader(
            "User-Agent",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
        )
        url = self.url.replace("{query}", str(self.query))
        url = url.replace("{startvar}", str(self.start))

        req.setUrl(url)
        req.setTotalTimeout(10)
        req.setConnTimeout(10)
        if self.cookie:
            req.addHeader("Cookie", self.cookie)

        req.setProxy(self.proxy)
        req.setFollowLocation(True)

        trys = 5
        while trys:
            try:
                req.perform()
                break
            except:
                trys -= 1
                if not trys:
                    self.status = "Failed"
                    raise StopIteration
                pass

        if not req.response.has_header('Content-Type') or (
                not 'text' in req.response['Content-Type']
                and not 'script' in req.response['Content-Type']):
            self.MoreResults = False
            return

        rawResponse = self.preProcess(req.response.getContent())

        self.cookie = req.response.getCookie()

        tp = TextParser()
        tp.setSource("string", rawResponse)
        if req.response.code == 200:
            self.responseContent = req.response.getContent()

        while tp.readUntil(self.urlRegexp):
            for i in tp:
                self.addResult(i)

        tp.seekinit()

        if tp.readUntil(self.nextRegexp):
            self.MoreResults = True
        else:
            self.MoreResults = False

        self.REQ = req

예제 #18

0

파일 보기

 def get_chapter_title(self):
     parser = TextParser(self.__pagestr)
     return self.Utf8ToAnsi(
         parser.JumpStr('keywords').JumpStr('content="').GetStr('"'))

예제 #19

0

파일 보기

 def get_content(self):
     parser = TextParser(self.__pagestr)
     return self.Utf8ToAnsi(
         parser.JumpStr('"cont"').GetStr('<script')).replace(
             '<br>', '\n').replace('<br/>', '\n')

예제 #20

0

파일 보기

 def __init__(self, db, Stopwords=[], Stemmer=None, bloomFilter=None):
     self.textParser = TextParser.TextParser(db,
                                             Stopwords,
                                             Stemmer,
                                             bloomFilter=bloomFilter)
     self.charNumRegExp = re.compile('\W', re.IGNORECASE)

예제 #21

0

파일 보기

    def Run(self):
        #print Globals.TextCatCategoryList

        db = SqliteDatabase(Globals.TextCatFileName)
        logFileName = PlatformMethods.Decode(
            os.path.join(Globals.CasePath, (
                Globals.TextCatFileName[Globals.TextCatFileName.rfind(os.sep) +
                                        1:] + '.log')))
        self.fout = open(logFileName, 'ab')
        if not db.OpenConnection():
            return

        #self.bloomFilter = self.CreateBloomFilter()
        self.bloomFilter = None
        #self.htmlParser = HTMLParser.HTMLParser(self.Stemmer)
        textParser = TextParser.TextParser(db,
                                           Globals.Stopwords,
                                           Stemmer=self.Stemmer,
                                           bloomFilter=self.bloomFilter)
        #self.WordDict = {}
        #print Globals.TextCatDirList
        docxParser = DocxParser.DocxParser(db, Globals.Stopwords, self.Stemmer)
        docParser = DocParser.DocParser(db, Globals.Stopwords, self.Stemmer)
        query = "insert into %s (DocPath) values (?)" % (
            Constants.TextCatDocumentsTable)

        self.filePath = ""
        for dirPath, dirs, files in os.walk(self.rootPath):
            self.DirCount += 1
            for afile in files:
                self.FileScanStartTime = time.time()
                self.FilesCount += 1
                """
                if (self.FilesCount % Globals.TotalFilesToHold) == 0 and self.WordDict:
                    self.ParseStatus = "Writing to database..."
                    self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(time.time() - self.StartTime)
                    self.SendEvent()
                    self.HandleWords(self.WordDict)
                    self.ParseStatus = "Indexing in Progress..."
                    self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(time.time() - self.StartTime)
                    self.SendEvent()
                    self.WordDict = {}
                """
                if not self.keepGoing:
                    self.running = False
                    return

                self.filePath = os.path.join(dirPath, afile)
                try:
                    #print filePath
                    parsed = False
                    dotIndex = self.filePath.rfind('.')
                    extension = ""
                    if dotIndex >= 0:
                        extension = self.filePath[dotIndex:]

                        fileType = wx.TheMimeTypesManager.GetFileTypeFromExtension(
                            extension)
                        if fileType:
                            mimeType = fileType.GetMimeType() or "unknown"
                            if Globals.TextCatCategoryList:
                                if mimeType not in Globals.TextCatCategoryList:

                                    self.FileScanStartTime = time.time()
                                    #self.fout.write('%s :'%(self.filePath))
                                    #query = "insert into %s (DocPath) values (?)"%(Constants.TextCatDocumentsTable)

                                    #default list of all the mime types doesn't seem to produce the mime type for
                                    # MS docx document
                                    if mimeType == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' or extension == '.docx':
                                        try:
                                            DocID = db.InsertAutoRow(
                                                query,
                                                [(PlatformMethods.Encode(
                                                    self.filePath), )])
                                            #docID, filePath, startTime, logFile, extractMedia = False, MediaPath=""
                                            docxParser.Parse(
                                                DocID,
                                                self.filePath,
                                                self.FileScanStartTime,
                                                self.fout,
                                                extractMedia=False,
                                                MediaPath="")
                                            parsed = True
                                        except Exception, value:
                                            self.fout.write(
                                                "Error in docxParser : %s Value: %s\n"
                                                % (self.filePath, value))
                                            #gives junk so let's not parse it using binary
                                            #parsed = True
                                            #docxParser.Parse(DocID, self.filePath, extractMedia = False, MediaPath="")

                                    curTime = time.time()
                                    self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                        curTime - self.StartTime)

                                    #self.fout.write('%s\n'%(CommonFunctions.ConvertSecondsToDayHourMinSec(curTime - self.FileScanStartTime)))
                                    #self.fout.flush()

                                    if (curTime - self.EventStart) > 10:
                                        #print time.time() - self.EventStart
                                        self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                            curTime - self.StartTime)
                                        self.SendEvent()

                                    continue
                                else:
                                    pass

                            #print filePath
                            self.FileScanStartTime = time.time()
                            #self.fout.write('%s :'%(self.filePath))

                            DocID = db.InsertAutoRow(
                                query,
                                [(PlatformMethods.Encode(self.filePath), )])
                            #print 'mimeType ', mimeType
                            #print 'extension ', extension
                            if mimeType == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' or extension == '.docx':
                                try:
                                    #docID, filePath, startTime, logFile, extractMedia = False, MediaPath=""
                                    docxParser.Parse(DocID,
                                                     self.filePath,
                                                     self.FileScanStartTime,
                                                     self.fout,
                                                     extractMedia=False,
                                                     MediaPath="")
                                    parsed = True
                                except Exception, value:
                                    #gives junk so let's not parse it using binary
                                    parsed = True
                                    self.fout.write(
                                        "Error in docxParser : %s Value: %s\n"
                                        % (PlatformMethods.Encode(
                                            self.filePath), value))

                            elif mimeType == 'application/msword':
                                """
                                try:
                                    textParser.parse(DocID, MSOfficeToText.WordToText(self.filePath), self.filePath, self.FileScanStartTime, self.fout)
                                    parsed = True
                                except Exception, value:
                                    self.fout.write("Error in MSOfficeToText.WordToText : %s Value: %s\n"%(self.filePath, value))
                                """
                                try:
                                    #docID, filePath, startTime, logFile, extractMedia = False, MediaPath=""
                                    docParser.Parse(DocID,
                                                    self.filePath,
                                                    self.FileScanStartTime,
                                                    self.fout,
                                                    extractMedia=False,
                                                    MediaPath="")
                                    parsed = True
                                except Exception, value:
                                    #gives junk so let's not parse it using binary
                                    self.fout.write(
                                        "Error in DocParser : %s Value: %s\n" %
                                        (PlatformMethods.Encode(
                                            self.filePath), value))

                            elif mimeType == 'application/pdf':
                                try:
                                    textParser.parse(
                                        DocID,
                                        PDFToText.GetText(self.filePath),
                                        self.filePath, self.FileScanStartTime,
                                        self.fout)
                                    parsed = True
                                except Exception, value:
                                    self.fout.write(
                                        "Error in PDFToText: %s Value: %s\n" %
                                        (PlatformMethods.Encode(
                                            self.filePath), value))

                            elif mimeType == 'text/plain':
                                try:
                                    fin = open(self.filePath, 'rb')
                                    #data = fin.read(4096)
                                    #while data:
                                    textParser.parse(DocID, fin.read(),
                                                     self.filePath,
                                                     self.FileScanStartTime,
                                                     self.fout)
                                    parsed = True
                                    fin.close()

                                except Exception, value:
                                    self.fout.write(
                                        "Error in text/plain : %s Value: %s\n"
                                        % (PlatformMethods.Encode(
                                            self.filePath), value))

예제 #22

0

파일 보기

파일: DocxParser.py 프로젝트: rambasnet/MAKE2

 def __init__(self, db, Stopwords=[], Stemmer=None, bloomFilter=None):
     self.textParser = TextParser.TextParser(db, Stopwords, Stemmer,
                                             bloomFilter)