예제 #1
0
	def sent_header_callback(self,type,data):
		if type==pycurl.INFOTYPE_HEADER_OUT:
			tp=TextParser()
			tp.setSource("string",data)

			while (tp.readUntil("^([^:]+): (.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
예제 #2
0
	def parseMultipart(self,cad,boundary):
		self.boundary=boundary
		dicc=[]
		tp=TextParser()
		tp.setSource("string",cad)

		while True:
			headers=[]
			if not tp.readUntil("name=\"([^\"]+)\""):
				break
			var=tp[0][0]
			headers.append(tp.lastFull_line.strip())
			while True:
				tp.readLine()
				if tp.search("^([^:]+): (.*)$"):
					headers.append(tp.lastFull_line.strip())
				else:
					break

			value=""
			while True:
				tp.readLine()
				if not tp.search(boundary):
					value+=tp.lastFull_line
				else:
					break

			if value[-2:]=="\r\n":
				value=value[:-2]


			dicc.append(Variable(var,value,headers))

		self.variables=dicc
예제 #3
0
	def sent_header_callback(self,type,data):
		if type==pycurl.INFOTYPE_HEADER_OUT:
			tp=TextParser()
			tp.setSource("string",data)

			while (tp.readUntil("^([^:]+): (.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
예제 #4
0
	def parseResponse (self,rawResponse,type="curl"):
		self.__content=""
		self.__headers=[]

		tp=TextParser()
		tp.setSource("string",rawResponse)

		while True:
			tp.readUntil("(HTTP\S*) ([0-9]+)")

			try:
				self.protocol=tp[0][0]
			except:
				self.protocol="unknown"

			try:
				self.code=tp[0][1]
			except:
				self.code="0"

			if self.code!="100":
				break


		self.code=int(self.code)

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): ?(.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		while tp.skip(1):
			self.addContent(tp.lastFull_line)

		if type=='curl':
			self.delHeader("Transfer-Encoding")

		if self.header_equal("Transfer-Encoding","chunked"):
			result=""
			content=StringIO.StringIO(self.__content)
			hexa=content.readline()	
			nchunk=int(hexa.strip(),16)
			
			while nchunk:
				result+=content.read(nchunk)
				content.readline()
				hexa=content.readline()	
				nchunk=int(hexa.strip(),16)

			self.__content=result

		if self.header_equal("Content-Encoding","gzip"):
			compressedstream = StringIO.StringIO(self.__content)
			gzipper = gzip.GzipFile(fileobj=compressedstream)
			body=gzipper.read()
			self.__content=body
			self.delHeader("Content-Encoding")
예제 #5
0
	def parseResponse (self,rawResponse,type="curl"):
		self.__content=""
		self.__headers=[]

		tp=TextParser()
		tp.setSource("string",rawResponse)

		while True:
			tp.readUntil("(HTTP\S*) ([0-9]+)")

			try:
				self.protocol=tp[0][0]
			except:
				self.protocol="unknown"

			try:
				self.code=tp[0][1]
			except:
				self.code="0"

			if self.code!="100":
				break


		self.code=int(self.code)

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): ?(.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		while tp.skip(1):
			self.addContent(tp.lastFull_line)

		if type=='curl':
			self.delHeader("Transfer-Encoding")

		if self.header_equal("Transfer-Encoding","chunked"):
			result=""
			content=StringIO.StringIO(self.__content)
			hexa=content.readline()	
			nchunk=int(hexa.strip(),16)
			
			while nchunk:
				result+=content.read(nchunk)
				content.readline()
				hexa=content.readline()	
				nchunk=int(hexa.strip(),16)

			self.__content=result

		if self.header_equal("Content-Encoding","gzip"):
			compressedstream = StringIO.StringIO(self.__content)
			gzipper = gzip.GzipFile(fileobj=compressedstream)
			body=gzipper.read()
			self.__content=body
			self.delHeader("Content-Encoding")
예제 #6
0
def get_basefilename(dfi_filename):
    """
    引数で渡されたdfiファイルから、"/Header/BaseFileName"の値を返す
    """
    instance=TextParser.getInstanceSingleton()
    TextParser.read(instance, dfi_filename)

    rt = TextParser.getValue(instance,"/Header/BaseFileName")
    if rt[0] != 0:
        raise
    return rt[1]
예제 #7
0
 def __init__(self, parent, channel, data, *args, **kwargs):
     super(BackgroundManager, self).__init__(*args, **kwargs)
     self.parent = parent
     poll = parser.Poll(data)
     self.counter = parser.ChatHandler([poll])
     self.chat_bot = bot.ChatBot(UI.USERNAME,
                                 UI.PASSWORD)  #change to use event info
     self.chat_bot.connect_to_irc()
     self.chat_bot.connect_to_channel(channel)
     #self.daemon = True
     self.start()
예제 #8
0
	def __init__ (self,file,growing=False,exclude=""):
		self.tp=TextParser()
		self.tp.setSource("file",file)
		self.tp.growing=growing

		self.reqs=[]
		self.resp=[]

		self.reqs_exclude=[]
		self.resp_exclude=[]

		self.exclude=exclude
    def get_sources(self, cat):
        word_counts = []
        import TextParser
        if not cat in self.files:
            print("category not found: " + cat)
            return word_counts
        for file in self.files[cat]:
            from TextParser import TextParser
            parser = TextParser()

            parser.parse(file)
            word_counts.append(parser.get_word_counts())
        return word_counts
예제 #10
0
def print_feature(files_dir,parsed_files_dir,results_dir,links_file):
    """ Write the feature values of every links from prominents in a new file.
	Args:
	    files_dir : directory containing all non-parsed text files
		parsed_files_dir : directory containing all parsed text files
		results_dir : directory containing results
		links_file : file listing all links from prominents
	"""
    print("Computing similarity from \"LINKS\" file.")
    lost = 0
    
    # Reading the links file (all outgoing and incoming links from prominent articles)  
    with open(links_file,'r',encoding='utf-8') as links:
        with open(os.path.join(results_dir,"cosine_similarity_feature"),"w",encoding='utf-8') as out:
            
            # nb_lines = 0
            
            for line in links:  
                # get the files names (and encode in base64 if necessary)
                (title1,title2) = get_article_titles(line)
                (filename1,filename2) = get_article_names(files_dir,title1,title2)
                
                # create parsed files 
                parsed_file_1 = TextParser.parse_file(filename1,files_dir,parsed_files_dir)
                parsed_file_2 = TextParser.parse_file(filename2,files_dir,parsed_files_dir)
                
                # compute feature
                feature = -1
                if os.path.isfile(parsed_file_1) and os.path.isfile(parsed_file_2):                            
                    try :
                        feature = similarity(parsed_file_1,parsed_file_2)
                    except ValueError :
                        # compute similarity on unparsed files
                        src_file1 = os.path.join(files_dir,filename1)
                        src_file2 = os.path.join(files_dir,filename2)
                        if os.path.isfile(src_file1) and os.path.isfile(src_file2) :
                            feature = similarity(src_file1,src_file2)
                        else :
                            feature =-1
                            lost = lost+1
                else :
                    lost = lost+1  # count number of lost links
                    
                # write feature in output file
                link_title = title1+"@"+title2
                out.write(link_title+"\t%f\n"%(feature))
                # data.append(feature)
        out.close()
    links.close()
    print("Lost links :",lost)         
 def __init__(self, win, startTime):
     import HTMLParser
     self.win = win
     self.StartTime = startTime
     self.DocID = 0
     self.WordID = 0
     self.StemmedWordID = 0
     self.DirCount = 0
     self.FilesCount = 0
     self.WordCount = 0
     self.StemmedWordCount = 0
     self.ElapsedTime = ""
     self.ParseStatus = "Indexing in Progress..."
     self.KeyColumnNames = ""
     self.UseStemmer = False
     self.Stemmer = None
     #self.SetupTextCatDB()
     DBFunctions.SetupTextCatTables(Globals.TextCatFileName)
     
     """
     self.timerStatus = wx.Timer(id=wx.NewId(), owner=self)
     self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer,
           id=self.timerStatus.GetId())
     """
     self.EventStart = time.time()
     self.splitter = re.compile(r'\W*')
     #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I)
     if Globals.Stemmer == "Porter Stemmer":
         self.Stemmer = PorterStemmer()
         #self.UseStemmer = True
     self.htmlParser = HTMLParser.HTMLParser(self.Stemmer)   
     self.textParser = TextParser.TextParser(self.Stemmer)
     """
예제 #12
0
def drug_peek():
    data = request.data

    if not data:
        data = request.form.keys()[0]

    p = TextParser(data)

    output = p.get_drug_data()
    print output

    # d = DrugAnalyzer(p.get_drug_data())
    # d.add_summary("Summary item 1")
    # d.check_data()

    #output = "Drug: Unknown \nSummary: To be analyzed \nCaution: No Applicable\n"

    return output
def aggregator(path_to_dir):
    files = glob.glob(path_to_dir + '/*.html', recursive=True)
    bigList = []
    for file in files:
        s = TextParser.text_to_string(file)
        f = TextParser.text_filter(s)
        c = TextParser.cleanse(f)
        for item in c:
            bigList.append(item)

    listDF = pd.DataFrame(bigList, columns=['Name', 'Message', 'Date Time'])
    listDF['Date Time'] = pd.to_datetime(listDF['Date Time'])
    listDF = listDF.set_index('Date Time')
    listDF['Year'] = listDF.index.year
    listDF['Month'] = listDF.index.month
    listDF['Day'] = listDF.index.day
    listDF['Time'] = listDF.index.time
    listDF['Weekday'] = listDF.index.day_name()
    return listDF
예제 #14
0
 def run_search(self):
     name = self.name_entry.get()
     arn = self.arn_entry.get()
     waf.make_doc(name, arn, 1)
     prof = es.fiche(name)
     for filename in enumerate(os.listdir(name + '/')):
         nom_fichier = name + '/' + filename[1]
         text = tp.open_fichier(nom_fichier)
         sen = tp.get_text_for_name(nom_fichier, name)
         evs = pf.find_evenement(sen, text, name)
         for e in range(len(evs)):
             ev = es.evenement(evs[e][0], evs[e][1], evs[e][2], evs[e][3])
             prof.add_evenement(ev)
             #prof.get_evenement(e)
     prof.sort_evenements_by_score()
     prof.remove_score_0()
     prof.remove_dup()
     t2p = prof.str_fiche()
     self.make_new_text_window(t2p)
예제 #15
0
 def _list_to_dict(self, list_list):
     #TODO: this function should be elsewhere
     all_options = {}
     for section in list_list:
         #multiple keys will point to a single Counter if that choice has multiple variations
         counter = tp.VoteCounter(section[0])
         for item in section:
             all_options[item] = counter
         #and them append counter somewhere accessible
     return all_options
예제 #16
0
def getPage(n):
	a=Request()
	
	a.setUrl("http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg="+str(n)+"&po=0&pro=9&paso=1")
	
	a.perform()
	
	HTML=a.response.getContent()
	
	tp=TextParser()
	tp.setSource("string",HTML)
	
	lista=[]
	
	while tp.readUntil("<tr class='contenido_tab' onClick='window.location=\"([a-zA-Z0-9\.-]+)\""):
		# Expresion regular mejorada ("<tr class='contenido_tab' onClick='window.location=\"([^\"]+)\""):
		link=tp[0][0]
	
		tp.readUntil("<td class='contenido_tab' align='right'>([0-9]+)</td>")
		precio=int(tp[0][0])
	
		lista.append([link,precio])

	print "Descargada pagina",n

	return lista
예제 #17
0
def main():
    try:
        fileText = FileManager.getText(sys.argv[1])

        information = TextParser.parse(fileText)

        OutputHelper.format(information)
    except IndexError:
        ErrorHandler.handleError("IndexError")
    except FileNotFoundError:
        ErrorHandler.handleError("FileNotFoundError")
    except:
        ErrorHandler.handleError("GenericError")
예제 #18
0
def Main(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS):
    # # These are all the global dictionaries/objects in the game. Anywhere where a loadgame happens you need all the global variables
    # global PLAYER #The main character. player is an object instance of class character.
    # global ITEMS #All the items. This a dictionary of objects of class equipment keyed by their lowcase equipment name (item.name). Remember the lowercase, may trip you up if referencing upercase version in the file.
    # global MAPS #All the locations. A tuple of objects of class Map inxed by there x,y,z coordinate (MAPS[x][y][z])
    # global INTERACT #All the interactables (stationary things that need something). This a dictionary of objects of class Interact keyed by their lowcase name (interact.name). Remember the lowercase, may trip you up if referencing upercase version in the file.
    # global QUESTS #Quest statuses. This is a dictionary of flags (1 or 0) for the status of the quest keyed by quest name.
    # global ENEMIES #All the npcs. This a dictionary of objects of class Enemy keyed by their lowcase equipment name (item.name.lower()). Remember the lowercase, may trip you up if referencing upercase version in the file.
    # global GAMEINFO #Miscellaneous game info. Dictionary of all sorts of variables
    # global GAMESETTINGS # The game settings that are saved in the game
    # # global keyword makes the variables inside the function reference the correct global scope variable when assigned in the function.
    # # If not assignment within the function  may lead to changes only in the local scope



    # Main game loop section that runs while the player is alive (player is killed in story once done)
    # TODO don't have main loop based on player alive but on game being played, e.g. gameExit boolean variable instead
    while(PLAYER.alive):
        # if not(GAMESETTINGS['HardcoreMode']): MapDisplay.mini()  # Minimap display area in game


        if GAMEINFO['scriptdata']: # if there's a script loaded carry out those commands! instead of normal
            command = GAMEINFO['scriptdata'].pop(0)  # pops the first element to go through script until finished
            printT(command)
        else:
            command = input('\nWhat do you want to do?\n')

        print(LINEBREAK)  # This linebreak helps split up each turn
        if GAMESETTINGS['HardcoreMode']: print(CLEARSCREEN)

        # Sends the command text to the text parser to be interpreted and action to be done
        MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = TextParser.Parser(command,MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS)

        GAMEINFO['commandcount'] += 1  # increments the command count after every command but doesn't print
        #print LINEBREAK  # Got rid of this bottom linebreak to hopefully have the current situation more clear
        MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.story(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS)  # runs through the story quests checks and actions
        MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.sidequests(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS)  # runs through all the sidequest checks and actions
        MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.events(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS)  # runs through all the events checks and actions

        #TODO integrate this into game functions with a function, possibly seperate quests from game functions and import all from there to keep things global
        if PLAYER.alive == False and GAMEINFO['layersdeep'] > 0:  # gets you out of the EPTA all the way down quest and back into the sublayer
            #End(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS)
            print(LINEBREAK)
            printT(" (\S)You finish the game and put back the laptop ready to get back to reality.\nHow long did you spend on this game?")
            log = GAMEINFO['log'] #sets up a temporary variable to pass the log back up a layer
            MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = load_game(str(GAMEINFO['layersdeep']-1))
            GAMEINFO['log'] = log + ["--Back in layer: " + str(GAMEINFO['layersdeep']) +"---"] #overwrites it to keep a running tab and says what layer we're in 
            #Doesn't reset the GAMEINFO['timestart'] as the runtime will included the time in the nested function
            #TODO delete the save file you're coming out of
            
    MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = End(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) #calls the end function in main so that the game can continue its loop structure
예제 #19
0
 def plot_fit_gui(self):
     name = self.name_entry.get()
     fichier = self.file_entry.get()
     text = tp.open_fichier(fichier)
     [firstname, lastname] = tp.format_name(name)
     [wo, fo, lo] = tp.find_name(name, firstname, lastname, text)
     [x, y, p, fit] = tp.name_occ_fit_gauss(wo, fo, lo, text)
     tp.plot_fit(x, y, fit)
예제 #20
0
    def parseRequest(self, rawRequest, prot="http"):
        ''' Aun esta en fase BETA y por probar'''
        tp = TextParser()
        tp.setSource("string", rawRequest)

        self.__variablesPOST = VariablesSet()
        self._headers = {}  # diccionario, por ejemplo headers["Cookie"]

        tp.readLine()
        try:
            tp.search("^(\w+) (.*) (HTTP\S*)$")
            self.method = tp[0][0]
            self.protocol = tp[0][2]
        except Exception, a:
            print rawRequest
            raise a
예제 #21
0
	def parseRequest (self,rawRequest,prot="http"):
		''' Aun esta en fase BETA y por probar'''
		tp=TextParser()
		tp.setSource("string",rawRequest)

		self.__variablesPOST=VariablesSet()
		self.__headers={}		# diccionario, por ejemplo headers["Cookie"]

		tp.readLine()
		try:
			tp.search("^(\w+) (.*) (HTTP\S*)$")
			self.method=tp[0][0]
			self.protocol=tp[0][2]
		except Exception,a:
			print rawRequest
			raise a
예제 #22
0
    def __init__(self,
                 db,
                 AttachmentsDict,
                 Stopwords=[],
                 Stemmer=None,
                 bloomFilter=None,
                 logFile=None):
        #self.filePath = filePath
        self.db = db
        self.Stemmer = Stemmer
        self.bloomFilter = bloomFilter
        self.AttachmentsDict = AttachmentsDict
        self.Stopwords = Stopwords
        self.logFile = logFile
        self.FromRE = re.compile(r"(From:\W*)(.*)", re.I)
        self.ToRE = re.compile(r'(To:\W*)(.*)', re.I)
        self.CcRE = re.compile(r"(Cc:\W*)(.*)", re.I)
        self.BccRE = re.compile(r"(Bcc:\W*)(.*)", re.I)
        self.DateRE = re.compile(r'(Date:\W*)(.*)')
        self.SubjectRE = re.compile(r'(Subject:\W*)(.*)')
        self.AttachmentsRE = re.compile(r'(Attachments:\W*)(.*)')
        self.EmailRE = re.compile(r"[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+", re.I)
        #self.EmailsDict = EmailsDict

        self.query = "insert into " + Constants.EmailsTable + "(DocID, FromID, ToID,EmailDate,Subject,Attachments,FilePath,AttachmentsPath,TotalRecipients,Size,`Group`,Label) values (?,?,?,?,?,?,?,?,?,?,?,?)"
        #self.query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)"

        #added for TC on Emails
        #self.WordCount = 0
        #self.StemmedWordCount = 0
        self.FooterLineRE = re.compile(r'[_-]{2,}')
        self.ListSepRE = re.compile(r'[~`!#$^&*()+=|\\{}\'"?><\[\],;]')
        self.Splitter = re.compile(r'\W+', re.I)

        self.PhoneRE = re.compile(
            r'([\d{3}]*)[\(\)-/\. ]*(\d{3})[\(\)-/\. ]*(\d{4})\D*')
        self.EmailRE = re.compile(r"\A[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+",
                                  re.I)
        self.HTTPRE = re.compile(r"\A(http://)[a-z0-9_-]+\.[a-z]{2,4}\b", re.I)

        self.textParser = TextParser.TextParser(db,
                                                Stopwords,
                                                Stemmer=self.Stemmer,
                                                bloomFilter=self.bloomFilter)
예제 #23
0
def normalize(
        s):  # проставление ударений, нормализация строчек с переводами строки
    l = list(s.replace("△", "").split("\n"))
    ans = []
    for z in l:
        text = ""
        prev = ""
        for el in z:
            if el not in alpha:
                text = text + TP.make_bigger(prev)
                prev = ""
            elif el == "ё":
                text = text + prev + "Ё"
                prev = ""
            else:
                text = text + prev
                prev = el
        ans.append(text + el)
    return ans
예제 #24
0
def run_game():
    """This is the general controller function to run the game.
    First, it generates the TextParser and Player character. Second, it gives a choice to either run with default
        options or run a DungeonCreator.
    """
    parser = TextParser()
    player = Player()

    choice = ""
    while not choice.isdigit() or int(choice) < 1 or int(choice) > 2:
        choice = input("(1) Run with default Dungeon settings or (2) Generate Dungeon? (1/2): ")

    if int(choice) == 1:
        dungeon = Dungeon("")
        if dungeon.json_dungeon is not None:
            dungeon.dungeon_control(parser, player)
    else:
        dungeon_creator = DungeonCreator()
        file_name = dungeon_creator.generate_dungeon(parser, player)
        dungeon = Dungeon(os.path.join(os.path.dirname(__file__), "UserDefinedFiles", file_name))
        if dungeon.json_dungeon is not None:
            dungeon.dungeon_control(parser, player)
예제 #25
0
def define_type(soup):  # noun, verb, etc.
    # Прилагательное
    type1 = soup.find_all("p")
    for el in type1:
        # print(el)
        try:
            text = ""
            for el in el.text:
                text = text + TP.make_lower(el)
            for z in types:
                if text.find(z) != -1:
                    return z
            # print(text)
        except:
            pass
    # print()

    # Существительные, глаголы
    type1 = soup.find_all("p")
    for el in type1:
        q = el.find("a")
        if q != None:
            # print(1, el)
            try:
                b = True
                name1 = q.get("title")
                # print(q)
                for el in name1:
                    if el not in alpha:
                        b = False
                if b and name1 in types:
                    return name1

            except:
                pass

    return "другое"
예제 #26
0
def get_info(word):  #get info about word
    w = word
    word = ""
    for el in w:
        word = word + TP.make_lower(el)
    text = get_article(word)
    if not text:
        #print("Word not found")
        return False
    soup = bs4.BeautifulSoup(text, "lxml")
    q = XMLparser.define_type(soup)
    #исправить разные имена родов у сущ и прил
    #два файла: полностью известные, другие
    #

    if q == "существительное":
        z = [
            word, q,
            XMLparser.noun_prop(soup),
            XMLparser.noun_changing(soup)
        ]  #сделать отмену у собственных
    elif q == "прилагательное":
        z = [
            word, q,
            XMLparser.adjective_prop(soup),
            XMLparser.adjective_changing(soup)
        ]  #, XMLparser.noun_changing(soup)
    elif q == "глагол":
        z = [word, q, XMLparser.verb_prop(soup), XMLparser.verb_changing(soup)]
    elif q == "наречие":
        z = [word, q, XMLparser.adverb_prop(soup)]
    else:
        return False
    if "собств." in z[2]:
        return False
    return z
예제 #27
0
from TextParser import *


def analyze(string):
    sentiment = ''
    blob = TextBlob(string)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        sentiment = 'pos'
    elif polarity == 0:
        sentiment = 'neu'
    elif polarity < 0:
        sentiment = 'neg'

    return sentiment, polarity


if __name__ == '__main__':

    server = couchdb.Server('http://115.146.95.53:5984')
    db = server['twitter_rest']

    TextParser.getStopWords()
    textParser = TextParser()
    for row in db.view('C2E2View/C2E2'):
        doc = db.get(row.id)
        (tag, score) = analyze(textParser.parsing(row.value['what']['text']))
        doc['sentiment'] = {'sentiment': tag, 'sentiScore': score}
        db.save(doc)
예제 #28
0
파일: Indexer.py 프로젝트: jbjjbjjbj/Lexic
#!/usr/bin/python3

import sys
import os.path
import sqlite3
import mimetypes
import TextParser


# Try and catch any wrong input in arguments
for argument in sys.argv[1:]:
    if not os.path.isfile(argument):
        raise Exception("File " + argument + " does not exist")
    if mimetypes.guess_type(argument)[0] != 'text/plain':
        raise Exception("File " + argument + " is not a plain text file")


for argument in sys.argv[1:]:
    TextParser.parse(argument)
예제 #29
0
def process_text_feature(db, issue_id, query_dict, non_repeat,
                         issue_collection, comment_collection):
    # check to see if the issue is already processed
    if non_repeat:
        id = "%s/%s/%d" % (query_dict["repo"], query_dict["owner"], issue_id)
        if db[issue_collection].find_one({"_id": id}):
            logging.info("%s already processed" % id)
            return

    issue_start = time.time()

    query_dict["issue_id"] = issue_id  # set issue_id for querying

    comments = db.issue_comments.find(query_dict)  #get all comments

    # take the top2 for perspective score;
    perspective_scores = []

    # set a data struc to store everything
    total_comment_info = {
        "total_reference": 0,
        "total_url": 0,
        "total_emoji": 0,
        "total_mention": 0,
        "total_plus_one": 0,
        "total_text": "",
    }

    # Senti4SD document preparation
    input_senti4sd_filename = "input_%s_%s_%d.csv" % (
        query_dict["owner"], query_dict["repo"], issue_id)
    output_senti4sd_filename = "output_%s_%s_%d.csv" % (
        query_dict["owner"], query_dict["repo"], issue_id)
    feature_senti4sd_filename = "extractedFeatures_%s_%s_%d.csv" % (
        query_dict["owner"], query_dict["repo"], issue_id)
    f = open(senti4SD_address + input_senti4sd_filename, 'w')

    comment_info_l = []
    comment_sentence_l = []
    # process comments
    for comment in comments:

        logging.debug("Issue id: %d Comment: %s " % (issue_id, comment))

        comment_info = process_comment(comment)

        if not comment_info[
                "valid"]:  # if comment is not valid, go to next loop
            continue

        comment_info_l.append(
            comment_info)  # add valid comment_info into a list

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = sent_detector.tokenize(comment_info["text"].strip())
        comment_sentence_l.append(len(sentences))

        total_comment_info["total_reference"] += comment_info["num_reference"]
        total_comment_info["total_url"] += comment_info["num_url"]
        total_comment_info["total_emoji"] += comment_info["num_emoji"]
        total_comment_info["total_mention"] += comment_info["num_mention"]
        total_comment_info["total_plus_one"] += comment_info["num_plus_one"]

        if len(perspective_scores) > 2:
            if min(perspective_scores) < comment_info["perspective_score"]:
                _ = heapq.heappushpop(perspective_scores,
                                      comment_info["perspective_score"])
        else:
            perspective_scores.append(comment_info["perspective_score"])
        # write all comments to 1 part
        if total_comment_info["total_text"] == "":
            total_comment_info["total_text"] += comment_info["text"]
        else:
            total_comment_info["total_text"] += " " + comment_info["text"]
        f.write(comment_info["text"] + "\n")  # write to csv

    # close input senti4sd file
    f.close()

    # check total_text here, it is empty skip the rest!!!
    if total_comment_info[
            "total_text"] == "":  # too little text, no need to write into database
        return

    if len(perspective_scores):
        total_comment_info["perspective_score"] = sum(
            perspective_scores) / len(perspective_scores)
    else:
        total_comment_info["perspective_score"] = 0
    # aggregate some features on the issue level ###!!! may need to change this part !!!###
    total_comment_info["length"] = TextParser.get_length(
        total_comment_info["total_text"])
    total_comment_info["avg_word_length"] = TextParser.get_avg_length(
        total_comment_info["total_text"])
    total_comment_info["num_punct"] = TextParser.count_punct(
        total_comment_info["total_text"])
    total_comment_info["num_QEMark"] = TextParser.count_QEMark(
        total_comment_info["total_text"])
    total_comment_info["num_one_letter_word"] = TextParser.count_one_letter(
        total_comment_info["total_text"])
    total_comment_info["num_capital"] = TextParser.count_captial(
        total_comment_info["total_text"])
    total_comment_info[
        "num_non_alpha_in_middle"] = TextParser.count_non_alpha_in_middle(
            total_comment_info["total_text"])
    total_comment_info["num_modal_word"] = TextParser.count_modal_word(
        total_comment_info["total_text"])
    total_comment_info["num_unknown_word"] = TextParser.count_unknown_word(
        total_comment_info["total_text"])
    total_comment_info["num_insult_word"] = TextParser.count_insult_word(
        total_comment_info["total_text"])

    # sent to Senti4SD for score, we would want a result for each comment
    senti_start = time.time()
    senti_l = get_senti4SD(
        input_senti4sd_filename, output_senti4sd_filename,
        feature_senti4sd_filename)  # change this to returning a list

    # for each comment_info add a senti4sd classification
    for i in range(len(senti_l)):
        comment_info_l[i]["senti_4sd"] = senti_l[i]

    # calculate an aggregation
    total_comment_info["senti4sd_positive_percentage"] = senti_l.count(
        "positive") / len(senti_l)
    total_comment_info["senti4sd_neutral_percentage"] = senti_l.count(
        "neutral") / len(senti_l)
    total_comment_info["senti4sd_negative_percentage"] = senti_l.count(
        "negative") / len(senti_l)

    # logging.info("senti4sd took %d" % (time.time()-senti_start))

    # stanford politeness API
    politeness_start = time.time()
    # make a pickle file using coreNLP
    coreNLP_parse(
        input_senti4sd_filename, comment_sentence_l
    )  # comment_l is a list that stores number of sentences each comment has

    # pass this pickle to stanford politeness api
    calculate_stanford_politeness_score(input_senti4sd_filename,
                                        output_senti4sd_filename)

    # read from csv the score, the first one is for total, then for each comment
    score_df = pd.read_csv(stanford_politeness_score_address +
                           output_senti4sd_filename,
                           header=None)
    for row in score_df.itertuples():
        if row.Index == 0:
            total_comment_info["stanford_positive"] = row._1
            total_comment_info["stanford_negative"] = row._2
        else:
            comment_info_l[row.Index - 1]["stanford_positive"] = row._1
            comment_info_l[row.Index - 1]["stanford_negative"] = row._2

    # logging.info("stanford took %d" % (time.time()-politeness_start))

    # set_id
    total_comment_info["repo"] = query_dict["repo"]
    total_comment_info["owner"] = query_dict["owner"]
    total_comment_info["issue_id"] = query_dict["issue_id"]
    total_comment_info["_id"] = "%s/%s/%d" % (
        query_dict["repo"], query_dict["owner"], query_dict["issue_id"])

    # log for checking to somewhere, not multi-process safe
    # logging.debug(total_comment_info)

    # insert total_comment_info to database
    db[issue_collection].update_one(
        {"_id": total_comment_info["_id"]}, total_comment_info,
        upsert=True)  #update document with newly created features

    # insert each comment_info to database
    for comment_info in comment_info_l:
        db[comment_collection].update_one({"_id": comment_info["_id"]},
                                          comment_info,
                                          upsert=True)
예제 #30
0
def process_comment(doc):

    text = doc["body"]
    if TextParser.contain_non_english(
            text):  # if the text contains non-english, we terminate early
        return {
            "valid": False,
            "num_reference": 0,
            "num_url": 0,
            "num_emoji": 0,
            "num_mention": 0,
            "num_plus_one": 0,
            "perspective_score": 0,
            "text": ""
        }

    num_reference = TextParser.count_reference_line(text)
    # print("num_reference: %d" % num_reference)
    text = TextParser.remove_reference(text)
    # print("text 0: %s" % text)

    text = TextParser.transform_markdown(
        text)  # use mistune to transform markdown into html for removal later.
    # print("text 1: %s" % text)

    text = TextParser.remove_inline_code(text)  # used place-holder: InlineCode
    # print("text 2: %s" % text)

    text = TextParser.remove_html(text)
    # print("text 3: %s" % text)

    num_url = TextParser.count_url(text)
    # print("num_url: %d" % num_url)
    text = TextParser.remove_url(text)
    # print("text 4: %s" % text)

    num_emoji = TextParser.count_emoji(text)
    # print("num_emoji: %d" % num_emoji)
    text = TextParser.remove_emoji_marker(
        text)  # remove the two semi-colons on two sides of emoji
    # print("text 5: %s" % text)
    text = TextParser.remove_newline(text)
    # print("text 6: %s" % text)

    num_mention = TextParser.count_mention(text)
    # print("num_mention: %d" % num_mention)
    text = TextParser.replace_mention(text)
    # print("text 7: %s" % text)
    # sub all "+1" to "plus one"
    num_plus_one = TextParser.count_plus_one(text)
    # print("num_plus_one: %d" % num_plus_one)
    text = TextParser.sub_PlusOne(text)
    # print("text 8: %s" % text)

    perspective_score = get_perspective_score(text)

    return {
        "_id":
        "%s/%s/%d/%d" %
        (doc["repo"], doc["owner"], doc["issue_id"], doc["id"]),
        "repo":
        doc["repo"],
        "owner":
        doc["owner"],
        "issue_id":
        doc["issue_id"],
        "comment_id":
        doc["id"],
        "valid":
        True,
        "num_reference":
        num_reference,
        "num_url":
        num_url,
        "num_emoji":
        num_emoji,
        "num_mention":
        num_mention,
        "num_plus_one":
        num_plus_one,
        "perspective_score":
        perspective_score,
        "text":
        text
    }
예제 #31
0
	def parseResponse (self,rawResponse):
		self.__content=""
		self.__headers=[]

		tp=TextParser()
		tp.setSource("string",rawResponse)

		tp.readLine()
		tp.search("(HTTP\S*) ([0-9]+)")

		try:
			self.protocol=tp[0][0]
		except:
			self.protocol="unknown"

		try:
			self.code=tp[0][1]
		except:
			self.code="0"

#		try:
#			self.message=tp[2]
#		except:
#			self.message="unknown"

		self.code=int(self.code)

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): ?(.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		while tp.skip(1):
			self.addContent(tp.lastFull_line)
예제 #32
0
	def parsePOSTDATA(self,pd):

		if self.ContentType=="application/x-www-form-urlencoded":
			dicc=self.readUrlEncodedVariables(pd)
			self.__variablesPOST.update(dicc)

		elif self.ContentType=="multipart/form-data":
			self.multiPOSThead={}
			dicc={}
			tp=TextParser()
			tp.setSource("string",pd)
		#	print self.boundary
		#	print tp.readUntil("%s$" % (self.boundary))

			while True:
				headers=[]
				if not tp.readUntil("name=\"([^\"]+)\""):
					break
				var=tp[0][0]
				headers.append(tp.lastFull_line.strip())
				while True:
					tp.readLine()
					if tp.search("^([^:]+): (.*)$"):
						headers.append(tp.lastFull_line.strip())
					else:
						break

				value=""
				while True:
					tp.readLine()
					if not tp.search(self.boundary):
						value+=tp.lastFull_line
					else:
						break

				if value[-2:]=="\r\n":
					value=value[:-2]


				dicc[var]=value
				self.multiPOSThead[var]=headers

				if tp.search(self.boundary+"--"):
					break

			
			self.__variablesPOST.update(dicc)
#			print pd
#			print dicc
#			print self.__variablesPOST

		else:
			self.__uknPostData=pd
예제 #33
0
	def getNewPage(self):

		if self.MoreResults==False:
			raise StopIteration

		if self.start==None:
			self.start=self.startIndex
		else:
			self.start+=self.increment

		req=Request()
		req.addHeader("User-Agent","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14")
		url=self.url.replace("{query}",str(self.query))
		url=url.replace("{startvar}",str(self.start))


		req.setUrl(url)
		req.setTotalTimeout(10)
		req.setConnTimeout(10)
		if self.cookie:
			req.addHeader("Cookie",self.cookie)

		req.setProxy(self.proxy)
		req.setFollowLocation(True)

		trys=5
		while trys:
			try:
   				req.perform()
				break
			except:
				trys-=1
				if not trys:
					self.status="Failed"
					raise StopIteration
				pass

		if not req.response.has_header('Content-Type') or (not 'text' in req.response['Content-Type'] and not 'script' in req.response['Content-Type']) :
			self.MoreResults=False
			return

		rawResponse=self.preProcess(req.response.getContent())

		self.cookie=req.response.getCookie()

		tp=TextParser()
		tp.setSource("string",rawResponse)
		if req.response.code==200:
			self.responseContent=req.response.getContent()

		while tp.readUntil(self.urlRegexp):
			for i in tp:
				self.addResult(i)

		tp.seekinit()

		if tp.readUntil(self.nextRegexp):
			self.MoreResults=True
		else:
			self.MoreResults=False

		self.REQ=req
예제 #34
0
def learnLemmasByOrderOfScore(getSentenceScore):
    # Scheme: Learn words as they become possible to learn, in terms of sentences, in order of score

    # Initialize: Load all texts in Texts folder:
    TextParser.addAllTextsFromDirectoryToDatabase("Texts")

    # Will only contain sentences with fewer than or equal to one missing word, marked in order of the missing words frequency
    directlyUnlockableLemmasScore, sentencePairsBySentenceScore, directlyUnlockableLemmas = getPriorityQueueOfDirectlyLearnableSentencesByLemmaFrequency(getSentenceScore)
    lemmasByFrequency = getPriorityQueueOfLemmasByFrequency()

    # Find which words one is forced to learn, without being able to isolate it to one sentence:
    forcedToLearn = []
    notForcedToLearn = []
    orderedLearningList = []
    #First we remove all words that are not true "words", for example names, by learning the NotAWordLemma lemma:
    learnLemmaAndHandleSentencesWithLemmaFrequency(TextParser.NotAWordLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)

    i = 0
    numberOfLemmas = len(lemmasByFrequency)
    print("Start learning lemmas: " + str(len(lemmasByFrequency)))

    highestScoringDirectlyLearnableSentencePair = None
    highestScoringDirectlyLearnableSentencePairScore = None
    while not hasLearnedAllLemmas(lemmasByFrequency):
        (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore)

        while hasDirectlyLearnableSentence(directlyUnlockableLemmas):
            currentSentencePair = getHighestScoringUnforcedSentencePair(sentencePairsBySentenceScore, highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore)
            
            assert i + len(lemmasByFrequency) == numberOfLemmas

            # No new word in the sentence:
            if hasNoNewLemmas(currentSentencePair):
                continue
            
            assert i + len(lemmasByFrequency) == numberOfLemmas

            # A new pair of words to learn: lets do it!
            kage = 1
            #TODO (*) Der mangler at blive fjernet en fejl i forbindelse med at opdaterer sætninger, hvis sentence score afhænger af andre sætninger.
            for sentence in currentSentencePair:
                if sentence == None:
                    continue
                if sentence.associatedLearnableSentence != None:
                    sentence.associatedLearnableSentence.scoreDependentSentences.remove(sentence)

                newLemma = sentence.getOnlyUncoveredLemma()
                orderedLearningList.append((newLemma, sentence))
                learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)            
                if i % 1 == 0 or i < 4000:
                    print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + sentence.rawSentence)
                i += 1  
                
                assert i + len(lemmasByFrequency) == numberOfLemmas
                
            (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore)
            

        if hasLearnedAllLemmas(lemmasByFrequency):  # When all words have been learned in the loop above
            break

        # There are no more free words: time to learn a frequent word:
        newLemma = getHighestScoringLemma(lemmasByFrequency)
        orderedLearningList.append((newLemma, "NONE"))
        learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, forcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)            
        if i < 6000:
            print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + "NONE")
        i += 1
        assert i + len(lemmasByFrequency) == numberOfLemmas

    print("Learned directly " + str(len(orderedLearningList)) + " of " + str(numberOfLemmas) + " lemmas.")
    return orderedLearningList
예제 #35
0
    def parseMultipart(self, cad, boundary):
        self.boundary = boundary
        dicc = []
        tp = TextParser()
        tp.setSource("string", cad)

        while True:
            headers = []
            if not tp.readUntil("name=\"([^\"]+)\""):
                break
            var = tp[0][0]
            headers.append(tp.lastFull_line.strip())
            while True:
                tp.readLine()
                if tp.search("^([^:]+): (.*)$"):
                    headers.append(tp.lastFull_line.strip())
                else:
                    break

            value = ""
            while True:
                tp.readLine()
                if not tp.search(boundary):
                    value += tp.lastFull_line
                else:
                    break

            if value[-2:] == "\r\n":
                value = value[:-2]

            dicc.append(Variable(var, value, headers))

        self.variables = dicc
예제 #36
0
from reqresp import *
from TextParser import *

#
# http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg=0&po=0&pro=9&paso=1
#

a=Request()

a.setUrl("http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg=0&po=0&pro=9&paso=1")

a.perform()

HTML=a.response.getContent()

tp=TextParser()
tp.setSource("string",HTML)

lista=[]

while tp.readUntil("<tr class='contenido_tab' onClick='window.location=\"([a-zA-Z0-9\.-]+)\""):
	# Expresion regular mejorada ("<tr class='contenido_tab' onClick='window.location=\"([^\"]+)\""):
	link=tp[0][0]

	tp.readUntil("<td class='contenido_tab' align='right'>([0-9]+)</td>")
	precio=int(tp[0][0])

	lista.append([link,precio])


for ln,euros in lista:
예제 #37
0
파일: test.py 프로젝트: jorji/TextParser
#
# Copyright (C) 2012-2015 Institute of Industrial Science, The University of Tokyo.
# All rights reserved.
#
# Copyright (c) 2014-2016 Advanced Institute for Computational Science, RIKEN.
# All rights reserved.
#
# Copyright (c) 2016-2018 Research Institute for Information Technology, Kyushu University.
# All rights reserved.
#
###################################################################################
#

import TextParser as tp

instance = tp.createInstance()

print "test for read()"
print tp.read(instance, "../Examples/test.tpp")

print "test for getVersionInfo()"
print tp.getVersionInfo()

print "test for getAllLabels()"
print tp.getAllLabels(instance)

print "test for getType()"
print tp.getType(instance, '/config/test_double_max')

print "test for getValue()"
print tp.getValue(instance, '/config/cell_start')
예제 #38
0
from enum import Enum
import TextParser

class GameState(Enum):
    start = 0
    playing = 1
    gameOver = 2

gameState = GameState.start

while gameState == GameState.start: 
    print("Welcome to the Python Text adventure!")
    input("Press any key to continue...\n")
    gameState = GameState.playing

print("You are now playing the python text-based adventure game!")

while gameState == GameState.playing:
    userInput = input(">") 
    TextParser.parseInput(userInput)
    
    
예제 #39
0
	def parseRequest (self,rawRequest,prot="http"):
		''' Aun esta en fase BETA y por probar'''
		tp=TextParser()
		tp.setSource("string",rawRequest)

		self.__variablesPOST={}
		self.__headers={}		# diccionario, por ejemplo headers["Cookie"]


		tp.readLine()
		try:
			tp.search("(\w+) (.*) (HTTP\S*)")
			self.method=tp[0][0]
			self.protocol=tp[0][2]
		except:
			print "error en"
			print rawRequest
			return

		pathTMP=tp[0][1]
		pathTMP=('','')+urlparse(pathTMP)[2:]
		pathTMP=urlunparse(pathTMP)
		pathTMP=pathTMP.replace("//","/")
		self.time=strftime("%H:%M:%S", gmtime())

		while True:
			tp.readLine()
			if (tp.search("^([^:]+): (.*)$")):
				self.addHeader(tp[0][0],tp[0][1])
			else:
				break

		self.setUrl(prot+"://"+self.__headers["Host"]+pathTMP)

		if self.method.upper()=="POST":

			pd=""
			while tp.readLine(): 
				pd+=tp.lastFull_line


			if "Content-Type" in self.__headers:
				values=self.__headers["Content-Type"].split(";")
				if values[0].strip().lower()=="application/x-www-form-urlencoded":
					self.ContentType=values[0]
				elif values[0].strip().lower()=="multipart/form-data":
					self.ContentType=values[0]
					self.boundary=values[1].split("=")[1].strip()

			self.parsePOSTDATA(pd)
예제 #40
0
# TextParser - Text Parsing Library
#
# Copyright (C) 2012-2015 Institute of Industrial Science, The University of Tokyo.
# All rights reserved.
#
# Copyright (c) 2014-2016 Advanced Institute for Computational Science, RIKEN.
# All rights reserved.
#
# Copyright (c) 2016-2018 Research Institute for Information Technology, Kyushu University.
# All rights reserved.
#
###################################################################################
#

import TextParser as tp
instance=tp.createInstance()

print "test for read()"
print tp.read(instance, "../Examples/test.tpp")

print "test for getVersionInfo()"
print tp.getVersionInfo()

print "test for getAllLabels()"
print tp.getAllLabels(instance)

print "test for getType()"
print tp.getType(instance, '/config/test_double_max')

print "test for getValue()"
print tp.getValue(instance, '/config/cell_start')
elif argv[1] == "train":

	# Instantiate tfidf object to run tf-idf on documents
	topic_tfidf = tfidf()

	PreProcessor.preprocessTopicEngine(topic_tfidf,"NYT_articles.txt")
	topic_engine = TopicEngine(topic_tfidf.doc_word_list)

	topic_engine.GibbsSampling()

else:
	url = argv[1]
	text = argv[2]

	# Get processed document as a list of words
	doc = TextParser.return_word_list(text)

	# Run tf-idf on inputted document
	article_tfidf = new_tfidf()
	doc_dict = article_tfidf.return_tfidf_dict(doc)

	# get words associated with topic

	doc_word_list = eval(open("doc_word_list2.txt",'r').read())
	topics = eval(open("topics.txt",'r').read())
	topics_dict = {}

	doc_count = 0
	vocab = list(set(eval(open ('word_list_total.txt','r').read())))

	# Putting words in topic buckets
예제 #42
0
	def parsePOSTDATA(self,pd):

		if self.ContentType=="application/x-www-form-urlencoded":
			dicc=self.readUrlEncodedVariables(pd)
			[self.addVariablePOST(i,j) for i,j in dicc]

		elif self.ContentType=="multipart/form-data":
			self.multiPOSThead={}
			dicc={}
			tp=TextParser()
			tp.setSource("string",pd)
		#	print self.boundary
		#	print tp.readUntil("%s$" % (self.boundary))

			while True:
				headers=[]
				if not tp.readUntil("name=\"([^\"]+)\""):
					break
				var=tp[0][0]
				headers.append(tp.lastFull_line.strip())
				while True:
					tp.readLine()
					if tp.search("^([^:]+): (.*)$"):
						headers.append(tp.lastFull_line.strip())
					else:
						break

				value=""
				while True:
					tp.readLine()
					if not tp.search(self.boundary):
						value+=tp.lastFull_line
					else:
						break

				if value[-2:]=="\r\n":
					value=value[:-2]


				dicc[var]=value
				self.multiPOSThead[var]=headers

				if tp.search(self.boundary+"--"):
					break

			
			self.__variablesPOST.update(dicc)
#			print pd
#			print dicc
#			print self.__variablesPOST

		else:
			self.__uknPostData=pd
    def Run(self):
        db = SqliteDatabase(Globals.EmailsFileName)
        if not db.OpenConnection():
            return

        self.bloomFilter = self.CreateBloomFilter()
        #self.bloomFilter = None

        logFileName = PlatformMethods.Decode(
            os.path.join(
                Globals.CasePath,
                (Globals.EmailsFileName[Globals.EmailsFileName.rfind(os.sep) +
                                        1:] + '.log')))
        self.fout = open(logFileName, 'ab')
        #print self.CheckedMimeTypes
        self.fout.write(
            'Parsing/Indexing Emails Attachments Started at: %s\n' %
            (time.ctime()))

        if self.AddressBookPath:
            self.ParseStatus = "Parsing Address book..."
            self.SendEvent()
            AddressBookParser = OutlookAddressBook.AddressBookParser(
                Globals.AddressBookDict)
            for root, dirs, files in os.walk(self.AddressBookPath):
                for eachfile in files:
                    filePath = os.path.join(root, eachfile)
                    self.FilesCount += 1
                    if (filePath.rfind('.') == -1):
                        continue
                    #print filePath
                    extension = filePath[filePath.rfind('.'):]
                    #print 'extension ', extension
                    if extension.lower() == ".csv":
                        AddressBookParser.Parse(filePath)
                        #print 'add book parsed'
        else:
            self.fout.write('No Addressbook path found!\n')
        #Updte Addressbook
        query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)"
        ManyValues = []
        for key in Globals.AddressBookDict:
            #'EmailID': email, 'FirstName': firstName, 'MiddleName': middleName, 'LastName': lastName, 'InBook':1}
            ManyValues.append((Globals.AddressBookDict[key]['EmailID'],
                               Globals.AddressBookDict[key]['FirstName'],
                               Globals.AddressBookDict[key]['MiddleName'],
                               Globals.AddressBookDict[key]['LastName'],
                               Globals.AddressBookDict[key]['InBook']))

        #query = "delete from %s"%Constants.AddressBookTable
        #db.ExecuteNonQuery(query)
        #print ManyValues
        db.ExecuteMany(query1, ManyValues)

        #self.ParseStatus = "Done Preprocessing/Indexing Emails!"
        #return

        textParser = TextParser.TextParser(db,
                                           Globals.EmailsStopwords,
                                           self.Stemmer,
                                           bloomFilter=self.bloomFilter)
        docxParser = DocxParser.DocxParser(db,
                                           Globals.EmailsStopwords,
                                           self.Stemmer,
                                           bloomFilter=self.bloomFilter)
        docParser = DocParser.DocParser(db,
                                        Globals.EmailsStopwords,
                                        self.Stemmer,
                                        bloomFilter=self.bloomFilter)
        docQuery = "insert into %s (DocPath, DocType) values (?, ?)" % (
            Constants.TextCatDocumentsTable)

        if self.AttachmentsPath:
            for root, dirs, files in os.walk(self.AttachmentsPath):
                for eachfile in files:
                    filePath = os.path.join(root, eachfile)

                    fileNameList = eachfile.split()
                    if len(fileNameList) >= 2:
                        dateTimeFileName = "%s %s - %s" % (
                            fileNameList[0],
                            (fileNameList[1].replace(".", ":")),
                            (eachfile[eachfile.rfind('-') + 1:]))

                        if self.AttachmentsDict.has_key(dateTimeFileName):
                            self.AttachmentsDict[dateTimeFileName].append(
                                filePath)
                        else:
                            self.AttachmentsDict[dateTimeFileName] = [filePath]
                            #print 'Intersting! more than 1 attach. file found with same date time: %s'%
                        #else:
                        #    self.AttachmentsDict[dateTimeFileName] = filePath
                    else:
                        self.fout.write(
                            'Attachment filename found without date time: %s\n'
                            % (PlatformMethods.Encode(filePath)))

        #AttachmentsDict, Stopwords=[], Stemmer=None
        self.outlookTextParser = OutlookTextParser.OutlookTextParser(
            db,
            self.AttachmentsDict,
            Globals.EmailsStopwords,
            self.Stemmer,
            bloomFilter=self.bloomFilter,
            logFile=self.fout)

        if self.IndexMessages:
            self.ParseStatus = "Parsing and Indexing Emails..."
        else:
            self.ParseStatus = "Parsing Email Headers..."

        self.SendEvent()
        for root, dirs, files in os.walk(self.EmailsPath):
            if not self.keepGoing:
                self.running = False
                return

            for eachfile in files:
                self.FilesCount += 1
                if not self.keepGoing:
                    self.running = False
                    return

                filePath = os.path.join(root, eachfile)
                #print filePath
                if (filePath.rfind('.') == -1):
                    continue

                try:
                    extension = filePath[filePath.rfind('.'):]
                    fileType = wx.TheMimeTypesManager.GetFileTypeFromExtension(
                        extension)
                    if fileType:
                        mimeType = fileType.GetMimeType() or "unknown"
                        if mimeType == "text/plain":
                            try:
                                self.outlookTextParser.parse(
                                    filePath, self.IndexMessages)
                            except Exception, msg:
                                self.fout.write(
                                    'Error Parsing Message: %s Msg:: %s\n' %
                                    (PlatformMethods.Encode(filePath), msg))

                            self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                                time.time() - self.StartTime)

                    if (time.time() - self.EventStart) > 10:
                        self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec(
                            time.time() - self.StartTime)
                        self.SendEvent()

                except Exception, value:
                    #try:
                    self.fout.write(
                        "Error Parsing Message: %s Msg: %s\n" %
                        (PlatformMethods.Encode(filePath), str(value)))
                    self.fout.flush()
예제 #44
0
def getPage(n):
	a=Request()
	
	a.setUrl("http://www.infoempleo.com/trabajo/en_barcelona/area-de-empresa_informatica/pagina_"+str(n))
	
	a.perform()
	
	HTML=a.response.getContent()
	
	tp=TextParser()
	tp.setSource("string",HTML)
	
	lista=[]
	
	while tp.readUntil('<tr class="[AB]">'):
		tp.readUntil('<td class="col1"><span>(.*)</span></td>')
		fecha=tp[0][0].split("-")
		
		if 9 == int(fecha[0]) and datetime.today().month == int(fecha[1]):
			tp.readUntil('<td class="col2"><a.*?>(.*)</a></td>')
			oferta=tp[0][0]
			
			tp.readUntil('<td class="col3"><strong><a.*?>(.*)</a></strong></td>')
			empresa=tp[0][0]

			tp.readUntil('<td class="col4"><a.*?>(.*)</a></td>')
			lugar=tp[0][0]

			tp.readUntil('<td class="col5">(.*)</td>')
			inscritos=tp[0][0]

			lista.append([oferta,empresa,lugar,inscritos])

	print "Descargada pagina",n

	return lista
예제 #45
0
"""
Author: Team 8
City: Chicago
Subject: COMP90024
"""

from collections import Counter

import couchdb
from textblob import TextBlob

from TextParser import *


# This file aims at finding out the most frequently mentioned words in a branch of tweets in a given view.
# It's used in the report of scenario analysis part.
if __name__ == '__main__':
    server = couchdb.Server('http://115.146.95.53:5984/')
    db = server['twitter_rest']

    TextParser.getStopWords()
    textParser = TextParser()

    counter = Counter()
    for row in db.view('C2E2View/C2E2'):
        blob = TextBlob(textParser.parsing(row.value['what']['text']))
        for word in blob.words:
            counter[word] += 1
    top_words = counter.most_common(20)