def sent_header_callback(self,type,data): if type==pycurl.INFOTYPE_HEADER_OUT: tp=TextParser() tp.setSource("string",data) while (tp.readUntil("^([^:]+): (.*)$")): self.addHeader(tp[0][0],tp[0][1])
def parseMultipart(self,cad,boundary): self.boundary=boundary dicc=[] tp=TextParser() tp.setSource("string",cad) while True: headers=[] if not tp.readUntil("name=\"([^\"]+)\""): break var=tp[0][0] headers.append(tp.lastFull_line.strip()) while True: tp.readLine() if tp.search("^([^:]+): (.*)$"): headers.append(tp.lastFull_line.strip()) else: break value="" while True: tp.readLine() if not tp.search(boundary): value+=tp.lastFull_line else: break if value[-2:]=="\r\n": value=value[:-2] dicc.append(Variable(var,value,headers)) self.variables=dicc
def parseResponse (self,rawResponse,type="curl"): self.__content="" self.__headers=[] tp=TextParser() tp.setSource("string",rawResponse) while True: tp.readUntil("(HTTP\S*) ([0-9]+)") try: self.protocol=tp[0][0] except: self.protocol="unknown" try: self.code=tp[0][1] except: self.code="0" if self.code!="100": break self.code=int(self.code) while True: tp.readLine() if (tp.search("^([^:]+): ?(.*)$")): self.addHeader(tp[0][0],tp[0][1]) else: break while tp.skip(1): self.addContent(tp.lastFull_line) if type=='curl': self.delHeader("Transfer-Encoding") if self.header_equal("Transfer-Encoding","chunked"): result="" content=StringIO.StringIO(self.__content) hexa=content.readline() nchunk=int(hexa.strip(),16) while nchunk: result+=content.read(nchunk) content.readline() hexa=content.readline() nchunk=int(hexa.strip(),16) self.__content=result if self.header_equal("Content-Encoding","gzip"): compressedstream = StringIO.StringIO(self.__content) gzipper = gzip.GzipFile(fileobj=compressedstream) body=gzipper.read() self.__content=body self.delHeader("Content-Encoding")
def get_basefilename(dfi_filename): """ 引数で渡されたdfiファイルから、"/Header/BaseFileName"の値を返す """ instance=TextParser.getInstanceSingleton() TextParser.read(instance, dfi_filename) rt = TextParser.getValue(instance,"/Header/BaseFileName") if rt[0] != 0: raise return rt[1]
def __init__(self, parent, channel, data, *args, **kwargs): super(BackgroundManager, self).__init__(*args, **kwargs) self.parent = parent poll = parser.Poll(data) self.counter = parser.ChatHandler([poll]) self.chat_bot = bot.ChatBot(UI.USERNAME, UI.PASSWORD) #change to use event info self.chat_bot.connect_to_irc() self.chat_bot.connect_to_channel(channel) #self.daemon = True self.start()
def __init__ (self,file,growing=False,exclude=""): self.tp=TextParser() self.tp.setSource("file",file) self.tp.growing=growing self.reqs=[] self.resp=[] self.reqs_exclude=[] self.resp_exclude=[] self.exclude=exclude
def get_sources(self, cat): word_counts = [] import TextParser if not cat in self.files: print("category not found: " + cat) return word_counts for file in self.files[cat]: from TextParser import TextParser parser = TextParser() parser.parse(file) word_counts.append(parser.get_word_counts()) return word_counts
def print_feature(files_dir,parsed_files_dir,results_dir,links_file): """ Write the feature values of every links from prominents in a new file. Args: files_dir : directory containing all non-parsed text files parsed_files_dir : directory containing all parsed text files results_dir : directory containing results links_file : file listing all links from prominents """ print("Computing similarity from \"LINKS\" file.") lost = 0 # Reading the links file (all outgoing and incoming links from prominent articles) with open(links_file,'r',encoding='utf-8') as links: with open(os.path.join(results_dir,"cosine_similarity_feature"),"w",encoding='utf-8') as out: # nb_lines = 0 for line in links: # get the files names (and encode in base64 if necessary) (title1,title2) = get_article_titles(line) (filename1,filename2) = get_article_names(files_dir,title1,title2) # create parsed files parsed_file_1 = TextParser.parse_file(filename1,files_dir,parsed_files_dir) parsed_file_2 = TextParser.parse_file(filename2,files_dir,parsed_files_dir) # compute feature feature = -1 if os.path.isfile(parsed_file_1) and os.path.isfile(parsed_file_2): try : feature = similarity(parsed_file_1,parsed_file_2) except ValueError : # compute similarity on unparsed files src_file1 = os.path.join(files_dir,filename1) src_file2 = os.path.join(files_dir,filename2) if os.path.isfile(src_file1) and os.path.isfile(src_file2) : feature = similarity(src_file1,src_file2) else : feature =-1 lost = lost+1 else : lost = lost+1 # count number of lost links # write feature in output file link_title = title1+"@"+title2 out.write(link_title+"\t%f\n"%(feature)) # data.append(feature) out.close() links.close() print("Lost links :",lost)
def __init__(self, win, startTime): import HTMLParser self.win = win self.StartTime = startTime self.DocID = 0 self.WordID = 0 self.StemmedWordID = 0 self.DirCount = 0 self.FilesCount = 0 self.WordCount = 0 self.StemmedWordCount = 0 self.ElapsedTime = "" self.ParseStatus = "Indexing in Progress..." self.KeyColumnNames = "" self.UseStemmer = False self.Stemmer = None #self.SetupTextCatDB() DBFunctions.SetupTextCatTables(Globals.TextCatFileName) """ self.timerStatus = wx.Timer(id=wx.NewId(), owner=self) self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer, id=self.timerStatus.GetId()) """ self.EventStart = time.time() self.splitter = re.compile(r'\W*') #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I) if Globals.Stemmer == "Porter Stemmer": self.Stemmer = PorterStemmer() #self.UseStemmer = True self.htmlParser = HTMLParser.HTMLParser(self.Stemmer) self.textParser = TextParser.TextParser(self.Stemmer) """
def drug_peek(): data = request.data if not data: data = request.form.keys()[0] p = TextParser(data) output = p.get_drug_data() print output # d = DrugAnalyzer(p.get_drug_data()) # d.add_summary("Summary item 1") # d.check_data() #output = "Drug: Unknown \nSummary: To be analyzed \nCaution: No Applicable\n" return output
def aggregator(path_to_dir): files = glob.glob(path_to_dir + '/*.html', recursive=True) bigList = [] for file in files: s = TextParser.text_to_string(file) f = TextParser.text_filter(s) c = TextParser.cleanse(f) for item in c: bigList.append(item) listDF = pd.DataFrame(bigList, columns=['Name', 'Message', 'Date Time']) listDF['Date Time'] = pd.to_datetime(listDF['Date Time']) listDF = listDF.set_index('Date Time') listDF['Year'] = listDF.index.year listDF['Month'] = listDF.index.month listDF['Day'] = listDF.index.day listDF['Time'] = listDF.index.time listDF['Weekday'] = listDF.index.day_name() return listDF
def run_search(self): name = self.name_entry.get() arn = self.arn_entry.get() waf.make_doc(name, arn, 1) prof = es.fiche(name) for filename in enumerate(os.listdir(name + '/')): nom_fichier = name + '/' + filename[1] text = tp.open_fichier(nom_fichier) sen = tp.get_text_for_name(nom_fichier, name) evs = pf.find_evenement(sen, text, name) for e in range(len(evs)): ev = es.evenement(evs[e][0], evs[e][1], evs[e][2], evs[e][3]) prof.add_evenement(ev) #prof.get_evenement(e) prof.sort_evenements_by_score() prof.remove_score_0() prof.remove_dup() t2p = prof.str_fiche() self.make_new_text_window(t2p)
def _list_to_dict(self, list_list): #TODO: this function should be elsewhere all_options = {} for section in list_list: #multiple keys will point to a single Counter if that choice has multiple variations counter = tp.VoteCounter(section[0]) for item in section: all_options[item] = counter #and them append counter somewhere accessible return all_options
def getPage(n): a=Request() a.setUrl("http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg="+str(n)+"&po=0&pro=9&paso=1") a.perform() HTML=a.response.getContent() tp=TextParser() tp.setSource("string",HTML) lista=[] while tp.readUntil("<tr class='contenido_tab' onClick='window.location=\"([a-zA-Z0-9\.-]+)\""): # Expresion regular mejorada ("<tr class='contenido_tab' onClick='window.location=\"([^\"]+)\""): link=tp[0][0] tp.readUntil("<td class='contenido_tab' align='right'>([0-9]+)</td>") precio=int(tp[0][0]) lista.append([link,precio]) print "Descargada pagina",n return lista
def main(): try: fileText = FileManager.getText(sys.argv[1]) information = TextParser.parse(fileText) OutputHelper.format(information) except IndexError: ErrorHandler.handleError("IndexError") except FileNotFoundError: ErrorHandler.handleError("FileNotFoundError") except: ErrorHandler.handleError("GenericError")
def Main(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS): # # These are all the global dictionaries/objects in the game. Anywhere where a loadgame happens you need all the global variables # global PLAYER #The main character. player is an object instance of class character. # global ITEMS #All the items. This a dictionary of objects of class equipment keyed by their lowcase equipment name (item.name). Remember the lowercase, may trip you up if referencing upercase version in the file. # global MAPS #All the locations. A tuple of objects of class Map inxed by there x,y,z coordinate (MAPS[x][y][z]) # global INTERACT #All the interactables (stationary things that need something). This a dictionary of objects of class Interact keyed by their lowcase name (interact.name). Remember the lowercase, may trip you up if referencing upercase version in the file. # global QUESTS #Quest statuses. This is a dictionary of flags (1 or 0) for the status of the quest keyed by quest name. # global ENEMIES #All the npcs. This a dictionary of objects of class Enemy keyed by their lowcase equipment name (item.name.lower()). Remember the lowercase, may trip you up if referencing upercase version in the file. # global GAMEINFO #Miscellaneous game info. Dictionary of all sorts of variables # global GAMESETTINGS # The game settings that are saved in the game # # global keyword makes the variables inside the function reference the correct global scope variable when assigned in the function. # # If not assignment within the function may lead to changes only in the local scope # Main game loop section that runs while the player is alive (player is killed in story once done) # TODO don't have main loop based on player alive but on game being played, e.g. gameExit boolean variable instead while(PLAYER.alive): # if not(GAMESETTINGS['HardcoreMode']): MapDisplay.mini() # Minimap display area in game if GAMEINFO['scriptdata']: # if there's a script loaded carry out those commands! instead of normal command = GAMEINFO['scriptdata'].pop(0) # pops the first element to go through script until finished printT(command) else: command = input('\nWhat do you want to do?\n') print(LINEBREAK) # This linebreak helps split up each turn if GAMESETTINGS['HardcoreMode']: print(CLEARSCREEN) # Sends the command text to the text parser to be interpreted and action to be done MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = TextParser.Parser(command,MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) GAMEINFO['commandcount'] += 1 # increments the command count after every command but doesn't print #print LINEBREAK # Got rid of this bottom linebreak to hopefully have the current situation more clear MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.story(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) # runs through the story quests checks and actions MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.sidequests(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) # runs through all the sidequest checks and actions MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = game_scripts.events(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) # runs through all the events checks and actions #TODO integrate this into game functions with a function, possibly seperate quests from game functions and import all from there to keep things global if PLAYER.alive == False and GAMEINFO['layersdeep'] > 0: # gets you out of the EPTA all the way down quest and back into the sublayer #End(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) print(LINEBREAK) printT(" (\S)You finish the game and put back the laptop ready to get back to reality.\nHow long did you spend on this game?") log = GAMEINFO['log'] #sets up a temporary variable to pass the log back up a layer MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = load_game(str(GAMEINFO['layersdeep']-1)) GAMEINFO['log'] = log + ["--Back in layer: " + str(GAMEINFO['layersdeep']) +"---"] #overwrites it to keep a running tab and says what layer we're in #Doesn't reset the GAMEINFO['timestart'] as the runtime will included the time in the nested function #TODO delete the save file you're coming out of MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS = End(MAPS, PLAYER, ITEMS, INTERACT, QUESTS, ENEMIES, GAMEINFO, GAMESETTINGS) #calls the end function in main so that the game can continue its loop structure
def plot_fit_gui(self): name = self.name_entry.get() fichier = self.file_entry.get() text = tp.open_fichier(fichier) [firstname, lastname] = tp.format_name(name) [wo, fo, lo] = tp.find_name(name, firstname, lastname, text) [x, y, p, fit] = tp.name_occ_fit_gauss(wo, fo, lo, text) tp.plot_fit(x, y, fit)
def parseRequest(self, rawRequest, prot="http"): ''' Aun esta en fase BETA y por probar''' tp = TextParser() tp.setSource("string", rawRequest) self.__variablesPOST = VariablesSet() self._headers = {} # diccionario, por ejemplo headers["Cookie"] tp.readLine() try: tp.search("^(\w+) (.*) (HTTP\S*)$") self.method = tp[0][0] self.protocol = tp[0][2] except Exception, a: print rawRequest raise a
def parseRequest (self,rawRequest,prot="http"): ''' Aun esta en fase BETA y por probar''' tp=TextParser() tp.setSource("string",rawRequest) self.__variablesPOST=VariablesSet() self.__headers={} # diccionario, por ejemplo headers["Cookie"] tp.readLine() try: tp.search("^(\w+) (.*) (HTTP\S*)$") self.method=tp[0][0] self.protocol=tp[0][2] except Exception,a: print rawRequest raise a
def __init__(self, db, AttachmentsDict, Stopwords=[], Stemmer=None, bloomFilter=None, logFile=None): #self.filePath = filePath self.db = db self.Stemmer = Stemmer self.bloomFilter = bloomFilter self.AttachmentsDict = AttachmentsDict self.Stopwords = Stopwords self.logFile = logFile self.FromRE = re.compile(r"(From:\W*)(.*)", re.I) self.ToRE = re.compile(r'(To:\W*)(.*)', re.I) self.CcRE = re.compile(r"(Cc:\W*)(.*)", re.I) self.BccRE = re.compile(r"(Bcc:\W*)(.*)", re.I) self.DateRE = re.compile(r'(Date:\W*)(.*)') self.SubjectRE = re.compile(r'(Subject:\W*)(.*)') self.AttachmentsRE = re.compile(r'(Attachments:\W*)(.*)') self.EmailRE = re.compile(r"[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+", re.I) #self.EmailsDict = EmailsDict self.query = "insert into " + Constants.EmailsTable + "(DocID, FromID, ToID,EmailDate,Subject,Attachments,FilePath,AttachmentsPath,TotalRecipients,Size,`Group`,Label) values (?,?,?,?,?,?,?,?,?,?,?,?)" #self.query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)" #added for TC on Emails #self.WordCount = 0 #self.StemmedWordCount = 0 self.FooterLineRE = re.compile(r'[_-]{2,}') self.ListSepRE = re.compile(r'[~`!#$^&*()+=|\\{}\'"?><\[\],;]') self.Splitter = re.compile(r'\W+', re.I) self.PhoneRE = re.compile( r'([\d{3}]*)[\(\)-/\. ]*(\d{3})[\(\)-/\. ]*(\d{4})\D*') self.EmailRE = re.compile(r"\A[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]+", re.I) self.HTTPRE = re.compile(r"\A(http://)[a-z0-9_-]+\.[a-z]{2,4}\b", re.I) self.textParser = TextParser.TextParser(db, Stopwords, Stemmer=self.Stemmer, bloomFilter=self.bloomFilter)
def normalize( s): # проставление ударений, нормализация строчек с переводами строки l = list(s.replace("△", "").split("\n")) ans = [] for z in l: text = "" prev = "" for el in z: if el not in alpha: text = text + TP.make_bigger(prev) prev = "" elif el == "ё": text = text + prev + "Ё" prev = "" else: text = text + prev prev = el ans.append(text + el) return ans
def run_game(): """This is the general controller function to run the game. First, it generates the TextParser and Player character. Second, it gives a choice to either run with default options or run a DungeonCreator. """ parser = TextParser() player = Player() choice = "" while not choice.isdigit() or int(choice) < 1 or int(choice) > 2: choice = input("(1) Run with default Dungeon settings or (2) Generate Dungeon? (1/2): ") if int(choice) == 1: dungeon = Dungeon("") if dungeon.json_dungeon is not None: dungeon.dungeon_control(parser, player) else: dungeon_creator = DungeonCreator() file_name = dungeon_creator.generate_dungeon(parser, player) dungeon = Dungeon(os.path.join(os.path.dirname(__file__), "UserDefinedFiles", file_name)) if dungeon.json_dungeon is not None: dungeon.dungeon_control(parser, player)
def define_type(soup): # noun, verb, etc. # Прилагательное type1 = soup.find_all("p") for el in type1: # print(el) try: text = "" for el in el.text: text = text + TP.make_lower(el) for z in types: if text.find(z) != -1: return z # print(text) except: pass # print() # Существительные, глаголы type1 = soup.find_all("p") for el in type1: q = el.find("a") if q != None: # print(1, el) try: b = True name1 = q.get("title") # print(q) for el in name1: if el not in alpha: b = False if b and name1 in types: return name1 except: pass return "другое"
def get_info(word): #get info about word w = word word = "" for el in w: word = word + TP.make_lower(el) text = get_article(word) if not text: #print("Word not found") return False soup = bs4.BeautifulSoup(text, "lxml") q = XMLparser.define_type(soup) #исправить разные имена родов у сущ и прил #два файла: полностью известные, другие # if q == "существительное": z = [ word, q, XMLparser.noun_prop(soup), XMLparser.noun_changing(soup) ] #сделать отмену у собственных elif q == "прилагательное": z = [ word, q, XMLparser.adjective_prop(soup), XMLparser.adjective_changing(soup) ] #, XMLparser.noun_changing(soup) elif q == "глагол": z = [word, q, XMLparser.verb_prop(soup), XMLparser.verb_changing(soup)] elif q == "наречие": z = [word, q, XMLparser.adverb_prop(soup)] else: return False if "собств." in z[2]: return False return z
from TextParser import * def analyze(string): sentiment = '' blob = TextBlob(string) polarity = blob.sentiment.polarity if polarity > 0: sentiment = 'pos' elif polarity == 0: sentiment = 'neu' elif polarity < 0: sentiment = 'neg' return sentiment, polarity if __name__ == '__main__': server = couchdb.Server('http://115.146.95.53:5984') db = server['twitter_rest'] TextParser.getStopWords() textParser = TextParser() for row in db.view('C2E2View/C2E2'): doc = db.get(row.id) (tag, score) = analyze(textParser.parsing(row.value['what']['text'])) doc['sentiment'] = {'sentiment': tag, 'sentiScore': score} db.save(doc)
#!/usr/bin/python3 import sys import os.path import sqlite3 import mimetypes import TextParser # Try and catch any wrong input in arguments for argument in sys.argv[1:]: if not os.path.isfile(argument): raise Exception("File " + argument + " does not exist") if mimetypes.guess_type(argument)[0] != 'text/plain': raise Exception("File " + argument + " is not a plain text file") for argument in sys.argv[1:]: TextParser.parse(argument)
def process_text_feature(db, issue_id, query_dict, non_repeat, issue_collection, comment_collection): # check to see if the issue is already processed if non_repeat: id = "%s/%s/%d" % (query_dict["repo"], query_dict["owner"], issue_id) if db[issue_collection].find_one({"_id": id}): logging.info("%s already processed" % id) return issue_start = time.time() query_dict["issue_id"] = issue_id # set issue_id for querying comments = db.issue_comments.find(query_dict) #get all comments # take the top2 for perspective score; perspective_scores = [] # set a data struc to store everything total_comment_info = { "total_reference": 0, "total_url": 0, "total_emoji": 0, "total_mention": 0, "total_plus_one": 0, "total_text": "", } # Senti4SD document preparation input_senti4sd_filename = "input_%s_%s_%d.csv" % ( query_dict["owner"], query_dict["repo"], issue_id) output_senti4sd_filename = "output_%s_%s_%d.csv" % ( query_dict["owner"], query_dict["repo"], issue_id) feature_senti4sd_filename = "extractedFeatures_%s_%s_%d.csv" % ( query_dict["owner"], query_dict["repo"], issue_id) f = open(senti4SD_address + input_senti4sd_filename, 'w') comment_info_l = [] comment_sentence_l = [] # process comments for comment in comments: logging.debug("Issue id: %d Comment: %s " % (issue_id, comment)) comment_info = process_comment(comment) if not comment_info[ "valid"]: # if comment is not valid, go to next loop continue comment_info_l.append( comment_info) # add valid comment_info into a list sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_detector.tokenize(comment_info["text"].strip()) comment_sentence_l.append(len(sentences)) total_comment_info["total_reference"] += comment_info["num_reference"] total_comment_info["total_url"] += comment_info["num_url"] total_comment_info["total_emoji"] += comment_info["num_emoji"] total_comment_info["total_mention"] += comment_info["num_mention"] total_comment_info["total_plus_one"] += comment_info["num_plus_one"] if len(perspective_scores) > 2: if min(perspective_scores) < comment_info["perspective_score"]: _ = heapq.heappushpop(perspective_scores, comment_info["perspective_score"]) else: perspective_scores.append(comment_info["perspective_score"]) # write all comments to 1 part if total_comment_info["total_text"] == "": total_comment_info["total_text"] += comment_info["text"] else: total_comment_info["total_text"] += " " + comment_info["text"] f.write(comment_info["text"] + "\n") # write to csv # close input senti4sd file f.close() # check total_text here, it is empty skip the rest!!! if total_comment_info[ "total_text"] == "": # too little text, no need to write into database return if len(perspective_scores): total_comment_info["perspective_score"] = sum( perspective_scores) / len(perspective_scores) else: total_comment_info["perspective_score"] = 0 # aggregate some features on the issue level ###!!! may need to change this part !!!### total_comment_info["length"] = TextParser.get_length( total_comment_info["total_text"]) total_comment_info["avg_word_length"] = TextParser.get_avg_length( total_comment_info["total_text"]) total_comment_info["num_punct"] = TextParser.count_punct( total_comment_info["total_text"]) total_comment_info["num_QEMark"] = TextParser.count_QEMark( total_comment_info["total_text"]) total_comment_info["num_one_letter_word"] = TextParser.count_one_letter( total_comment_info["total_text"]) total_comment_info["num_capital"] = TextParser.count_captial( total_comment_info["total_text"]) total_comment_info[ "num_non_alpha_in_middle"] = TextParser.count_non_alpha_in_middle( total_comment_info["total_text"]) total_comment_info["num_modal_word"] = TextParser.count_modal_word( total_comment_info["total_text"]) total_comment_info["num_unknown_word"] = TextParser.count_unknown_word( total_comment_info["total_text"]) total_comment_info["num_insult_word"] = TextParser.count_insult_word( total_comment_info["total_text"]) # sent to Senti4SD for score, we would want a result for each comment senti_start = time.time() senti_l = get_senti4SD( input_senti4sd_filename, output_senti4sd_filename, feature_senti4sd_filename) # change this to returning a list # for each comment_info add a senti4sd classification for i in range(len(senti_l)): comment_info_l[i]["senti_4sd"] = senti_l[i] # calculate an aggregation total_comment_info["senti4sd_positive_percentage"] = senti_l.count( "positive") / len(senti_l) total_comment_info["senti4sd_neutral_percentage"] = senti_l.count( "neutral") / len(senti_l) total_comment_info["senti4sd_negative_percentage"] = senti_l.count( "negative") / len(senti_l) # logging.info("senti4sd took %d" % (time.time()-senti_start)) # stanford politeness API politeness_start = time.time() # make a pickle file using coreNLP coreNLP_parse( input_senti4sd_filename, comment_sentence_l ) # comment_l is a list that stores number of sentences each comment has # pass this pickle to stanford politeness api calculate_stanford_politeness_score(input_senti4sd_filename, output_senti4sd_filename) # read from csv the score, the first one is for total, then for each comment score_df = pd.read_csv(stanford_politeness_score_address + output_senti4sd_filename, header=None) for row in score_df.itertuples(): if row.Index == 0: total_comment_info["stanford_positive"] = row._1 total_comment_info["stanford_negative"] = row._2 else: comment_info_l[row.Index - 1]["stanford_positive"] = row._1 comment_info_l[row.Index - 1]["stanford_negative"] = row._2 # logging.info("stanford took %d" % (time.time()-politeness_start)) # set_id total_comment_info["repo"] = query_dict["repo"] total_comment_info["owner"] = query_dict["owner"] total_comment_info["issue_id"] = query_dict["issue_id"] total_comment_info["_id"] = "%s/%s/%d" % ( query_dict["repo"], query_dict["owner"], query_dict["issue_id"]) # log for checking to somewhere, not multi-process safe # logging.debug(total_comment_info) # insert total_comment_info to database db[issue_collection].update_one( {"_id": total_comment_info["_id"]}, total_comment_info, upsert=True) #update document with newly created features # insert each comment_info to database for comment_info in comment_info_l: db[comment_collection].update_one({"_id": comment_info["_id"]}, comment_info, upsert=True)
def process_comment(doc): text = doc["body"] if TextParser.contain_non_english( text): # if the text contains non-english, we terminate early return { "valid": False, "num_reference": 0, "num_url": 0, "num_emoji": 0, "num_mention": 0, "num_plus_one": 0, "perspective_score": 0, "text": "" } num_reference = TextParser.count_reference_line(text) # print("num_reference: %d" % num_reference) text = TextParser.remove_reference(text) # print("text 0: %s" % text) text = TextParser.transform_markdown( text) # use mistune to transform markdown into html for removal later. # print("text 1: %s" % text) text = TextParser.remove_inline_code(text) # used place-holder: InlineCode # print("text 2: %s" % text) text = TextParser.remove_html(text) # print("text 3: %s" % text) num_url = TextParser.count_url(text) # print("num_url: %d" % num_url) text = TextParser.remove_url(text) # print("text 4: %s" % text) num_emoji = TextParser.count_emoji(text) # print("num_emoji: %d" % num_emoji) text = TextParser.remove_emoji_marker( text) # remove the two semi-colons on two sides of emoji # print("text 5: %s" % text) text = TextParser.remove_newline(text) # print("text 6: %s" % text) num_mention = TextParser.count_mention(text) # print("num_mention: %d" % num_mention) text = TextParser.replace_mention(text) # print("text 7: %s" % text) # sub all "+1" to "plus one" num_plus_one = TextParser.count_plus_one(text) # print("num_plus_one: %d" % num_plus_one) text = TextParser.sub_PlusOne(text) # print("text 8: %s" % text) perspective_score = get_perspective_score(text) return { "_id": "%s/%s/%d/%d" % (doc["repo"], doc["owner"], doc["issue_id"], doc["id"]), "repo": doc["repo"], "owner": doc["owner"], "issue_id": doc["issue_id"], "comment_id": doc["id"], "valid": True, "num_reference": num_reference, "num_url": num_url, "num_emoji": num_emoji, "num_mention": num_mention, "num_plus_one": num_plus_one, "perspective_score": perspective_score, "text": text }
def parseResponse (self,rawResponse): self.__content="" self.__headers=[] tp=TextParser() tp.setSource("string",rawResponse) tp.readLine() tp.search("(HTTP\S*) ([0-9]+)") try: self.protocol=tp[0][0] except: self.protocol="unknown" try: self.code=tp[0][1] except: self.code="0" # try: # self.message=tp[2] # except: # self.message="unknown" self.code=int(self.code) while True: tp.readLine() if (tp.search("^([^:]+): ?(.*)$")): self.addHeader(tp[0][0],tp[0][1]) else: break while tp.skip(1): self.addContent(tp.lastFull_line)
def parsePOSTDATA(self,pd): if self.ContentType=="application/x-www-form-urlencoded": dicc=self.readUrlEncodedVariables(pd) self.__variablesPOST.update(dicc) elif self.ContentType=="multipart/form-data": self.multiPOSThead={} dicc={} tp=TextParser() tp.setSource("string",pd) # print self.boundary # print tp.readUntil("%s$" % (self.boundary)) while True: headers=[] if not tp.readUntil("name=\"([^\"]+)\""): break var=tp[0][0] headers.append(tp.lastFull_line.strip()) while True: tp.readLine() if tp.search("^([^:]+): (.*)$"): headers.append(tp.lastFull_line.strip()) else: break value="" while True: tp.readLine() if not tp.search(self.boundary): value+=tp.lastFull_line else: break if value[-2:]=="\r\n": value=value[:-2] dicc[var]=value self.multiPOSThead[var]=headers if tp.search(self.boundary+"--"): break self.__variablesPOST.update(dicc) # print pd # print dicc # print self.__variablesPOST else: self.__uknPostData=pd
def getNewPage(self): if self.MoreResults==False: raise StopIteration if self.start==None: self.start=self.startIndex else: self.start+=self.increment req=Request() req.addHeader("User-Agent","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14") url=self.url.replace("{query}",str(self.query)) url=url.replace("{startvar}",str(self.start)) req.setUrl(url) req.setTotalTimeout(10) req.setConnTimeout(10) if self.cookie: req.addHeader("Cookie",self.cookie) req.setProxy(self.proxy) req.setFollowLocation(True) trys=5 while trys: try: req.perform() break except: trys-=1 if not trys: self.status="Failed" raise StopIteration pass if not req.response.has_header('Content-Type') or (not 'text' in req.response['Content-Type'] and not 'script' in req.response['Content-Type']) : self.MoreResults=False return rawResponse=self.preProcess(req.response.getContent()) self.cookie=req.response.getCookie() tp=TextParser() tp.setSource("string",rawResponse) if req.response.code==200: self.responseContent=req.response.getContent() while tp.readUntil(self.urlRegexp): for i in tp: self.addResult(i) tp.seekinit() if tp.readUntil(self.nextRegexp): self.MoreResults=True else: self.MoreResults=False self.REQ=req
def learnLemmasByOrderOfScore(getSentenceScore): # Scheme: Learn words as they become possible to learn, in terms of sentences, in order of score # Initialize: Load all texts in Texts folder: TextParser.addAllTextsFromDirectoryToDatabase("Texts") # Will only contain sentences with fewer than or equal to one missing word, marked in order of the missing words frequency directlyUnlockableLemmasScore, sentencePairsBySentenceScore, directlyUnlockableLemmas = getPriorityQueueOfDirectlyLearnableSentencesByLemmaFrequency(getSentenceScore) lemmasByFrequency = getPriorityQueueOfLemmasByFrequency() # Find which words one is forced to learn, without being able to isolate it to one sentence: forcedToLearn = [] notForcedToLearn = [] orderedLearningList = [] #First we remove all words that are not true "words", for example names, by learning the NotAWordLemma lemma: learnLemmaAndHandleSentencesWithLemmaFrequency(TextParser.NotAWordLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) i = 0 numberOfLemmas = len(lemmasByFrequency) print("Start learning lemmas: " + str(len(lemmasByFrequency))) highestScoringDirectlyLearnableSentencePair = None highestScoringDirectlyLearnableSentencePairScore = None while not hasLearnedAllLemmas(lemmasByFrequency): (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore) while hasDirectlyLearnableSentence(directlyUnlockableLemmas): currentSentencePair = getHighestScoringUnforcedSentencePair(sentencePairsBySentenceScore, highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) assert i + len(lemmasByFrequency) == numberOfLemmas # No new word in the sentence: if hasNoNewLemmas(currentSentencePair): continue assert i + len(lemmasByFrequency) == numberOfLemmas # A new pair of words to learn: lets do it! kage = 1 #TODO (*) Der mangler at blive fjernet en fejl i forbindelse med at opdaterer sætninger, hvis sentence score afhænger af andre sætninger. for sentence in currentSentencePair: if sentence == None: continue if sentence.associatedLearnableSentence != None: sentence.associatedLearnableSentence.scoreDependentSentences.remove(sentence) newLemma = sentence.getOnlyUncoveredLemma() orderedLearningList.append((newLemma, sentence)) learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) if i % 1 == 0 or i < 4000: print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + sentence.rawSentence) i += 1 assert i + len(lemmasByFrequency) == numberOfLemmas (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore) if hasLearnedAllLemmas(lemmasByFrequency): # When all words have been learned in the loop above break # There are no more free words: time to learn a frequent word: newLemma = getHighestScoringLemma(lemmasByFrequency) orderedLearningList.append((newLemma, "NONE")) learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, forcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) if i < 6000: print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + "NONE") i += 1 assert i + len(lemmasByFrequency) == numberOfLemmas print("Learned directly " + str(len(orderedLearningList)) + " of " + str(numberOfLemmas) + " lemmas.") return orderedLearningList
def parseMultipart(self, cad, boundary): self.boundary = boundary dicc = [] tp = TextParser() tp.setSource("string", cad) while True: headers = [] if not tp.readUntil("name=\"([^\"]+)\""): break var = tp[0][0] headers.append(tp.lastFull_line.strip()) while True: tp.readLine() if tp.search("^([^:]+): (.*)$"): headers.append(tp.lastFull_line.strip()) else: break value = "" while True: tp.readLine() if not tp.search(boundary): value += tp.lastFull_line else: break if value[-2:] == "\r\n": value = value[:-2] dicc.append(Variable(var, value, headers)) self.variables = dicc
from reqresp import * from TextParser import * # # http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg=0&po=0&pro=9&paso=1 # a=Request() a.setUrl("http://www.pisosalaventa.com/listar_contactos_ofertas_compartir.php?pg=0&po=0&pro=9&paso=1") a.perform() HTML=a.response.getContent() tp=TextParser() tp.setSource("string",HTML) lista=[] while tp.readUntil("<tr class='contenido_tab' onClick='window.location=\"([a-zA-Z0-9\.-]+)\""): # Expresion regular mejorada ("<tr class='contenido_tab' onClick='window.location=\"([^\"]+)\""): link=tp[0][0] tp.readUntil("<td class='contenido_tab' align='right'>([0-9]+)</td>") precio=int(tp[0][0]) lista.append([link,precio]) for ln,euros in lista:
# # Copyright (C) 2012-2015 Institute of Industrial Science, The University of Tokyo. # All rights reserved. # # Copyright (c) 2014-2016 Advanced Institute for Computational Science, RIKEN. # All rights reserved. # # Copyright (c) 2016-2018 Research Institute for Information Technology, Kyushu University. # All rights reserved. # ################################################################################### # import TextParser as tp instance = tp.createInstance() print "test for read()" print tp.read(instance, "../Examples/test.tpp") print "test for getVersionInfo()" print tp.getVersionInfo() print "test for getAllLabels()" print tp.getAllLabels(instance) print "test for getType()" print tp.getType(instance, '/config/test_double_max') print "test for getValue()" print tp.getValue(instance, '/config/cell_start')
from enum import Enum import TextParser class GameState(Enum): start = 0 playing = 1 gameOver = 2 gameState = GameState.start while gameState == GameState.start: print("Welcome to the Python Text adventure!") input("Press any key to continue...\n") gameState = GameState.playing print("You are now playing the python text-based adventure game!") while gameState == GameState.playing: userInput = input(">") TextParser.parseInput(userInput)
def parseRequest (self,rawRequest,prot="http"): ''' Aun esta en fase BETA y por probar''' tp=TextParser() tp.setSource("string",rawRequest) self.__variablesPOST={} self.__headers={} # diccionario, por ejemplo headers["Cookie"] tp.readLine() try: tp.search("(\w+) (.*) (HTTP\S*)") self.method=tp[0][0] self.protocol=tp[0][2] except: print "error en" print rawRequest return pathTMP=tp[0][1] pathTMP=('','')+urlparse(pathTMP)[2:] pathTMP=urlunparse(pathTMP) pathTMP=pathTMP.replace("//","/") self.time=strftime("%H:%M:%S", gmtime()) while True: tp.readLine() if (tp.search("^([^:]+): (.*)$")): self.addHeader(tp[0][0],tp[0][1]) else: break self.setUrl(prot+"://"+self.__headers["Host"]+pathTMP) if self.method.upper()=="POST": pd="" while tp.readLine(): pd+=tp.lastFull_line if "Content-Type" in self.__headers: values=self.__headers["Content-Type"].split(";") if values[0].strip().lower()=="application/x-www-form-urlencoded": self.ContentType=values[0] elif values[0].strip().lower()=="multipart/form-data": self.ContentType=values[0] self.boundary=values[1].split("=")[1].strip() self.parsePOSTDATA(pd)
# TextParser - Text Parsing Library # # Copyright (C) 2012-2015 Institute of Industrial Science, The University of Tokyo. # All rights reserved. # # Copyright (c) 2014-2016 Advanced Institute for Computational Science, RIKEN. # All rights reserved. # # Copyright (c) 2016-2018 Research Institute for Information Technology, Kyushu University. # All rights reserved. # ################################################################################### # import TextParser as tp instance=tp.createInstance() print "test for read()" print tp.read(instance, "../Examples/test.tpp") print "test for getVersionInfo()" print tp.getVersionInfo() print "test for getAllLabels()" print tp.getAllLabels(instance) print "test for getType()" print tp.getType(instance, '/config/test_double_max') print "test for getValue()" print tp.getValue(instance, '/config/cell_start')
elif argv[1] == "train": # Instantiate tfidf object to run tf-idf on documents topic_tfidf = tfidf() PreProcessor.preprocessTopicEngine(topic_tfidf,"NYT_articles.txt") topic_engine = TopicEngine(topic_tfidf.doc_word_list) topic_engine.GibbsSampling() else: url = argv[1] text = argv[2] # Get processed document as a list of words doc = TextParser.return_word_list(text) # Run tf-idf on inputted document article_tfidf = new_tfidf() doc_dict = article_tfidf.return_tfidf_dict(doc) # get words associated with topic doc_word_list = eval(open("doc_word_list2.txt",'r').read()) topics = eval(open("topics.txt",'r').read()) topics_dict = {} doc_count = 0 vocab = list(set(eval(open ('word_list_total.txt','r').read()))) # Putting words in topic buckets
def parsePOSTDATA(self,pd): if self.ContentType=="application/x-www-form-urlencoded": dicc=self.readUrlEncodedVariables(pd) [self.addVariablePOST(i,j) for i,j in dicc] elif self.ContentType=="multipart/form-data": self.multiPOSThead={} dicc={} tp=TextParser() tp.setSource("string",pd) # print self.boundary # print tp.readUntil("%s$" % (self.boundary)) while True: headers=[] if not tp.readUntil("name=\"([^\"]+)\""): break var=tp[0][0] headers.append(tp.lastFull_line.strip()) while True: tp.readLine() if tp.search("^([^:]+): (.*)$"): headers.append(tp.lastFull_line.strip()) else: break value="" while True: tp.readLine() if not tp.search(self.boundary): value+=tp.lastFull_line else: break if value[-2:]=="\r\n": value=value[:-2] dicc[var]=value self.multiPOSThead[var]=headers if tp.search(self.boundary+"--"): break self.__variablesPOST.update(dicc) # print pd # print dicc # print self.__variablesPOST else: self.__uknPostData=pd
def Run(self): db = SqliteDatabase(Globals.EmailsFileName) if not db.OpenConnection(): return self.bloomFilter = self.CreateBloomFilter() #self.bloomFilter = None logFileName = PlatformMethods.Decode( os.path.join( Globals.CasePath, (Globals.EmailsFileName[Globals.EmailsFileName.rfind(os.sep) + 1:] + '.log'))) self.fout = open(logFileName, 'ab') #print self.CheckedMimeTypes self.fout.write( 'Parsing/Indexing Emails Attachments Started at: %s\n' % (time.ctime())) if self.AddressBookPath: self.ParseStatus = "Parsing Address book..." self.SendEvent() AddressBookParser = OutlookAddressBook.AddressBookParser( Globals.AddressBookDict) for root, dirs, files in os.walk(self.AddressBookPath): for eachfile in files: filePath = os.path.join(root, eachfile) self.FilesCount += 1 if (filePath.rfind('.') == -1): continue #print filePath extension = filePath[filePath.rfind('.'):] #print 'extension ', extension if extension.lower() == ".csv": AddressBookParser.Parse(filePath) #print 'add book parsed' else: self.fout.write('No Addressbook path found!\n') #Updte Addressbook query1 = "insert into " + Constants.AddressBookTable + "(EmailID, FirstName, MiddleName, LastName, InBook) values (?,?,?,?,?)" ManyValues = [] for key in Globals.AddressBookDict: #'EmailID': email, 'FirstName': firstName, 'MiddleName': middleName, 'LastName': lastName, 'InBook':1} ManyValues.append((Globals.AddressBookDict[key]['EmailID'], Globals.AddressBookDict[key]['FirstName'], Globals.AddressBookDict[key]['MiddleName'], Globals.AddressBookDict[key]['LastName'], Globals.AddressBookDict[key]['InBook'])) #query = "delete from %s"%Constants.AddressBookTable #db.ExecuteNonQuery(query) #print ManyValues db.ExecuteMany(query1, ManyValues) #self.ParseStatus = "Done Preprocessing/Indexing Emails!" #return textParser = TextParser.TextParser(db, Globals.EmailsStopwords, self.Stemmer, bloomFilter=self.bloomFilter) docxParser = DocxParser.DocxParser(db, Globals.EmailsStopwords, self.Stemmer, bloomFilter=self.bloomFilter) docParser = DocParser.DocParser(db, Globals.EmailsStopwords, self.Stemmer, bloomFilter=self.bloomFilter) docQuery = "insert into %s (DocPath, DocType) values (?, ?)" % ( Constants.TextCatDocumentsTable) if self.AttachmentsPath: for root, dirs, files in os.walk(self.AttachmentsPath): for eachfile in files: filePath = os.path.join(root, eachfile) fileNameList = eachfile.split() if len(fileNameList) >= 2: dateTimeFileName = "%s %s - %s" % ( fileNameList[0], (fileNameList[1].replace(".", ":")), (eachfile[eachfile.rfind('-') + 1:])) if self.AttachmentsDict.has_key(dateTimeFileName): self.AttachmentsDict[dateTimeFileName].append( filePath) else: self.AttachmentsDict[dateTimeFileName] = [filePath] #print 'Intersting! more than 1 attach. file found with same date time: %s'% #else: # self.AttachmentsDict[dateTimeFileName] = filePath else: self.fout.write( 'Attachment filename found without date time: %s\n' % (PlatformMethods.Encode(filePath))) #AttachmentsDict, Stopwords=[], Stemmer=None self.outlookTextParser = OutlookTextParser.OutlookTextParser( db, self.AttachmentsDict, Globals.EmailsStopwords, self.Stemmer, bloomFilter=self.bloomFilter, logFile=self.fout) if self.IndexMessages: self.ParseStatus = "Parsing and Indexing Emails..." else: self.ParseStatus = "Parsing Email Headers..." self.SendEvent() for root, dirs, files in os.walk(self.EmailsPath): if not self.keepGoing: self.running = False return for eachfile in files: self.FilesCount += 1 if not self.keepGoing: self.running = False return filePath = os.path.join(root, eachfile) #print filePath if (filePath.rfind('.') == -1): continue try: extension = filePath[filePath.rfind('.'):] fileType = wx.TheMimeTypesManager.GetFileTypeFromExtension( extension) if fileType: mimeType = fileType.GetMimeType() or "unknown" if mimeType == "text/plain": try: self.outlookTextParser.parse( filePath, self.IndexMessages) except Exception, msg: self.fout.write( 'Error Parsing Message: %s Msg:: %s\n' % (PlatformMethods.Encode(filePath), msg)) self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec( time.time() - self.StartTime) if (time.time() - self.EventStart) > 10: self.ElapsedTime = CommonFunctions.ConvertSecondsToDayHourMinSec( time.time() - self.StartTime) self.SendEvent() except Exception, value: #try: self.fout.write( "Error Parsing Message: %s Msg: %s\n" % (PlatformMethods.Encode(filePath), str(value))) self.fout.flush()
def getPage(n): a=Request() a.setUrl("http://www.infoempleo.com/trabajo/en_barcelona/area-de-empresa_informatica/pagina_"+str(n)) a.perform() HTML=a.response.getContent() tp=TextParser() tp.setSource("string",HTML) lista=[] while tp.readUntil('<tr class="[AB]">'): tp.readUntil('<td class="col1"><span>(.*)</span></td>') fecha=tp[0][0].split("-") if 9 == int(fecha[0]) and datetime.today().month == int(fecha[1]): tp.readUntil('<td class="col2"><a.*?>(.*)</a></td>') oferta=tp[0][0] tp.readUntil('<td class="col3"><strong><a.*?>(.*)</a></strong></td>') empresa=tp[0][0] tp.readUntil('<td class="col4"><a.*?>(.*)</a></td>') lugar=tp[0][0] tp.readUntil('<td class="col5">(.*)</td>') inscritos=tp[0][0] lista.append([oferta,empresa,lugar,inscritos]) print "Descargada pagina",n return lista
""" Author: Team 8 City: Chicago Subject: COMP90024 """ from collections import Counter import couchdb from textblob import TextBlob from TextParser import * # This file aims at finding out the most frequently mentioned words in a branch of tweets in a given view. # It's used in the report of scenario analysis part. if __name__ == '__main__': server = couchdb.Server('http://115.146.95.53:5984/') db = server['twitter_rest'] TextParser.getStopWords() textParser = TextParser() counter = Counter() for row in db.view('C2E2View/C2E2'): blob = TextBlob(textParser.parsing(row.value['what']['text'])) for word in blob.words: counter[word] += 1 top_words = counter.most_common(20)