def main(mode: str): """ Main program. Either fetches new articles from RSS feeds or analyzes them using Textrazor. :param mode: One of 'fetch' or 'analyze'. :return: Nothing, only prints messages regarding success or failure. """ # time-related now = datetime.now() # configuration, database connection conf = Config() db = DatabaseClient(conf.database.host, conf.database.database, conf.database.user, conf.database.password, conf.database.port) # fetch / analyze articles if mode == "fetch": fetch(conf, db, now.isoformat()) elif mode == "analyze": analyze(conf, db, now) else: raise ValueError("--mode must be one of 'fetch', 'analyze'.") # commit any changes to database db.connection.commit()
def test_analyze(self): self.assertEqual(analyze([]), {"avg": 0.0, "n": 0, "variance": 0.0}) self.assertEqual(analyze([3.0]), {"avg": 3.0, "n": 1, "variance": 0.0}) self.assertEqual(analyze([1.0, 2.0, 3.0, 4.0]), {"avg": 2.5, "n": 4, "variance": 5.0/3.0}) self.assertEqual(analyze([1.0, 2.0, 3.0, 4.0], linear_weights), {"avg": 2.0, "n": 4, "variance": 2.0})
def main(): """ main executable check times, put into intervals """ # load parsed data data, driver_hash = getData() # analyze data analyze(data, driver_hash)
def processApps(args, gpapi, apps): """ Download and analyze apps on the Google Play Store. Arguments: args -- the command line arguments object gpapi -- the Google Play API object apps -- dictionary of apps and their meta data """ createAppFolder(args) i = 0 j = 0 for app in apps: if shouldProcess(apps[app]): j += 1 print "found {:,} apps to process".format(j) pos = getRestorePoint(args) for app,meta in apps.iteritems(): # we only care about apps which require INTERNET permission, we haven't checked yet, and are free if not shouldProcess(meta): continue # skip until at the position where we should resume i += 1 if i < pos: continue # create restore point if i % args.restore_freq == 0 and i > 0 and args.restore_freq > 0: createRestorePoint(args, apps, i) # print progress sys.stdout.write("\rprocessing apps... %6.2f%% %10s: %s\033[K " % (100.0 * i / j, "app", app)) sys.stdout.flush() try: fname = args.app_dir + app + ".apk" if download(gpapi, fname, app, meta['version'], meta['offer']): analyze(apps, fname, app) os.remove(fname) except: None sys.stdout.write("\rdone processing apps\033[K\n") sys.stdout.flush() # clean up print "saving to cache" clearRestorePoint(args, apps) deleteAppFolder(args)
def test_analyze(self): self.assertEqual(analyze([]), {"avg": 0.0, "n": 0, "variance": 0.0}) self.assertEqual(analyze([3.0]), {"avg": 3.0, "n": 1, "variance": 0.0}) self.assertEqual(analyze([1.0, 2.0, 3.0, 4.0]), { "avg": 2.5, "n": 4, "variance": 5.0 / 3.0 }) self.assertEqual(analyze([1.0, 2.0, 3.0, 4.0], linear_weights), { "avg": 2.0, "n": 4, "variance": 2.0 })
def get(self): a = analyze() header = """ <html> <head> <title>Green Tweets</Title> """ self.write(header) for tweet in tweetstore: screen_name = "" text = "" if a.cleanstr(tweet['text']) == '': #print "Skipping: " + tweet['text'] pass else: if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode( 'ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user'][ 'screen_name'].encode('ascii', 'ignore') + ": " if a.finduser(screen_name) or ( a.findword(text)) or a.findterm(text): self.write(tweet) self.write("\n") finhdr = """ </head> <body style="background:lightgray"> """ self.write("</body></html>")
def get(self): a = analyze() header = """ <html> <head> <title>Twitter Client</Title> <meta http-equiv="refresh" content="60" /> </head> <body style="background:lightgray"> """ self.write(header) for tweet in rtstore: if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode( 'ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user']['screen_name'].encode( 'ascii', 'ignore') + ": " self.write("<p>") self.write(screen_name) self.write(text) self.write("</p>") self.write("</body></html>")
def __init__(self): QMainWindow.__init__(self) self.setAttribute(QtCore.Qt.WA_DeleteOnClose) self.setWindowTitle("application main window") self.file_menu = QMenu('&File', self) self.file_menu.addAction('&Quit', self.fileQuit, QtCore.Qt.CTRL + QtCore.Qt.Key_Q) self.menuBar().addMenu(self.file_menu) self.help_menu = QMenu('&Help', self) self.menuBar().addSeparator() self.menuBar().addMenu(self.help_menu) self.help_menu.addAction('&About', self.about) self.main_widget = QWidget(self) file = "D:/Users/User/Documents/GitHub/gg/gggg.pcap" df = analyze(file, self) ax = create_plot(df, ) l = QVBoxLayout(self.main_widget) sc = MyStaticMplCanvas(ax, self.main_widget, width=5, height=4, dpi=100) l.addWidget(sc) self.main_widget.setFocus() self.setCentralWidget(self.main_widget) self.statusBar().showMessage("All hail matplotlib!", 2000)
def process(packets): packets_to_analyze = [] packets_to_analyze_index = [] packet_index = 0 for packet in packets: if Raw in packet: packets_to_analyze.append(packet[Raw].load) packets_to_analyze_index.append(packet_index) packet_index += 1 strings_to_analyze = [''] * len(packets_to_analyze) index = -1 for load in packets_to_analyze: first_char = ord(load[0]) if first_char < 20 and load.find("json") != -1: index += 1 strings_to_analyze[index] = load else: string = strings_to_analyze[index] string += load strings_to_analyze[index] = string token = -2 for load in strings_to_analyze: tmp_token = analyze(load) if tmp_token > 0: token = tmp_token if token > 0: for packet in packets: if Raw in packet: str_token = hex(token)[2:len(hex(token))-1] token += 1 str_token_plus = hex(token)[2:len(hex(token))-1] packet[Raw].load = packet[Raw].load.replace(str_token, str_token_plus)
def get(self): a = analyze() header = """ <html> <head> <title>Green Tweets</Title> </head> <body style="background:lightgray"> """ self.write(header) for tweet in reversed(greenstore): screen_name = "" text = "" if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode('ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user']['screen_name'].encode('ascii', 'ignore') + ": " img_url = "" if tweet['user'].has_key('profile_image_url'): img_url = tweet['user']['profile_image_url'] self.write("<p style='background:white; color: green'>") if img_url != "": self.write("<img src='" + img_url + "' style='float:left'>") self.write("<a href='http://twitter.com/" + screen_name[1:-2] + "' target='_blank'>" + screen_name + "</a>") self.write("<br/>") self.write(text) if tweet.has_key('created_at'): self.write("<br/>" + tweet['created_at'].encode('ascii', 'ignore')) self.write("</p>") self.write("</body></html>")
def test_wordcount(get_text_mock, spark_context): get_text_mock.return_value = "foo bar foo" result = analyze(spark_context) assert result[0] == ('foo', 2) assert result[1] == ('bar', 1)
def analyze_file(self): file = QtWidgets.QFileDialog.getOpenFileName( self.MainWindow, "Open a File", filter="Wireshark capture file (*.pcap;*.pcapng);;All Files (*.*)") self.df = analyze(file[0]) if self.df is not None: self.ui.actionCreateGraph.setEnabled(True)
def handle_messages(): print "Handling Messages" payload = request.get_data() print payload for sender, message in messaging_events(payload): print "Incoming from %s: %s" % (sender, message) send_message(PAT, sender, analyze(message)) return "ok"
def test(filename, graph): total = [] data, features, labels = parse_csv(filename) SGD_pred = runSGD(data) total.append(analyze(SGD_pred, labels)) SVC_pred = runSVC(data) total.append(analyze(SVC_pred, labels)) MNB_pred = runMNB(data) total.append(analyze(MNB_pred, labels)) # each element in total looks like: # [tn, fp, fn, tp, precision, recall, acc, fscore] if (graph): bargraph([i[0] for i in total], [i[1] for i in total], [i[2] for i in total], [i[3] for i in total], [i[4] for i in total], [i[5] for i in total], [i[6] for i in total], [i[7] for i in total])
def run(self): print "Starting consumer thread" a = analyze() global queue global tweetstore global rtstore block = ["insafediver", "MakeUseOf", "healthyworld24", "ISSAboveYou", "JoeGumby1", "Gumbletech"] blockterms = ["invest", "market", "nasa", "untuk"] prefuser = ["davewiner", "scobleizer"] while True: #lock.acquire() condition.acquire() if not queue: #print "Nothing in queue: printer will try again" condition.wait() pass else: num = queue.pop() screen_name = "" text = "" if num.has_key('user'): if num['user'].has_key('screen_name'): screen_name = num['user']['screen_name'].encode('ascii', 'ignore') if num.has_key('text'): text = num['text'].encode('ascii', 'ignore') if screen_name in prefuser: # print screen_name, text for client in wsckts: wsckts[client]['object'].send_msg(num) greenstore.append(num) # if self.client: # self.db.tweets.insert_one(num) elif text.startswith('RT'): #rtstore.append(num) pass elif (text.find('http') != -1): if (screen_name not in block): blck = True for term in blockterms: if text.lower().find(term) > 0: blck = False if (blck) and not (a.cleanstr(text).isupper()): if isgreen(num): for client in wsckts: wsckts[client]['object'].send_msg(num) greenstore.append(num) # if self.client: # self.db.tweets.insert_one(num) else: for client in rwsckts: rwsckts[client]['object'].send_msg(num) tweetstore.append(num) # if self.client: # self.db2.redtweets.insert_one(num) condition.release()
def add_entry(): db = get_db() text = request.form['text'] analysis = analyze(text) bytes = cPickle.dumps(analysis[1], 1) db.execute('insert into entries (text, time, tones) values (?, ?, ?)', [text, analysis[0], str(bytes)]) db.commit() flash('New entry was successfully posted') return redirect(url_for('analyzeWeb'))
def save_and_analyze_file(self): file_name = QtWidgets.QFileDialog.getSaveFileName( self.MainWindow, "Save into a File", filter="Wireshark capture file (*.pcap;*.pcapng);;All Files (*.*)") if file_name[0]: self.sniffer.write_into_pcap(file_path_name=file_name[0]) self.df = analyze(file_name[0]) if self.df is not None: self.ui.actionCreateGraph.setEnabled(True)
def isgreen(tweet): if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode('ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user']['screen_name'].encode('ascii', 'ignore') + ": " a = analyze() if a.finduser(screen_name) or (a.findword(text) or (a.findterm(text))): return True else: return False
def get(self): a = analyze() header = """ <html> <head> <title>Green Tweets</Title> <meta http-equiv="refresh" content="30" /> </head> <body style="background:lightgray"> """ self.write(header) for tweet in tweetstore: screen_name = "" text = "" if a.cleanstr(tweet['text']) == '': #print "Skipping: " + tweet['text'] pass else: if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode( 'ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user'][ 'screen_name'].encode('ascii', 'ignore') + ": " if a.finduser(screen_name) or (a.findword(text)) or ( a.findterm(text)): img_url = "" if tweet['user'].has_key('profile_image_url'): img_url = tweet['user']['profile_image_url'] self.write( "<p style='background:white; color: green'>") if img_url != "": self.write("<img src='" + img_url + "' style='float:left'>") self.write("<a href='http://twitter.com/" + screen_name[1:-2] + "' target='_blank'>" + screen_name + "</a>") self.write("<br/>") self.write(text) if tweet.has_key('created_at'): self.write( "<br/>" + tweet['created_at'].encode('ascii', 'ignore')) self.write("</p>") self.write("</body></html>")
def get(self): a = analyze() header = """ <html> <head> <title>Twitter Client</Title> <meta http-equiv="refresh" content="60" /> </head> <body style="background:lightgray"> """ self.write(header) self.write("<a href='/green'>Green Tweets</A>") self.write("<a href='/red'>Red Tweets</A>") self.write("<a href='/stats'>Stats</A>") self.write("<A href='/rt'>RTs</A>") self.write("<A href='gjson'>Green Tweets JSON</A>") self.write("</body></html>")
def get(self): a = analyze() header = """ <html> <head> <title>Green Tweets</Title> """ self.write(header) finhdr = """ </head> <body style="background:lightgray"> """ self.write(finhdr) for tweet in greenstore: self.write(tweet) self.write("\n") self.write("</body></html>")
def get(self): a = analyze() header = """ <html> <head> <title>Twitter Client</Title> <meta http-equiv="refresh" content="60" /> </head> <body style="background:lightgray"> """ self.write(header) for tweet in tweetstore: screen_name = "" text = "" if a.cleanstr(tweet['text']) == '': #print "Skipping: " + tweet['text'] pass else: if tweet.has_key('text'): text = tweet['text'].encode('ascii', 'ignore') text = convertLinks(text) if tweet.has_key('screen_name'): screen_name = "@" + tweet['screen_name'].encode( 'ascii', 'ignore') + ": " elif tweet.has_key('user'): if tweet['user'].has_key('screen_name'): screen_name = "@" + tweet['user'][ 'screen_name'].encode('ascii', 'ignore') + ": " if a.finduser(screen_name): self.write("<p style='color: green'>") elif a.findword(text) or a.findterm(text): self.write("<p style='color: green'>") txt = a.appendword(text) if txt != "": text = txt else: self.write("<p style='color: red'>") self.write(screen_name) self.write(text) self.write("</p>") self.write("</body></html>")
catDir = cat[0]+'_nT'+cat[1]+'_nW'+cat[2]+'_nB'+cat[3] datahists = {} bkghists = {} sighists = {} if len(sys.argv)>1: outDir=sys.argv[1] else: outDir = os.getcwd() outDir+='/'+pfix if not os.path.exists(outDir): os.system('mkdir '+outDir) outDir+='/'+cutString if not os.path.exists(outDir): os.system('mkdir '+outDir) outDir+='/'+catDir if not os.path.exists(outDir): os.system('mkdir '+outDir) category = {'isEM':cat[0],'nttag':cat[1],'nWtag':cat[2],'nbtag':cat[3]} for data in dataList: datahists.update(analyze(tTreeData,data,cutList,isotrig,False,doJetRwt,iPlot,plotList[iPlot],category,region)) if catInd==nCats: del tFileData[data] for bkg in bkgList: bkghists.update(analyze(tTreeBkg,bkg,cutList,isotrig,doAllSys,doJetRwt,iPlot,plotList[iPlot],category,region)) if catInd==nCats: del tFileBkg[bkg] if doAllSys and catInd==nCats: for syst in shapesFiles: for ud in ['Up','Down']: del tFileBkg[bkg+syst+ud] for sig in sigList: for decay in decays: sighists.update(analyze(tTreeSig,sig+decay,cutList,isotrig,doAllSys,doJetRwt,iPlot,plotList[iPlot],category,region)) if catInd==nCats: del tFileSig[sig+decay] if doAllSys and catInd==nCats: for syst in shapesFiles: for ud in ['Up','Down']: del tFileSig[sig+decay+syst+ud] if doQ2sys:
def test_wordcount_analyze(_, __): result = analyze(Context()) assert len(result) == 327 assert result[:6] == [('ut', 17), ('eu', 16), ('vel', 14), ('nec', 14), ('quis', 12), ('vitae', 12)]
from analyze import * import summary as sm pcaps = all_pcaps() local, remote, dir = next(pcaps) results = analyze(local, remote, dir) local_sender = results[0] local_sender_steady = results[1] remote_sender = results[2] remote_sender_steady = results[3] throughput_quantiles, rtt_quantiles, host, protocol, start_time, _ = results[0] rtt_quantiles
from mock import patch from pysparkling import Context from jobs.{{cookiecutter.job}} import analyze @patch('jobs.{{cookiecutter.job}}.{{ cookiecutter.project.replace(' ', '').replace('-', '') }}Context.initalize_counter') @patch('jobs.{{cookiecutter.job}}.{{ cookiecutter.project.replace(' ', '').replace('-', '') }}Context.inc_counter') def test_{{cookiecutter.job}}_analyze(_, __): result = analyze(Context()) assert len(result) == 327
outDir += '/' + pfix if not os.path.exists(outDir): os.system('mkdir ' + outDir) #outDir+='/'+cutString if not os.path.exists(outDir): os.system('mkdir ' + outDir) outDir += '/' + catDir if not os.path.exists(outDir): os.system('mkdir ' + outDir) category = { 'isEM': cat[0], 'nttag': cat[1], 'nWtag': cat[2], 'nbtag': cat[3], 'njets': cat[4] } for data in dataList: datahists.update( analyze(tTreeData, data, cutList, False, iPlot, plotList[iPlot], category)) if catInd == nCats: del tFileData[data] pickle.dump(datahists, open(outDir + '/datahists_' + iPlot + '.p', 'wb')) catInd += 1 catInd = 1 for cat in catList: if not runBkgs: break catDir = cat[0] + '_nT' + cat[1] + '_nW' + cat[2] + '_nB' + cat[ 3] + '_nJ' + cat[4] catDir = catDir.replace('_nT0p', '').replace('_nW0p', '').replace('_nB0p', '').replace('_nJ0p', '') bkghists = {} if len(sys.argv) > 1: outDir = sys.argv[1]
if not os.path.exists(outDir): os.system('mkdir ' + outDir) outDir += '/' + catDir if not os.path.exists(outDir): os.system('mkdir ' + outDir) category = { 'isEM': cat[0], 'nttag': cat[1], 'nWtag': cat[2], 'nbtag': cat[3], 'njets': cat[4] } for data in dataList: print "*****" * 20 print "*****" * 20 print "[data] : ", category, region, isCategorized datahists.update( analyze(tTreeData, data, data, cutList, False, doJetRwt, iPlot, plotList[iPlot], category, region, isCategorized)) if catInd == nCats: del tFileData[data] pickle.dump(datahists, open(outDir + '/datahists_' + iPlot + '.p', 'wb')) catInd += 1 catInd = 1 for cat in catList: if not runBkgs: break if skip(cat[4], cat[3]) and isCategorized: continue #DO YOU WANT TO HAVE THIS?? catDir = cat[0] + '_nT' + cat[1] + '_nW' + cat[2] + '_nB' + cat[ 3] + '_nJ' + cat[4] bkghists = {} if len(sys.argv) > 1: outDir = sys.argv[1] else: outDir = os.getcwd()
catDir = cat[0]+'_nT'+cat[1]+'_nW'+cat[2]+'_nB'+cat[3] datahists = {} bkghists = {} sighists = {} if len(sys.argv)>1: outDir=sys.argv[1] else: outDir = os.getcwd()+'/' outDir+=pfix if not os.path.exists(outDir): os.system('mkdir '+outDir) if not os.path.exists(outDir+'/'+cutString): os.system('mkdir '+outDir+'/'+cutString) outDir+='/'+cutString if not os.path.exists(outDir+'/'+catDir): os.system('mkdir '+outDir+'/'+catDir) outDir+='/'+catDir category = {'isEM':cat[0],'nttag':cat[1],'nWtag':cat[2],'nbtag':cat[3]} for data in dataList: datahists.update(analyze(tTreeData,data,cutList,False,iPlot,plotList[iPlot],category)) if catInd==nCats: del tFileData[data] for bkg in bkgList: bkghists.update(analyze(tTreeBkg,bkg,cutList,doAllSys,iPlot,plotList[iPlot],category)) if catInd==nCats: del tFileBkg[bkg] if doAllSys and catInd==nCats: for syst in shapesFiles: if 'DataDriven' in bkg: continue for ud in ['Up','Down']: del tFileBkg[bkg+syst+ud] for sig in sigList: for decay in decays: sighists.update(analyze(tTreeSig,sig+decay,cutList,doAllSys,iPlot,plotList[iPlot],category)) if catInd==nCats: del tFileSig[sig+decay] if doAllSys and catInd==nCats: for syst in shapesFiles: if 'DataDriven' in bkg: continue
import json from analyze import * from load_data import load_data if __name__ == '__main__': with open('keys.json') as f: key = json.loads(f.read())['usda-api-key'] #load_data(key) analyze()
outDir = os.getcwd() outDir += '/' + pfix if not os.path.exists(outDir): os.system('mkdir ' + outDir) outDir += '/' + cutString if not os.path.exists(outDir): os.system('mkdir ' + outDir) outDir += '/' + catDir if not os.path.exists(outDir): os.system('mkdir ' + outDir) category = { 'isEM': cat[0], 'nttag': cat[1], 'nWtag': cat[2], 'nbtag': cat[3] } for data in dataList: datahists.update( analyze(tTreeData, data, cutList, isotrig, False, doJetRwt, iPlot, plotList[iPlot], category, region)) if catInd == nCats: del tFileData[data] for bkg in bkgList: bkghists.update( analyze(tTreeBkg, bkg, cutList, isotrig, doAllSys, doJetRwt, iPlot, plotList[iPlot], category, region)) if catInd == nCats: del tFileBkg[bkg] if doAllSys and catInd == nCats: for syst in shapesFiles: for ud in ['Up', 'Down']: del tFileBkg[bkg + syst + ud] for sig in sigList: for decay in decays: sighists.update( analyze(tTreeSig, sig + decay, cutList, isotrig, doAllSys, doJetRwt, iPlot, plotList[iPlot], category, region))
if len(syslist) > 1: print('Systems:') for sysname in sorted(syslist): print(" " + sysname) print() # analyze the systems: if len(syslist) == 1 and syslist[0] == '': print('no system?') usage() sys.exit(1) switches = (show_events, show_disk, show_details, show_filesystem, show_mem, show_ping) for sysname in sorted(syslist): analyze(sysname, allSystems, switches) print() ##------------------------------------------------------------------------------ ## pretty-print the resulting dictionary: #print('allSystems:') #print() #pp.pprint(allSystems) #print() # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # EOF:
outfile = outfile) return datafiles if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run the benchmark tests") parser.add_argument('--engines', type=str, nargs='+', default=['ode','pqp','fcl'], help="The collision checkers to test") args = parser.parse_args() # Self collision self_collision_data = run_self_collision(args.engines) out_basename = os.path.join(package_path, 'results', 'self_collision') analyze(self_collision_data, title='Self Collision', out_basename=out_basename) # Empty envirnment empty_collision_data = run_environment_collision(args.engines) out_basename = os.path.join(package_path, 'results', 'empty_env_collision') analyze(empty_collision_data, title='Empty Environment Collisions', out_basename=out_basename) # PR kitchen from catkin.find_in_workspaces import find_in_workspaces kitchen_env = find_in_workspaces( search_dirs=['share'], project='pr_ordata', path='data/kitchen/pr_kitchen.env.xml', first_match_only=True)[0] kitchen_collision_data = run_environment_collision(args.engines, kitchen_env, 'kitchen')
import timefrom gensim.models.word2vec import Word2Vecfrom Utils.string_utils import clean_strfrom Utils.file_utils import find_filesfrom analysis_pipeline import analyze, debug_analyzefrom analysis_pipeline import build_synonym_filter, fact_case_sensitive_stop_word_filter, fact_stop_word_filterfrom analysis_pipeline import fact_is_synonym_filter, white_space_tokenize, remove_punct_at_end_filter, lower_case_filter, remove_empty_tokens_filterfrom Config.train_word2vec_model_config import TrainWord2VecModelConfigimport sys """ TRAIN Word 2 Vec Model""" if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") config = TrainWord2VecModelConfig(sys.argv[1]) """ Load analysis chain """syn_mapper = build_synonym_filter(config.keywords_files, config.case_sensitive) if config.case_sensitive: stop_filter = fact_case_sensitive_stop_word_filter(config.stop_words_file)else: stop_filter = fact_stop_word_filter(config.stop_words_file) # Simon Hughes: This is quite inefficient, as each function is applied in turn# resulting in multiple passes over the token stream. While not currently a# big performance bottleneck, could be much faster.# - TODO: use functional composition to speed upis_a_synonym_filter = fact_is_synonym_filter(syn_mapper)analysis_chain = [clean_str, white_space_tokenize, remove_punct_at_end_filter, lower_case_filter, stop_filter, syn_mapper.map_synonyms, remove_empty_tokens_filter] # is_a_synonym_filter] - Un-comment to just train on keywords. #Test#rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management", analysis_chain) """ Load Documents """start = time.time() sentences = []files = find_files(config.processed_documents_folder, config.file_mask, True)print("%s files found in %s" % (len(files), config.processed_documents_folder)) documents = []for i, fname in enumerate(files): with open(fname) as f: contents = f.read() sentences.extend(contents.split("\n"))end = time.time()print("Loading %i sentences took %s seconds" % (len(sentences), str(end - start))) """ Analyze - clean, tokenize, extract phrases """print("%i sentences to process" % len(sentences)) tokenized = []print("Tokenizing sentences")for i, sent in enumerate(sentences): tokens = analyze(sent, analysis_chain) if len(tokens) >= config.min_sentence_length_words: tokenized.append(tokens) if i % 100000 == 0: print(i) """ Train Model """ start = time.time() print("Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee")model = Word2Vec(tokenized, iter=config.training_iterations, size=config.vector_size, window=config.window_size, min_count=config.min_word_count, workers=config.workers, sample=1e-5, hs=0, negative=20)model.save(config.model_file)end = time.time()print "Took %s seconds" % (end - start)
print() # display list of systems we found: syslist = list(allSystems) if len(syslist) > 1: print('Systems:') for sysname in sorted(syslist): print(" " + sysname) print() # analyze the systems: if len(syslist) > 0: for sysname in sorted(syslist): analyze(sysname, allSystems) print() # close the output file if we had one: if lclvars.outfile != None: lclvars.outfile.close() ##------------------------------------------------------------------------------ ## pretty-print the resulting dictionary: #print('allSystems:') #print() #pp.pprint(allSystems) #print() # ----------------------------------------------------------------------------