def do_many_days_s3_APPS(argv): if len(argv) < 3: print "usage: start_day end_day" print "usage: 2011-10-7 2011-10-13" sys.exit() QUERY_FILES = [ 'euospcomp07.osp_query.log', 'euospcomp08.osp_query.log', 'euospsch01.osp_query.log', 'euospsch03.osp_query.log', 'euospsch01.2.osp_query.log', 'euospsch03.2.osp_query.log', ] CLICK_FILES = [ 'euospsch03.osp_click.log', 'euospcomp08.osp_click.log' ] #QUERY_FILES = QUERY_FILES[:2] daylist = handolUtil.get_day_list(argv[1],argv[2]) analyzer = LogAnalyzer("APPS", "%s-%s" %(daylist[0], daylist[-1])) stopwatch = handolUtil.StopWatch() for daystr in daylist: for q in QUERY_FILES: s3_file = "DEVELOPING/app/7nmc1m75ij/apps-log/query_log/%s/%s.%s" % (daystr, q, daystr) analyzer.querylog_from_s3("sch-emr", s3_file) print "Loading & ETL: %f sec" % (stopwatch.laptime()) analyzer.get_stats() print "Calc Stats: %f sec" % (stopwatch.laptime()) analyzer.write_info()
def get_stats_from_qcmatched(self, fname): """ calculate statistics from Query/Click matched log """ stopwatch = handolUtil.StopWatch() stopwatch.start() qcmatched_list = QcLogAnalyzer.loadfile_qc_matched(fname) print "Loading : %f sec" % (stopwatch.laptime()) stopwatch.start() for v in qcmatched_list: im = v[0] num_clicks = int(v[1]) click_pos = int(v[2]) tot = int(v[3]) devtype = v[4] dos = v[5] dm = v[6] shop = v[7] keyword = v[8] if tot == 0: nores = 1 else: nores = 0 if num_clicks > 0: anyclick = 1 else: anyclick = 0 if is_non_ascii(keyword): english = 0 else: english = 1 os_shop = "%s_%s" % (dos, shop) shop_keyw = (shop, keyword) self.langD.add(english, [1, anyclick, nores, click_pos, num_clicks]) self.imD.add(im, [1, anyclick, nores, click_pos, num_clicks, english ]) # query count, click self.keywordD.add( keyword, [1, anyclick, nores, click_pos, num_clicks, english]) self.shopkeywordD.add(shop_keyw, [1, anyclick, nores, click_pos, num_clicks]) self.shopD.add( shop, [1, anyclick, nores, click_pos, num_clicks, english]) self.osshopD.add( os_shop, [1, anyclick, nores, click_pos, num_clicks, english]) self.devtypeD.add( devtype, [1, anyclick, nores, click_pos, num_clicks, english]) self.dosD.add(dos, [1, anyclick, nores, click_pos, num_clicks, english]) self.dmD.add(dm, [1, anyclick, nores, click_pos, num_clicks, english]) #self.diffsecD.add(diffsec, [1, anyclick, nores, click_pos, len(clicklist)]) if tot == 0: self.zeroD.add(keyword, [1, english]) #self.zeroshopD.add(shop, [1]) #self.zeroshopkeywordD.add(shop_keyw, [1]) print "Calculating : %f sec" % (stopwatch.laptime())
def test_w_local_LHRHVH(): analyzer = LogAnalyzer("LHRHVH", "any") stopwatch = handolUtil.StopWatch() analyzer.querylog_from_file("osp_query.log.20130306") print "Loading & ETL: %f sec" % (stopwatch.laptime()) analyzer.get_stats() print "Calc Stats: %f sec" % (stopwatch.laptime()) analyzer.write_info()
def test_w_local_APPS(): analyzer = LogAnalyzer("APPS", "any") stopwatch = handolUtil.StopWatch() analyzer.querylog_from_file("euospsch03.2.osp_query.log.20130215") #analyzer.querylog_from_file("a.query.log") print "Loading & ETL: %f sec" % (stopwatch.laptime()) analyzer.get_stats() print "Calc Stats: %f sec" % (stopwatch.laptime()) analyzer.write_info()
def main(self, bucket, cfiles, qfiles, outfile): stopwatch = handolUtil.StopWatch() for cfile in cfiles: self.add_click_file(cfile, bucket) print "Loading Click - %s: %f sec" % (cfile, stopwatch.laptime()) qclistAll = [] for qfile in qfiles: qclist = self.add_query_file(qfile, bucket) qclistAll += qclist print "Load Query & Match - %s: %f sec" % (qfile, stopwatch.laptime()) QueryClickMatcher.save_qc_match(qclistAll, outfile)
def load_qcmatched(org_log_dir, daystr): os.chdir(org_log_dir) print "DIR:", os.getcwd() print "DAY:", daystr stopwatch = handolUtil.StopWatch() stopwatch.start() analyzer = QcLogAnalyzer(daystr) QCLOG = "qc_matched.all.%s.log" % daystr analyzer.get_stats_from_qcmatched(QCLOG) print "Loading & Statistics: %f sec" % (stopwatch.laptime()) stopwatch.start() analyzer.print_stats('%s.all.csv' % (daystr)) print "Saving output: %f sec" % (stopwatch.laptime())
def do_many_days_s3_LHRHVH(argv): if len(argv) < 3: print "usage: start_day end_day" print "usage: 2011-10-7 2011-10-13" sys.exit() daylist = handolUtil.get_day_list(argv[1],argv[2]) analyzer = LogAnalyzer("LHRHVH", "%s-%s" %(daylist[0], daylist[-1])) stopwatch = handolUtil.StopWatch() for daystr in daylist: s3_file = "DEVELOPING/app/7nmc1m75ij/hubs1-log/query_log/osp_query.log.%s" % (daystr) analyzer.querylog_from_s3("sch-emr", s3_file) print "Loading & ETL: %f sec" % (stopwatch.laptime()) s3_file = "DEVELOPING/app/7nmc1m75ij/hubs2-log/query_log/osp_query.log.%s" % (daystr) analyzer.querylog_from_s3("sch-emr", s3_file) print "Loading & ETL: %f sec" % (stopwatch.laptime()) analyzer.get_stats() print "Calc Stats: %f sec" % (stopwatch.laptime()) analyzer.write_info()
def load_keywords_stat(fname, resfname, wikif): """ """ watch = handolUtil.StopWatch() print "Loading ...", wikif wikiStat = {} fp = codecs.open(wikif, 'rb', encoding='utf-8') for line in fp: flds = line.split() w = flds[0].lower().replace('_', ' ') n = int(flds[1]) if not wikiStat.has_key(w): wikiStat[w] = n fp.close() print "Loaded: %f" % (watch.laptime()) """ == keywords.stat 3997563 270333 6410 facebook 2399787 186241 45449 whatsapp 1433213 71764 1458 whats app 1315718 130860 2040 temple run 1004372 163723 272 games 947351 87185 19846 skype 829801 316731 834 angry birds 529552 64385 1416 fruit ninja """ print "Loading ...", fname highKeywords = DictDict() noresKeywords = [] fp = codecs.open(fname, 'rb', encoding='utf-8') for line in fp: flds = line.split('\t') q = int(flds[0]) c = int(flds[1]) nores = int(flds[2]) keyword = flds[3].strip() kflds = keyword.split() if len(kflds) > 2: continue noresratio = (nores * 100) / q if q > 10 and noresratio > 50: if not wikiStat.has_key(keyword): noresKeywords.append([keyword, q, noresratio]) if q > 150 and noresratio < 4: highKeywords.insert(keyword, [q, noresratio]) fp.close() print "Loaded: %f" % (watch.laptime()) print "noresKeywords: %d" % (len(noresKeywords)) print "highKeywords: %d" % (len(highKeywords.D)) fp = codecs.open(resfname, 'wb', encoding='utf-8') for noresK in noresKeywords: nearest = highKeywords.searchNearest(noresK[0], noresK[1]) if nearest: prnformat = u'\t'.join( map(lambda (d, x, y, z): "%d\t%s\t%d\t%d" % (d, x, y, z), nearest)) fp.write("%s\t%d\t%d\t%s\n" % (noresK[0], noresK[1], noresK[2], prnformat)) fp.close() print "Processed: %f" % (watch.laptime())
stopwatch = handolUtil.StopWatch() stopwatch.start() analyzer = QcLogAnalyzer(daystr) QCLOG = "qc_matched.all.%s.log" % daystr analyzer.get_stats_from_qcmatched(QCLOG) print "Loading & Statistics: %f sec" % (stopwatch.laptime()) stopwatch.start() analyzer.print_stats('%s.all.csv' % (daystr)) print "Saving output: %f sec" % (stopwatch.laptime()) #analyzer.print_info('%s.csv' % (daystr)) #analyzer.save_to_mongo() #analyzer.save_keywords() if __name__ == "__main__": import os import sys #os.chdir(sys.argv[1]) #daystr = sys.argv[2] org_log_dir = u"D:/0001/9 org logs/20130215" daystr = '20130215' stopwatch = handolUtil.StopWatch() stopwatch.start() #save_qcmatched(org_log_dir, daystr) load_qcmatched(org_log_dir, daystr) print is_non_ascii('avcd') print is_non_ascii('微信')