def process_file(known_queries, file): count = 0 out_count = 1 out = "%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" for e in yu.avro_reader(file): count+=1 if count == 100001: break if count % 10000 == 0: print >>sys.stderr,"Count", count,"out", out_count sys.stderr.flush() for fp in yu.get_clickstream_fingerprint(known_queries, e): print out % (fp[0][0], fp[1][0], fp[2][0], fp[3][0], fp[1][1], fp[2][1], fp[3][1], fp[1][2], fp[2][2], fp[3][2]) out_count+=1 return count
def process_file(query_hashes, file): counter = Counter() c = 0 s = 0 o = 0 for e in yu.avro_reader(file): s+=1 if s == 100000: break for q in e['queries']: c+=1 if c % 10000 == 0: print >>sys.stderr,"Queries:", c, "Sessions:", s, "Out", o if q['queryHash'] not in query_hashes: continue o+=1 for pos, url in enumerate(q['urls']): pos_marker = 0 if pos < 5 else 1 key = "%d\t%d\t%d\t%d" % (q['queryHash'], e['regionId'], url, pos_marker) counter[key]+=1 print >>sys.stderr,"Queries:", c, "Sessions:", s, "Out", o dump_cache(counter)
def main(args): known_queries = yu.read_judged_queries(args[0]) for f in args[1:]: process(yu.avro_reader(f), known_queries)