def load_run_and_attach_gain(runfile, updlens, nuggets, matches, useAverageLengths, track, query_durns): """ This function attaches gain (nuggets) to sentences of a run, on the fly """ run = {} with open(runfile) as rf: for line in rf: if len(line.strip()) == 0: continue qid, teamid, runid, docid, sentid, updtime, confidence = line.strip( ).split() if track == 'ts14': qid = 'TS14.' + qid updid = docid + '-' + sentid updtime = float(updtime) - query_durns[qid][ 0] # timestamps to start from 0 confidence = float(confidence) updlen = 30 if not useAverageLengths else updlens[qid][ "topic.avg.update.length"] #default updlen is 30 if updid in updlens[qid]: updlen = updlens[qid][updid] else: pass #print >> sys.stderr, 'no length for ', updid if qid not in run: run[qid] = [] #gain for update ngtstr = "" num_ngts = 0 matching_nuggets = [] if updid in matches[qid]: #update is relevant ngts_in_upd = matches[qid][updid] for ngtid in ngts_in_upd: if ngtid not in nuggets[ qid]: # there are 2 nuggets not in nuggets.tsv continue num_ngts += 1 ngt_gain, ngt_time = nuggets[qid][ngtid] # ngtstr += ','.join([ str(s) for s in [ngtid, ngt_gain, ngt_time] ]) # ngtstr += ' ' matching_nuggets.append( Nugget(ngtid, ngt_gain, ngt_time)) #run[qid].append( [updtime, confidence, updid, updlen, num_ngts, ngtstr] ) updobj = Update(qid, updid, updtime, confidence, updlen, num_ngts, ngtstr) updobj.nuggets = matching_nuggets if qid not in run: run[qid] = [] run[qid].append(updobj) return run
def microblog_load_run_and_attach_gain(runfile, nuggets, matches, track, query_durns): run = {} qids_matched = defaultdict(int) qids_ignored = defaultdict(int) with gzip.open(runfile) as rf: for line in rf: qid, tweet, epoch, runtag = line.strip().split() if track == 'mb15': qid = qid.replace("MB", "") #print qid, tweet, epoch, runtag if qid not in matches: qids_ignored[qid] += 1 continue qids_matched[qid] += 1 if qid not in query_durns: logger.error( 'qid {} not in query_durns: qid in matches {}'.format( qid, qid in matches)) epoch = float(epoch) - query_durns[qid][0] matching_nuggets = [] if tweet in matches[qid]: mc = matches[qid][tweet] if qid in nuggets and mc in nuggets[qid]: mcgain, mctime = nuggets[qid][mc] matching_nuggets.append(Nugget(mc, mcgain, mctime)) updobj = Update(qid, tweet, epoch, 1.0, 140 / 5.1, len(matching_nuggets), "") updobj.nuggets = matching_nuggets if qid not in run: run[qid] = [] run[qid].append(updobj) # print len(qids_matched),qids_matched # print len(qids_ignored),qids_ignored return run