def letor_producer(X, ixs): _id = 0 qid_prev, synid_prev = None, None for ids, row in tools.tqdm(zip(ixs, X), total=len(X)): qid, synid, fid, target = ids if (qid_prev, synid_prev) != (qid, synid): _id += 1 qid_prev, synid_prev = qid, synid yield target, _id, row, ids
def iter_sents(ets, total): for et in tools.tqdm(ets, total=total): name = tokenize(et['name']) yield tools.replace_num(name.split()) desc = et.get('description') if desc: desc = tokenize(desc) yield tools.replace_num(desc.split()) for syn in et.get('synonyms', []) + et.get('mistypes', []): sname = tokenize(syn['name']) yield tools.replace_num(sname.split())
def ranker_predict(ranks, dmatrix, groups): y = dmatrix.get_label() positions = [] ix_prev = 0 scores = {i: [] for i in [1, 2, 5, 6, 7, 10]} for gcount in tools.tqdm(groups): y_cur = y[ix_prev:ix_prev + gcount] r = ranks[ix_prev:ix_prev + gcount] rsorted = y_cur[np.argsort(r)[::-1]] ix = np.where(rsorted == 1)[0][0] positions.append(ix) for k in scores.keys(): val = ndcg_at_k(rsorted, k, method=1) scores[k].append(val) ix_prev += gcount for k in list(scores.keys()): scores['ndcg@%d' % k] = np.round(np.mean(scores.pop(k)), 4) positions = pd.Series(positions) return scores, positions
def gen_data(self, cur_samples): qid2text, fid2text, sid2text = self.qid2text, self.fid2text, self.sid2text for row in tools.tqdm(cur_samples.itertuples(), total=len(cur_samples)): if pd.notna(row.synid): q_terms = sid2text[row.synid].split() else: q_terms = qid2text[row.qid].split() d_terms = fid2text[row.fid].split() if row.target == 0 and q_terms == d_terms: continue if len(q_terms) * len(d_terms) == 0: continue qdi = QDInfo(q_terms=q_terms, d_terms=d_terms, ixs=[getattr(row, c) for c in self.colnames]) # TODO: add DNN features: brands ... yield qdi
def organization_info(): client = MongoClient(FLAGS.mongo_host) db = client[FLAGS.feed_db] total = db.etalons.count_documents({}) qs2org = [] for et in tools.tqdm(db.etalons.find({}), total=total): assert et['comment'].startswith('{') main_org = et['comment'].split('|')[0].lstrip('{') syns = et.get('synonyms', []) for s in syns: comment = s['comment'] if comment == 'merged with master': org = main_org else: org = comment qs2org.append((et['_id'], s['id'], org)) qs2org.append((et['_id'], None, main_org)) qs2org = pd.DataFrame(qs2org) qs2org.columns = ('qid', 'synid', 'org') return qs2org
def topn_precision_recall_curve(ftest, topns, n_thresholds=100, tag=''): thresholds = np.arange(0, 1 + 0.1**9, 1 / n_thresholds) scores = [] ftest.fillna(-1, inplace=True) for (qid, sid), g in tools.tqdm(ftest.groupby(['qid', 'synid'], sort=False)): gsort = g.sort_values('prob', ascending=False) tmax_global = gsort['target'].max() pmax = gsort['prob'].iloc[0] tp_fp_tn_fn_topns = [] for topn in topns: tmax = gsort['target'][:topn].max() tp_fp_tn_fn = [] for thres in thresholds: if tmax_global: # запрос имеет дубль в релизе if pmax >= thres: # выдача непустая if tmax: # нужный дубль вошел в топ-5 tp_fp_tn_fn.append([1, 0, 0, 0]) else: tp_fp_tn_fn.append([0, 1, 0, 0]) else: # выдача пустая tp_fp_tn_fn.append([0, 0, 0, 1]) else: # запрос не имеет дубля в релизе if pmax >= thres: # выдача непустая tp_fp_tn_fn.append([0, 1, 0, 0]) else: # выдача пустая tp_fp_tn_fn.append([0, 0, 1, 0]) tp_fp_tn_fn_topns.append(tp_fp_tn_fn) scores.append(tp_fp_tn_fn_topns) scores = np.array(scores, dtype=np.int64) for i, topn in enumerate(topns): precision, recall = [], [] for j in range(scores.shape[2]): tp, fp, tn, fn = scores[:, i, j, :].sum(axis=0) if tp + fp != 0: precision.append(tp / (tp + fp)) else: precision.append(1) recall.append(tp / (tp + fn)) recall = np.array(recall) recall_scale = get_recall_test_scale() plot_precision_recall_straight(precision, recall, tag='_top%d%s' % (topn, tag), recall_scale=recall_scale, average_precision=np.mean(precision), prefix='Top %d: ' % topn) df = pd.DataFrame([precision, list(recall * recall_scale)]).T df.columns = ('precision', 'recall') df.to_csv('../data/dedup/prec_recall_top%d%s.csv' % (topn, tag), index=False, sep='\t')