Exemple #1
0
 def letor_producer(X, ixs):
     _id = 0
     qid_prev, synid_prev = None, None
     for ids, row in tools.tqdm(zip(ixs, X), total=len(X)):
         qid, synid, fid, target = ids
         if (qid_prev, synid_prev) != (qid, synid):
             _id += 1
         qid_prev, synid_prev = qid, synid
         yield target, _id, row, ids
Exemple #2
0
    def iter_sents(ets, total):
        for et in tools.tqdm(ets, total=total):
            name = tokenize(et['name'])
            yield tools.replace_num(name.split())

            desc = et.get('description')
            if desc:
                desc = tokenize(desc)
                yield tools.replace_num(desc.split())

            for syn in et.get('synonyms', []) + et.get('mistypes', []):
                sname = tokenize(syn['name'])
                yield tools.replace_num(sname.split())
Exemple #3
0
def ranker_predict(ranks, dmatrix, groups):
    y = dmatrix.get_label()
    positions = []
    ix_prev = 0
    scores = {i: [] for i in [1, 2, 5, 6, 7, 10]}
    for gcount in tools.tqdm(groups):
        y_cur = y[ix_prev:ix_prev + gcount]
        r = ranks[ix_prev:ix_prev + gcount]
        rsorted = y_cur[np.argsort(r)[::-1]]
        ix = np.where(rsorted == 1)[0][0]
        positions.append(ix)
        for k in scores.keys():
            val = ndcg_at_k(rsorted, k, method=1)
            scores[k].append(val)
        ix_prev += gcount

    for k in list(scores.keys()):
        scores['ndcg@%d' % k] = np.round(np.mean(scores.pop(k)), 4)

    positions = pd.Series(positions)
    return scores, positions
Exemple #4
0
    def gen_data(self, cur_samples):
        qid2text, fid2text, sid2text = self.qid2text, self.fid2text, self.sid2text
        for row in tools.tqdm(cur_samples.itertuples(),
                              total=len(cur_samples)):
            if pd.notna(row.synid):
                q_terms = sid2text[row.synid].split()
            else:
                q_terms = qid2text[row.qid].split()
            d_terms = fid2text[row.fid].split()

            if row.target == 0 and q_terms == d_terms:
                continue

            if len(q_terms) * len(d_terms) == 0:
                continue

            qdi = QDInfo(q_terms=q_terms,
                         d_terms=d_terms,
                         ixs=[getattr(row, c) for c in self.colnames])

            # TODO: add DNN features: brands ...
            yield qdi
Exemple #5
0
def organization_info():
    client = MongoClient(FLAGS.mongo_host)
    db = client[FLAGS.feed_db]
    total = db.etalons.count_documents({})

    qs2org = []
    for et in tools.tqdm(db.etalons.find({}), total=total):
        assert et['comment'].startswith('{')
        main_org = et['comment'].split('|')[0].lstrip('{')
        syns = et.get('synonyms', [])
        for s in syns:
            comment = s['comment']
            if comment == 'merged with master':
                org = main_org
            else:
                org = comment
            qs2org.append((et['_id'], s['id'], org))

        qs2org.append((et['_id'], None, main_org))

    qs2org = pd.DataFrame(qs2org)
    qs2org.columns = ('qid', 'synid', 'org')

    return qs2org
Exemple #6
0
def topn_precision_recall_curve(ftest, topns, n_thresholds=100, tag=''):
    thresholds = np.arange(0, 1 + 0.1**9, 1 / n_thresholds)
    scores = []

    ftest.fillna(-1, inplace=True)

    for (qid, sid), g in tools.tqdm(ftest.groupby(['qid', 'synid'],
                                                  sort=False)):
        gsort = g.sort_values('prob', ascending=False)
        tmax_global = gsort['target'].max()
        pmax = gsort['prob'].iloc[0]

        tp_fp_tn_fn_topns = []
        for topn in topns:
            tmax = gsort['target'][:topn].max()
            tp_fp_tn_fn = []
            for thres in thresholds:
                if tmax_global:  # запрос имеет дубль в релизе
                    if pmax >= thres:  # выдача непустая
                        if tmax:  # нужный дубль вошел в топ-5
                            tp_fp_tn_fn.append([1, 0, 0, 0])
                        else:
                            tp_fp_tn_fn.append([0, 1, 0, 0])
                    else:  # выдача пустая
                        tp_fp_tn_fn.append([0, 0, 0, 1])
                else:  # запрос не имеет дубля в релизе
                    if pmax >= thres:  # выдача непустая
                        tp_fp_tn_fn.append([0, 1, 0, 0])
                    else:  # выдача пустая
                        tp_fp_tn_fn.append([0, 0, 1, 0])

            tp_fp_tn_fn_topns.append(tp_fp_tn_fn)

        scores.append(tp_fp_tn_fn_topns)

    scores = np.array(scores, dtype=np.int64)

    for i, topn in enumerate(topns):
        precision, recall = [], []
        for j in range(scores.shape[2]):
            tp, fp, tn, fn = scores[:, i, j, :].sum(axis=0)
            if tp + fp != 0:
                precision.append(tp / (tp + fp))
            else:
                precision.append(1)
            recall.append(tp / (tp + fn))

        recall = np.array(recall)
        recall_scale = get_recall_test_scale()
        plot_precision_recall_straight(precision,
                                       recall,
                                       tag='_top%d%s' % (topn, tag),
                                       recall_scale=recall_scale,
                                       average_precision=np.mean(precision),
                                       prefix='Top %d: ' % topn)

        df = pd.DataFrame([precision, list(recall * recall_scale)]).T
        df.columns = ('precision', 'recall')
        df.to_csv('../data/dedup/prec_recall_top%d%s.csv' % (topn, tag),
                  index=False,
                  sep='\t')