Beispiel #1
0
    def sentences_long_to_matrix(self, title1, title2):

        if self.use_emb:
            twords1 = self.tokenizer.texts_to_sequences(
                [title1])[0][:self.matrix_size_1_long]
            twords2 = self.tokenizer.texts_to_sequences(
                [title2])[0][:self.matrix_size_1_long]
        else:
            twords1 = feature_utils.get_words(title1)[:self.matrix_size_1_long]
            twords2 = feature_utils.get_words(title2)[:self.matrix_size_1_long]
        # print("twords1", twords1)
        # print("twords2", twords2)

        matrix = -np.ones((self.matrix_size_1_long, self.matrix_size_1_long))
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                v = -1
                if word1 == word2:
                    v = 1
                elif self.use_emb:
                    v = cosine_similarity(
                        self.pretrain_emb[word1].reshape(1, -1),
                        self.pretrain_emb[word2].reshape(1, -1))[0][0]
                    # print("cos", v)
                matrix[i][j] = v
        return matrix
Beispiel #2
0
def aff_keyword_method():
    pairs, labels = load_aff_data()
    out_dir = join(settings.OUT_DIR, "aff")
    with open(join(out_dir, "aff_tfidf.pkl"), "rb") as rf:
        vectorizer = pickle.load(rf)
    print(vectorizer.vocabulary_)
    vocab = vectorizer.vocabulary_
    idf = vectorizer.idf_

    pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42)

    features = []
    for i, p in enumerate(pairs):
        aff1 = p[0]["name"].lower()
        aff2 = p[1]["DisplayName"].lower()
        # aff1_words = aff1.split()
        # aff1_words = aff1.split()
        aff1_words = feature_utils.get_words(aff1)
        aff2_words = feature_utils.get_words(aff2)
        # print(aff1_words, aff2_words)
        intersec = Counter(aff1_words) & Counter(aff2_words)
        and_idf = sum(
            [intersec[w] * idf[vocab[w]] for w in intersec if w in vocab])
        # print(is_idf)
        union = Counter(aff1_words) | Counter(aff2_words)
        # or_idf = sum([union[w] * idf[vocab[w]] for w in union if w in vocab])
        aff1_idf = sum([idf[vocab[w]] for w in aff1_words if w in vocab])
        aff2_idf = sum([idf[vocab[w]] for w in aff2_words if w in vocab])
        cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf)
        # print(and_idf, aff1_idf, aff2_idf, cur_jac, labels[i])
        # print(cur_jac, labels[i])
        features.append(cur_jac)

    n = len(pairs)
    n_train = int(n * 0.6)
    n_valid = int(n * 0.2)
    features_valid = features[n_train:(n_valid + n_train)]
    labels_valid = labels[n_train:(n_valid + n_train)]
    features_test = features[(n_valid + n_train):]
    labels_test = labels[(n_valid + n_train):]

    precs, recs, thrs = precision_recall_curve(labels_valid, features_valid)
    f1s = 2 * precs * recs / (precs + recs)
    f1s = f1s[:-1]
    thrs = thrs[~np.isnan(f1s)]
    f1s = f1s[~np.isnan(f1s)]
    best_thr = thrs[np.argmax(f1s)]
    print("best thr", best_thr)

    y_pred = np.zeros_like(labels_test)
    y_pred[features_test > best_thr] = 1

    prec, rec, f1, _ = precision_recall_fscore_support(labels_test,
                                                       y_pred,
                                                       average="binary")
    auc = roc_auc_score(labels_test, features_test)
    print("AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f" % (auc, prec, rec, f1))
Beispiel #3
0
    def sentences_short_to_matrix(self, title1, title2):
        twords1 = feature_utils.get_words(title1)[:self.matrix_size_2_short]
        twords2 = feature_utils.get_words(title2)[:self.matrix_size_2_short]

        matrix = -np.ones((self.matrix_size_2_short, self.matrix_size_2_short))
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                matrix[i][j] = (1 if word1 == word2 else -1)
        return matrix
Beispiel #4
0
def aff_svm():
    pairs, labels = load_aff_data()
    out_dir = join(settings.OUT_DIR, "aff")
    with open(join(out_dir, "aff_tfidf.pkl"), "rb") as rf:
        vectorizer = pickle.load(rf)
    print(vectorizer.vocabulary_)
    vocab = vectorizer.vocabulary_
    idf = vectorizer.idf_

    pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42)

    features = []
    for i, p in enumerate(pairs):
        # cur_feat = []
        aff1 = p[0]["name"].lower()
        aff2 = p[1]["DisplayName"].lower()
        # aff1_words = aff1.split()
        # aff1_words = aff1.split()
        aff1_words = feature_utils.get_words(aff1)
        aff2_words = feature_utils.get_words(aff2)
        # print(aff1_words, aff2_words)
        intersec = Counter(aff1_words) & Counter(aff2_words)
        and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab])
        # print(is_idf)
        union = Counter(aff1_words) | Counter(aff2_words)
        # or_idf = sum([union[w] * idf[vocab[w]] for w in union if w in vocab])
        aff1_idf = sum([1 for w in aff1_words if w in vocab])
        aff2_idf = sum([1 for w in aff2_words if w in vocab])
        cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf)

        aff_idf_vec = vectorizer.fit_transform([aff1, aff2])
        # aff2_idf_vec = vectorizer.fit_transform([aff2])
        # print(type(aff_idf_vec), aff_idf_vec)
        # print()
        cos = cosine_similarity(aff_idf_vec)
        # print("cos", cos)
        cur_feat = [cur_jac, cos[0, 1]]
        features.append(cur_feat)

    features = np.array(features)
    n = len(pairs)
    n_train = int(n * 0.6)
    n_valid = int(n * 0.2)
    features_train = features[:n_train + n_valid]
    labels_train = labels[:n_train + n_valid]
    features_test = features[(n_valid + n_train):]
    labels_test = labels[(n_valid + n_train):]
    clf = svm.SVC()
    clf.fit(features_train, labels_train)
    y_pred = clf.predict(features_test)

    prec, rec, f1, _ = precision_recall_fscore_support(labels_test,
                                                       y_pred,
                                                       average="binary")
    print(prec, rec, f1)
Beispiel #5
0
    def titles_to_matrix(self, title1, title2):
        if self.use_emb:
            twords1 = self.tokenizer.texts_to_sequences([title1])[0][: self.matrix_title_size]
            twords2 = self.tokenizer.texts_to_sequences([title2])[0][: self.matrix_title_size]
        else:
            twords1 = feature_utils.get_words(title1)[: self.matrix_title_size]
            twords2 = feature_utils.get_words(title2)[: self.matrix_title_size]

        matrix = -np.ones((self.matrix_title_size, self.matrix_title_size))
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                v = -1
                if word1 == word2:
                    v = 1
                elif self.use_emb:
                    v = cosine_similarity(self.pretrain_emb[word1].reshape(1, -1),
                                          self.pretrain_emb[word2].reshape(1, -1))[0][0]
                matrix[i][j] = v
        return matrix
Beispiel #6
0
    def org_to_matrix(self, title1, title2, max_size):
        twords1 = feature_utils.get_words(title1, remove_stopwords=True)[: max_size]
        twords2 = feature_utils.get_words(title2, remove_stopwords=True)[: max_size]

        matrix = -np.ones((max_size, max_size))
        nn1 = 0
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                v = 1 if word1 == word2 else -1
                # if word1 == word2:
                #     matrix[i][j] = 1.
                #     continue
                # v = sim_utils.sim_ngrams(word1, word2)
                # v = 2 * v - 1
                if v == 1:
                    nn1 += 1
                matrix[i][j] = v
        # print(twords1, twords2, nn1)

        return matrix, nn1
Beispiel #7
0
 def get_candidates_by_inverted_index(self, npaper, word2ids):
     title = npaper['title'].lower()
     words = feature_utils.get_words(title, window=self.build_index_window)
     cids_to_freq = dd(int)
     for word in words:
         if word in word2ids:
             cur_cids = word2ids[word]
             for cid in cur_cids:
                 cids_to_freq[cid] += 1
     sorted_items = sorted(cids_to_freq.items(), key=lambda kv: kv[1], reverse=True)[:20]
     cand_cids = [item[0] for item in sorted_items]
     return cand_cids
Beispiel #8
0
    def sentences_short_to_matrix(self, title1, title2):
        # print("short---", title1, "v.s.", title2)
        if self.use_emb:
            twords1 = self.tokenizer.texts_to_sequences([title1])[0][: self.matrix_size_2_short]
            twords2 = self.tokenizer.texts_to_sequences([title2])[0][: self.matrix_size_2_short]
        else:
            twords1 = feature_utils.get_words(title1)[: self.matrix_size_2_short]
            twords2 = feature_utils.get_words(title2)[: self.matrix_size_2_short]

        matrix = -np.ones((self.matrix_size_2_short, self.matrix_size_2_short))
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                # matrix[i][j] = (1 if word1 == word2 else -1)
                v = -1
                if word1 == word2:
                    v = 1
                elif self.use_emb:
                    v = cosine_similarity(self.pretrain_emb[word1].reshape(1, -1),
                                          self.pretrain_emb[word2].reshape(1, -1))[0][0]
                    # print("cos", v)
                matrix[i][j] = v
        return matrix
Beispiel #9
0
 def build_cpapers_inverted_index(self):
     logger.info('build inverted index for cpapers')
     cpapers_train = data_utils.load_json_lines(self.file_dir, 'clean-papers-train.dat')
     cpapers_test = data_utils.load_json_lines(self.file_dir, 'clean-papers-test.dat')
     papers = cpapers_train + cpapers_test
     word2ids = dd(list)
     for paper in papers:
         pid = str(paper['id'])
         title = paper['title']
         words = feature_utils.get_words(title.lower(), window=self.build_index_window)
         for word in words:
             word2ids[word].append(pid)
     for word in word2ids:
         word2ids[word] = list(set(word2ids[word]))
     # data_utils.dump_json(word2ids, self.file_dir, 'clean-papers-inverted-index.json')
     logger.info('building inverted index completed')
     return word2ids
Beispiel #10
0
def venue_svm():
    pairs = load_venue_data()
    out_dir = join(settings.OUT_DIR, "venue")
    with open(join(out_dir, "venue_tfidf.pkl"), "rb") as rf:
        vectorizer = pickle.load(rf)
    print(vectorizer.vocabulary_)
    vocab = vectorizer.vocabulary_
    idf = vectorizer.idf_

    train_num = 800
    test_num = 200

    n_pos_set = int((train_num + 2 * test_num) / 2)

    neg_pairs = [p for p in pairs if p[0] == 0]
    pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:]
    n_pos = len(pos_pairs)
    neg_pairs = neg_pairs[-n_pos:]
    train_data = pos_pairs + neg_pairs

    train_data = sklearn.utils.shuffle(train_data, random_state=37)

    labels = [x[0] for x in train_data]

    features = []
    for i, p in enumerate(train_data):
        aff1 = p[2].lower()
        aff2 = p[1].lower()
        aff1_words = feature_utils.get_words(aff1)
        aff2_words = feature_utils.get_words(aff2)
        intersec = Counter(aff1_words) & Counter(aff2_words)
        and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab])
        aff1_idf = sum([1 for w in aff1_words if w in vocab])
        aff2_idf = sum([1 for w in aff2_words if w in vocab])
        cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf)

        aff_idf_vec = vectorizer.fit_transform([aff1, aff2])
        cos = cosine_similarity(aff_idf_vec)

        cur_v_mag = aff1
        cur_v_aminer = aff2
        overlap = set(cur_v_mag).intersection(cur_v_aminer)
        new_seq_mag = []
        new_seq_aminer = []
        for w in cur_v_mag:
            if w in overlap:
                new_seq_mag.append(w)
        for w in cur_v_aminer:
            if w in overlap:
                new_seq_aminer.append(w)

        intersec = Counter(new_seq_aminer) & Counter(new_seq_mag)
        and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab])
        aminer_idf_key = sum([1 for w in new_seq_aminer if w in vocab])
        mag_idf_key = sum([1 for w in new_seq_mag if w in vocab])
        mother = aminer_idf_key + mag_idf_key - and_idf
        if mother != 0:
            cur_jac_key = and_idf / mother
            print("key jac", cur_jac_key)
            aff_idf_vec = vectorizer.fit_transform(
                [" ".join(new_seq_aminer), " ".join(new_seq_mag)])
            cos_key = cosine_similarity(aff_idf_vec)
        else:
            cur_jac_key = 0
            cos_key = -1
            # print("here")

        # cur_feat = [cur_jac, cos[0, 1], cur_jac_key, cos_key]
        cur_feat = [cur_jac, cos[0, 1]]
        # print("cur feature", cur_feat)
        features.append(cur_feat)

    features = np.array(features)
    # n = len(pairs)
    n_train = train_num
    n_valid = test_num
    features_train = features[:n_train + n_valid]
    labels_train = labels[:n_train + n_valid]
    features_test = features[(n_valid + n_train):]
    labels_test = labels[(n_valid + n_train):]
    print("n_test", len(labels_test))
    clf = svm.SVC()
    clf.fit(features_train, labels_train)
    y_pred = clf.predict(features_test)

    prec, rec, f1, _ = precision_recall_fscore_support(labels_test,
                                                       y_pred,
                                                       average="binary")
    print(prec, rec, f1)
Beispiel #11
0
def venue_keyword_method():
    pairs = load_venue_data()
    out_dir = join(settings.OUT_DIR, "venue")
    with open(join(out_dir, "venue_tfidf.pkl"), "rb") as rf:
        vectorizer = pickle.load(rf)
    print(vectorizer.vocabulary_)
    vocab = vectorizer.vocabulary_
    idf = vectorizer.idf_

    # pairs = sklearn.utils.shuffle(
    #     pairs, random_state=42
    # )

    train_num = 800
    test_num = 200

    n_pos_set = int((train_num + 2 * test_num) / 2)

    neg_pairs = [p for p in pairs if p[0] == 0]
    pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:]
    n_pos = len(pos_pairs)
    neg_pairs = neg_pairs[-n_pos:]
    train_data = pos_pairs + neg_pairs

    train_data = sklearn.utils.shuffle(train_data, random_state=37)

    labels = [x[0] for x in train_data]

    features = []
    for i, p in enumerate(train_data):
        aff1 = p[2].lower()
        aff2 = p[1].lower()
        aff1_words = feature_utils.get_words(aff1)
        aff2_words = feature_utils.get_words(aff2)
        intersec = Counter(aff1_words) & Counter(aff2_words)
        and_idf = sum(
            [intersec[w] * idf[vocab[w]] for w in intersec if w in vocab])
        aff1_idf = sum([idf[vocab[w]] for w in aff1_words if w in vocab])
        aff2_idf = sum([idf[vocab[w]] for w in aff2_words if w in vocab])
        cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf)
        features.append(cur_jac)

    n = len(pairs)
    n_train = train_num
    n_valid = test_num
    features_valid = features[n_train:(n_valid + n_train)]
    labels_valid = labels[n_train:(n_valid + n_train)]
    features_test = features[(n_valid + n_train):]
    labels_test = labels[(n_valid + n_train):]

    print("valid", len(labels_valid), "test", len(labels_test))

    precs, recs, thrs = precision_recall_curve(labels_valid, features_valid)
    f1s = 2 * precs * recs / (precs + recs)
    f1s = f1s[:-1]
    thrs = thrs[~np.isnan(f1s)]
    f1s = f1s[~np.isnan(f1s)]
    best_thr = thrs[np.argmax(f1s)]
    print("best thr", best_thr)

    y_pred = np.zeros_like(labels_test)
    y_pred[features_test > best_thr] = 1

    prec, rec, f1, _ = precision_recall_fscore_support(labels_test,
                                                       y_pred,
                                                       average="binary")
    auc = roc_auc_score(labels_test, features_test)
    print("AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f" % (auc, prec, rec, f1))