def sentences_long_to_matrix(self, title1, title2): if self.use_emb: twords1 = self.tokenizer.texts_to_sequences( [title1])[0][:self.matrix_size_1_long] twords2 = self.tokenizer.texts_to_sequences( [title2])[0][:self.matrix_size_1_long] else: twords1 = feature_utils.get_words(title1)[:self.matrix_size_1_long] twords2 = feature_utils.get_words(title2)[:self.matrix_size_1_long] # print("twords1", twords1) # print("twords2", twords2) matrix = -np.ones((self.matrix_size_1_long, self.matrix_size_1_long)) for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): v = -1 if word1 == word2: v = 1 elif self.use_emb: v = cosine_similarity( self.pretrain_emb[word1].reshape(1, -1), self.pretrain_emb[word2].reshape(1, -1))[0][0] # print("cos", v) matrix[i][j] = v return matrix
def aff_keyword_method(): pairs, labels = load_aff_data() out_dir = join(settings.OUT_DIR, "aff") with open(join(out_dir, "aff_tfidf.pkl"), "rb") as rf: vectorizer = pickle.load(rf) print(vectorizer.vocabulary_) vocab = vectorizer.vocabulary_ idf = vectorizer.idf_ pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42) features = [] for i, p in enumerate(pairs): aff1 = p[0]["name"].lower() aff2 = p[1]["DisplayName"].lower() # aff1_words = aff1.split() # aff1_words = aff1.split() aff1_words = feature_utils.get_words(aff1) aff2_words = feature_utils.get_words(aff2) # print(aff1_words, aff2_words) intersec = Counter(aff1_words) & Counter(aff2_words) and_idf = sum( [intersec[w] * idf[vocab[w]] for w in intersec if w in vocab]) # print(is_idf) union = Counter(aff1_words) | Counter(aff2_words) # or_idf = sum([union[w] * idf[vocab[w]] for w in union if w in vocab]) aff1_idf = sum([idf[vocab[w]] for w in aff1_words if w in vocab]) aff2_idf = sum([idf[vocab[w]] for w in aff2_words if w in vocab]) cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf) # print(and_idf, aff1_idf, aff2_idf, cur_jac, labels[i]) # print(cur_jac, labels[i]) features.append(cur_jac) n = len(pairs) n_train = int(n * 0.6) n_valid = int(n * 0.2) features_valid = features[n_train:(n_valid + n_train)] labels_valid = labels[n_train:(n_valid + n_train)] features_test = features[(n_valid + n_train):] labels_test = labels[(n_valid + n_train):] precs, recs, thrs = precision_recall_curve(labels_valid, features_valid) f1s = 2 * precs * recs / (precs + recs) f1s = f1s[:-1] thrs = thrs[~np.isnan(f1s)] f1s = f1s[~np.isnan(f1s)] best_thr = thrs[np.argmax(f1s)] print("best thr", best_thr) y_pred = np.zeros_like(labels_test) y_pred[features_test > best_thr] = 1 prec, rec, f1, _ = precision_recall_fscore_support(labels_test, y_pred, average="binary") auc = roc_auc_score(labels_test, features_test) print("AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f" % (auc, prec, rec, f1))
def sentences_short_to_matrix(self, title1, title2): twords1 = feature_utils.get_words(title1)[:self.matrix_size_2_short] twords2 = feature_utils.get_words(title2)[:self.matrix_size_2_short] matrix = -np.ones((self.matrix_size_2_short, self.matrix_size_2_short)) for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): matrix[i][j] = (1 if word1 == word2 else -1) return matrix
def aff_svm(): pairs, labels = load_aff_data() out_dir = join(settings.OUT_DIR, "aff") with open(join(out_dir, "aff_tfidf.pkl"), "rb") as rf: vectorizer = pickle.load(rf) print(vectorizer.vocabulary_) vocab = vectorizer.vocabulary_ idf = vectorizer.idf_ pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42) features = [] for i, p in enumerate(pairs): # cur_feat = [] aff1 = p[0]["name"].lower() aff2 = p[1]["DisplayName"].lower() # aff1_words = aff1.split() # aff1_words = aff1.split() aff1_words = feature_utils.get_words(aff1) aff2_words = feature_utils.get_words(aff2) # print(aff1_words, aff2_words) intersec = Counter(aff1_words) & Counter(aff2_words) and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab]) # print(is_idf) union = Counter(aff1_words) | Counter(aff2_words) # or_idf = sum([union[w] * idf[vocab[w]] for w in union if w in vocab]) aff1_idf = sum([1 for w in aff1_words if w in vocab]) aff2_idf = sum([1 for w in aff2_words if w in vocab]) cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf) aff_idf_vec = vectorizer.fit_transform([aff1, aff2]) # aff2_idf_vec = vectorizer.fit_transform([aff2]) # print(type(aff_idf_vec), aff_idf_vec) # print() cos = cosine_similarity(aff_idf_vec) # print("cos", cos) cur_feat = [cur_jac, cos[0, 1]] features.append(cur_feat) features = np.array(features) n = len(pairs) n_train = int(n * 0.6) n_valid = int(n * 0.2) features_train = features[:n_train + n_valid] labels_train = labels[:n_train + n_valid] features_test = features[(n_valid + n_train):] labels_test = labels[(n_valid + n_train):] clf = svm.SVC() clf.fit(features_train, labels_train) y_pred = clf.predict(features_test) prec, rec, f1, _ = precision_recall_fscore_support(labels_test, y_pred, average="binary") print(prec, rec, f1)
def titles_to_matrix(self, title1, title2): if self.use_emb: twords1 = self.tokenizer.texts_to_sequences([title1])[0][: self.matrix_title_size] twords2 = self.tokenizer.texts_to_sequences([title2])[0][: self.matrix_title_size] else: twords1 = feature_utils.get_words(title1)[: self.matrix_title_size] twords2 = feature_utils.get_words(title2)[: self.matrix_title_size] matrix = -np.ones((self.matrix_title_size, self.matrix_title_size)) for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): v = -1 if word1 == word2: v = 1 elif self.use_emb: v = cosine_similarity(self.pretrain_emb[word1].reshape(1, -1), self.pretrain_emb[word2].reshape(1, -1))[0][0] matrix[i][j] = v return matrix
def org_to_matrix(self, title1, title2, max_size): twords1 = feature_utils.get_words(title1, remove_stopwords=True)[: max_size] twords2 = feature_utils.get_words(title2, remove_stopwords=True)[: max_size] matrix = -np.ones((max_size, max_size)) nn1 = 0 for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): v = 1 if word1 == word2 else -1 # if word1 == word2: # matrix[i][j] = 1. # continue # v = sim_utils.sim_ngrams(word1, word2) # v = 2 * v - 1 if v == 1: nn1 += 1 matrix[i][j] = v # print(twords1, twords2, nn1) return matrix, nn1
def get_candidates_by_inverted_index(self, npaper, word2ids): title = npaper['title'].lower() words = feature_utils.get_words(title, window=self.build_index_window) cids_to_freq = dd(int) for word in words: if word in word2ids: cur_cids = word2ids[word] for cid in cur_cids: cids_to_freq[cid] += 1 sorted_items = sorted(cids_to_freq.items(), key=lambda kv: kv[1], reverse=True)[:20] cand_cids = [item[0] for item in sorted_items] return cand_cids
def sentences_short_to_matrix(self, title1, title2): # print("short---", title1, "v.s.", title2) if self.use_emb: twords1 = self.tokenizer.texts_to_sequences([title1])[0][: self.matrix_size_2_short] twords2 = self.tokenizer.texts_to_sequences([title2])[0][: self.matrix_size_2_short] else: twords1 = feature_utils.get_words(title1)[: self.matrix_size_2_short] twords2 = feature_utils.get_words(title2)[: self.matrix_size_2_short] matrix = -np.ones((self.matrix_size_2_short, self.matrix_size_2_short)) for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): # matrix[i][j] = (1 if word1 == word2 else -1) v = -1 if word1 == word2: v = 1 elif self.use_emb: v = cosine_similarity(self.pretrain_emb[word1].reshape(1, -1), self.pretrain_emb[word2].reshape(1, -1))[0][0] # print("cos", v) matrix[i][j] = v return matrix
def build_cpapers_inverted_index(self): logger.info('build inverted index for cpapers') cpapers_train = data_utils.load_json_lines(self.file_dir, 'clean-papers-train.dat') cpapers_test = data_utils.load_json_lines(self.file_dir, 'clean-papers-test.dat') papers = cpapers_train + cpapers_test word2ids = dd(list) for paper in papers: pid = str(paper['id']) title = paper['title'] words = feature_utils.get_words(title.lower(), window=self.build_index_window) for word in words: word2ids[word].append(pid) for word in word2ids: word2ids[word] = list(set(word2ids[word])) # data_utils.dump_json(word2ids, self.file_dir, 'clean-papers-inverted-index.json') logger.info('building inverted index completed') return word2ids
def venue_svm(): pairs = load_venue_data() out_dir = join(settings.OUT_DIR, "venue") with open(join(out_dir, "venue_tfidf.pkl"), "rb") as rf: vectorizer = pickle.load(rf) print(vectorizer.vocabulary_) vocab = vectorizer.vocabulary_ idf = vectorizer.idf_ train_num = 800 test_num = 200 n_pos_set = int((train_num + 2 * test_num) / 2) neg_pairs = [p for p in pairs if p[0] == 0] pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:] n_pos = len(pos_pairs) neg_pairs = neg_pairs[-n_pos:] train_data = pos_pairs + neg_pairs train_data = sklearn.utils.shuffle(train_data, random_state=37) labels = [x[0] for x in train_data] features = [] for i, p in enumerate(train_data): aff1 = p[2].lower() aff2 = p[1].lower() aff1_words = feature_utils.get_words(aff1) aff2_words = feature_utils.get_words(aff2) intersec = Counter(aff1_words) & Counter(aff2_words) and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab]) aff1_idf = sum([1 for w in aff1_words if w in vocab]) aff2_idf = sum([1 for w in aff2_words if w in vocab]) cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf) aff_idf_vec = vectorizer.fit_transform([aff1, aff2]) cos = cosine_similarity(aff_idf_vec) cur_v_mag = aff1 cur_v_aminer = aff2 overlap = set(cur_v_mag).intersection(cur_v_aminer) new_seq_mag = [] new_seq_aminer = [] for w in cur_v_mag: if w in overlap: new_seq_mag.append(w) for w in cur_v_aminer: if w in overlap: new_seq_aminer.append(w) intersec = Counter(new_seq_aminer) & Counter(new_seq_mag) and_idf = sum([intersec[w] * 1 for w in intersec if w in vocab]) aminer_idf_key = sum([1 for w in new_seq_aminer if w in vocab]) mag_idf_key = sum([1 for w in new_seq_mag if w in vocab]) mother = aminer_idf_key + mag_idf_key - and_idf if mother != 0: cur_jac_key = and_idf / mother print("key jac", cur_jac_key) aff_idf_vec = vectorizer.fit_transform( [" ".join(new_seq_aminer), " ".join(new_seq_mag)]) cos_key = cosine_similarity(aff_idf_vec) else: cur_jac_key = 0 cos_key = -1 # print("here") # cur_feat = [cur_jac, cos[0, 1], cur_jac_key, cos_key] cur_feat = [cur_jac, cos[0, 1]] # print("cur feature", cur_feat) features.append(cur_feat) features = np.array(features) # n = len(pairs) n_train = train_num n_valid = test_num features_train = features[:n_train + n_valid] labels_train = labels[:n_train + n_valid] features_test = features[(n_valid + n_train):] labels_test = labels[(n_valid + n_train):] print("n_test", len(labels_test)) clf = svm.SVC() clf.fit(features_train, labels_train) y_pred = clf.predict(features_test) prec, rec, f1, _ = precision_recall_fscore_support(labels_test, y_pred, average="binary") print(prec, rec, f1)
def venue_keyword_method(): pairs = load_venue_data() out_dir = join(settings.OUT_DIR, "venue") with open(join(out_dir, "venue_tfidf.pkl"), "rb") as rf: vectorizer = pickle.load(rf) print(vectorizer.vocabulary_) vocab = vectorizer.vocabulary_ idf = vectorizer.idf_ # pairs = sklearn.utils.shuffle( # pairs, random_state=42 # ) train_num = 800 test_num = 200 n_pos_set = int((train_num + 2 * test_num) / 2) neg_pairs = [p for p in pairs if p[0] == 0] pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:] n_pos = len(pos_pairs) neg_pairs = neg_pairs[-n_pos:] train_data = pos_pairs + neg_pairs train_data = sklearn.utils.shuffle(train_data, random_state=37) labels = [x[0] for x in train_data] features = [] for i, p in enumerate(train_data): aff1 = p[2].lower() aff2 = p[1].lower() aff1_words = feature_utils.get_words(aff1) aff2_words = feature_utils.get_words(aff2) intersec = Counter(aff1_words) & Counter(aff2_words) and_idf = sum( [intersec[w] * idf[vocab[w]] for w in intersec if w in vocab]) aff1_idf = sum([idf[vocab[w]] for w in aff1_words if w in vocab]) aff2_idf = sum([idf[vocab[w]] for w in aff2_words if w in vocab]) cur_jac = and_idf / (aff1_idf + aff2_idf - and_idf) features.append(cur_jac) n = len(pairs) n_train = train_num n_valid = test_num features_valid = features[n_train:(n_valid + n_train)] labels_valid = labels[n_train:(n_valid + n_train)] features_test = features[(n_valid + n_train):] labels_test = labels[(n_valid + n_train):] print("valid", len(labels_valid), "test", len(labels_test)) precs, recs, thrs = precision_recall_curve(labels_valid, features_valid) f1s = 2 * precs * recs / (precs + recs) f1s = f1s[:-1] thrs = thrs[~np.isnan(f1s)] f1s = f1s[~np.isnan(f1s)] best_thr = thrs[np.argmax(f1s)] print("best thr", best_thr) y_pred = np.zeros_like(labels_test) y_pred[features_test > best_thr] = 1 prec, rec, f1, _ = precision_recall_fscore_support(labels_test, y_pred, average="binary") auc = roc_auc_score(labels_test, features_test) print("AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f" % (auc, prec, rec, f1))