def apply_d2v_sectionwise(): ''' Evaluate doc2vec for comparison of patent sections ''' # define embedding size size = 50 # load data model = pkl.load( open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model")) corpus = np.load('../corpus/corpus_claims.npy').item() target_ids = np.load('../corpus/target_ids.npy') random_ids = np.load('../corpus/random_ids.npy') dupl_ids = np.load('../corpus/dupl_ids.npy').item() cited_ids = np.load('../corpus/cited_ids.npy').item() id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids) patfeats_d2v = infer_patfeats(corpus, model) scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'], id_dict, 'linear') auc = calc_auc(scores['cited'], scores['random'])[2] ''' # guarantee that scores range between 0 and 1 for label, vals in scores.items(): scores[label] = scores[label] - np.min(scores[label]) scores[label] = scores[label]/np.max(scores[label]) ''' plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'], { 'cited': scores['cited'], 'random': scores['random'], 'duplicate': scores['duplicate'] }, auc, ['cited'], histdir='doc2vec_full%i_claims' % size, bins=50)
def apply_kpca_rel_corpus(): # load combis for small corpus combis = np.load('human_eval/corpus_info/combis.npy') target_ids = list(set([comb[0] for comb in combis])) single_pat_corpus = np.load('human_eval/corpus_info/single_pat_corpus.npy').item() ft = FeatureTransform(renorm='max') docfeats = ft.texts2features(single_pat_corpus) doc_ids = docfeats.keys() train_feats = {pid : pat for pid, pat in docfeats.items() if pid not in target_ids} target_feats = {pid : docfeats[pid] for pid in target_ids} # make feature matrices X_train, featurenames = features2mat(train_feats, train_feats.keys()) X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames) # train on full patent corpus (excluding target patents) kpca = KernelPCA(n_components=250, kernel='linear') X_train_kpca = kpca.fit_transform(X_train) # make feat mat for small corpus X_target_kpca = kpca.transform(X_target) patfeats_lsa = {pid: norm_dict(dict(zip(range(250), X_train_kpca[i,:])), 'length') for i, pid in enumerate(train_feats.keys())} for i, pid in enumerate(target_feats.keys()): patfeats_lsa[pid] = norm_dict(dict(zip(range(250), X_target_kpca[i,:])), 'length') pat_ids = np.load('human_eval/corpus_info/pat_ids.npy') binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item() human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item() binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs) human_sim_combis, human_diff_combis = group_combis(human_label_pairs) for simcoef in ['linear']: binary_scores = calc_simcoef_distr(patfeats_lsa, ['random', 'cited'], {'cited': binary_sim_combis, 'random': binary_diff_combis}, simcoef) human_scores = calc_simcoef_distr(patfeats_lsa, ['irrelevant', 'relevant'], {'relevant': human_sim_combis, 'irrelevant': human_diff_combis}, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2] plot_score_distr('human_eval', simcoef, ['random', 'cited'], {'cited': binary_scores['cited'], 'random': binary_scores['random']}, binary_auc, ['cited'], histdir='kpca_1000_rel_corp', bins=20) plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'], {'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']}, human_auc, ['relevant'], histdir='kpca_1000_rel_corp', bins=20)
def apply_d2v_full_corpus(): target_ids = np.load('../corpus/target_ids.npy') random_ids = np.load('../corpus/random_ids.npy') dupl_ids = np.load('../corpus/dupl_ids.npy').item() cited_ids = np.load('../corpus/cited_ids.npy').item() id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids) pat_corpus = np.load('../corpus/corpus.npy').item() for size in [50]: pat_corpus, target_pat_corpus = make_d2v_corpus(target_ids) #train model model = pkl.load( open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model")) #load model #model = pkl.load(open('human_eval/patents_dm_50_min5_iter10.model')) #patfeats_d2v = infer_patfeats(pat_corpus, model) #patfeats_d2v = corpus_to_patfeats(model, pat_corpus, target_ids) patfeats_d2v = make_doc2vec_corpus(model, target_pat_corpus) #np.save('../doc2vec/patfeats_d2v%i.npy' %size, patfeats_d2v) scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'], id_dict, 'linear') auc = calc_auc(scores['cited'], scores['random'])[2] ''' # guarantee that scores range between 0 and 1 for label, vals in scores.items(): scores[label] = scores[label] - np.min(scores[label]) scores[label] = scores[label]/np.max(scores[label]) ''' plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'], { 'cited': scores['cited'], 'random': scores['random'], 'duplicate': scores['duplicate'] }, auc, ['cited'], histdir='doc2vec_full%i_no_target' % size, bins=50)
def apply_d2v_rel_corpus(): """ Evaluate the doc2vec feature vectors on the smaller corpus for cited/random and relevant/irrelevant labellings """ # load text #pat_corpus = PatentCorpus() #pat_corpus.mode = 'd2v' #list(pat_corpus) combis = np.load('human_eval/corpus_info/combis.npy') target_ids = list(set([comb[0] for comb in combis])) #pat_corpus = np.load('human_eval/doc2vec/corpus.npy') #pat_corpus = [gensim.models.doc2vec.TaggedDocument(a[0], a[1]) for a in pat_corpus if a[1][0] not in target_ids] ## Plot AUC # load model trained on entire patent corpus model = pkl.load( open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model")) #model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model")) #model = train_doc2vec(pat_corpus) # get doc2vec feature vectors single_pat_corpus = np.load( 'human_eval/corpus_info/single_pat_corpus.npy').item() patfeats_d2v = infer_patfeats(single_pat_corpus, model) #patfeats_d2v = corpus_to_patfeats(model, single_pat_corpus, []) #patfeats_d2v = make_doc2vec_corpus(model, single_pat_corpus, target_ids) pat_ids = np.load('human_eval/corpus_info/pat_ids.npy') binary_label_pairs = np.load( 'human_eval/corpus_info/binary_label_pairs.npy').item() human_label_pairs = np.load( 'human_eval/corpus_info/human_label_pairs.npy').item() binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs) human_sim_combis, human_diff_combis = group_combis(human_label_pairs) for simcoef in ['linear']: binary_scores = calc_simcoef_distr(patfeats_d2v, ['random', 'cited'], { 'cited': binary_sim_combis, 'random': binary_diff_combis }, simcoef) human_scores = calc_simcoef_distr(patfeats_d2v, ['irrelevant', 'relevant'], { 'relevant': human_sim_combis, 'irrelevant': human_diff_combis }, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2] plot_score_distr('human_eval', simcoef, ['random', 'cited'], { 'cited': binary_scores['cited'], 'random': binary_scores['random'] }, binary_auc, ['cited'], histdir='doc2vec_full50_rel_corp', bins=20) plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'], { 'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant'] }, human_auc, ['relevant'], histdir='doc2vec_full50_rel_corp', bins=20)
Dw_all['binary_idf_neg'] = pointwise_dict_multiply(Dw_all['idf'], Dw_all['binary_neg']) np.save('full_patent_scores/corpus_info_for_regression/Dw_all.npy', Dw_all) ''' for method in ['lasso']: ## use learned term weights for feature extraction ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = Dw_all['lasso'] patfeats = ft.texts2features(pat_corpus) # plot the results for simcoef in ['linear']: binary_scores = calc_simcoef_distr( patfeats, ['cited', 'random'], { 'cited': [sim_combi.split('_') for sim_combi in sim_combis], 'random': [diff_combi.split('_') for diff_combi in diff_combis] }, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] plot_score_distr('full_patent_scores', simcoef, ['cited', 'random'], { 'cited': binary_scores['cited'], 'random': binary_scores['random'] }, binary_auc, ['cited'], histdir=method, bins=50)
for n_comp in [100, 250, 500, 1000]: print n_comp # fit LSA kpca = KernelPCA(n_components=n_comp, kernel='linear') X_train_kpca = kpca.fit_transform(X_train) #pkl.dump(kpca, open('human_eval/models/kpca_%i.model' %n_comp, 'wb'), -1) X_target_kpca = kpca.transform(X_target) kpca_feats = { pid: norm_dict(dict(zip(range(n_comp), X_train_kpca[i, :])), 'length') for i, pid in enumerate(train_feats.keys()) } for i, pid in enumerate(target_feats.keys()): kpca_feats[pid] = norm_dict( dict(zip(range(n_comp), X_target_kpca[i, :])), 'length') np.save('human_eval/corpus_info/kpca_feats.npy', kpca_feats) scores = calc_simcoef_distr(kpca_feats, ['cited', 'duplicate', 'random'], id_dict, 'linear') auc, aps = calc_auc(scores['cited'], scores['random'])[2::] print(auc, aps) plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'], { 'cited': scores['cited'], 'random': scores['random'], 'duplicate': scores['duplicate'] }, auc, ['cited'], histdir='kpca_%i' % n_comp, bins=50)
#corpus = np.load('../corpus/corpus.npy').item() for embed_dim in [200]: corpus = np.load('../corpus/corpus.npy').item() #pat_corpus = make_w2v_corpus() #model = train_word2vec(pat_corpus, 'full_patent_corpus', seed=1, embed_dim=200) model = pkl.load(open('human_eval/models/full_patent_corpus_sg_200_hs0_neg13_seed1.model')) pat_ids = corpus.keys() ft = FeatureTransform(identify_bigrams=False, norm=None, weight='tfidf', renorm='length') patfeats = ft.texts2features(corpus) featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids) patfeats_w2v = {} for i, pid in enumerate(pat_ids): patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i,:])) #np.save('../corpus/patfeats_w2v.npy', patfeats_w2v) #patfeats = np.load('../corpus/patfeats_w2v.npy').item() patfeats = patfeats_w2v target_ids = np.load('../corpus/target_ids.npy') random_ids = np.load('../corpus/random_ids.npy') dupl_ids = np.load('../corpus/dupl_ids.npy').item() cited_ids = np.load('../corpus/cited_ids.npy').item() id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids) scores = calc_simcoef_distr(patfeats, ['random', 'cited', 'duplicate'], id_dict, 'linear') auc = calc_auc(scores['cited'], scores['random'])[2] plot_score_distr('human_eval', 'linear', ['random', 'cited'], {'cited': scores['cited'], 'random': scores['random']}, auc, ['cited'], histdir='word2vec_full%i_nonorm_length' %embed_dim, bins=50)
ft.Dw = Dw_all['binary_idf'] patfeats_cited = ft.texts2features(single_pat_corpus) # multiply patfeats with human idf weights ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = Dw_all['human_idf'] patfeats_human = ft.texts2features(single_pat_corpus) # plot the distributions binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs) human_sim_combis, human_diff_combis = group_combis(human_label_pairs) for simcoef in ['linear', 'jaccard']: binary_scores = calc_simcoef_distr(patfeats_cited, ['cited', 'random'], { 'cited': binary_sim_combis, 'random': binary_diff_combis }, simcoef) human_scores = calc_simcoef_distr(patfeats_human, ['relevant', 'not relevant'], { 'relevant': human_sim_combis, 'not relevant': human_diff_combis }, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] human_auc = calc_auc(human_scores['relevant'], human_scores['not relevant'])[2] plot_score_distr('human_eval', simcoef, ['cited', 'random'], { 'cited': binary_scores['cited'], 'random': binary_scores['random']
'human_eval/corpus_info/binary_label_pairs.npy').item() human_label_pairs = np.load( 'human_eval/corpus_info/human_label_pairs.npy').item() # make word2vec embedded feature matrix featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids) #transform feature matrix into dict patfeats_w2v = {} for i, pid in enumerate(pat_ids): patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i, :])) ## Plot AUC binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs) human_sim_combis, human_diff_combis = group_combis(human_label_pairs) for simcoef in ['linear', 'jaccard']: binary_scores = calc_simcoef_distr(patfeats_w2v, ['random', 'cited'], { 'cited': binary_sim_combis, 'random': binary_diff_combis }, simcoef) human_scores = calc_simcoef_distr(patfeats_w2v, ['irrelevant', 'relevant'], { 'relevant': human_sim_combis, 'irrelevant': human_diff_combis }, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2] plot_score_distr('human_eval', simcoef, ['random', 'cited'], { 'cited': binary_scores['cited'], 'random': binary_scores['random']