def apply_d2v_sectionwise():
    '''
    Evaluate doc2vec for comparison of patent sections
    '''
    # define embedding size
    size = 50
    # load data
    model = pkl.load(
        open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
    corpus = np.load('../corpus/corpus_claims.npy').item()
    target_ids = np.load('../corpus/target_ids.npy')
    random_ids = np.load('../corpus/random_ids.npy')
    dupl_ids = np.load('../corpus/dupl_ids.npy').item()
    cited_ids = np.load('../corpus/cited_ids.npy').item()
    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
    patfeats_d2v = infer_patfeats(corpus, model)
    scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'],
                                id_dict, 'linear')
    auc = calc_auc(scores['cited'], scores['random'])[2]
    '''
    # guarantee that scores range between 0 and 1
    for label, vals in scores.items():
        scores[label] = scores[label] - np.min(scores[label])
        scores[label] = scores[label]/np.max(scores[label])
    '''
    plot_score_distr('human_eval',
                     'linear', ['random', 'cited', 'duplicate'], {
                         'cited': scores['cited'],
                         'random': scores['random'],
                         'duplicate': scores['duplicate']
                     },
                     auc, ['cited'],
                     histdir='doc2vec_full%i_claims' % size,
                     bins=50)
Beispiel #2
0
def apply_kpca_rel_corpus():
    # load combis for small corpus
    combis = np.load('human_eval/corpus_info/combis.npy')
    target_ids = list(set([comb[0] for comb in combis]))
    single_pat_corpus = np.load('human_eval/corpus_info/single_pat_corpus.npy').item()
    ft = FeatureTransform(renorm='max')
    docfeats = ft.texts2features(single_pat_corpus)
    doc_ids = docfeats.keys()
    train_feats = {pid : pat for pid, pat in docfeats.items() if pid not in target_ids}
    target_feats = {pid : docfeats[pid] for pid in target_ids}
    # make feature matrices
    X_train, featurenames = features2mat(train_feats, train_feats.keys())
    X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames)
    # train on full patent corpus (excluding target patents)
    kpca = KernelPCA(n_components=250, kernel='linear')
    X_train_kpca = kpca.fit_transform(X_train)
    # make feat mat for small corpus
    X_target_kpca = kpca.transform(X_target)
    patfeats_lsa = {pid: norm_dict(dict(zip(range(250), X_train_kpca[i,:])), 'length') for i, pid in enumerate(train_feats.keys())}
    for i, pid in enumerate(target_feats.keys()):
        patfeats_lsa[pid] = norm_dict(dict(zip(range(250), X_target_kpca[i,:])), 'length')
    pat_ids = np.load('human_eval/corpus_info/pat_ids.npy')
    binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item()
    human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item()
    binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
    human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
    for simcoef in ['linear']:
        binary_scores = calc_simcoef_distr(patfeats_lsa, ['random', 'cited'], 
                                           {'cited': binary_sim_combis, 'random': binary_diff_combis},
                                           simcoef)
        human_scores = calc_simcoef_distr(patfeats_lsa, ['irrelevant', 'relevant'],
                                          {'relevant': human_sim_combis, 'irrelevant': human_diff_combis},
                                          simcoef)
        binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2]
        human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2]
        plot_score_distr('human_eval', simcoef, ['random', 'cited'], 
                         {'cited': binary_scores['cited'], 'random': binary_scores['random']},
                         binary_auc, ['cited'], histdir='kpca_1000_rel_corp', bins=20)
        plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'], 
                 {'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']},
                 human_auc, ['relevant'], histdir='kpca_1000_rel_corp', bins=20)
Beispiel #3
0
def evaluate_coefs(corpus,
                   target_ids,
                   cited_ids,
                   random_ids,
                   dupl_ids,
                   dir_,
                   weights=[True],
                   norms=[None],
                   renorms=['length'],
                   simcoefs=['linear', 'jaccard']):
    auc_dict = collections.defaultdict(dict)
    for weight in weights:
        weighting = 'None'
        if weight:
            weighting = 'tfidf'
        auc_dict[weighting] = {}
        for norm in norms:
            auc_dict[weighting][str(norm)] = {}
            for renorm in renorms:
                auc_dict[weighting][str(norm)][str(renorm)] = {}
                # make features
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=norm,
                                      weight=weight,
                                      renorm=renorm)
                pat_feats = ft.texts2features(corpus)
                # compute scores and calculate AUC
                for simcoef in simcoefs:
                    sim_scores, diff_scores, dupl_scores = calc_simcoef_distr_dict(
                        pat_feats, target_ids, cited_ids, random_ids, dupl_ids,
                        simcoef)
                    fpr, tpr, auc_score = plot_utils.calc_auc(
                        sim_scores.values(), diff_scores.values())
                    auc_dict[weighting][str(norm)][str(
                        renorm)][simcoef] = auc_score
                    np.save(
                        dir_ + '/sim_scores/sim_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        sim_scores)
                    np.save(
                        dir_ + '/diff_scores/diff_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        diff_scores)
                    np.save(
                        dir_ + '/dupl_scores/dupl_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        dupl_scores)
                    np.save(
                        dir_ + '/fpr_tpr_rates/fpr_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        [fpr, tpr])
                    np.save(dir_ + '/auc_dict.npy', auc_dict)
def apply_d2v_full_corpus():
    target_ids = np.load('../corpus/target_ids.npy')
    random_ids = np.load('../corpus/random_ids.npy')
    dupl_ids = np.load('../corpus/dupl_ids.npy').item()
    cited_ids = np.load('../corpus/cited_ids.npy').item()
    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
    pat_corpus = np.load('../corpus/corpus.npy').item()
    for size in [50]:
        pat_corpus, target_pat_corpus = make_d2v_corpus(target_ids)
        #train model
        model = pkl.load(
            open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
        #load model
        #model = pkl.load(open('human_eval/patents_dm_50_min5_iter10.model'))
        #patfeats_d2v = infer_patfeats(pat_corpus, model)
        #patfeats_d2v = corpus_to_patfeats(model, pat_corpus, target_ids)
        patfeats_d2v = make_doc2vec_corpus(model, target_pat_corpus)
        #np.save('../doc2vec/patfeats_d2v%i.npy' %size, patfeats_d2v)

        scores = calc_simcoef_distr(patfeats_d2v,
                                    ['cited', 'duplicate', 'random'], id_dict,
                                    'linear')
        auc = calc_auc(scores['cited'], scores['random'])[2]
        '''
        # guarantee that scores range between 0 and 1
        for label, vals in scores.items():
            scores[label] = scores[label] - np.min(scores[label])
            scores[label] = scores[label]/np.max(scores[label])
        '''
        plot_score_distr('human_eval',
                         'linear', ['random', 'cited', 'duplicate'], {
                             'cited': scores['cited'],
                             'random': scores['random'],
                             'duplicate': scores['duplicate']
                         },
                         auc, ['cited'],
                         histdir='doc2vec_full%i_no_target' % size,
                         bins=50)
def apply_d2v_rel_corpus():
    """
    Evaluate the doc2vec feature vectors on the smaller corpus for
    cited/random and relevant/irrelevant labellings
    """
    # load text
    #pat_corpus = PatentCorpus()
    #pat_corpus.mode = 'd2v'
    #list(pat_corpus)
    combis = np.load('human_eval/corpus_info/combis.npy')
    target_ids = list(set([comb[0] for comb in combis]))
    #pat_corpus = np.load('human_eval/doc2vec/corpus.npy')
    #pat_corpus = [gensim.models.doc2vec.TaggedDocument(a[0], a[1]) for a in pat_corpus if a[1][0] not in target_ids]
    ## Plot AUC
    # load model trained on entire patent corpus
    model = pkl.load(
        open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
    #model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
    #model = train_doc2vec(pat_corpus)
    # get doc2vec feature vectors
    single_pat_corpus = np.load(
        'human_eval/corpus_info/single_pat_corpus.npy').item()
    patfeats_d2v = infer_patfeats(single_pat_corpus, model)
    #patfeats_d2v = corpus_to_patfeats(model, single_pat_corpus, [])
    #patfeats_d2v = make_doc2vec_corpus(model, single_pat_corpus, target_ids)
    pat_ids = np.load('human_eval/corpus_info/pat_ids.npy')
    binary_label_pairs = np.load(
        'human_eval/corpus_info/binary_label_pairs.npy').item()
    human_label_pairs = np.load(
        'human_eval/corpus_info/human_label_pairs.npy').item()
    binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
    human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
    for simcoef in ['linear']:
        binary_scores = calc_simcoef_distr(patfeats_d2v, ['random', 'cited'], {
            'cited': binary_sim_combis,
            'random': binary_diff_combis
        }, simcoef)
        human_scores = calc_simcoef_distr(patfeats_d2v,
                                          ['irrelevant', 'relevant'], {
                                              'relevant': human_sim_combis,
                                              'irrelevant': human_diff_combis
                                          }, simcoef)
        binary_auc = calc_auc(binary_scores['cited'],
                              binary_scores['random'])[2]
        human_auc = calc_auc(human_scores['relevant'],
                             human_scores['irrelevant'])[2]
        plot_score_distr('human_eval',
                         simcoef, ['random', 'cited'], {
                             'cited': binary_scores['cited'],
                             'random': binary_scores['random']
                         },
                         binary_auc, ['cited'],
                         histdir='doc2vec_full50_rel_corp',
                         bins=20)
        plot_score_distr('human_eval',
                         simcoef, ['irrelevant', 'relevant'], {
                             'relevant': human_scores['relevant'],
                             'irrelevant': human_scores['irrelevant']
                         },
                         human_auc, ['relevant'],
                         histdir='doc2vec_full50_rel_corp',
                         bins=20)
    Dw_all['binary_idf_neg'] = pointwise_dict_multiply(Dw_all['idf'], Dw_all['binary_neg'])
    np.save('full_patent_scores/corpus_info_for_regression/Dw_all.npy', Dw_all)
    '''
    for method in ['lasso']:
        ## use learned term weights for feature extraction
        ft = FeatureTransform(identify_bigrams=False,
                              norm=None,
                              weight=True,
                              renorm='length')
        ft.Dw = Dw_all['lasso']
        patfeats = ft.texts2features(pat_corpus)
        # plot the results
        for simcoef in ['linear']:
            binary_scores = calc_simcoef_distr(
                patfeats, ['cited', 'random'], {
                    'cited':
                    [sim_combi.split('_') for sim_combi in sim_combis],
                    'random':
                    [diff_combi.split('_') for diff_combi in diff_combis]
                }, simcoef)
            binary_auc = calc_auc(binary_scores['cited'],
                                  binary_scores['random'])[2]
            plot_score_distr('full_patent_scores',
                             simcoef, ['cited', 'random'], {
                                 'cited': binary_scores['cited'],
                                 'random': binary_scores['random']
                             },
                             binary_auc, ['cited'],
                             histdir=method,
                             bins=50)
Beispiel #7
0
    for n_comp in [100, 250, 500, 1000]:
        print n_comp
        # fit LSA
        kpca = KernelPCA(n_components=n_comp, kernel='linear')
        X_train_kpca = kpca.fit_transform(X_train)
        #pkl.dump(kpca, open('human_eval/models/kpca_%i.model' %n_comp, 'wb'), -1)
        X_target_kpca = kpca.transform(X_target)
        kpca_feats = {
            pid: norm_dict(dict(zip(range(n_comp), X_train_kpca[i, :])),
                           'length')
            for i, pid in enumerate(train_feats.keys())
        }
        for i, pid in enumerate(target_feats.keys()):
            kpca_feats[pid] = norm_dict(
                dict(zip(range(n_comp), X_target_kpca[i, :])), 'length')
        np.save('human_eval/corpus_info/kpca_feats.npy', kpca_feats)
        scores = calc_simcoef_distr(kpca_feats,
                                    ['cited', 'duplicate', 'random'], id_dict,
                                    'linear')
        auc, aps = calc_auc(scores['cited'], scores['random'])[2::]
        print(auc, aps)
        plot_score_distr('human_eval',
                         'linear', ['random', 'cited', 'duplicate'], {
                             'cited': scores['cited'],
                             'random': scores['random'],
                             'duplicate': scores['duplicate']
                         },
                         auc, ['cited'],
                         histdir='kpca_%i' % n_comp,
                         bins=50)
#corpus = np.load('../corpus/corpus.npy').item()
for embed_dim in [200]:
	corpus = np.load('../corpus/corpus.npy').item()
	#pat_corpus = make_w2v_corpus()
	#model = train_word2vec(pat_corpus, 'full_patent_corpus', seed=1, embed_dim=200)
	model = pkl.load(open('human_eval/models/full_patent_corpus_sg_200_hs0_neg13_seed1.model'))
	pat_ids = corpus.keys()
	ft = FeatureTransform(identify_bigrams=False, norm=None, weight='tfidf', renorm='length')
	patfeats = ft.texts2features(corpus)
	featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids)
	patfeats_w2v = {}
	for i, pid in enumerate(pat_ids):
	    patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i,:]))
	#np.save('../corpus/patfeats_w2v.npy', patfeats_w2v)


	#patfeats = np.load('../corpus/patfeats_w2v.npy').item()
	patfeats = patfeats_w2v
	target_ids = np.load('../corpus/target_ids.npy')
	random_ids = np.load('../corpus/random_ids.npy')
	dupl_ids = np.load('../corpus/dupl_ids.npy').item()
	cited_ids = np.load('../corpus/cited_ids.npy').item()

	id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
	scores = calc_simcoef_distr(patfeats, ['random', 'cited', 'duplicate'], 
	                                      id_dict, 'linear')
	auc = calc_auc(scores['cited'], scores['random'])[2]
	plot_score_distr('human_eval', 'linear', ['random', 'cited'], 
	                {'cited': scores['cited'], 'random': scores['random']},
	                         auc, ['cited'], histdir='word2vec_full%i_nonorm_length' %embed_dim, bins=50)
Beispiel #9
0
import numpy as np
from plot_utils import plot_score_distr, group_combis, calc_auc

binary_label_pairs = np.load(
    'human_eval/corpus_info/binary_label_pairs.npy').item()
human_label_pairs = np.load(
    'human_eval/corpus_info/human_label_pairs.npy').item()
combis = np.load('human_eval/corpus_info/combis.npy')
human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
sim_vals = [binary_label_pairs[combi] for combi in human_sim_combis]
diff_vals = [binary_label_pairs[combi] for combi in human_diff_combis]
fpr, tpr, auc_val = calc_auc(sim_vals, diff_vals)
plot_score_distr('human_eval',
                 'cited', ['relevant', 'not relevant'], {
                     'relevant': sim_vals,
                     'not relevant': diff_vals
                 },
                 auc_val, ['relevant'],
                 histdir='baseline',
                 bins=10)
Beispiel #10
0
 patfeats_human = ft.texts2features(single_pat_corpus)
 # plot the distributions
 binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
 human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
 for simcoef in ['linear', 'jaccard']:
     binary_scores = calc_simcoef_distr(patfeats_cited, ['cited', 'random'],
                                        {
                                            'cited': binary_sim_combis,
                                            'random': binary_diff_combis
                                        }, simcoef)
     human_scores = calc_simcoef_distr(patfeats_human,
                                       ['relevant', 'not relevant'], {
                                           'relevant': human_sim_combis,
                                           'not relevant': human_diff_combis
                                       }, simcoef)
     binary_auc = calc_auc(binary_scores['cited'],
                           binary_scores['random'])[2]
     human_auc = calc_auc(human_scores['relevant'],
                          human_scores['not relevant'])[2]
     plot_score_distr('human_eval',
                      simcoef, ['cited', 'random'], {
                          'cited': binary_scores['cited'],
                          'random': binary_scores['random']
                      },
                      binary_auc, ['cited'],
                      histdir='reg_idf',
                      bins=50)
     plot_score_distr('human_eval',
                      simcoef, ['relevant', 'not relevant'], {
                          'relevant': human_scores['relevant'],
                          'not relevant': human_scores['not relevant']
                      },