Example #1
0
def evaluate_coefs(corpus,
                   target_ids,
                   cited_ids,
                   random_ids,
                   dupl_ids,
                   dir_,
                   weights=[True],
                   norms=[None],
                   renorms=['length'],
                   simcoefs=['linear', 'jaccard']):
    auc_dict = collections.defaultdict(dict)
    for weight in weights:
        weighting = 'None'
        if weight:
            weighting = 'tfidf'
        auc_dict[weighting] = {}
        for norm in norms:
            auc_dict[weighting][str(norm)] = {}
            for renorm in renorms:
                auc_dict[weighting][str(norm)][str(renorm)] = {}
                # make features
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=norm,
                                      weight=weight,
                                      renorm=renorm)
                pat_feats = ft.texts2features(corpus)
                # compute scores and calculate AUC
                for simcoef in simcoefs:
                    sim_scores, diff_scores, dupl_scores = calc_simcoef_distr_dict(
                        pat_feats, target_ids, cited_ids, random_ids, dupl_ids,
                        simcoef)
                    fpr, tpr, auc_score = plot_utils.calc_auc(
                        sim_scores.values(), diff_scores.values())
                    auc_dict[weighting][str(norm)][str(
                        renorm)][simcoef] = auc_score
                    np.save(
                        dir_ + '/sim_scores/sim_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        sim_scores)
                    np.save(
                        dir_ + '/diff_scores/diff_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        diff_scores)
                    np.save(
                        dir_ + '/dupl_scores/dupl_scores_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        dupl_scores)
                    np.save(
                        dir_ + '/fpr_tpr_rates/fpr_%s_%s_%s_%s.npy' %
                        (simcoef, str(norm), str(renorm), weighting),
                        [fpr, tpr])
                    np.save(dir_ + '/auc_dict.npy', auc_dict)
def make_section_patfeats():
    ## Sanity Check: Make the plots calculating the values again
    # get patent corpus
    pat_corpus = PatentCorpus()
    pat_corpus.mode = 'regression'
    # ugly hack to invoke __iter__() function :-( :
    list(pat_corpus)
    # get relevant information
    #pat_ids = pat_corpus.pat_ids
    #combis = pat_corpus.combis
    #binary_label_pairs = pat_corpus.binary_label_pairs
    #human_label_pairs = pat_corpus.human_label_pairs
    single_pat_corpus = pat_corpus.single_pat_corpus
    # make dict of feature vectors
    ft = FeatureTransform(identify_bigrams=False,
                          norm=None,
                          weight=True,
                          renorm='length')
    patfeats = ft.texts2features(single_pat_corpus)
    np.save('../corpus/patcorpus_claims.npy', single_pat_corpus)
    np.save('../corpus/patfeats_claims.npy', patfeats)
 '''
 ## baseline: cosine similarity calculation with idf weights 
 print "baseline: idf weights"
 # make features
 ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
 patfeats_org = ft.texts2features(pat_corpus)
 
 # save the idf weights
 Dw_all['idf'] = deepcopy(ft.Dw)
 np.save('full_patent_scores/corpus_info_for_regression/Dw_all.npy', Dw_all)
 '''
 ## our case: weights learned by regression
 # transform into very basic features, i.e. w/o idf weights
 print "making patent pair features"
 ft = FeatureTransform(identify_bigrams=False,
                       norm=None,
                       weight=False,
                       renorm=None)
 # transform into pair features + baseline cosine labels
 patfeats = ft.texts2features(pat_corpus)
 # make pairwise feature matrix
 print "making feature matrix"
 patfeats_pairs = {}
 for combi in combis:
     target_id, pid = combi.split('_')
     patfeats_pairs[target_id + '_' + pid] = norm_dict(
         pointwise_dict_multiply(patfeats[target_id], patfeats[pid]),
         'length')
 featmat, featurenames = features2mat(patfeats_pairs, combis)
 '''
 print "performing regression"
 # perform logistig regression
import numpy as np
import cPickle as pkl
from word2vec_app import train_word2vec, embed_features
from corpus_utils import make_w2v_corpus
from plot_utils import calc_simcoef_distr, calc_auc, plot_score_distr, make_combis
from nlputils.preprocessing import FeatureTransform


#corpus = np.load('../corpus/corpus.npy').item()
for embed_dim in [200]:
	corpus = np.load('../corpus/corpus.npy').item()
	#pat_corpus = make_w2v_corpus()
	#model = train_word2vec(pat_corpus, 'full_patent_corpus', seed=1, embed_dim=200)
	model = pkl.load(open('human_eval/models/full_patent_corpus_sg_200_hs0_neg13_seed1.model'))
	pat_ids = corpus.keys()
	ft = FeatureTransform(identify_bigrams=False, norm=None, weight='tfidf', renorm='length')
	patfeats = ft.texts2features(corpus)
	featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids)
	patfeats_w2v = {}
	for i, pid in enumerate(pat_ids):
	    patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i,:]))
	#np.save('../corpus/patfeats_w2v.npy', patfeats_w2v)


	#patfeats = np.load('../corpus/patfeats_w2v.npy').item()
	patfeats = patfeats_w2v
	target_ids = np.load('../corpus/target_ids.npy')
	random_ids = np.load('../corpus/random_ids.npy')
	dupl_ids = np.load('../corpus/dupl_ids.npy').item()
	cited_ids = np.load('../corpus/cited_ids.npy').item()
Example #5
0
def model_selection(combis, patfeats_pairs, single_pat_corpus,
                    binary_label_pairs, human_label_pairs):
    alphas = np.arange(10) / 100000.
    param_auc_dict = {}
    param_auc_dict['cited'] = {}
    param_auc_dict['human'] = {}
    for alpha in alphas:
        param_auc_dict['cited']['%.5f' % alpha] = {}
        param_auc_dict['human']['%.5f' % alpha] = {}
        for wtype in [
                'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                'idf_weights_zeroed_sqrt'
        ]:
            param_auc_dict['cited']['%.5f' % alpha][wtype] = []
            param_auc_dict['human']['%.5f' % alpha][wtype] = []
    ## model selection
    for n in range(5):
        print "testing for the %ith time" % n
        # train/test split
        combis_perm = np.random.permutation(combis)
        trainids = combis_perm[:int(np.ceil(len(combis) * 0.7))]
        testids = combis_perm[int(np.ceil(len(combis) * 0.7)):]
        patfeats_pairs_train = {}
        for combi in trainids:
            target_id, pid = combi
            patfeats_pairs_train[(target_id, pid)] = patfeats_pairs[(target_id,
                                                                     pid)]
        train_pair_ids = patfeats_pairs_train.keys()
        # transform into feature matrix (number of pairs) x (bow-dim)
        print "make feature matrix train"
        featmat_train, featurenames = features2mat(patfeats_pairs_train,
                                                   train_pair_ids)
        # same for test set
        patfeats_pairs_test = {}
        for combi in testids:
            target_id, pid = combi
            patfeats_pairs_test[(target_id, pid)] = patfeats_pairs[(target_id,
                                                                    pid)]
        test_pair_ids = patfeats_pairs_test.keys()
        print "make feature matrix test"
        featmat_test, featurenames = features2mat(patfeats_pairs_test,
                                                  test_pair_ids, featurenames)

        # get the corresponding label vectors
        y_human_train = [human_label_pairs[tid] for tid in train_pair_ids]
        y_human_test = [human_label_pairs[tid] for tid in test_pair_ids]
        y_binary_train = [binary_label_pairs[tid] for tid in train_pair_ids]
        y_binary_test = [binary_label_pairs[tid] for tid in test_pair_ids]

        for alpha in alphas:
            # perform the linear regression for binary (cited/not cited) labels
            print "perform regression for binary scoring"
            clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
            clf.fit(featmat_train, y_binary_train)
            ## calculate AUC-values
            # the fitted coefficients are now our word weights
            # perform regression for all weight postprocessings
            weights = {}
            weights['idf_weights'] = norm_dict(
                dict(zip(featurenames, clf.coef_)))
            weights['idf_weights_zeroed'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=False)
            weights['idf_weights_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=False, sqrt=False)
            weights['idf_weights_zeroed_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=True)

            # multiply patfeats with idf weights
            for wtype in [
                    'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                    'idf_weights_zeroed_sqrt'
            ]:
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=None,
                                      weight=True,
                                      renorm='length')
                ft.Dw = weights[wtype]
                patfeats_idf = ft.texts2features(single_pat_corpus)

                # calculate auc for cited/not cited on test set
                for simcoef in ['linear']:
                    y_true = []
                    y_pred = []
                    for combi in testids:
                        y_true.append(binary_label_pairs[(combi[0], combi[1])])
                        y_pred.append(
                            compute_sim(patfeats_idf[combi[0]],
                                        patfeats_idf[combi[1]], simcoef))
                    fpr, tpr, thresholds = roc_curve(y_true,
                                                     y_pred,
                                                     pos_label=1)
                    auc_val = auc(fpr, tpr)
                    print "cited, alpha: %.5f, AUC: %.4f" % (alpha, auc_val)
                    param_auc_dict['cited']['%.5f' %
                                            alpha][wtype].append(auc_val)

            print "perform regression for human scoring"
            clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
            clf.fit(featmat_train, y_human_train)
            ## calculate AUC-values
            # the fitted coefficients are now our word weights
            # perform regression for all weight postprocessings
            weights = {}
            weights['idf_weights'] = norm_dict(
                dict(zip(featurenames, clf.coef_)))
            weights['idf_weights_zeroed'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=False)
            weights['idf_weights_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=False, sqrt=False)
            weights['idf_weights_zeroed_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=True)

            # multiply patfeats with idf weights
            for wtype in [
                    'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                    'idf_weights_zeroed_sqrt'
            ]:
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=None,
                                      weight=True,
                                      renorm='length')
                ft.Dw = weights[wtype]
                patfeats_idf = ft.texts2features(single_pat_corpus)

                # calculate auc for cited/not cited on test set
                for simcoef in ['linear']:
                    y_true = []
                    y_pred = []
                    for combi in testids:
                        y_true.append(
                            int(human_label_pairs[(combi[0],
                                                   combi[1])] >= 0.5))
                        y_pred.append(
                            compute_sim(patfeats_idf[combi[0]],
                                        patfeats_idf[combi[1]], simcoef))
                    fpr, tpr, thresholds = roc_curve(y_true,
                                                     y_pred,
                                                     pos_label=1)
                    auc_val = auc(fpr, tpr)
                    print "human, alpha: %.5f, AUC: %.4f" % (alpha, auc_val)
                    param_auc_dict['human']['%.5f' %
                                            alpha][wtype].append(auc_val)
    np.save('human_eval/regression/param_auc_dict.npy', param_auc_dict)
Example #6
0
 combis = np.load('human_eval/corpus_info/combis.npy')
 single_pat_corpus = np.load(
     'human_eval/corpus_info/single_pat_corpus.npy').item()
 binary_label_pairs = np.load(
     'human_eval/corpus_info/binary_label_pairs.npy').item()
 human_label_pairs = np.load(
     'human_eval/corpus_info/human_label_pairs.npy').item()
 for weight in [True, False]:
     weighting = 'None'
     if weight:
         weighting = 'tfidf'
     for norm in ['binary', None]:
         for renorm in ['max', 'length']:
             # make features
             ft = FeatureTransform(identify_bigrams=False,
                                   norm=norm,
                                   weight=weight,
                                   renorm=renorm)
             pat_feats = ft.texts2features(single_pat_corpus)
             # compute scores and calculate AUC for all pairs in combis
             for simcoef in [
                     'linear', 'polynomial', 'sigmoidal', 'histint',
                     'gaussian', 'simpson', 'braun', 'kulczynski',
                     'jaccard', 'dice', 'otsuka', 'sokal', 'manhattan',
                     'sqeucl', 'minkowski', 'canberra', 'chisq',
                     'chebyshev', 'hellinger', 'jenshan'
             ]:
                 sim_scores = {}
                 for combi in combis:
                     target, pid = combi
                     sim_scores[(target, pid)] = compute_sim(
                         pat_feats[target], pat_feats[pid], simcoef)