def evaluate_coefs(corpus, target_ids, cited_ids, random_ids, dupl_ids, dir_, weights=[True], norms=[None], renorms=['length'], simcoefs=['linear', 'jaccard']): auc_dict = collections.defaultdict(dict) for weight in weights: weighting = 'None' if weight: weighting = 'tfidf' auc_dict[weighting] = {} for norm in norms: auc_dict[weighting][str(norm)] = {} for renorm in renorms: auc_dict[weighting][str(norm)][str(renorm)] = {} # make features ft = FeatureTransform(identify_bigrams=False, norm=norm, weight=weight, renorm=renorm) pat_feats = ft.texts2features(corpus) # compute scores and calculate AUC for simcoef in simcoefs: sim_scores, diff_scores, dupl_scores = calc_simcoef_distr_dict( pat_feats, target_ids, cited_ids, random_ids, dupl_ids, simcoef) fpr, tpr, auc_score = plot_utils.calc_auc( sim_scores.values(), diff_scores.values()) auc_dict[weighting][str(norm)][str( renorm)][simcoef] = auc_score np.save( dir_ + '/sim_scores/sim_scores_%s_%s_%s_%s.npy' % (simcoef, str(norm), str(renorm), weighting), sim_scores) np.save( dir_ + '/diff_scores/diff_scores_%s_%s_%s_%s.npy' % (simcoef, str(norm), str(renorm), weighting), diff_scores) np.save( dir_ + '/dupl_scores/dupl_scores_%s_%s_%s_%s.npy' % (simcoef, str(norm), str(renorm), weighting), dupl_scores) np.save( dir_ + '/fpr_tpr_rates/fpr_%s_%s_%s_%s.npy' % (simcoef, str(norm), str(renorm), weighting), [fpr, tpr]) np.save(dir_ + '/auc_dict.npy', auc_dict)
def make_section_patfeats(): ## Sanity Check: Make the plots calculating the values again # get patent corpus pat_corpus = PatentCorpus() pat_corpus.mode = 'regression' # ugly hack to invoke __iter__() function :-( : list(pat_corpus) # get relevant information #pat_ids = pat_corpus.pat_ids #combis = pat_corpus.combis #binary_label_pairs = pat_corpus.binary_label_pairs #human_label_pairs = pat_corpus.human_label_pairs single_pat_corpus = pat_corpus.single_pat_corpus # make dict of feature vectors ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') patfeats = ft.texts2features(single_pat_corpus) np.save('../corpus/patcorpus_claims.npy', single_pat_corpus) np.save('../corpus/patfeats_claims.npy', patfeats)
''' ## baseline: cosine similarity calculation with idf weights print "baseline: idf weights" # make features ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') patfeats_org = ft.texts2features(pat_corpus) # save the idf weights Dw_all['idf'] = deepcopy(ft.Dw) np.save('full_patent_scores/corpus_info_for_regression/Dw_all.npy', Dw_all) ''' ## our case: weights learned by regression # transform into very basic features, i.e. w/o idf weights print "making patent pair features" ft = FeatureTransform(identify_bigrams=False, norm=None, weight=False, renorm=None) # transform into pair features + baseline cosine labels patfeats = ft.texts2features(pat_corpus) # make pairwise feature matrix print "making feature matrix" patfeats_pairs = {} for combi in combis: target_id, pid = combi.split('_') patfeats_pairs[target_id + '_' + pid] = norm_dict( pointwise_dict_multiply(patfeats[target_id], patfeats[pid]), 'length') featmat, featurenames = features2mat(patfeats_pairs, combis) ''' print "performing regression" # perform logistig regression
import numpy as np import cPickle as pkl from word2vec_app import train_word2vec, embed_features from corpus_utils import make_w2v_corpus from plot_utils import calc_simcoef_distr, calc_auc, plot_score_distr, make_combis from nlputils.preprocessing import FeatureTransform #corpus = np.load('../corpus/corpus.npy').item() for embed_dim in [200]: corpus = np.load('../corpus/corpus.npy').item() #pat_corpus = make_w2v_corpus() #model = train_word2vec(pat_corpus, 'full_patent_corpus', seed=1, embed_dim=200) model = pkl.load(open('human_eval/models/full_patent_corpus_sg_200_hs0_neg13_seed1.model')) pat_ids = corpus.keys() ft = FeatureTransform(identify_bigrams=False, norm=None, weight='tfidf', renorm='length') patfeats = ft.texts2features(corpus) featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids) patfeats_w2v = {} for i, pid in enumerate(pat_ids): patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i,:])) #np.save('../corpus/patfeats_w2v.npy', patfeats_w2v) #patfeats = np.load('../corpus/patfeats_w2v.npy').item() patfeats = patfeats_w2v target_ids = np.load('../corpus/target_ids.npy') random_ids = np.load('../corpus/random_ids.npy') dupl_ids = np.load('../corpus/dupl_ids.npy').item() cited_ids = np.load('../corpus/cited_ids.npy').item()
def model_selection(combis, patfeats_pairs, single_pat_corpus, binary_label_pairs, human_label_pairs): alphas = np.arange(10) / 100000. param_auc_dict = {} param_auc_dict['cited'] = {} param_auc_dict['human'] = {} for alpha in alphas: param_auc_dict['cited']['%.5f' % alpha] = {} param_auc_dict['human']['%.5f' % alpha] = {} for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: param_auc_dict['cited']['%.5f' % alpha][wtype] = [] param_auc_dict['human']['%.5f' % alpha][wtype] = [] ## model selection for n in range(5): print "testing for the %ith time" % n # train/test split combis_perm = np.random.permutation(combis) trainids = combis_perm[:int(np.ceil(len(combis) * 0.7))] testids = combis_perm[int(np.ceil(len(combis) * 0.7)):] patfeats_pairs_train = {} for combi in trainids: target_id, pid = combi patfeats_pairs_train[(target_id, pid)] = patfeats_pairs[(target_id, pid)] train_pair_ids = patfeats_pairs_train.keys() # transform into feature matrix (number of pairs) x (bow-dim) print "make feature matrix train" featmat_train, featurenames = features2mat(patfeats_pairs_train, train_pair_ids) # same for test set patfeats_pairs_test = {} for combi in testids: target_id, pid = combi patfeats_pairs_test[(target_id, pid)] = patfeats_pairs[(target_id, pid)] test_pair_ids = patfeats_pairs_test.keys() print "make feature matrix test" featmat_test, featurenames = features2mat(patfeats_pairs_test, test_pair_ids, featurenames) # get the corresponding label vectors y_human_train = [human_label_pairs[tid] for tid in train_pair_ids] y_human_test = [human_label_pairs[tid] for tid in test_pair_ids] y_binary_train = [binary_label_pairs[tid] for tid in train_pair_ids] y_binary_test = [binary_label_pairs[tid] for tid in test_pair_ids] for alpha in alphas: # perform the linear regression for binary (cited/not cited) labels print "perform regression for binary scoring" clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13) clf.fit(featmat_train, y_binary_train) ## calculate AUC-values # the fitted coefficients are now our word weights # perform regression for all weight postprocessings weights = {} weights['idf_weights'] = norm_dict( dict(zip(featurenames, clf.coef_))) weights['idf_weights_zeroed'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=False) weights['idf_weights_sqrt'] = postprocess_weights( weights['idf_weights'], zero=False, sqrt=False) weights['idf_weights_zeroed_sqrt'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=True) # multiply patfeats with idf weights for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = weights[wtype] patfeats_idf = ft.texts2features(single_pat_corpus) # calculate auc for cited/not cited on test set for simcoef in ['linear']: y_true = [] y_pred = [] for combi in testids: y_true.append(binary_label_pairs[(combi[0], combi[1])]) y_pred.append( compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef)) fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) auc_val = auc(fpr, tpr) print "cited, alpha: %.5f, AUC: %.4f" % (alpha, auc_val) param_auc_dict['cited']['%.5f' % alpha][wtype].append(auc_val) print "perform regression for human scoring" clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13) clf.fit(featmat_train, y_human_train) ## calculate AUC-values # the fitted coefficients are now our word weights # perform regression for all weight postprocessings weights = {} weights['idf_weights'] = norm_dict( dict(zip(featurenames, clf.coef_))) weights['idf_weights_zeroed'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=False) weights['idf_weights_sqrt'] = postprocess_weights( weights['idf_weights'], zero=False, sqrt=False) weights['idf_weights_zeroed_sqrt'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=True) # multiply patfeats with idf weights for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = weights[wtype] patfeats_idf = ft.texts2features(single_pat_corpus) # calculate auc for cited/not cited on test set for simcoef in ['linear']: y_true = [] y_pred = [] for combi in testids: y_true.append( int(human_label_pairs[(combi[0], combi[1])] >= 0.5)) y_pred.append( compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef)) fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) auc_val = auc(fpr, tpr) print "human, alpha: %.5f, AUC: %.4f" % (alpha, auc_val) param_auc_dict['human']['%.5f' % alpha][wtype].append(auc_val) np.save('human_eval/regression/param_auc_dict.npy', param_auc_dict)
if __name__ == "__main__": pat_corpus = PatentCorpus() pat_corpus.mode = 'regression' # ugly hack to invoke __iter__() function :-( : list(pat_corpus) pat_ids = pat_corpus.pat_ids combis = pat_corpus.combis binary_label_pairs = pat_corpus.binary_label_pairs human_label_pairs = pat_corpus.human_label_pairs single_pat_corpus = pat_corpus.single_pat_corpus ## baseline: cosine similarity calculation with idf weights print "baseline: idf weights" # make features ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') patfeats_org = ft.texts2features(single_pat_corpus) #save corpus and ids for training word2vec np.save('human_eval/corpus_info/pat_ids.npy', pat_ids) np.save('human_eval/corpus_info/combis.npy', combis) np.save('human_eval/corpus_info/binary_label_pairs.npy', binary_label_pairs) np.save('human_eval/corpus_info/human_label_pairs.npy', human_label_pairs) np.save('human_eval/corpus_info/patfeats_human_eval.npy', patfeats_org) np.save('human_eval/corpus_info/single_pat_corpus.npy', single_pat_corpus) # save the idf weights Dw_all = {} Dw_all['idf'] = deepcopy(ft.Dw) ## our case: weights learned by regression
combis = np.load('human_eval/corpus_info/combis.npy') single_pat_corpus = np.load( 'human_eval/corpus_info/single_pat_corpus.npy').item() binary_label_pairs = np.load( 'human_eval/corpus_info/binary_label_pairs.npy').item() human_label_pairs = np.load( 'human_eval/corpus_info/human_label_pairs.npy').item() for weight in [True, False]: weighting = 'None' if weight: weighting = 'tfidf' for norm in ['binary', None]: for renorm in ['max', 'length']: # make features ft = FeatureTransform(identify_bigrams=False, norm=norm, weight=weight, renorm=renorm) pat_feats = ft.texts2features(single_pat_corpus) # compute scores and calculate AUC for all pairs in combis for simcoef in [ 'linear', 'polynomial', 'sigmoidal', 'histint', 'gaussian', 'simpson', 'braun', 'kulczynski', 'jaccard', 'dice', 'otsuka', 'sokal', 'manhattan', 'sqeucl', 'minkowski', 'canberra', 'chisq', 'chebyshev', 'hellinger', 'jenshan' ]: sim_scores = {} for combi in combis: target, pid = combi sim_scores[(target, pid)] = compute_sim( pat_feats[target], pat_feats[pid], simcoef)