def read_in_csv_data_2(one_2, one_met, one_stemmed, two_2, two_met, two_stemmed): source_one = get_file_base( ) + 'extracted_features/' + one_2 + '_' + one_met + '_' + one_stemmed + '.csv' source_two = get_file_base( ) + 'extracted_features/' + two_2 + '_' + two_met + '_' + two_stemmed + '.csv' data_one = pandas.read_csv(source_one, sep=',') data_two = pandas.read_csv(source_two, sep=',') data_two.drop(['class'], axis=1, inplace=True) columns = data_one.columns for col in range(1, len(columns)): columns.values[col] = 'one_' + str(columns[col]) data = pandas.concat([ data_one, data_two, ], axis=1, sort=False) data = data.sample(frac=1, random_state=random_state) labels = data[['class']] data.drop(['class'], axis=1, inplace=True) return data, labels
def make_year(): # read in with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_difference, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/years_dist_unstemmed.csv', completed_vecs) write_to_file( get_file_base() + 'extracted_features/years_dist_stemmed.csv', completed_vecs)
def make_tfidf(use_stemming): # read in if use_stemming: with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) else: with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp, use_stemming) if use_stemming: with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav', 'wb') as output: joblib.dump(data_set, output) else: with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav', 'wb') as output: joblib.dump(data_set, output) print_words(words, use_stemming)
def read_in_csv_data_5(one_stemmed, two_stemmed, one_met, two_met, three_met, four_met): source_one = get_file_base( ) + 'extracted_features/tfidf_' + one_met + '_' + one_stemmed + '.csv' source_two = get_file_base( ) + 'extracted_features/lda_' + two_met + '_' + two_stemmed + '.csv' source_three = get_file_base( ) + 'extracted_features/d2v_' + three_met + '_unstemmed.csv' source_four = get_file_base( ) + 'extracted_features/bert_' + four_met + '_unstemmed.csv' source_five = get_file_base( ) + 'extracted_features/years_dist_unstemmed.csv' data_one = pandas.read_csv(source_one, sep=',') data_two = pandas.read_csv(source_two, sep=',') data_three = pandas.read_csv(source_three, sep=',') data_four = pandas.read_csv(source_four, sep=',') data_five = pandas.read_csv(source_five, sep=',') data_two.drop(['class'], axis=1, inplace=True) data_three.drop(['class'], axis=1, inplace=True) data_four.drop(['class'], axis=1, inplace=True) data_five.drop(['class'], axis=1, inplace=True) columns = data_one.columns for col in range(1, len(columns)): columns.values[col] = 'one_' + str(columns[col]) columns = data_two.columns for col in range(1, len(columns)): columns.values[col] = 'two_' + str(columns[col]) columns = data_three.columns for col in range(1, len(columns)): columns.values[col] = 'three_' + str(columns[col]) columns = data_four.columns for col in range(1, len(columns)): columns.values[col] = 'four_' + str(columns[col]) data = pandas.concat( [data_one, data_two, data_three, data_four, data_five], axis=1, sort=False) data = data.sample(frac=1, random_state=random_state) labels = data[['class']] data.drop(['class'], axis=1, inplace=True) return data, labels
def print_words(words, use_stemming): if use_stemming: f = open(get_file_base() + 'tfidf_data/words_stemmed.txt', 'w', encoding='utf8') else: f = open(get_file_base() + 'tfidf_data/words_unstemmed.txt', 'w', encoding='utf8') for word, w_id in words.items(): f.write(str(w_id) + ' ' + word + '\n') f.close()
def main(): with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) # calculate features on which the classification is going to be performed ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} # data_set[][0] -> P # data_set[][1] -> X, references # data_set[][2] -> Y, citations while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_cosine, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/EVAL/d2v_cos_YEAR_' + less_than_or_more + "_" + str(year) + '_unstemmed.csv', completed_vecs)
def sem_all_class(vec, dist, stem, do_her_sig, restrict_to_publication_time): # read in normal distance features for all publications data, labels = read_in_csv_data(get_file_base() + 'extracted_features/' + vec + '_' + dist + '_' + ('un' if not stem else '') + 'stemmed.csv') # drop insignificant features her_sig = None if do_her_sig: her_sig = data.drop( list(set(data.columns) - set(sig_feat_herrmannova)), axis=1) c = single_feature_classify_data(data, labels) print('SINGLE FEATURES') print(c) print('__________________________________') print('ALL FEATURES') if restrict_to_publication_time: data = restrict_features_to_publication_time(data) all_single_feature_classify_data(data, labels) if do_her_sig: print('__________________________________') print('ALL SIGNIFICANT FEATURES IN HER') # values for features, which were significant in Herrmannova et al all_single_feature_classify_data(her_sig, labels) # Return feature importance for the GB estimators in CV if access_importance: assess_importance(data, labels, "RF")
def classify(X, y, clf): y = y.astype('int') model = ExtraTreesClassifier() model.fit(X, y) print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.show() # get correlations of each features in dataset corrmat = X.corr() top_corr_features = corrmat.index plt.figure(figsize=(20, 20)) # plot heat map g = sns.heatmap(X[top_corr_features].corr(), annot=False, cmap="coolwarm") figure = g.get_figure() figure.savefig(get_file_base() + 'plots\importance_heatmap.png') bestfeatures = SelectKBest(score_func=f_classif, k=10) fit = bestfeatures.fit(X, y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(X.columns) # concat two dataframes for better visualization featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns print(featureScores.nlargest(10, 'Score')) # print 10 best features
def main(): # 0 = complete citation network, 1 = only references, 2 = only citations, 3 = references and p, 4 = only p only_part = 4 use_stemming = get_stem() wo_RS = False # read in normal distance features for all publications data, labels = read_in_csv_data( get_file_base() + 'extracted_features/OVR/years_unstemmed_OVR.csv') print('ALL FEATURES') third = -1 if only_part == 1: data, third = restrict_data(data, 1) if only_part == 2: data, third = restrict_data(data, 2) if only_part == 3: data, third = restrict_data(data, 3) if wo_RS: data = drop_rs(data, third, use_stemming) if only_part == 4: data, third = restrict_data(data, 4) if only_part == 0 and wo_RS: data = drop_rs(data, third, use_stemming) all_single_feature_classify_data(data, labels)
def task_lda(use_stemming): with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) with open(get_file_base() + 'lda_data/sem_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sem_file: sem = json.load(sem_file) with open(get_file_base() + 'lda_data/sur_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sur_file: sur = json.load(sur_file) with open(get_file_base() + 'lda_data/uni_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sur_file: uni = json.load(sur_file) # seminal unordered_seminal_p, unordered_seminal_x, unordered_seminal_y = read_in_json_lda_data( 'seminal', sem) # survey unordered_survey_p, unordered_survey_x, unordered_survey_y = read_in_json_lda_data( 'survey', sur) # uninfluential unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y = \ read_in_json_lda_data('uninfluential', uni) seminal_hlp = seminal_hlp['seminal'] survey_hlp = survey_hlp['survey'] uninfluential_hlp = uninfluential_hlp['uninfluential'] # matching of ordering of publication with sur/sem/uni_stemmed/unstemmed-data seminal_p, seminal_x, seminal_y = order_publications( unordered_seminal_p, unordered_seminal_x, unordered_seminal_y, seminal_hlp) survey_p, survey_x, survey_y = order_publications(unordered_survey_p, unordered_survey_x, unordered_survey_y, survey_hlp) uninfluential_p, uninfluential_x, uninfluential_y = order_publications( unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y, uninfluential_hlp) return [[seminal_p, seminal_x, seminal_y, 'sem '], [survey_p, survey_x, survey_y, 'surv '], [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
def main(): # read in normal distance features for all publications data = read_in_csv_data(get_file_base() + 'extracted_features/d2v_cos_unstemmed.csv') s_sem, s_sur, s_uni = find_equal() data, labels = prepare_data(data, s_sem, s_sur, s_uni) for model_id in model_config: all_single_feature_classify_data(data, labels, model_id)
def main(): with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'rb') as f: data_set = pickle.load(f) robust_data_set = generate_robust_ds(data_set) ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_robust_vecs = {} while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_cosine, robust_data_set, ds_ct, p) completed_robust_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file(get_file_base() + 'extracted_features/robustness.csv', completed_robust_vecs)
def make_d2v(): # read in with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp) with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'wb') as output: pickle.dump(data_set, output)
def build_lda_model(stem): corpus = [] ps = PorterStemmer() number_of_topics = 100 # read in data from publications with open(get_lda_base(), 'r') as f: for line in f: if stem: stemmed = [] for w in line.split(): s = ps.stem(w) if len(s) > 1: stemmed.append(s) corpus.append(stemmed) else: corpus.append(line.split()) # build vocabulary and transform texts in vocab format dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] # do lda lda = ldamodel.LdaModel(corpus=corpus, num_topics=number_of_topics, passes=20, id2word=dictionary, minimum_probability=0) if stem: temp_file = datapath('lda_model_stemmed') dictionary.save_as_text(get_file_base() + 'lda_data/dict_stemmed') else: temp_file = datapath('lda_model_unstemmed') dictionary.save_as_text(get_file_base() + 'lda_data/dict_unstemmed') lda.save(temp_file)
def main(): data = None labels = None if comb == 2: data, labels = read_in_csv_data_2( get_file_base() + 'extracted_features/OVR/lda_stemmed_OVR.csv', get_file_base() + 'extracted_features/OVR/tfidf_stemmed_OVR.csv') if comb == 3: data, labels = read_in_csv_data_3( get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv', get_file_base() + 'extracted_features/bert_cos_unstemmed.csv', get_file_base() + 'extracted_features/lda_was_unstemmed.csv') if comb == 4: data, labels = read_in_csv_data_4( get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv', get_file_base() + 'extracted_features/bert_cos_unstemmed.csv', get_file_base() + 'extracted_features/lda_was_unstemmed.csv', get_file_base() + 'extracted_features/years.csv') print('ALL FEATURES') all_single_feature_classify_data(data, labels)
def sem_one_class(vec, dist, classifier, stem, single, searched_feat, restrict_to_publication_time): # read in normal distance features for all publications # data, labels = read_in_csv_data(get_file_base() + 'extracted_features/OVR/lda_unstemmed_OVR.csv') data, labels = read_in_csv_data(get_file_base() + 'extracted_features/' + vec + '_' + dist + '_' + ('un' if not stem else '') + 'stemmed.csv') if single: print('SINGLE FEATURE') single_feature_classify_data(data, labels, classifier, searched_feat) else: if restrict_to_publication_time: data = restrict_features_to_publication_time(data) print('ALL FEATURES') all_single_feature_classify_data(data, labels, classifier)
data = [trace1, trace2, trace3] layout = go.Layout(showlegend=True, autosize=False, width=800, height=300, margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4), xaxis=dict( title='Years', showgrid=False ), yaxis=dict( title='Number of publications', showgrid=True, gridcolor='#E2E2E2' ), legend=dict( x=0.01, y=1, font=dict( family='sans-serif', size=12, color='#000' ), bgcolor='#E2E2E2', bordercolor='#FFFFFF', borderwidth=2 ), paper_bgcolor='#FFFFFF', plot_bgcolor='#FFFFFF' ) fig = go.Figure(data=data, layout=layout) plot(fig, get_file_base() + 'plots/sem', image='jpeg') print('mean year sem : ' + str(np.mean(sem_p))) print('mean year sur : ' + str(np.mean(sur_p))) print('mean year uni : ' + str(np.mean(uni_p)))
marker=dict(color='green'), name='uninfluential citations') data = [trace1, trace2, trace3] layout = go.Layout(showlegend=True, autosize=False, width=600, height=300, margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4), xaxis=dict(title='Years', showgrid=False), yaxis=dict(title='Number of citations', showgrid=True, gridcolor='#E2E2E2'), legend=dict(x=0.01, y=1, font=dict(family='sans-serif', size=12, color='#000'), bgcolor='#E2E2E2', bordercolor='#FFFFFF', borderwidth=2), paper_bgcolor='#FFFFFF', plot_bgcolor='#FFFFFF') fig = go.Figure(data=data, layout=layout) plot(fig, get_file_base() + 'plots/citations', image='jpeg') print(np.mean(sem_cit)) print(np.mean(sur_cit)) print(np.mean(uni_cit))
def make_bert(): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) print('sem') seminal_p = {} seminal_x = {} seminal_y = {} ct = 0 for p in seminal_hlp['seminal']: seminal_p[ct] = do_bert(p['abs'], tokenizer) seminal_x[ct] = {} seminal_y[ct] = {} ct_ref = 0 for ref in p['ref']: seminal_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: seminal_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/sem_bert_unstemmed.json', seminal_p, seminal_x, seminal_y, 'seminal') survey_p = {} survey_x = {} survey_y = {} print('sur') ct = 0 for p in survey_hlp['survey']: survey_p[ct] = do_bert(p['abs'], tokenizer) survey_x[ct] = {} survey_y[ct] = {} ct_ref = 0 for ref in p['ref']: survey_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: survey_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/sur_bert_unstemmed.json', survey_p, survey_x, survey_y, 'survey') print('uni') uninfluential_p = {} uninfluential_x = {} uninfluential_y = {} ct = 0 for p in uninfluential_hlp['uninfluential']: uninfluential_p[ct] = do_bert(p['abs'], tokenizer) uninfluential_x[ct] = {} uninfluential_y[ct] = {} ct_ref = 0 for ref in p['ref']: uninfluential_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: uninfluential_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/uni_bert_unstemmed.json', uninfluential_p, uninfluential_x, uninfluential_y, 'uninfluential')
trace3 = go.Scatter( mode='markers', x=cit, y=ref, marker=dict( color='green', size=8, opacity=0.3, symbol='cross', line=dict(color='green', width=2) ), showlegend=False, name='Uninfluential' ) data = [trace2, trace1, trace3] layout = {'yaxis': dict( title='Number of references', type='log', autorange=True, gridcolor='#E2E2E2' ), 'xaxis': dict( title='Number of citations', type='log', autorange=True, gridcolor='#E2E2E2' ), 'width': 1200, 'height': 500, 'paper_bgcolor': '#FFFFFF', 'plot_bgcolor': '#FFFFFF' } fig = go.Figure(data=data, layout=layout) plot(fig, get_file_base() + 'plots/' + title + '.pdf')
def task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp): # model 0 : dbow; model 1: dm doc2vec_model = Doc2Vec.load(get_file_base() + 'models/doc2vec_model_1') seminal_p = {} seminal_x = {} seminal_y = {} ct = 0 for p in seminal_hlp['seminal']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) seminal_p[ct] = hlp.reshape(1, -1) seminal_x[ct] = {} seminal_y[ct] = {} ct_x = 0 for ref in p['ref']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) seminal_x[ct][ct_x] = hlp.reshape(1, -1) ct_x += 1 ct_y = 0 for cit in p['cit']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) seminal_y[ct][ct_y] = hlp.reshape(1, -1) ct_y += 1 ct += 1 survey_p = {} survey_x = {} survey_y = {} ct = 0 for p in survey_hlp['survey']: surv_abs = [] for w in re.compile('[^a-zA-Z0-9]+').split(p['abs']): surv_abs.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(surv_abs, alpha=0.025, steps=20) survey_p[ct] = hlp.reshape(1, -1) survey_x[ct] = {} survey_y[ct] = {} ct_x = 0 for ref in p['ref']: abs_o = [] for w in re.compile('[^a-zA-Z0-9]+').split(ref['abs']): abs_o.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(abs_o, alpha=0.025, steps=20) survey_x[ct][ct_x] = hlp.reshape(1, -1) ct_x += 1 ct_y = 0 for cit in p['cit']: abs_i = [] for w in re.compile('[^a-zA-Z0-9]+').split(cit['abs']): abs_i.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(abs_i, alpha=0.025, steps=20) survey_y[ct][ct_y] = hlp.reshape(1, -1) ct_y += 1 ct += 1 uninfluential_p = {} uninfluential_x = {} uninfluential_y = {} ct = 0 for p in uninfluential_hlp['uninfluential']: surv_abs = [] for w in re.compile('[^a-zA-Z0-9]+').split(p['abs']): surv_abs.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(surv_abs, alpha=0.025, steps=20) uninfluential_p[ct] = hlp.reshape(1, -1) uninfluential_x[ct] = {} uninfluential_y[ct] = {} ct_x = 0 for ref in p['ref']: abs_o = [] for w in re.compile('[^a-zA-Z0-9]+').split(ref['abs']): abs_o.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(abs_o, alpha=0.025, steps=20) uninfluential_x[ct][ct_x] = hlp.reshape(1, -1) ct_x += 1 ct_y = 0 for cit in p['cit']: abs_i = [] for w in re.compile('[^a-zA-Z0-9]+').split(cit['abs']): abs_i.append(w.lower()) doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(abs_i, alpha=0.025, steps=20) uninfluential_y[ct][ct_y] = hlp.reshape(1, -1) ct_y += 1 ct += 1 return [[seminal_p, seminal_x, seminal_y, 'sem '], [survey_p, survey_x, survey_y, 'surv '], [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
def task_year(survey_hlp, seminal_hlp, uninfluential_hlp): # model 0 : dbow; model 1: dm doc2vec_model = Doc2Vec.load(get_file_base() + 'models/doc2vec_model_1') seminal_p = {} seminal_x = {} seminal_y = {} ct = 0 curr_ct = 0 for p in seminal_hlp['seminal']: if less_than_or_more == 'l': if p['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) seminal_p[curr_ct] = hlp.reshape(1, -1) seminal_x[curr_ct] = {} seminal_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: if ref['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) seminal_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: if cit['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) seminal_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 if less_than_or_more == "m": if p['year'] >= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) seminal_p[curr_ct] = hlp.reshape(1, -1) seminal_x[curr_ct] = {} seminal_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) seminal_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) seminal_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 ct += 1 survey_p = {} survey_x = {} survey_y = {} ct = 0 curr_ct = 0 for p in survey_hlp['survey']: if less_than_or_more == 'l': if p['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) survey_p[curr_ct] = hlp.reshape(1, -1) survey_x[curr_ct] = {} survey_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: if ref['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) survey_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: if cit['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) survey_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 if less_than_or_more == 'm': if p['year'] >= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) survey_p[curr_ct] = hlp.reshape(1, -1) survey_x[curr_ct] = {} survey_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) survey_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) survey_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 ct += 1 uninfluential_p = {} uninfluential_x = {} uninfluential_y = {} ct = 0 curr_ct = 0 for p in uninfluential_hlp['uninfluential']: if less_than_or_more == 'l': if p['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) uninfluential_p[curr_ct] = hlp.reshape(1, -1) uninfluential_x[curr_ct] = {} uninfluential_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: if ref['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) uninfluential_x[curr_ct][curr_ct_x] = hlp.reshape( 1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: if cit['year'] <= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) uninfluential_y[curr_ct][curr_ct_y] = hlp.reshape( 1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 if less_than_or_more == 'm': if p['year'] >= year: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20) uninfluential_p[curr_ct] = hlp.reshape(1, -1) uninfluential_x[curr_ct] = {} uninfluential_y[curr_ct] = {} ct_x = 0 curr_ct_x = 0 for ref in p['ref']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20) uninfluential_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1) curr_ct_x += 1 ct_x += 1 ct_y = 0 curr_ct_y = 0 for cit in p['cit']: doc2vec_model.random.seed(0) hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20) uninfluential_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1) curr_ct_y += 1 ct_y += 1 curr_ct += 1 ct += 1 return [[seminal_p, seminal_x, seminal_y, 'sem '], [survey_p, survey_x, survey_y, 'surv '], [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
def make_features(metric, task, this_use_stemming): data_set = None if metric not in ['cos', 'jac', 'emd', 'ipd', 'dist']: print('Metric ' + metric + ' unknown.') return if task not in ['tfidf', 'd2v', 'bert', 'lda', 'year']: print('Task ' + task + ' unknown.') return print('Using ' + metric + ' on ' + ('un' if not this_use_stemming else '') + 'stemmed ' + task + ' vectors.') if metric == 'dist' and task == 'year': make_year() return if task == 'tfidf': with open( get_file_base() + 'tfidf_data/tfidf_' + ('un' if not this_use_stemming else '') + 'stemmed.sav', 'rb') as f: data_set = joblib.load(f) # for p in range(0, len(data_set[2][0])): # data_set[2][0][p] = [data_set[2][0][p]] # for p in range(0, len(data_set[2][1])): # for x in range(0, len(data_set[2][1][p])): # data_set[2][1][p][x] = [data_set[2][1][p][x]] # for p in range(0, len(data_set[2][2])): # for x in range(0, len(data_set[2][2][p])): # data_set[2][2][p][x] = [data_set[2][2][p][x]] if task == 'd2v': with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'rb') as f: data_set = pickle.load(f) if task == 'bert': with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json', encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json', encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json', encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp) if task == 'lda': data_set = task_lda(this_use_stemming) ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} # data_set[][0] -> P # data_set[][1] -> X, references # data_set[][2] -> Y, citations while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = None if metric == 'emd': futures = executor.submit(do_wasserstein, data_set, ds_ct, p) if metric == 'cos': futures = executor.submit(do_cosine, data_set, ds_ct, p) if metric == 'jac': futures = executor.submit(do_jaccard, data_set, ds_ct, p) if metric == 'ipd': futures = executor.submit(do_component_wise_multiplication, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/' + task + '_' + metric + '_' + ('un' if not this_use_stemming else '') + 'stemmed.csv', completed_vecs)
for ref in p['ref']: uni_cit.append(ref['year']) for cit in p['cit']: uni_ref.append(cit['year']) if seminal == 0: trace1 = go.Histogram(x=sem_p, opacity=1) trace2 = go.Histogram(x=sem_cit, opacity=0.5) trace3 = go.Histogram(x=sem_ref, opacity=0.5) if seminal == 1: trace1 = go.Histogram(x=sur_p, opacity=1) trace2 = go.Histogram(x=sur_cit, opacity=0.5) trace3 = go.Histogram(x=sur_ref, opacity=0.5) if seminal == 2: trace1 = go.Histogram(x=uni_p, opacity=1) trace2 = go.Histogram(x=uni_cit, opacity=0.5) trace3 = go.Histogram(x=uni_ref, opacity=0.5) data = [trace1, trace2, trace3] layout = go.Layout(showlegend=False, barmode='overlay', width=600, height=300, margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4), yaxis=dict(range=[0, 6999])) fig = go.Figure(data=data, layout=layout) plot(fig, get_file_base() + 'plots/' + str(seminal), image='jpeg')
import seaborn as sns from classify.ClassificationSEM import read_in_csv_data from general.baseFileExtractor import get_file_base data, labels = read_in_csv_data(get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv') ax = sns.heatmap(data.corr()) figure = ax.get_figure() figure.savefig(get_file_base() + 'plots/heatmap.png')
from general.baseFileExtractor import get_file_base, get_seminal_u, get_survey_u, get_uninfluential_u # read in with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) survey_hlp = survey_hlp['survey'] with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) seminal_hlp = seminal_hlp['seminal'] with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) uninfluential_hlp = uninfluential_hlp['uninfluential'] lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem = [] sur = [] uni = [] for p in seminal_hlp: sem.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in survey_hlp: sur.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in uninfluential_hlp: uni.append(lda[dictionary.doc2bow(p['abs'].split())]) fin_sem = [] fin_sur = []
from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_seminal_u, get_survey_u, \ get_uninfluential_u, get_stem, get_file_base, get_what_to_do, get_which_vectors, get_which_distance, get_classifier from generateData.TFIDFEmbedding import make_tfidf from generateData.BuildD2VModel import build_d2v_model from generateData.D2VEmbedding import make_d2v from generateData.BERTEmbedding import make_bert from generateData.BuildLDAModel import build_lda_model from generateData.LDAEmbedding import make_lda from computeFeatures.FeaturesFromEmbedding import make_features from classify.ClassificationSEMallC import sem_all_class from classify.ClassificationSEM import sem_one_class import os # build folder structure if not os.path.exists(os.path.dirname(get_file_base() + 'tfidf_data/')): os.makedirs(os.path.dirname(get_file_base() + 'tfidf_data/')) if not os.path.exists(os.path.dirname(get_file_base() + 'd2v_data/')): os.makedirs(os.path.dirname(get_file_base() + 'd2v_data/')) if not os.path.exists(os.path.dirname(get_file_base() + 'bert_data/')): os.makedirs(os.path.dirname(get_file_base() + 'bert_data/')) if not os.path.exists(os.path.dirname(get_file_base() + 'lda_data/')): os.makedirs(os.path.dirname(get_file_base() + 'lda_data/')) if not os.path.exists( os.path.dirname(get_file_base() + 'extracted_features/')): os.makedirs(os.path.dirname(get_file_base() + 'extracted_features/')) if not os.path.exists( os.path.dirname(get_file_base() + 'extracted_features/OVR/')): os.makedirs(os.path.dirname(get_file_base() + 'extracted_features/OVR/')) if not os.path.exists(os.path.dirname(get_file_base() + 'plots/')):
from plotly.offline import plot import plotly.graph_objs as go from classify.Classification import read_in_csv_data_sem_sur_uni from general.baseFileExtractor import get_file_base vec = 'tfidf' measure = 'cos' stem = 'stemmed' full_data, labels, sem, sur, uni = read_in_csv_data_sem_sur_uni(get_file_base() + 'extracted_features/tfidf_cos_stemmed.csv') feature = 'sum' group = 'A' title = feature + group + ' ' + vec + ' ' + measure + ' ' + stem[:1] sem = sem[feature + group] sur = sur[feature + group] uni = uni[feature + group] trace1 = go.Box(x=sem, opacity=1, name='seminal', marker=dict(color='blue')) trace2 = go.Box(x=sur, opacity=1, name='survey', marker=dict(color='orange')) trace3 = go.Box(x=uni, opacity=1, name='uninfluential', marker=dict(color='green')) layout = go.Layout(showlegend=False, autosize=False, width=800, height=250, xaxis_type='log', margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4), xaxis=dict( title='Value for ' + feature + group, showgrid=True, gridcolor='#E2E2E2' ), yaxis=dict( showgrid=False
marker=dict(color='green'), name='uninfluential references') data = [trace1, trace2, trace3] layout = go.Layout(showlegend=True, autosize=False, width=600, height=300, margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4), xaxis=dict(title='Years', showgrid=False), yaxis=dict(title='Number of references', showgrid=True, gridcolor='#E2E2E2'), legend=dict(x=0.01, y=1, font=dict(family='sans-serif', size=12, color='#000'), bgcolor='#E2E2E2', bordercolor='#FFFFFF', borderwidth=2), paper_bgcolor='#FFFFFF', plot_bgcolor='#FFFFFF') fig = go.Figure(data=data, layout=layout) plot(fig, get_file_base() + 'plots/references', image='jpeg') print(np.mean(sem_ref)) print(np.mean(sur_ref)) print(np.mean(uni_ref))
def main(): if task not in ['tfidf', 'd2v', 'bert', 'lda', 'years']: print('Task ' + task + ' unknown.') return if task == 'tfidf': with open(get_file_base() + 'tfidf_data/tfidf_' + ('un' if not use_stemming else '') + 'stemmed.sav', 'rb') as \ f: data_set = joblib.load(f) # todo: delete for p in range(0, len(data_set[2][0])): data_set[2][0][p] = [data_set[2][0][p]] for p in range(0, len(data_set[2][1])): for x in range(0, len(data_set[2][1][p])): data_set[2][1][p][x] = [data_set[2][1][p][x]] for p in range(0, len(data_set[2][2])): for x in range(0, len(data_set[2][2][p])): data_set[2][2][p][x] = [data_set[2][2][p][x]] if task == 'd2v': with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'rb') as f: data_set = pickle.load(f) if task == 'bert': with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json', encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json', encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json', encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp) if task == 'lda': data_set = task_lda(use_stemming) if task == 'years': with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) for ds in range(0, 3): for p in range(0, len(data_set[ds][0])): data_set[ds][0][p] = [[data_set[ds][0][p]]] for p in range(0, len(data_set[ds][1])): for x in range(0, len(data_set[ds][1][p])): data_set[ds][1][p][x] = [[data_set[ds][1][p][x]]] for p in range(0, len(data_set[ds][2])): for x in range(0, len(data_set[ds][2][p])): data_set[ds][2][p][x] = [[data_set[ds][2][p][x]]] ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} # data_set[][0] -> P # data_set[][1] -> X, references # data_set[][2] -> Y, citations while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_one_doc_rep, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/OVR/' + task + '_' + ('un' if not use_stemming else '') + 'stemmed_OVR.csv', completed_vecs)