def make_tfidf(use_stemming): # read in if use_stemming: with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) else: with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp, use_stemming) if use_stemming: with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav', 'wb') as output: joblib.dump(data_set, output) else: with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav', 'wb') as output: joblib.dump(data_set, output) print_words(words, use_stemming)
def main(): with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) # calculate features on which the classification is going to be performed ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} # data_set[][0] -> P # data_set[][1] -> X, references # data_set[][2] -> Y, citations while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_cosine, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/EVAL/d2v_cos_YEAR_' + less_than_or_more + "_" + str(year) + '_unstemmed.csv', completed_vecs)
def find_equal(): with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) seminal_hlp = seminal_hlp['seminal'] with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) survey_hlp = survey_hlp['survey'] with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) uninfluential_hlp = uninfluential_hlp['uninfluential'] lengths_sem = {} for entry_id in range(0, len(seminal_hlp)): abs_length = len(seminal_hlp[entry_id]['abs'].split()) if abs_length not in lengths_sem: lengths_sem[abs_length] = [entry_id] else: lengths_sem[abs_length].append(entry_id) lengths_sur = {} for entry_id in range(0, len(survey_hlp)): abs_length = len(survey_hlp[entry_id]['abs'].split()) if abs_length not in lengths_sur: lengths_sur[abs_length] = [entry_id] else: lengths_sur[abs_length].append(entry_id) lengths_uni = {} for entry_id in range(0, len(uninfluential_hlp)): abs_length = len(uninfluential_hlp[entry_id]['abs'].split()) if abs_length not in lengths_uni: lengths_uni[abs_length] = [entry_id] else: lengths_uni[abs_length].append(entry_id) found = [] for entry in lengths_sem: if entry in lengths_sur and entry in lengths_uni: found.append(entry) sem_ids = [] sur_ids = [] uni_ids = [] for entry in found: num = min(len(lengths_sem[entry]), len(lengths_sur[entry]), len(lengths_uni[entry])) for i in range(0, num): sem_ids.append(lengths_sem[entry][i]) sur_ids.append(lengths_sur[entry][i]) uni_ids.append(lengths_uni[entry][i]) return sem_ids, sur_ids, uni_ids
def make_lda(stem): ps = PorterStemmer() if stem: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_stemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_stemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(), 'uninfluential') else: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(), 'uninfluential') # write lda information to file if stem: write_to_file( get_file_base() + 'lda_data/sem_lda_stemmed.json', get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_stemmed.json', get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_stemmed.json', get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary) else: write_to_file( get_file_base() + 'lda_data/sem_lda_unstemmed.json', get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_unstemmed.json', get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_unstemmed.json', get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
def make_d2v(): # read in with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp) with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'wb') as output: pickle.dump(data_set, output)
def task_lda(use_stemming): with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) with open(get_file_base() + 'lda_data/sem_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sem_file: sem = json.load(sem_file) with open(get_file_base() + 'lda_data/sur_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sur_file: sur = json.load(sur_file) with open(get_file_base() + 'lda_data/uni_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \ sur_file: uni = json.load(sur_file) # seminal unordered_seminal_p, unordered_seminal_x, unordered_seminal_y = read_in_json_lda_data( 'seminal', sem) # survey unordered_survey_p, unordered_survey_x, unordered_survey_y = read_in_json_lda_data( 'survey', sur) # uninfluential unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y = \ read_in_json_lda_data('uninfluential', uni) seminal_hlp = seminal_hlp['seminal'] survey_hlp = survey_hlp['survey'] uninfluential_hlp = uninfluential_hlp['uninfluential'] # matching of ordering of publication with sur/sem/uni_stemmed/unstemmed-data seminal_p, seminal_x, seminal_y = order_publications( unordered_seminal_p, unordered_seminal_x, unordered_seminal_y, seminal_hlp) survey_p, survey_x, survey_y = order_publications(unordered_survey_p, unordered_survey_x, unordered_survey_y, survey_hlp) uninfluential_p, uninfluential_x, uninfluential_y = order_publications( unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y, uninfluential_hlp) return [[seminal_p, seminal_x, seminal_y, 'sem '], [survey_p, survey_x, survey_y, 'surv '], [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
def make_bert(): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) print('sem') seminal_p = {} seminal_x = {} seminal_y = {} ct = 0 for p in seminal_hlp['seminal']: seminal_p[ct] = do_bert(p['abs'], tokenizer) seminal_x[ct] = {} seminal_y[ct] = {} ct_ref = 0 for ref in p['ref']: seminal_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: seminal_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/sem_bert_unstemmed.json', seminal_p, seminal_x, seminal_y, 'seminal') survey_p = {} survey_x = {} survey_y = {} print('sur') ct = 0 for p in survey_hlp['survey']: survey_p[ct] = do_bert(p['abs'], tokenizer) survey_x[ct] = {} survey_y[ct] = {} ct_ref = 0 for ref in p['ref']: survey_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: survey_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/sur_bert_unstemmed.json', survey_p, survey_x, survey_y, 'survey') print('uni') uninfluential_p = {} uninfluential_x = {} uninfluential_y = {} ct = 0 for p in uninfluential_hlp['uninfluential']: uninfluential_p[ct] = do_bert(p['abs'], tokenizer) uninfluential_x[ct] = {} uninfluential_y[ct] = {} ct_ref = 0 for ref in p['ref']: uninfluential_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer) ct_ref += 1 ct_cit = 0 for cit in p['cit']: uninfluential_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer) ct_cit += 1 ct += 1 write_to_file(get_file_base() + 'bert_data/uni_bert_unstemmed.json', uninfluential_p, uninfluential_x, uninfluential_y, 'uninfluential')
import json from gensim.models import ldamodel from gensim.corpora.dictionary import Dictionary from general.baseFileExtractor import get_file_base, get_seminal_u, get_survey_u, get_uninfluential_u # read in with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) survey_hlp = survey_hlp['survey'] with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) seminal_hlp = seminal_hlp['seminal'] with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) uninfluential_hlp = uninfluential_hlp['uninfluential'] lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem = [] sur = [] uni = [] for p in seminal_hlp: sem.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in survey_hlp: sur.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in uninfluential_hlp: uni.append(lda[dictionary.doc2bow(p['abs'].split())])
get_survey_u, get_uninfluential_u, get_stem use_stemming = get_stem() if use_stemming: with open(get_seminal_s(), 'r', encoding='utf8') as f: sem = json.load(f) sem = sem['seminal'] with open(get_survey_s(), 'r', encoding='utf8') as f: sur = json.load(f) sur = sur['survey'] with open(get_uninfluential_s(), 'r', encoding='utf8') as f: uni = json.load(f) uni = uni['uninfluential'] else: with open(get_seminal_u(), 'r', encoding='utf8') as f: sem = json.load(f) sem = sem['seminal'] with open(get_survey_u(), 'r', encoding='utf8') as f: sur = json.load(f) sur = sur['survey'] with open(get_uninfluential_u(), 'r', encoding='utf8') as f: uni = json.load(f) uni = uni['uninfluential'] avg_length_abs_sem = 0 ref_sem = [] cit_sem = [] for p in sem: avg_length_abs_sem += len(p['abs'].split())