def make_tfidf(use_stemming): # read in if use_stemming: with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) else: with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp, use_stemming) if use_stemming: with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav', 'wb') as output: joblib.dump(data_set, output) else: with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav', 'wb') as output: joblib.dump(data_set, output) print_words(words, use_stemming)
def make_year(): # read in with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_difference, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/years_dist_unstemmed.csv', completed_vecs) write_to_file( get_file_base() + 'extracted_features/years_dist_stemmed.csv', completed_vecs)
def main(): # read in with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) count(survey_hlp, seminal_hlp, uninfluential_hlp)
def make_lda(stem): ps = PorterStemmer() if stem: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_stemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_stemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(), 'uninfluential') else: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(), 'uninfluential') # write lda information to file if stem: write_to_file( get_file_base() + 'lda_data/sem_lda_stemmed.json', get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_stemmed.json', get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_stemmed.json', get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary) else: write_to_file( get_file_base() + 'lda_data/sem_lda_unstemmed.json', get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_unstemmed.json', get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_unstemmed.json', get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
def main(): with open(get_seminal_s(), 'r', encoding='utf8') as f: sem = json.load(f) sem = sem['seminal'] with open(get_survey_s(), 'r', encoding='utf8') as f: sur = json.load(f) sur = sur['survey'] with open(get_uninfluential_s(), 'r', encoding='utf8') as f: uni = json.load(f) uni = uni['uninfluential'] references = [] citations = [] labels = [] for p in sem: references.append(len(p['ref'])) citations.append(len(p['cit'])) labels.append(0) for p in sur: references.append(len(p['ref'])) citations.append(len(p['cit'])) labels.append(1) for p in uni: references.append(len(p['ref'])) citations.append(len(p['cit'])) labels.append(2) data = pd.DataFrame(data={ 'class': labels, 'r': references, 'c': citations }) data = data.sample(frac=1, random_state=random_state) labels = data[['class']] data.drop(['class'], axis=1, inplace=True) all_single_feature_classify_data(data, labels, 'GB')
# durchschnittsjahr für alle ref pro sem/sur berechnen # Histograms mit Jahren von Cit/Ref von allen sem/sur Publikationen aus einem Jahr from plotly.offline import plot import plotly.graph_objs as go import json import numpy as np from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_file_base # read in with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) sem_ref = [] sur_ref = [] uni_ref = [] for p in seminal_hlp['seminal']: for ref in p['ref']: sem_ref.append(ref['year']) for p in survey_hlp['survey']: for ref in p['ref']: sur_ref.append(ref['year']) for p in uninfluential_hlp['uninfluential']: for ref in p['ref']:
import json import numpy as np from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_seminal_u,\ get_survey_u, get_uninfluential_u, get_stem use_stemming = get_stem() if use_stemming: with open(get_seminal_s(), 'r', encoding='utf8') as f: sem = json.load(f) sem = sem['seminal'] with open(get_survey_s(), 'r', encoding='utf8') as f: sur = json.load(f) sur = sur['survey'] with open(get_uninfluential_s(), 'r', encoding='utf8') as f: uni = json.load(f) uni = uni['uninfluential'] else: with open(get_seminal_u(), 'r', encoding='utf8') as f: sem = json.load(f) sem = sem['seminal'] with open(get_survey_u(), 'r', encoding='utf8') as f: sur = json.load(f) sur = sur['survey'] with open(get_uninfluential_u(), 'r', encoding='utf8') as f: uni = json.load(f) uni = uni['uninfluential'] avg_length_abs_sem = 0 ref_sem = [] cit_sem = []
def main(): if task not in ['tfidf', 'd2v', 'bert', 'lda', 'years']: print('Task ' + task + ' unknown.') return if task == 'tfidf': with open(get_file_base() + 'tfidf_data/tfidf_' + ('un' if not use_stemming else '') + 'stemmed.sav', 'rb') as \ f: data_set = joblib.load(f) # todo: delete for p in range(0, len(data_set[2][0])): data_set[2][0][p] = [data_set[2][0][p]] for p in range(0, len(data_set[2][1])): for x in range(0, len(data_set[2][1][p])): data_set[2][1][p][x] = [data_set[2][1][p][x]] for p in range(0, len(data_set[2][2])): for x in range(0, len(data_set[2][2][p])): data_set[2][2][p][x] = [data_set[2][2][p][x]] if task == 'd2v': with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'rb') as f: data_set = pickle.load(f) if task == 'bert': with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json', encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json', encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json', encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp) if task == 'lda': data_set = task_lda(use_stemming) if task == 'years': with open(get_survey_s(), encoding='latin-1') as s: survey_hlp = json.load(s) with open(get_seminal_s(), encoding='latin-1') as s: seminal_hlp = json.load(s) with open(get_uninfluential_s(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp) for ds in range(0, 3): for p in range(0, len(data_set[ds][0])): data_set[ds][0][p] = [[data_set[ds][0][p]]] for p in range(0, len(data_set[ds][1])): for x in range(0, len(data_set[ds][1][p])): data_set[ds][1][p][x] = [[data_set[ds][1][p][x]]] for p in range(0, len(data_set[ds][2])): for x in range(0, len(data_set[ds][2][p])): data_set[ds][2][p][x] = [[data_set[ds][2][p][x]]] ds_ct = 0 executor = ThreadPoolExecutor(max_workers=64) completed_vecs = {} # data_set[][0] -> P # data_set[][1] -> X, references # data_set[][2] -> Y, citations while ds_ct < len(data_set): p = 0 while p < len(data_set[ds_ct][0]): futures = executor.submit(do_one_doc_rep, data_set, ds_ct, p) completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result() p += 1 ds_ct += 1 write_to_file( get_file_base() + 'extracted_features/OVR/' + task + '_' + ('un' if not use_stemming else '') + 'stemmed_OVR.csv', completed_vecs)