Esempio n. 1
0
def make_tfidf(use_stemming):
    # read in
    if use_stemming:
        with open(get_survey_s(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_s(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_s(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)
    else:
        with open(get_survey_u(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_u(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_u(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

    data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp,
                                 use_stemming)

    if use_stemming:
        with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)
    else:
        with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)

    print_words(words, use_stemming)
def make_year():
    # read in
    with open(get_survey_s(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_s(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_s(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_difference, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/years_dist_unstemmed.csv',
        completed_vecs)
    write_to_file(
        get_file_base() + 'extracted_features/years_dist_stemmed.csv',
        completed_vecs)
Esempio n. 3
0
def main():
    # read in
    with open(get_survey_s(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_s(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_s(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    count(survey_hlp, seminal_hlp, uninfluential_hlp)
def make_lda(stem):
    ps = PorterStemmer()

    if stem:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_stemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_stemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(),
                                           'uninfluential')
    else:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_unstemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_unstemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(),
                                           'uninfluential')

    # write lda information to file
    if stem:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_stemmed.json',
            get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_stemmed.json',
            get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_stemmed.json',
            get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
    else:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_unstemmed.json',
            get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_unstemmed.json',
            get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_unstemmed.json',
            get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
def main():
    with open(get_seminal_s(), 'r', encoding='utf8') as f:
        sem = json.load(f)
        sem = sem['seminal']
    with open(get_survey_s(), 'r', encoding='utf8') as f:
        sur = json.load(f)
        sur = sur['survey']
    with open(get_uninfluential_s(), 'r', encoding='utf8') as f:
        uni = json.load(f)
        uni = uni['uninfluential']

    references = []
    citations = []
    labels = []

    for p in sem:
        references.append(len(p['ref']))
        citations.append(len(p['cit']))
        labels.append(0)

    for p in sur:
        references.append(len(p['ref']))
        citations.append(len(p['cit']))
        labels.append(1)

    for p in uni:
        references.append(len(p['ref']))
        citations.append(len(p['cit']))
        labels.append(2)

    data = pd.DataFrame(data={
        'class': labels,
        'r': references,
        'c': citations
    })
    data = data.sample(frac=1, random_state=random_state)

    labels = data[['class']]
    data.drop(['class'], axis=1, inplace=True)

    all_single_feature_classify_data(data, labels, 'GB')
# durchschnittsjahr für alle ref pro sem/sur berechnen
# Histograms mit Jahren von Cit/Ref von allen sem/sur Publikationen aus einem Jahr

from plotly.offline import plot
import plotly.graph_objs as go
import json
import numpy as np
from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_file_base

# read in
with open(get_survey_s(), encoding='latin-1') as s:
    survey_hlp = json.load(s)
with open(get_seminal_s(), encoding='latin-1') as s:
    seminal_hlp = json.load(s)
with open(get_uninfluential_s(), encoding='latin-1') as s:
    uninfluential_hlp = json.load(s)

sem_ref = []
sur_ref = []
uni_ref = []

for p in seminal_hlp['seminal']:
    for ref in p['ref']:
        sem_ref.append(ref['year'])

for p in survey_hlp['survey']:
    for ref in p['ref']:
        sur_ref.append(ref['year'])

for p in uninfluential_hlp['uninfluential']:
    for ref in p['ref']:
import json
import numpy as np
from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_seminal_u,\
    get_survey_u, get_uninfluential_u, get_stem

use_stemming = get_stem()

if use_stemming:
    with open(get_seminal_s(), 'r', encoding='utf8') as f:
        sem = json.load(f)
        sem = sem['seminal']
    with open(get_survey_s(), 'r', encoding='utf8') as f:
        sur = json.load(f)
        sur = sur['survey']
    with open(get_uninfluential_s(), 'r', encoding='utf8') as f:
        uni = json.load(f)
        uni = uni['uninfluential']
else:
    with open(get_seminal_u(), 'r', encoding='utf8') as f:
        sem = json.load(f)
        sem = sem['seminal']
    with open(get_survey_u(), 'r', encoding='utf8') as f:
        sur = json.load(f)
        sur = sur['survey']
    with open(get_uninfluential_u(), 'r', encoding='utf8') as f:
        uni = json.load(f)
        uni = uni['uninfluential']

avg_length_abs_sem = 0
ref_sem = []
cit_sem = []
def main():
    if task not in ['tfidf', 'd2v', 'bert', 'lda', 'years']:
        print('Task ' + task + ' unknown.')
        return

    if task == 'tfidf':
        with open(get_file_base() + 'tfidf_data/tfidf_' + ('un' if not use_stemming else '') + 'stemmed.sav', 'rb') as \
                f:
            data_set = joblib.load(f)

        # todo: delete
        for p in range(0, len(data_set[2][0])):
            data_set[2][0][p] = [data_set[2][0][p]]

        for p in range(0, len(data_set[2][1])):
            for x in range(0, len(data_set[2][1][p])):
                data_set[2][1][p][x] = [data_set[2][1][p][x]]

        for p in range(0, len(data_set[2][2])):
            for x in range(0, len(data_set[2][2][p])):
                data_set[2][2][p][x] = [data_set[2][2][p][x]]

    if task == 'd2v':
        with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle',
                  'rb') as f:
            data_set = pickle.load(f)

    if task == 'bert':
        with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json',
                  encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json',
                  encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json',
                  encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

        data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp)

    if task == 'lda':
        data_set = task_lda(use_stemming)

    if task == 'years':
        with open(get_survey_s(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_s(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_s(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

        data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

        for ds in range(0, 3):
            for p in range(0, len(data_set[ds][0])):
                data_set[ds][0][p] = [[data_set[ds][0][p]]]

            for p in range(0, len(data_set[ds][1])):
                for x in range(0, len(data_set[ds][1][p])):
                    data_set[ds][1][p][x] = [[data_set[ds][1][p][x]]]

            for p in range(0, len(data_set[ds][2])):
                for x in range(0, len(data_set[ds][2][p])):
                    data_set[ds][2][p][x] = [[data_set[ds][2][p][x]]]

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    # data_set[][0] -> P
    # data_set[][1] -> X, references
    # data_set[][2] -> Y, citations
    while ds_ct < len(data_set):
        p = 0

        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_one_doc_rep, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/OVR/' + task + '_' +
        ('un' if not use_stemming else '') + 'stemmed_OVR.csv', completed_vecs)