Esempio n. 1
0
def make_tfidf(use_stemming):
    # read in
    if use_stemming:
        with open(get_survey_s(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_s(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_s(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)
    else:
        with open(get_survey_u(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_u(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_u(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

    data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp,
                                 use_stemming)

    if use_stemming:
        with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)
    else:
        with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)

    print_words(words, use_stemming)
Esempio n. 2
0
def main():
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

    # calculate features on which the classification is going to be performed

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    # data_set[][0] -> P
    # data_set[][1] -> X, references
    # data_set[][2] -> Y, citations
    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_cosine, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/EVAL/d2v_cos_YEAR_' +
        less_than_or_more + "_" + str(year) + '_unstemmed.csv', completed_vecs)
Esempio n. 3
0
def find_equal():
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
        seminal_hlp = seminal_hlp['seminal']

    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
        survey_hlp = survey_hlp['survey']

    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)
        uninfluential_hlp = uninfluential_hlp['uninfluential']

    lengths_sem = {}

    for entry_id in range(0, len(seminal_hlp)):
        abs_length = len(seminal_hlp[entry_id]['abs'].split())
        if abs_length not in lengths_sem:
            lengths_sem[abs_length] = [entry_id]
        else:
            lengths_sem[abs_length].append(entry_id)

    lengths_sur = {}

    for entry_id in range(0, len(survey_hlp)):
        abs_length = len(survey_hlp[entry_id]['abs'].split())
        if abs_length not in lengths_sur:
            lengths_sur[abs_length] = [entry_id]
        else:
            lengths_sur[abs_length].append(entry_id)

    lengths_uni = {}

    for entry_id in range(0, len(uninfluential_hlp)):
        abs_length = len(uninfluential_hlp[entry_id]['abs'].split())
        if abs_length not in lengths_uni:
            lengths_uni[abs_length] = [entry_id]
        else:
            lengths_uni[abs_length].append(entry_id)

    found = []

    for entry in lengths_sem:
        if entry in lengths_sur and entry in lengths_uni:
            found.append(entry)

    sem_ids = []
    sur_ids = []
    uni_ids = []
    for entry in found:
        num = min(len(lengths_sem[entry]), len(lengths_sur[entry]), len(lengths_uni[entry]))

        for i in range(0, num):
            sem_ids.append(lengths_sem[entry][i])
            sur_ids.append(lengths_sur[entry][i])
            uni_ids.append(lengths_uni[entry][i])

    return sem_ids, sur_ids, uni_ids
def make_lda(stem):
    ps = PorterStemmer()

    if stem:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_stemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_stemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(),
                                           'uninfluential')
    else:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_unstemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_unstemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(),
                                           'uninfluential')

    # write lda information to file
    if stem:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_stemmed.json',
            get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_stemmed.json',
            get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_stemmed.json',
            get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
    else:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_unstemmed.json',
            get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_unstemmed.json',
            get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_unstemmed.json',
            get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
def make_d2v():
    # read in
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp)

    with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle',
              'wb') as output:
        pickle.dump(data_set, output)
def task_lda(use_stemming):
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    with open(get_file_base() + 'lda_data/sem_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sem_file:
        sem = json.load(sem_file)
    with open(get_file_base() + 'lda_data/sur_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sur_file:
        sur = json.load(sur_file)
    with open(get_file_base() + 'lda_data/uni_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sur_file:
        uni = json.load(sur_file)

    # seminal
    unordered_seminal_p, unordered_seminal_x, unordered_seminal_y = read_in_json_lda_data(
        'seminal', sem)
    # survey
    unordered_survey_p, unordered_survey_x, unordered_survey_y = read_in_json_lda_data(
        'survey', sur)
    # uninfluential
    unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y = \
        read_in_json_lda_data('uninfluential', uni)

    seminal_hlp = seminal_hlp['seminal']
    survey_hlp = survey_hlp['survey']
    uninfluential_hlp = uninfluential_hlp['uninfluential']

    # matching of ordering of publication with sur/sem/uni_stemmed/unstemmed-data
    seminal_p, seminal_x, seminal_y = order_publications(
        unordered_seminal_p, unordered_seminal_x, unordered_seminal_y,
        seminal_hlp)
    survey_p, survey_x, survey_y = order_publications(unordered_survey_p,
                                                      unordered_survey_x,
                                                      unordered_survey_y,
                                                      survey_hlp)
    uninfluential_p, uninfluential_x, uninfluential_y = order_publications(
        unordered_uninfluential_p, unordered_uninfluential_x,
        unordered_uninfluential_y, uninfluential_hlp)

    return [[seminal_p, seminal_x, seminal_y, 'sem '],
            [survey_p, survey_x, survey_y, 'surv '],
            [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
Esempio n. 7
0
def make_bert():
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    print('sem')

    seminal_p = {}
    seminal_x = {}
    seminal_y = {}

    ct = 0
    for p in seminal_hlp['seminal']:
        seminal_p[ct] = do_bert(p['abs'], tokenizer)

        seminal_x[ct] = {}
        seminal_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            seminal_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            seminal_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/sem_bert_unstemmed.json',
                  seminal_p, seminal_x, seminal_y, 'seminal')

    survey_p = {}
    survey_x = {}
    survey_y = {}

    print('sur')

    ct = 0
    for p in survey_hlp['survey']:
        survey_p[ct] = do_bert(p['abs'], tokenizer)

        survey_x[ct] = {}
        survey_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            survey_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            survey_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/sur_bert_unstemmed.json',
                  survey_p, survey_x, survey_y, 'survey')

    print('uni')

    uninfluential_p = {}
    uninfluential_x = {}
    uninfluential_y = {}

    ct = 0
    for p in uninfluential_hlp['uninfluential']:
        uninfluential_p[ct] = do_bert(p['abs'], tokenizer)

        uninfluential_x[ct] = {}
        uninfluential_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            uninfluential_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            uninfluential_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/uni_bert_unstemmed.json',
                  uninfluential_p, uninfluential_x, uninfluential_y,
                  'uninfluential')
import json
from gensim.models import ldamodel
from gensim.corpora.dictionary import Dictionary
from general.baseFileExtractor import get_file_base, get_seminal_u, get_survey_u, get_uninfluential_u

# read in
with open(get_survey_u(), encoding='latin-1') as s:
    survey_hlp = json.load(s)
    survey_hlp = survey_hlp['survey']

with open(get_seminal_u(), encoding='latin-1') as s:
    seminal_hlp = json.load(s)
    seminal_hlp = seminal_hlp['seminal']

with open(get_uninfluential_u(), encoding='latin-1') as s:
    uninfluential_hlp = json.load(s)
    uninfluential_hlp = uninfluential_hlp['uninfluential']

lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed')
dictionary = Dictionary.load_from_text(get_file_base() +
                                       'lda_data/dict_unstemmed')

sem = []
sur = []
uni = []
for p in seminal_hlp:
    sem.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in survey_hlp:
    sur.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in uninfluential_hlp:
    uni.append(lda[dictionary.doc2bow(p['abs'].split())])
if use_stemming:
    with open(get_seminal_s(), 'r', encoding='utf8') as f:
        sem = json.load(f)
        sem = sem['seminal']
    with open(get_survey_s(), 'r', encoding='utf8') as f:
        sur = json.load(f)
        sur = sur['survey']
    with open(get_uninfluential_s(), 'r', encoding='utf8') as f:
        uni = json.load(f)
        uni = uni['uninfluential']
else:
    with open(get_seminal_u(), 'r', encoding='utf8') as f:
        sem = json.load(f)
        sem = sem['seminal']
    with open(get_survey_u(), 'r', encoding='utf8') as f:
        sur = json.load(f)
        sur = sur['survey']
    with open(get_uninfluential_u(), 'r', encoding='utf8') as f:
        uni = json.load(f)
        uni = uni['uninfluential']

avg_length_abs_sem = 0
ref_sem = []
cit_sem = []

for p in sem:
    avg_length_abs_sem += len(p['abs'].split())
    ref_sem.append(len(p['ref']))
    cit_sem.append(len(p['cit']))