def read_in_csv_data_2(one_2, one_met, one_stemmed, two_2, two_met,
                       two_stemmed):
    source_one = get_file_base(
    ) + 'extracted_features/' + one_2 + '_' + one_met + '_' + one_stemmed + '.csv'
    source_two = get_file_base(
    ) + 'extracted_features/' + two_2 + '_' + two_met + '_' + two_stemmed + '.csv'

    data_one = pandas.read_csv(source_one, sep=',')
    data_two = pandas.read_csv(source_two, sep=',')
    data_two.drop(['class'], axis=1, inplace=True)

    columns = data_one.columns

    for col in range(1, len(columns)):
        columns.values[col] = 'one_' + str(columns[col])

    data = pandas.concat([
        data_one,
        data_two,
    ], axis=1, sort=False)

    data = data.sample(frac=1, random_state=random_state)

    labels = data[['class']]
    data.drop(['class'], axis=1, inplace=True)

    return data, labels
def make_year():
    # read in
    with open(get_survey_s(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_s(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_s(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_difference, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/years_dist_unstemmed.csv',
        completed_vecs)
    write_to_file(
        get_file_base() + 'extracted_features/years_dist_stemmed.csv',
        completed_vecs)
Exemple #3
0
def make_tfidf(use_stemming):
    # read in
    if use_stemming:
        with open(get_survey_s(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_s(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_s(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)
    else:
        with open(get_survey_u(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_u(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_u(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

    data_set, words = task_tfidf(survey_hlp, seminal_hlp, uninfluential_hlp,
                                 use_stemming)

    if use_stemming:
        with open(get_file_base() + 'tfidf_data/tfidf_stemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)
    else:
        with open(get_file_base() + 'tfidf_data/tfidf_unstemmed.sav',
                  'wb') as output:
            joblib.dump(data_set, output)

    print_words(words, use_stemming)
def read_in_csv_data_5(one_stemmed, two_stemmed, one_met, two_met, three_met,
                       four_met):
    source_one = get_file_base(
    ) + 'extracted_features/tfidf_' + one_met + '_' + one_stemmed + '.csv'
    source_two = get_file_base(
    ) + 'extracted_features/lda_' + two_met + '_' + two_stemmed + '.csv'
    source_three = get_file_base(
    ) + 'extracted_features/d2v_' + three_met + '_unstemmed.csv'
    source_four = get_file_base(
    ) + 'extracted_features/bert_' + four_met + '_unstemmed.csv'
    source_five = get_file_base(
    ) + 'extracted_features/years_dist_unstemmed.csv'

    data_one = pandas.read_csv(source_one, sep=',')
    data_two = pandas.read_csv(source_two, sep=',')
    data_three = pandas.read_csv(source_three, sep=',')
    data_four = pandas.read_csv(source_four, sep=',')
    data_five = pandas.read_csv(source_five, sep=',')

    data_two.drop(['class'], axis=1, inplace=True)
    data_three.drop(['class'], axis=1, inplace=True)
    data_four.drop(['class'], axis=1, inplace=True)
    data_five.drop(['class'], axis=1, inplace=True)

    columns = data_one.columns

    for col in range(1, len(columns)):
        columns.values[col] = 'one_' + str(columns[col])

    columns = data_two.columns

    for col in range(1, len(columns)):
        columns.values[col] = 'two_' + str(columns[col])

    columns = data_three.columns

    for col in range(1, len(columns)):
        columns.values[col] = 'three_' + str(columns[col])

    columns = data_four.columns

    for col in range(1, len(columns)):
        columns.values[col] = 'four_' + str(columns[col])

    data = pandas.concat(
        [data_one, data_two, data_three, data_four, data_five],
        axis=1,
        sort=False)

    data = data.sample(frac=1, random_state=random_state)

    labels = data[['class']]
    data.drop(['class'], axis=1, inplace=True)

    return data, labels
Exemple #5
0
def print_words(words, use_stemming):
    if use_stemming:
        f = open(get_file_base() + 'tfidf_data/words_stemmed.txt',
                 'w',
                 encoding='utf8')
    else:
        f = open(get_file_base() + 'tfidf_data/words_unstemmed.txt',
                 'w',
                 encoding='utf8')

    for word, w_id in words.items():
        f.write(str(w_id) + ' ' + word + '\n')

    f.close()
Exemple #6
0
def main():
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

    # calculate features on which the classification is going to be performed

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    # data_set[][0] -> P
    # data_set[][1] -> X, references
    # data_set[][2] -> Y, citations
    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_cosine, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/EVAL/d2v_cos_YEAR_' +
        less_than_or_more + "_" + str(year) + '_unstemmed.csv', completed_vecs)
def sem_all_class(vec, dist, stem, do_her_sig, restrict_to_publication_time):
    # read in normal distance features for all publications
    data, labels = read_in_csv_data(get_file_base() + 'extracted_features/' +
                                    vec + '_' + dist + '_' +
                                    ('un' if not stem else '') + 'stemmed.csv')

    # drop insignificant features
    her_sig = None
    if do_her_sig:
        her_sig = data.drop(
            list(set(data.columns) - set(sig_feat_herrmannova)), axis=1)

    c = single_feature_classify_data(data, labels)

    print('SINGLE FEATURES')
    print(c)

    print('__________________________________')
    print('ALL FEATURES')

    if restrict_to_publication_time:
        data = restrict_features_to_publication_time(data)

    all_single_feature_classify_data(data, labels)

    if do_her_sig:
        print('__________________________________')
        print('ALL SIGNIFICANT FEATURES IN HER')

        # values for features, which were significant in Herrmannova et al
        all_single_feature_classify_data(her_sig, labels)

    # Return feature importance for the GB estimators in CV
    if access_importance:
        assess_importance(data, labels, "RF")
def classify(X, y, clf):
    y = y.astype('int')
    model = ExtraTreesClassifier()
    model.fit(X, y)
    print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()

    # get correlations of each features in dataset
    corrmat = X.corr()
    top_corr_features = corrmat.index
    plt.figure(figsize=(20, 20))
    # plot heat map
    g = sns.heatmap(X[top_corr_features].corr(), annot=False, cmap="coolwarm")
    figure = g.get_figure()
    figure.savefig(get_file_base() + 'plots\importance_heatmap.png')

    bestfeatures = SelectKBest(score_func=f_classif, k=10)
    fit = bestfeatures.fit(X, y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    # concat two dataframes for better visualization
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    print(featureScores.nlargest(10, 'Score'))  # print 10 best features
def main():
    # 0 = complete citation network, 1 = only references, 2 = only citations, 3 = references and p, 4 = only p
    only_part = 4
    use_stemming = get_stem()
    wo_RS = False

    # read in normal distance features for all publications
    data, labels = read_in_csv_data(
        get_file_base() + 'extracted_features/OVR/years_unstemmed_OVR.csv')
    print('ALL FEATURES')

    third = -1
    if only_part == 1:
        data, third = restrict_data(data, 1)
    if only_part == 2:
        data, third = restrict_data(data, 2)
    if only_part == 3:
        data, third = restrict_data(data, 3)

        if wo_RS:
            data = drop_rs(data, third, use_stemming)
    if only_part == 4:
        data, third = restrict_data(data, 4)

    if only_part == 0 and wo_RS:
        data = drop_rs(data, third, use_stemming)

    all_single_feature_classify_data(data, labels)
def task_lda(use_stemming):
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    with open(get_file_base() + 'lda_data/sem_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sem_file:
        sem = json.load(sem_file)
    with open(get_file_base() + 'lda_data/sur_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sur_file:
        sur = json.load(sur_file)
    with open(get_file_base() + 'lda_data/uni_lda_' + ('un' if not use_stemming else '') + 'stemmed.json', 'r') as \
            sur_file:
        uni = json.load(sur_file)

    # seminal
    unordered_seminal_p, unordered_seminal_x, unordered_seminal_y = read_in_json_lda_data(
        'seminal', sem)
    # survey
    unordered_survey_p, unordered_survey_x, unordered_survey_y = read_in_json_lda_data(
        'survey', sur)
    # uninfluential
    unordered_uninfluential_p, unordered_uninfluential_x, unordered_uninfluential_y = \
        read_in_json_lda_data('uninfluential', uni)

    seminal_hlp = seminal_hlp['seminal']
    survey_hlp = survey_hlp['survey']
    uninfluential_hlp = uninfluential_hlp['uninfluential']

    # matching of ordering of publication with sur/sem/uni_stemmed/unstemmed-data
    seminal_p, seminal_x, seminal_y = order_publications(
        unordered_seminal_p, unordered_seminal_x, unordered_seminal_y,
        seminal_hlp)
    survey_p, survey_x, survey_y = order_publications(unordered_survey_p,
                                                      unordered_survey_x,
                                                      unordered_survey_y,
                                                      survey_hlp)
    uninfluential_p, uninfluential_x, uninfluential_y = order_publications(
        unordered_uninfluential_p, unordered_uninfluential_x,
        unordered_uninfluential_y, uninfluential_hlp)

    return [[seminal_p, seminal_x, seminal_y, 'sem '],
            [survey_p, survey_x, survey_y, 'surv '],
            [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
Exemple #11
0
def main():
    # read in normal distance features for all publications
    data = read_in_csv_data(get_file_base() + 'extracted_features/d2v_cos_unstemmed.csv')

    s_sem, s_sur, s_uni = find_equal()

    data, labels = prepare_data(data, s_sem, s_sur, s_uni)

    for model_id in model_config:
        all_single_feature_classify_data(data, labels, model_id)
Exemple #12
0
def main():
    with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle', 'rb') as f:
        data_set = pickle.load(f)

    robust_data_set = generate_robust_ds(data_set)

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_robust_vecs = {}

    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_cosine, robust_data_set, ds_ct, p)

            completed_robust_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(get_file_base() + 'extracted_features/robustness.csv', completed_robust_vecs)
def make_d2v():
    # read in
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    data_set = task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp)

    with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle',
              'wb') as output:
        pickle.dump(data_set, output)
Exemple #14
0
def build_lda_model(stem):
    corpus = []
    ps = PorterStemmer()
    number_of_topics = 100

    # read in data from publications
    with open(get_lda_base(), 'r') as f:
        for line in f:
            if stem:
                stemmed = []

                for w in line.split():
                    s = ps.stem(w)
                    if len(s) > 1:
                        stemmed.append(s)

                corpus.append(stemmed)
            else:
                corpus.append(line.split())

    # build vocabulary and transform texts in vocab format
    dictionary = Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]

    # do lda
    lda = ldamodel.LdaModel(corpus=corpus, num_topics=number_of_topics, passes=20, id2word=dictionary,
                            minimum_probability=0)

    if stem:
        temp_file = datapath('lda_model_stemmed')
        dictionary.save_as_text(get_file_base() + 'lda_data/dict_stemmed')
    else:
        temp_file = datapath('lda_model_unstemmed')
        dictionary.save_as_text(get_file_base() + 'lda_data/dict_unstemmed')

    lda.save(temp_file)
Exemple #15
0
def main():
    data = None
    labels = None
    if comb == 2:
        data, labels = read_in_csv_data_2(
            get_file_base() + 'extracted_features/OVR/lda_stemmed_OVR.csv',
            get_file_base() + 'extracted_features/OVR/tfidf_stemmed_OVR.csv')
    if comb == 3:
        data, labels = read_in_csv_data_3(
            get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv',
            get_file_base() + 'extracted_features/bert_cos_unstemmed.csv',
            get_file_base() + 'extracted_features/lda_was_unstemmed.csv')
    if comb == 4:
        data, labels = read_in_csv_data_4(
            get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv',
            get_file_base() + 'extracted_features/bert_cos_unstemmed.csv',
            get_file_base() + 'extracted_features/lda_was_unstemmed.csv',
            get_file_base() + 'extracted_features/years.csv')

    print('ALL FEATURES')
    all_single_feature_classify_data(data, labels)
Exemple #16
0
def sem_one_class(vec, dist, classifier, stem, single, searched_feat, restrict_to_publication_time):
    # read in normal distance features for all publications
    # data, labels = read_in_csv_data(get_file_base() + 'extracted_features/OVR/lda_unstemmed_OVR.csv')

    data, labels = read_in_csv_data(get_file_base() + 'extracted_features/' + vec + '_' + dist + '_' +
                                    ('un' if not stem else '') + 'stemmed.csv')

    if single:
        print('SINGLE FEATURE')
        single_feature_classify_data(data, labels, classifier, searched_feat)

    else:
        if restrict_to_publication_time:
            data = restrict_features_to_publication_time(data)

        print('ALL FEATURES')

        all_single_feature_classify_data(data, labels, classifier)
data = [trace1, trace2, trace3]
layout = go.Layout(showlegend=True, autosize=False, width=800, height=300,
                   margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4),
                   xaxis=dict(
                       title='Years', showgrid=False
                   ),
                   yaxis=dict(
                       title='Number of publications', showgrid=True, gridcolor='#E2E2E2'
                   ),
                   legend=dict(
                       x=0.01,
                       y=1,
                       font=dict(
                           family='sans-serif',
                           size=12,
                           color='#000'
                       ),
                       bgcolor='#E2E2E2',
                       bordercolor='#FFFFFF',
                       borderwidth=2
                   ), paper_bgcolor='#FFFFFF', plot_bgcolor='#FFFFFF'
                   )
fig = go.Figure(data=data, layout=layout)

plot(fig, get_file_base() + 'plots/sem', image='jpeg')

print('mean year sem : ' + str(np.mean(sem_p)))
print('mean year sur : ' + str(np.mean(sur_p)))
print('mean year uni : ' + str(np.mean(uni_p)))
Exemple #18
0
                      marker=dict(color='green'),
                      name='uninfluential citations')

data = [trace1, trace2, trace3]
layout = go.Layout(showlegend=True,
                   autosize=False,
                   width=600,
                   height=300,
                   margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4),
                   xaxis=dict(title='Years', showgrid=False),
                   yaxis=dict(title='Number of citations',
                              showgrid=True,
                              gridcolor='#E2E2E2'),
                   legend=dict(x=0.01,
                               y=1,
                               font=dict(family='sans-serif',
                                         size=12,
                                         color='#000'),
                               bgcolor='#E2E2E2',
                               bordercolor='#FFFFFF',
                               borderwidth=2),
                   paper_bgcolor='#FFFFFF',
                   plot_bgcolor='#FFFFFF')
fig = go.Figure(data=data, layout=layout)

plot(fig, get_file_base() + 'plots/citations', image='jpeg')

print(np.mean(sem_cit))
print(np.mean(sur_cit))
print(np.mean(uni_cit))
Exemple #19
0
def make_bert():
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    with open(get_seminal_u(), encoding='latin-1') as s:
        seminal_hlp = json.load(s)
    with open(get_survey_u(), encoding='latin-1') as s:
        survey_hlp = json.load(s)
    with open(get_uninfluential_u(), encoding='latin-1') as s:
        uninfluential_hlp = json.load(s)

    print('sem')

    seminal_p = {}
    seminal_x = {}
    seminal_y = {}

    ct = 0
    for p in seminal_hlp['seminal']:
        seminal_p[ct] = do_bert(p['abs'], tokenizer)

        seminal_x[ct] = {}
        seminal_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            seminal_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            seminal_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/sem_bert_unstemmed.json',
                  seminal_p, seminal_x, seminal_y, 'seminal')

    survey_p = {}
    survey_x = {}
    survey_y = {}

    print('sur')

    ct = 0
    for p in survey_hlp['survey']:
        survey_p[ct] = do_bert(p['abs'], tokenizer)

        survey_x[ct] = {}
        survey_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            survey_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            survey_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/sur_bert_unstemmed.json',
                  survey_p, survey_x, survey_y, 'survey')

    print('uni')

    uninfluential_p = {}
    uninfluential_x = {}
    uninfluential_y = {}

    ct = 0
    for p in uninfluential_hlp['uninfluential']:
        uninfluential_p[ct] = do_bert(p['abs'], tokenizer)

        uninfluential_x[ct] = {}
        uninfluential_y[ct] = {}

        ct_ref = 0
        for ref in p['ref']:
            uninfluential_x[ct][ct_ref] = do_bert(ref['abs'], tokenizer)
            ct_ref += 1

        ct_cit = 0
        for cit in p['cit']:
            uninfluential_y[ct][ct_cit] = do_bert(cit['abs'], tokenizer)
            ct_cit += 1

        ct += 1

    write_to_file(get_file_base() + 'bert_data/uni_bert_unstemmed.json',
                  uninfluential_p, uninfluential_x, uninfluential_y,
                  'uninfluential')

trace3 = go.Scatter(
        mode='markers',
        x=cit,
        y=ref,
        marker=dict(
          color='green',
          size=8,
          opacity=0.3, symbol='cross', line=dict(color='green', width=2)
        ),
        showlegend=False,
        name='Uninfluential'
)

data = [trace2, trace1, trace3]
layout = {'yaxis': dict(
              title='Number of references',
              type='log',
              autorange=True, gridcolor='#E2E2E2'
          ),
          'xaxis': dict(
              title='Number of citations',
              type='log',
              autorange=True, gridcolor='#E2E2E2'
          ), 'width': 1200, 'height': 500, 'paper_bgcolor': '#FFFFFF', 'plot_bgcolor': '#FFFFFF'
          }

fig = go.Figure(data=data, layout=layout)
plot(fig, get_file_base() + 'plots/' + title + '.pdf')
Exemple #21
0
def task_d2v(survey_hlp, seminal_hlp, uninfluential_hlp):
    # model 0 : dbow; model 1: dm
    doc2vec_model = Doc2Vec.load(get_file_base() + 'models/doc2vec_model_1')

    seminal_p = {}
    seminal_x = {}
    seminal_y = {}

    ct = 0
    for p in seminal_hlp['seminal']:
        doc2vec_model.random.seed(0)
        hlp = doc2vec_model.infer_vector(p['abs'].split(), alpha=0.025, steps=20)
        seminal_p[ct] = hlp.reshape(1, -1)

        seminal_x[ct] = {}
        seminal_y[ct] = {}

        ct_x = 0
        for ref in p['ref']:
            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(ref['abs'].split(), alpha=0.025, steps=20)
            seminal_x[ct][ct_x] = hlp.reshape(1, -1)
            ct_x += 1

        ct_y = 0
        for cit in p['cit']:
            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(cit['abs'].split(), alpha=0.025, steps=20)
            seminal_y[ct][ct_y] = hlp.reshape(1, -1)
            ct_y += 1

        ct += 1

    survey_p = {}
    survey_x = {}
    survey_y = {}

    ct = 0
    for p in survey_hlp['survey']:
        surv_abs = []
        for w in re.compile('[^a-zA-Z0-9]+').split(p['abs']):
            surv_abs.append(w.lower())

        doc2vec_model.random.seed(0)
        hlp = doc2vec_model.infer_vector(surv_abs, alpha=0.025, steps=20)
        survey_p[ct] = hlp.reshape(1, -1)

        survey_x[ct] = {}
        survey_y[ct] = {}

        ct_x = 0
        for ref in p['ref']:
            abs_o = []
            for w in re.compile('[^a-zA-Z0-9]+').split(ref['abs']):
                abs_o.append(w.lower())

            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(abs_o, alpha=0.025, steps=20)
            survey_x[ct][ct_x] = hlp.reshape(1, -1)
            ct_x += 1

        ct_y = 0
        for cit in p['cit']:
            abs_i = []
            for w in re.compile('[^a-zA-Z0-9]+').split(cit['abs']):
                abs_i.append(w.lower())

            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(abs_i, alpha=0.025, steps=20)
            survey_y[ct][ct_y] = hlp.reshape(1, -1)
            ct_y += 1

        ct += 1

    uninfluential_p = {}
    uninfluential_x = {}
    uninfluential_y = {}

    ct = 0
    for p in uninfluential_hlp['uninfluential']:
        surv_abs = []
        for w in re.compile('[^a-zA-Z0-9]+').split(p['abs']):
            surv_abs.append(w.lower())

        doc2vec_model.random.seed(0)
        hlp = doc2vec_model.infer_vector(surv_abs, alpha=0.025, steps=20)
        uninfluential_p[ct] = hlp.reshape(1, -1)

        uninfluential_x[ct] = {}
        uninfluential_y[ct] = {}

        ct_x = 0
        for ref in p['ref']:
            abs_o = []
            for w in re.compile('[^a-zA-Z0-9]+').split(ref['abs']):
                abs_o.append(w.lower())

            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(abs_o, alpha=0.025, steps=20)
            uninfluential_x[ct][ct_x] = hlp.reshape(1, -1)
            ct_x += 1

        ct_y = 0
        for cit in p['cit']:
            abs_i = []
            for w in re.compile('[^a-zA-Z0-9]+').split(cit['abs']):
                abs_i.append(w.lower())

            doc2vec_model.random.seed(0)
            hlp = doc2vec_model.infer_vector(abs_i, alpha=0.025, steps=20)
            uninfluential_y[ct][ct_y] = hlp.reshape(1, -1)
            ct_y += 1

        ct += 1

    return [[seminal_p, seminal_x, seminal_y, 'sem '], [survey_p, survey_x, survey_y, 'surv '],
            [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
Exemple #22
0
def task_year(survey_hlp, seminal_hlp, uninfluential_hlp):
    # model 0 : dbow; model 1: dm
    doc2vec_model = Doc2Vec.load(get_file_base() + 'models/doc2vec_model_1')

    seminal_p = {}
    seminal_x = {}
    seminal_y = {}

    ct = 0
    curr_ct = 0
    for p in seminal_hlp['seminal']:
        if less_than_or_more == 'l':
            if p['year'] <= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                seminal_p[curr_ct] = hlp.reshape(1, -1)

                seminal_x[curr_ct] = {}
                seminal_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    if ref['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        seminal_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1)
                        curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    if cit['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        seminal_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1)
                        curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1
        if less_than_or_more == "m":
            if p['year'] >= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                seminal_p[curr_ct] = hlp.reshape(1, -1)

                seminal_x[curr_ct] = {}
                seminal_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    seminal_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1)
                    curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    seminal_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1)
                    curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1

        ct += 1

    survey_p = {}
    survey_x = {}
    survey_y = {}

    ct = 0
    curr_ct = 0
    for p in survey_hlp['survey']:
        if less_than_or_more == 'l':
            if p['year'] <= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                survey_p[curr_ct] = hlp.reshape(1, -1)

                survey_x[curr_ct] = {}
                survey_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    if ref['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        survey_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1)
                        curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    if cit['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        survey_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1)
                        curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1
        if less_than_or_more == 'm':
            if p['year'] >= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                survey_p[curr_ct] = hlp.reshape(1, -1)

                survey_x[curr_ct] = {}
                survey_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    survey_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1)
                    curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    survey_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1)
                    curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1
        ct += 1

    uninfluential_p = {}
    uninfluential_x = {}
    uninfluential_y = {}

    ct = 0
    curr_ct = 0
    for p in uninfluential_hlp['uninfluential']:
        if less_than_or_more == 'l':
            if p['year'] <= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                uninfluential_p[curr_ct] = hlp.reshape(1, -1)

                uninfluential_x[curr_ct] = {}
                uninfluential_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    if ref['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        uninfluential_x[curr_ct][curr_ct_x] = hlp.reshape(
                            1, -1)
                        curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    if cit['year'] <= year:
                        doc2vec_model.random.seed(0)
                        hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                         alpha=0.025,
                                                         steps=20)
                        uninfluential_y[curr_ct][curr_ct_y] = hlp.reshape(
                            1, -1)
                        curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1
        if less_than_or_more == 'm':
            if p['year'] >= year:
                doc2vec_model.random.seed(0)
                hlp = doc2vec_model.infer_vector(p['abs'].split(),
                                                 alpha=0.025,
                                                 steps=20)
                uninfluential_p[curr_ct] = hlp.reshape(1, -1)

                uninfluential_x[curr_ct] = {}
                uninfluential_y[curr_ct] = {}

                ct_x = 0
                curr_ct_x = 0
                for ref in p['ref']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(ref['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    uninfluential_x[curr_ct][curr_ct_x] = hlp.reshape(1, -1)
                    curr_ct_x += 1
                    ct_x += 1

                ct_y = 0
                curr_ct_y = 0
                for cit in p['cit']:
                    doc2vec_model.random.seed(0)
                    hlp = doc2vec_model.infer_vector(cit['abs'].split(),
                                                     alpha=0.025,
                                                     steps=20)
                    uninfluential_y[curr_ct][curr_ct_y] = hlp.reshape(1, -1)
                    curr_ct_y += 1
                    ct_y += 1

                curr_ct += 1
        ct += 1

    return [[seminal_p, seminal_x, seminal_y, 'sem '],
            [survey_p, survey_x, survey_y, 'surv '],
            [uninfluential_p, uninfluential_x, uninfluential_y, 'uni ']]
def make_features(metric, task, this_use_stemming):
    data_set = None

    if metric not in ['cos', 'jac', 'emd', 'ipd', 'dist']:
        print('Metric ' + metric + ' unknown.')
        return
    if task not in ['tfidf', 'd2v', 'bert', 'lda', 'year']:
        print('Task ' + task + ' unknown.')
        return

    print('Using ' + metric + ' on ' +
          ('un' if not this_use_stemming else '') + 'stemmed ' + task +
          ' vectors.')

    if metric == 'dist' and task == 'year':
        make_year()
        return

    if task == 'tfidf':
        with open(
                get_file_base() + 'tfidf_data/tfidf_' +
            ('un' if not this_use_stemming else '') + 'stemmed.sav',
                'rb') as f:
            data_set = joblib.load(f)

        # for p in range(0, len(data_set[2][0])):
        #    data_set[2][0][p] = [data_set[2][0][p]]

        # for p in range(0, len(data_set[2][1])):
        #    for x in range(0, len(data_set[2][1][p])):
        #       data_set[2][1][p][x] = [data_set[2][1][p][x]]

        # for p in range(0, len(data_set[2][2])):
        #    for x in range(0, len(data_set[2][2][p])):
        #       data_set[2][2][p][x] = [data_set[2][2][p][x]]

    if task == 'd2v':
        with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle',
                  'rb') as f:
            data_set = pickle.load(f)

    if task == 'bert':
        with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json',
                  encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json',
                  encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json',
                  encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

        data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp)

    if task == 'lda':
        data_set = task_lda(this_use_stemming)

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    # data_set[][0] -> P
    # data_set[][1] -> X, references
    # data_set[][2] -> Y, citations
    while ds_ct < len(data_set):
        p = 0
        while p < len(data_set[ds_ct][0]):
            futures = None
            if metric == 'emd':
                futures = executor.submit(do_wasserstein, data_set, ds_ct, p)
            if metric == 'cos':
                futures = executor.submit(do_cosine, data_set, ds_ct, p)
            if metric == 'jac':
                futures = executor.submit(do_jaccard, data_set, ds_ct, p)
            if metric == 'ipd':
                futures = executor.submit(do_component_wise_multiplication,
                                          data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/' + task + '_' + metric + '_' +
        ('un' if not this_use_stemming else '') + 'stemmed.csv',
        completed_vecs)
Exemple #24
0
    for ref in p['ref']:
        uni_cit.append(ref['year'])

    for cit in p['cit']:
        uni_ref.append(cit['year'])

if seminal == 0:
    trace1 = go.Histogram(x=sem_p, opacity=1)
    trace2 = go.Histogram(x=sem_cit, opacity=0.5)
    trace3 = go.Histogram(x=sem_ref, opacity=0.5)
if seminal == 1:
    trace1 = go.Histogram(x=sur_p, opacity=1)
    trace2 = go.Histogram(x=sur_cit, opacity=0.5)
    trace3 = go.Histogram(x=sur_ref, opacity=0.5)
if seminal == 2:
    trace1 = go.Histogram(x=uni_p, opacity=1)
    trace2 = go.Histogram(x=uni_cit, opacity=0.5)
    trace3 = go.Histogram(x=uni_ref, opacity=0.5)

data = [trace1, trace2, trace3]
layout = go.Layout(showlegend=False,
                   barmode='overlay',
                   width=600,
                   height=300,
                   margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4),
                   yaxis=dict(range=[0, 6999]))
fig = go.Figure(data=data, layout=layout)

plot(fig, get_file_base() + 'plots/' + str(seminal), image='jpeg')
import seaborn as sns
from classify.ClassificationSEM import read_in_csv_data
from general.baseFileExtractor import get_file_base


data, labels = read_in_csv_data(get_file_base() + 'extracted_features/tfidf_cos_unstemmed.csv')

ax = sns.heatmap(data.corr())
figure = ax.get_figure()
figure.savefig(get_file_base() + 'plots/heatmap.png')
from general.baseFileExtractor import get_file_base, get_seminal_u, get_survey_u, get_uninfluential_u

# read in
with open(get_survey_u(), encoding='latin-1') as s:
    survey_hlp = json.load(s)
    survey_hlp = survey_hlp['survey']

with open(get_seminal_u(), encoding='latin-1') as s:
    seminal_hlp = json.load(s)
    seminal_hlp = seminal_hlp['seminal']

with open(get_uninfluential_u(), encoding='latin-1') as s:
    uninfluential_hlp = json.load(s)
    uninfluential_hlp = uninfluential_hlp['uninfluential']

lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed')
dictionary = Dictionary.load_from_text(get_file_base() +
                                       'lda_data/dict_unstemmed')

sem = []
sur = []
uni = []
for p in seminal_hlp:
    sem.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in survey_hlp:
    sur.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in uninfluential_hlp:
    uni.append(lda[dictionary.doc2bow(p['abs'].split())])

fin_sem = []
fin_sur = []
Exemple #27
0
from general.baseFileExtractor import get_seminal_s, get_survey_s, get_uninfluential_s, get_seminal_u, get_survey_u, \
    get_uninfluential_u, get_stem, get_file_base, get_what_to_do, get_which_vectors, get_which_distance, get_classifier
from generateData.TFIDFEmbedding import make_tfidf
from generateData.BuildD2VModel import build_d2v_model
from generateData.D2VEmbedding import make_d2v
from generateData.BERTEmbedding import make_bert
from generateData.BuildLDAModel import build_lda_model
from generateData.LDAEmbedding import make_lda
from computeFeatures.FeaturesFromEmbedding import make_features
from classify.ClassificationSEMallC import sem_all_class
from classify.ClassificationSEM import sem_one_class

import os

# build folder structure
if not os.path.exists(os.path.dirname(get_file_base() + 'tfidf_data/')):
    os.makedirs(os.path.dirname(get_file_base() + 'tfidf_data/'))
if not os.path.exists(os.path.dirname(get_file_base() + 'd2v_data/')):
    os.makedirs(os.path.dirname(get_file_base() + 'd2v_data/'))
if not os.path.exists(os.path.dirname(get_file_base() + 'bert_data/')):
    os.makedirs(os.path.dirname(get_file_base() + 'bert_data/'))
if not os.path.exists(os.path.dirname(get_file_base() + 'lda_data/')):
    os.makedirs(os.path.dirname(get_file_base() + 'lda_data/'))

if not os.path.exists(
        os.path.dirname(get_file_base() + 'extracted_features/')):
    os.makedirs(os.path.dirname(get_file_base() + 'extracted_features/'))
if not os.path.exists(
        os.path.dirname(get_file_base() + 'extracted_features/OVR/')):
    os.makedirs(os.path.dirname(get_file_base() + 'extracted_features/OVR/'))
if not os.path.exists(os.path.dirname(get_file_base() + 'plots/')):
from plotly.offline import plot
import plotly.graph_objs as go
from classify.Classification import read_in_csv_data_sem_sur_uni
from general.baseFileExtractor import get_file_base

vec = 'tfidf'
measure = 'cos'
stem = 'stemmed'

full_data, labels, sem, sur, uni = read_in_csv_data_sem_sur_uni(get_file_base() +
                                                                'extracted_features/tfidf_cos_stemmed.csv')
feature = 'sum'
group = 'A'
title = feature + group + ' ' + vec + ' ' + measure + ' ' + stem[:1]

sem = sem[feature + group]
sur = sur[feature + group]

uni = uni[feature + group]

trace1 = go.Box(x=sem, opacity=1, name='seminal', marker=dict(color='blue'))
trace2 = go.Box(x=sur, opacity=1, name='survey', marker=dict(color='orange'))
trace3 = go.Box(x=uni, opacity=1, name='uninfluential', marker=dict(color='green'))

layout = go.Layout(showlegend=False, autosize=False, width=800, height=250, xaxis_type='log',
                   margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4),
                   xaxis=dict(
                       title='Value for ' + feature + group, showgrid=True, gridcolor='#E2E2E2'
                   ),
                   yaxis=dict(
                       showgrid=False
                      marker=dict(color='green'),
                      name='uninfluential references')

data = [trace1, trace2, trace3]
layout = go.Layout(showlegend=True,
                   autosize=False,
                   width=600,
                   height=300,
                   margin=go.layout.Margin(l=50, r=15, b=40, t=10, pad=4),
                   xaxis=dict(title='Years', showgrid=False),
                   yaxis=dict(title='Number of references',
                              showgrid=True,
                              gridcolor='#E2E2E2'),
                   legend=dict(x=0.01,
                               y=1,
                               font=dict(family='sans-serif',
                                         size=12,
                                         color='#000'),
                               bgcolor='#E2E2E2',
                               bordercolor='#FFFFFF',
                               borderwidth=2),
                   paper_bgcolor='#FFFFFF',
                   plot_bgcolor='#FFFFFF')
fig = go.Figure(data=data, layout=layout)

plot(fig, get_file_base() + 'plots/references', image='jpeg')

print(np.mean(sem_ref))
print(np.mean(sur_ref))
print(np.mean(uni_ref))
def main():
    if task not in ['tfidf', 'd2v', 'bert', 'lda', 'years']:
        print('Task ' + task + ' unknown.')
        return

    if task == 'tfidf':
        with open(get_file_base() + 'tfidf_data/tfidf_' + ('un' if not use_stemming else '') + 'stemmed.sav', 'rb') as \
                f:
            data_set = joblib.load(f)

        # todo: delete
        for p in range(0, len(data_set[2][0])):
            data_set[2][0][p] = [data_set[2][0][p]]

        for p in range(0, len(data_set[2][1])):
            for x in range(0, len(data_set[2][1][p])):
                data_set[2][1][p][x] = [data_set[2][1][p][x]]

        for p in range(0, len(data_set[2][2])):
            for x in range(0, len(data_set[2][2][p])):
                data_set[2][2][p][x] = [data_set[2][2][p][x]]

    if task == 'd2v':
        with open(get_file_base() + 'd2v_data/d2v_unstemmed.pickle',
                  'rb') as f:
            data_set = pickle.load(f)

    if task == 'bert':
        with open(get_file_base() + 'bert_data/sur_bert_unstemmed.json',
                  encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/sem_bert_unstemmed.json',
                  encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_file_base() + 'bert_data/uni_bert_unstemmed.json',
                  encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

        data_set = task_bert(survey_hlp, seminal_hlp, uninfluential_hlp)

    if task == 'lda':
        data_set = task_lda(use_stemming)

    if task == 'years':
        with open(get_survey_s(), encoding='latin-1') as s:
            survey_hlp = json.load(s)
        with open(get_seminal_s(), encoding='latin-1') as s:
            seminal_hlp = json.load(s)
        with open(get_uninfluential_s(), encoding='latin-1') as s:
            uninfluential_hlp = json.load(s)

        data_set = task_year(survey_hlp, seminal_hlp, uninfluential_hlp)

        for ds in range(0, 3):
            for p in range(0, len(data_set[ds][0])):
                data_set[ds][0][p] = [[data_set[ds][0][p]]]

            for p in range(0, len(data_set[ds][1])):
                for x in range(0, len(data_set[ds][1][p])):
                    data_set[ds][1][p][x] = [[data_set[ds][1][p][x]]]

            for p in range(0, len(data_set[ds][2])):
                for x in range(0, len(data_set[ds][2][p])):
                    data_set[ds][2][p][x] = [[data_set[ds][2][p][x]]]

    ds_ct = 0
    executor = ThreadPoolExecutor(max_workers=64)
    completed_vecs = {}

    # data_set[][0] -> P
    # data_set[][1] -> X, references
    # data_set[][2] -> Y, citations
    while ds_ct < len(data_set):
        p = 0

        while p < len(data_set[ds_ct][0]):
            futures = executor.submit(do_one_doc_rep, data_set, ds_ct, p)

            completed_vecs[data_set[ds_ct][3] + str(p)] = futures.result()
            p += 1
        ds_ct += 1

    write_to_file(
        get_file_base() + 'extracted_features/OVR/' + task + '_' +
        ('un' if not use_stemming else '') + 'stemmed_OVR.csv', completed_vecs)