from sklearn.externals import joblib
import text_processing as tp
import dataset_utils as dsu
import pandas as pd
import os

out_dir = "preprocessed_data"

vectorizer = joblib.load("models/vectorizer.pkl")
analyzer = vectorizer.build_analyzer()

data_zip = [dsu.read_dataset_UK_ethos(True), dsu.read_dataset_UK_ethos(False)]
out_files_zip = ["ethos_abs_preprocessed.csv", "ethos_no_abs_preprocessed.csv"]

for data, out_file in zip(data_zip, out_files_zip):
    data = dsu.read_dataset_UK_ethos(True)
    print("data with abs size:", data.count())
    data_text = tp.lemmatize_data(data['titolo'])
    data_text = tp.preprocess_text_data(data_text, analyzer)
    print("preprocessed data size: {}".format(len(data_text)))
    data.loc[:, "preprocessed_data"] = data_text

    data.to_csv(os.path.join(out_dir, out_file),
                index=None,
                columns=[
                    "id", "titolo", "autore", "univ", "publisher", "anno",
                    "abs", "tipo", "argomento", "preprocessed_data"
                ])
Beispiel #2
0
        #if hit_score > score_threshold:
        #update_global_scores(hit_score, max_score, min_score)
        query_hits[hit['_id']] = hit_score
    return query_hits


####################################################################

es = Elasticsearch('localhost', port=9205)

# choose which datraset to read
# - US
# - UK without abstracts (pass False)
# - UK with abstracts (pass True)
for data in [
        dsu.read_dataset_UK_ethos(True),
        dsu.read_dataset_UK_ethos(False)
]:
    #for data in [dsu.read_dataset_US()]:
    #data = dsu.read_dataset_UK_ethos(True)
    print("data read")

    for index, row in data.iterrows():
        print(index)
        title = row['titolo'].replace("\"", "'")
        doc_id = row['id']
        if 'abstract' in data.columns:
            abstract = row['abstract'].replace("\"", "'")

        print(title)
Beispiel #3
0
import pandas as pd
import re

phils = {"title": [], "subject": [], "id": []}
no_phils = {"title": [], "subject": [], "id": []}


def scan_philosophy(data):
    for index, row in data.iterrows():
        print(index, end="\r")

        if re.search(r'[P|p]hilosop', str(row['argomento'])) is not None:
            append_data(phils, row)
        else:
            append_data(no_phils, row)


def append_data(dictionary, row):
    dictionary['title'].append(row['titolo'].strip())
    dictionary['subject'].append(row['argomento'].strip())
    dictionary['id'].append(row['id'])


print("Scanning file with abstracts")
scan_philosophy(dsu.read_dataset_UK_ethos(True))
print("Scanning file without abstracts")
scan_philosophy(dsu.read_dataset_UK_ethos(False))

pd.DataFrame(phils).to_csv("data/philosophy.csv", index=None)
pd.DataFrame(no_phils).to_csv("data/no_philosophy.csv", index=None)
Beispiel #4
0
if use_preprocessed:
    data_abs = pd.read_csv("preprocessed_data/ethos_abs_preprocessed.csv")
    data_no_abs = pd.read_csv(
        "preprocessed_data/ethos_no_abs_preprocessed.csv")
    print("Number of samples:", data_abs.count()[0], data_no_abs.count()[0])

    for data, label in zip([data_abs, data_no_abs], ["abs", "no_abs"]):
        TDmatrix = vectorizer.transform(feed_preprocessed_data(data))
        res = clf.predict(TDmatrix)
        probs = clf.predict_proba(TDmatrix)
        res_df = add_columns_to_df(data, res, probs)
        res_df.to_csv("results/classification_" + label + ".csv", index=None)
        print(label, "saved")
else:
    #abstracts
    data = dsu.read_dataset_UK_ethos(True)
    data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"),
                               columns=['titolo'])
    data_titles = tp.preprocess_dataframe(data_titles, analyzer)
    data_text = data_titles
    print("samples:", data_text.count()[0])

    TDmatrix = vectorizer.transform(feed_data(data_text, True))
    res = clf.predict(TDmatrix)
    probs = clf.predict_proba(TDmatrix)
    res_df = add_columns_to_df(data, res, probs)
    res_df.to_csv("results/classification_abs.csv", index=None)
    print("abs saved")

    #no abstracts
    data_no_abs = dsu.read_dataset_UK_id(False)
Beispiel #5
0
num_results_title = 10
num_results_abstract = 20
score_threshold = 30.0
out_file = "../data/tmf_entities_{}.csv".format(theses_dataset)
max_score = 0
min_score = 100

####################################################################

es = Elasticsearch('localhost', port=9200)

data_files = []
if theses_dataset == "US":
    data_files.append(dsu.read_dataset_US())
else:
    data_files.append(dsu.read_dataset_UK_ethos(True))
    data_files.append(dsu.read_dataset_UK_ethos(False))
print("data read")

tmf_entities = {}
for input_file in data_files:
    for index,row in input_file.iterrows():
    	print(index, end="\r")

    	title = row['title'].strip().replace("\"", "'")
    	doc_id = row['id']

    	if 'abstract' in data.columns:

    	if '***NO TITLE PROVIDED***' not in title:
    		tmf_entities[doc_id] = {
Beispiel #6
0
import text_processing as tp
import dataset_utils as dsu
import pandas as pd
import numpy as np
import joblib

use_preprocessed = False

vectorizer = joblib.load("models/vectorizer.pkl")
analyzer = vectorizer.build_analyzer()
clf = joblib.load("models/classifier.pkl")

data_abs = dsu.read_dataset_UK_ethos(True)
data_nabs = dsu.read_dataset_UK_ethos(False)
data = data_abs[['id', 'titolo']].append(data_nabs[['id', 'titolo']])
print("data read")

if use_preprocessed:
    data_titles = pd.read_csv("data/preprocessed_titles.csv")['titolo']
else:
    data_titles = tp.lemmatize_data(data['titolo'])
    data_titles = tp.preprocess_text_data(data_titles, analyzer)
    pd.DataFrame({
        'titolo': data_titles
    }).to_csv("data/preprocessed_titles.csv", index=None)

print("samples:", len(data_titles))
TDmatrix = vectorizer.transform(data_titles)
y_pred = clf.predict(TDmatrix)
y_pred_probs = clf.predict_proba(TDmatrix)
data.loc[:, 'classification'] = y_pred
Beispiel #7
0
stat_corpus = False
stat_train = False
stat_test = False
stat_feature_score = False

vectorizer = CountVectorizer(stop_words="english", analyzer="word")
analyzer = vectorizer.build_analyzer()

if stat_year:
    #anno pubblicazione
    title = "Years Distribution"
    xx = "number of documents"
    yy = "year"
    plt.clf()

    file_abs = dsu.read_dataset_UK_ethos(True)
    file_nabs = dsu.read_dataset_UK_ethos(False)

    for file, color in zip([file_abs, file_nabs], ['r', 'b']):

        years = set()
        plot_data = {i: 0 for i in range(1850, 2030)}
        for index, row in file.iterrows():
            year = row['anno']
            if year == ' dcterms_issued:: @@MISSING-DATA' or year == 'dcterms_issued:: @@MISSING-DATA':
                continue
            years.add(int(year))
            try:
                plot_data[int(year)] = plot_data[int(year)] + 1
            except KeyError:
                continue