yield df2.iloc[index-size1]['preprocessed_data'] else: #print_sample(df1.iloc[index]['preprocessed_data'], True) yield df1.iloc[index]['preprocessed_data'] cv = tp.build_vectorizer("tfidf", min_df, max_df, n_features) analyzer = cv.build_analyzer() vectorizer = cv # read and preprocess text data test_df = pd.read_csv("data/test_set_1000.tsv", delimiter="\t", names=['title','creator','university','publisher', 'year','abstract','type','subject','id','philosophy']) print("test size:",test_df.count()[0]) test_df = test_df.fillna("") test_titles = pd.DataFrame(tp.lemmatize_data(test_df[['title']],"title"), columns=['title']) test_titles = tp.preprocess_dataframe(test_titles, analyzer) test_abs = pd.DataFrame(tp.lemmatize_data(test_df[['abstract']],"abstract"), columns=['abstract']) test_abs = tp.preprocess_dataframe(test_abs, analyzer) test_text = test_titles.merge(test_abs,left_index=True,right_index=True) preprocessed_data = [] for index,row in test_text.iterrows(): preprocessed_data.append(row['title'] + " " + row['abstract']) print("preprocessed data size:",len(preprocessed_data)) test_df.loc[:,"preprocessed_data"] = preprocessed_data orcid_abs = pd.read_csv("preprocessed_data/orcid_abs_preprocessed.csv")[['preprocessed_data']] doiboost_abs = pd.read_csv("preprocessed_data/doiboost_abs_preprocessed.csv")[['preprocessed_data']] data_abs = pd.concat([orcid_abs, doiboost_abs]) orcid_no_abs = pd.read_csv("preprocessed_data/orcid_no_abs_preprocessed.csv")[['preprocessed_data']]
"preprocessed_data/ethos_no_abs_preprocessed.csv") print("Number of samples:", data_abs.count()[0], data_no_abs.count()[0]) for data, label in zip([data_abs, data_no_abs], ["abs", "no_abs"]): TDmatrix = vectorizer.transform(feed_preprocessed_data(data)) res = clf.predict(TDmatrix) probs = clf.predict_proba(TDmatrix) res_df = add_columns_to_df(data, res, probs) res_df.to_csv("results/classification_" + label + ".csv", index=None) print(label, "saved") else: #abstracts data = dsu.read_dataset_UK_ethos(True) data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"), columns=['titolo']) data_titles = tp.preprocess_dataframe(data_titles, analyzer) data_text = data_titles print("samples:", data_text.count()[0]) TDmatrix = vectorizer.transform(feed_data(data_text, True)) res = clf.predict(TDmatrix) probs = clf.predict_proba(TDmatrix) res_df = add_columns_to_df(data, res, probs) res_df.to_csv("results/classification_abs.csv", index=None) print("abs saved") #no abstracts data_no_abs = dsu.read_dataset_UK_id(False) data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"), columns=['titolo']) data_titles = tp.preprocess_dataframe(data_titles, analyzer)
vectorizer = tp.build_vectorizer("tfidf") # preprocess text data if not use_preprocessed: [positive_samples, negative_samples] = tp.read_samples("data/philosophy.csv", "data/no_philosophy.csv") num = negative_samples.count()[0] - 1 rand_indexes = [randint(0,num) for _ in range(len(positive_samples)*negative_ratio)] negative_samples = negative_samples.iloc[rand_indexes] print("Positive samples:",positive_samples.count()[0]) print("Negative samples:",negative_samples.count()[0]) phil_subj = pd.DataFrame(tp.lemmatize_data(positive_samples[['subject']],"subject"), columns=['subject']) phil_subj = tp.preprocess_subjects(phil_subj) phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples[['title']],"title"), columns=['title']) phil_titles = tp.preprocess_dataframe(phil_titles, analyzer) phil_abs = pd.DataFrame(tp.lemmatize_data(positive_samples[['abstract']],"abstract"), columns=['abstract']) phil_abs = tp.preprocess_dataframe(phil_abs, analyzer) nphil_subj = pd.DataFrame(tp.lemmatize_data(negative_samples[['subject']],"subject"), columns=['subject']) nphil_subj = tp.preprocess_subjects(nphil_subj) nphil_titles = pd.DataFrame(tp.lemmatize_data(negative_samples[['title']],"title"), columns=['title']) nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer) nphil_abs = pd.DataFrame(tp.lemmatize_data(negative_samples[['abstract']],"abstract"), columns=['abstract']) nphil_abs = tp.preprocess_dataframe(nphil_abs, analyzer) phil_text = phil_subj.merge(phil_titles,left_index=True,right_index=True).merge(phil_abs,left_index=True,right_index=True) nphil_text = nphil_subj.merge(nphil_titles,left_index=True,right_index=True).merge(nphil_abs,left_index=True,right_index=True) tp.save_lemmatized_data(phil_text, nphil_text, "preprocessed_data/phil_text.csv", "preprocessed_data/nphil_text.csv") else: [phil_text, nphil_text] = tp.read_lemmatized_data("preprocessed_data/phil_text.csv", "preprocessed_data/nphil_text.csv")
data = tp.remove_missing_abstract(pd.read_csv("no_philosophy.csv")) #data = tp.select_missing_abstract(pd.read_csv("no_philosophy.csv")) vectorizer = joblib.load("vectorizer.pkl") analyzer = vectorizer.build_analyzer() #clf = joblib.load("randomforestCLF.pkl") clf = joblib.load("LinearSVC_CLF.pkl") while True: index = randint(0, min(data.count())) row = data.iloc[[index]] abstract_df = pd.DataFrame(tp.lemmatize_data(row, "abstract", False), columns=['abstract']) abstract = tp.preprocess_dataframe(abstract_df, analyzer).iloc[0]["abstract"] title_df = pd.DataFrame(tp.lemmatize_data(row, "title", False), columns=['title']) title = tp.preprocess_dataframe(title_df, analyzer).iloc[0]["title"] subject_df = pd.DataFrame(tp.lemmatize_data(row, "subject", False), columns=['subject']) subject = re.sub(r'[\.\,\(\)\[\]\;\']', '', subject_df.iloc[0]["subject"]) text = ' '.join([subject, title, abstract]) vec = vectorizer.transform([text]) res = clf.predict(vec) #probs = clf.predict_proba(vec) if res[0] == 1: print("\n")
test_samples = pd.read_csv("data/test_set_1000.tsv", delimiter="\t", names=[ 'title', 'creator', 'university', 'publisher', 'year', 'abstract', 'type', 'subject', 'id', 'philosophy' ]) print("Positive samples_train:", positive_samples_train.count()[0]) print("Negative samples_train:", negative_samples_train.count()[0]) print("Test samples:", test_samples.count()[0]) phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples_train[['title']], "title"), columns=['title']) phil_titles = tp.preprocess_dataframe(phil_titles, analyzer) nphil_titles = pd.DataFrame(tp.lemmatize_data( negative_samples_train[['title']], "title"), columns=['title']) nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer) phil_text = phil_titles nphil_text = nphil_titles test_titles = pd.DataFrame(tp.lemmatize_data(test_samples[['title']], "title"), columns=['title']) test_titles = tp.preprocess_dataframe(test_titles, analyzer) test_text = test_titles # transform text data into vector space vectorizer.fit(feed_data()) joblib.dump(vectorizer, "models/vectorizer.pkl")