def run(): #1. Read dataset (tmp.json) articles = io.read_json_file('/home/elenaruiz/Documents/TFG/FNC/src/data/tmp.json') df = pd.DataFrame(data=articles['articles']) #2. Clean data df['corpus'] = df['title'] i = 0 for _, row in df.iterrows(): x = ct.clean_text_by_word(row['title'], True) y = ct.clean_text_by_word(row['subtitle'], True) z = [] for sent in row['text']: z += ct.clean_text_by_word(sent, True) row['corpus'].replace(x + y + z) i = i + 1 #3. Split data X_train, X_test = train_test_split(df, random_state=0) print(X_train) print(X_test) #4. Tag each doc tagged_data = [] for i in vector_id: tagged_data.append(TaggedDocument(corpus[i],str(df['fake'][i]))) #3. Doc2Vec Model + build vocab max_epochs = 100 vec_size = 20 alpha = 0.025 model = Doc2Vec(alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) model.build_vocab(tagged_data) #4. Test resutls for epoch in range(max_epochs): #print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay model.min_alpha = model.alpha # 5. Test print(similar_doc) i = similar_doc[0][0] print(i) row = df.loc[[1]] print(row['title'], row['text']) row = df.loc[[8]] print(row['title'], row['text']) return
def run(nmin, nmax): for x in range(nmin, nmax): translate = {} path = 'src/data/articles/Article_' + str(x) + '.json' print('Reading file ...', path) content = io.read_json_file(path) print('Done!') print('Requesting title and subtitle tranlation...') # Title and subtitle request resp_one = request_title_subtitle(content) translate['title'] = resp_one[0] translate['subtitle'] = resp_one[1] print('Requesting text body tranlation...') # Text body request rest_two = request_body(content) translate['text'] = rest_two # URL & FAKE translate['url'] = content['url'] translate['fake'] = content['fake'] print('Done!') print('Writting article in English ...') path_dest = 'src/data/articles_en/Article_' + str(x) + '.json' io.write_json_file(path_dest, translate) print('Done!')
def modelate_dataset(): articles = {'articles': []} n_ini = 1 n_fi = 116 for x in range(n_ini, n_fi): path = 'src/data/articles_en/Article_' + str(x) + '.json' # Read file content = io.read_json_file(path) if content != None: articles['articles'].append(content) # Create dataframe return get_dataframe_from_json(articles)
def get_all_news(path, is_fake, n): url_list = io.read_json_file(path) for key in url_list.keys(): css_attr = get_variable(key) if css_attr != -1: for item in url_list[key]: n += 1 try: print('Making request of ...', item) result = make_request(item, css_attr) result['fake'] = is_fake out_path = 'src/data/articles/Article_' + str(n) + '.json' io.write_json_file(out_path, result) print('File saved!') except ValueError as err: print(err) print('Problem with: ', item) return n
import json from src.utils import io articles = {'articles': []} n_ini = 1 n_fi = 116 for x in range(n_ini, n_fi): path = 'src/data/articles_en/Article_' + str(x) + '.json' # Read file content = io.read_json_file(path) if content != None: articles['articles'].append(content) io.write_json_file('src/data/tmp.json', articles)
import numpy as np import pandas as pd from src.utils import io from src.fake_news_detector.classification.sub_classifications import get_similarity_prediction if __name__ == "__main__": articles = io.read_json_file( '/home/elenaruiz/Documents/TFG/FNC/src/data/dataset_similarity.json') df = pd.DataFrame(data=articles['articles']) model_type = 'LR' pred = get_similarity_prediction(df, model_type, True) model_type = 'DTC' pred = get_similarity_prediction(df, model_type, True) model_type = 'KNC' pred = get_similarity_prediction(df, model_type, True) model_type = 'LDA' pred = get_similarity_prediction(df, model_type, True) model_type = 'GNB' pred = get_similarity_prediction(df, model_type, True) model_type = 'SVC' pred = get_similarity_prediction(df, model_type, True)
from src.fake_news_detector.core.classification import doc2vec_classification as dc import pandas as pd import numpy as np from src.utils import io if __name__ == "__main__": # 1. Read articles = io.read_json_file( '/home/elenaruiz/Documents/TFG/FNC/src/data/dataset_content.json') df = pd.DataFrame(data=articles['articles']) # 2. Create dataframe corpus = pd.DataFrame() corpus['corpus'] = df['title_subject'] corpus['fake'] = df['id'] corpus['id'] = list(range(0, len(df.rows))) # 3. DOC2VEC Test models = dc.generate_doc2vec_model(data) models.get_similarty_doc2vec(data) # 4. Check results error_1_1 = 0 error_1_2 = 0 error_2_1 = 0 error_2_2 = 0 for i, row in data.iterrows(): label = row['label'] * 1 # Model 1 : MAX if row['result_m1_max'] != label: