コード例 #1
0
def run():
    #1. Read dataset (tmp.json)
    articles = io.read_json_file('/home/elenaruiz/Documents/TFG/FNC/src/data/tmp.json')
    df = pd.DataFrame(data=articles['articles'])


    #2. Clean data
    df['corpus'] = df['title']
    i = 0
    for _, row in df.iterrows():
        x = ct.clean_text_by_word(row['title'], True)
        y = ct.clean_text_by_word(row['subtitle'], True)
        z = []
        for sent in row['text']:
            z += ct.clean_text_by_word(sent, True)
        row['corpus'].replace(x + y + z)
        i = i + 1


    #3. Split data
    X_train, X_test = train_test_split(df, random_state=0)
    print(X_train)
    print(X_test)

    #4. Tag each doc
    tagged_data = []
    for i in vector_id:
        tagged_data.append(TaggedDocument(corpus[i],str(df['fake'][i])))

    #3. Doc2Vec Model + build vocab
    max_epochs = 100
    vec_size = 20
    alpha = 0.025
    model = Doc2Vec(alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=1,
                    dm =1)
    
    model.build_vocab(tagged_data)
    #4. Test resutls
    for epoch in range(max_epochs):
        #print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    # 5. Test
        
    print(similar_doc)
    i = similar_doc[0][0]
    print(i)
    row = df.loc[[1]]
    print(row['title'], row['text'])
    row = df.loc[[8]]
    print(row['title'], row['text'])
    return
コード例 #2
0
def run(nmin, nmax):

    for x in range(nmin, nmax):
        translate = {}
        path = 'src/data/articles/Article_' + str(x) + '.json'
        print('Reading file ...', path)
        content = io.read_json_file(path)
        print('Done!')

        print('Requesting title and subtitle tranlation...')
        # Title and subtitle request
        resp_one = request_title_subtitle(content)
        translate['title'] = resp_one[0]
        translate['subtitle'] = resp_one[1]
        print('Requesting text body tranlation...')
        # Text body request
        rest_two = request_body(content)
        translate['text'] = rest_two

        # URL  & FAKE
        translate['url'] = content['url']
        translate['fake'] = content['fake']
        print('Done!')

        print('Writting article in English ...')
        path_dest = 'src/data/articles_en/Article_' + str(x) + '.json'
        io.write_json_file(path_dest, translate)
        print('Done!')
コード例 #3
0
def modelate_dataset():
    articles = {'articles': []}
    n_ini = 1
    n_fi = 116
    for x in range(n_ini, n_fi):
        path = 'src/data/articles_en/Article_' + str(x) + '.json'
        # Read file
        content = io.read_json_file(path)
        if content != None:
            articles['articles'].append(content)
    # Create dataframe
    return get_dataframe_from_json(articles)
コード例 #4
0
ファイル: scrapper.py プロジェクト: FNClassificator/FNC
def get_all_news(path, is_fake, n):
    url_list = io.read_json_file(path)
    for key in url_list.keys():
        css_attr = get_variable(key)
        if css_attr != -1:
            for item in url_list[key]:
                n += 1
                try:
                    print('Making request of ...', item)
                    result = make_request(item, css_attr)
                    result['fake'] = is_fake
                    out_path = 'src/data/articles/Article_' + str(n) + '.json'
                    io.write_json_file(out_path, result)
                    print('File saved!')
                except ValueError as err:
                    print(err)
                    print('Problem with: ', item)
    return n
コード例 #5
0
import json
from src.utils import io

articles = {'articles': []}
n_ini = 1
n_fi = 116
for x in range(n_ini, n_fi):
    path = 'src/data/articles_en/Article_' + str(x) + '.json'
    # Read file
    content = io.read_json_file(path)
    if content != None:
        articles['articles'].append(content)
io.write_json_file('src/data/tmp.json', articles)
コード例 #6
0
import numpy as np
import pandas as pd

from src.utils import io
from src.fake_news_detector.classification.sub_classifications import get_similarity_prediction

if __name__ == "__main__":

    articles = io.read_json_file(
        '/home/elenaruiz/Documents/TFG/FNC/src/data/dataset_similarity.json')
    df = pd.DataFrame(data=articles['articles'])

    model_type = 'LR'
    pred = get_similarity_prediction(df, model_type, True)

    model_type = 'DTC'
    pred = get_similarity_prediction(df, model_type, True)

    model_type = 'KNC'
    pred = get_similarity_prediction(df, model_type, True)

    model_type = 'LDA'
    pred = get_similarity_prediction(df, model_type, True)

    model_type = 'GNB'
    pred = get_similarity_prediction(df, model_type, True)

    model_type = 'SVC'
    pred = get_similarity_prediction(df, model_type, True)
コード例 #7
0
from src.fake_news_detector.core.classification import doc2vec_classification as dc
import pandas as pd
import numpy as np

from src.utils import io

if __name__ == "__main__":

    # 1. Read
    articles = io.read_json_file(
        '/home/elenaruiz/Documents/TFG/FNC/src/data/dataset_content.json')
    df = pd.DataFrame(data=articles['articles'])

    # 2. Create dataframe
    corpus = pd.DataFrame()
    corpus['corpus'] = df['title_subject']
    corpus['fake'] = df['id']
    corpus['id'] = list(range(0, len(df.rows)))

    # 3. DOC2VEC Test
    models = dc.generate_doc2vec_model(data)
    models.get_similarty_doc2vec(data)
    # 4. Check results
    error_1_1 = 0
    error_1_2 = 0
    error_2_1 = 0
    error_2_2 = 0
    for i, row in data.iterrows():
        label = row['label'] * 1
        # Model 1 : MAX
        if row['result_m1_max'] != label: