Ejemplo n.º 1
0
def process(data, output_folder, source):
    mappings = {
        'fake': 'fake',
        'bias': 'fake',
        'conspiracy': 'fake',
        'junksci': 'fake',
        'hate': 'fake',
        'clickbait': 'fake',
        #'unreliable': 'fake',
        'reliable': 'true'
    }
    properties = ['type', '2nd type', '3rd type']
    results = []
    # find the properties belonging to the mappings in the samples, and assign a single label
    for domain, props in data.items():
        looking_at = [
            prop_value for prop_name, prop_value in props.items()
            if prop_name in properties and prop_value
        ]
        #print(looking_at)
        classes = set(mappings[el] for el in looking_at if el in mappings)
        if len(classes) != 1:
            print(domain, classes)
            continue
        label = classes.pop()
        results.append({'domain': domain, 'label': label, 'source': source})

    utils.write_json_with_path(results, output_folder, 'domains.json')
def get_claimreviews_from_factcheckers(original_claimReviews):
    result = {}

    # retrieve the full claimReview from the fact checking website
    for idx, c in enumerate(tqdm(claimReviews)):
        # get the correct URL (some of them are wrong in the original dataset)
        fixed_url = claimreview.get_corrected_url(c['url'])

        # this part with id and file saving is just to be able to restore the operation after a failure so that the single claims are saved onto disk on by one
        id = utils.string_to_md5(fixed_url)
        partial_file_name = '{}.json'.format(id)
        partial_file_path = subfolder_path / 'intermediate' / 'single_claims' / partial_file_name
        if os.path.isfile(partial_file_path):
            # if it's been already saved, read it
            partial = utils.read_json(partial_file_path)
        else:
            # otherwise download the original claimReview from the fact checker
            url, partial = claimreview.retrieve_claimreview(c['url'])
            # and save it to disk
            utils.write_json_with_path(
                partial, subfolder_path / 'intermediate' / 'single_claims',
                partial_file_name)
        if not partial:
            # in this case there is no claimReview metadata on the fact checker website
            #print(c['url'])
            pass
        if len(partial):
            # there can be multiple claimReviews in a single fact checking page
            for j, claimReview in enumerate(partial):
                # save this in the result
                result['{}::{}'.format(fixed_url, j)] = claimReview

    return result
def load_jsonld():
    # read the file
    with open(source_file_path) as f:
        content = f.read()

    # extract the embedded metadata https://github.com/scrapinghub/extruct
    data = extruct.extract(content)

    claimReviews = data['json-ld']

    # some analysis of the labels to see how they are annotated
    labels = set([el['reviewRating']['alternateName'] for el in claimReviews])
    lambda_source = lambda el: el['author']['name']

    # group the labels by the author of the review, to see how each one of them uses the alternateName
    labels_by_sources = {
        k: set([el['reviewRating']['alternateName'] for el in v])
        for k, v in itertools.groupby(sorted(claimReviews, key=lambda_source),
                                      key=lambda_source)
    }

    print('#claimReviews', len(claimReviews))
    print('#labels', len(labels))
    #print('labels', labels)
    print('#label for each source',
          {k: len(v)
           for k, v in labels_by_sources.items()})

    # save the original claimReviews
    utils.write_json_with_path(claimReviews, intermediate_path,
                               'datacommons_claimReviews.json')

    return claimReviews
#!/bin/env python

import utils

location = utils.data_location / 'wikipedia'

data = utils.read_tsv(location / 'source' / 'wikipedia.tsv')

domains = [{
    'domain': el['url'],
    'label': el['label'],
    'source': 'wikipedia'
} for el in data]

utils.write_json_with_path(domains, location, 'domains.json')
}
print('types', cnt_by_type)

by_site_fn = lambda el: el['site_url']
types_by_domain = {
    k: set([el['type'] for el in v])
    for k, v in itertools.groupby(sorted(data, key=by_site_fn), key=by_site_fn)
}

mappings = {
    'fake': 'fake',
    'junksci': 'fake',
    'hate': 'fake',
    'bs': 'fake',
    'bias': 'fake',
    'conspiracy': 'fake'
}

result = []
for k, v in types_by_domain.items():
    assert len(v) == 1
    label = v.pop()
    if label in mappings:
        result.append({
            'domain': k,
            'label': mappings[label],
            'source': 'mrisdal_fakenews'
        })

utils.write_json_with_path(result, subfolder_path, 'domains.json')
#!/bin/env python

import json
import csv
import utils

folder = utils.data_location / 'several27_fakenews'


# read the file without utils, 27.3 GB is too large for my pc
# TODO limit RAM used!!!
input_path = folder / 'source' / 'news_cleaned_2018_02_13.csv'
output_path = folder / 'intermediate'
output_file = output_path / 'unfiltered.json'

results = []
chunk_n = 0
with open(input_path) as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        results.append({'url': row['url'], 'label': row['type'], 'source': 'several27_fakenews'})
        if not len(results) % 1000000:
            print(len(results) * (chunk_n + 1) / 216212648)
            utils.write_json_with_path(results, output_path, 'unfiltered_{}.json'.format(chunk_n), indent=None)
            results = []
            chunk_n += 1

#!/bin/env python

import glob
import xml.etree.ElementTree as ET

import utils

subfolder = utils.data_location / 'hyperpartisan'

results = []
for input_file in glob.glob(str(subfolder /
                                'intermediate/ground-truth-*.xml')):
    with open(input_file) as f:
        tree = ET.parse(f)
    articles = tree.getroot().findall('article')
    results.extend([{
        'url':
        el.attrib['url'],
        'label':
        'fake' if el.attrib['hyperpartisan'] == 'true' else 'true',
        'source':
        'hyperpartisan'
    } for el in articles])

utils.write_json_with_path(results, subfolder, 'urls.json')
utils.print_stats(results)
by_domain = utils.compute_by_domain(results)

utils.write_json_with_path(by_domain, subfolder, 'domains.json')
from tqdm import tqdm

import utils
import unshortener

location = utils.data_location / 'rbutr'

data = utils.read_tsv(location / 'source' / 'link_data.tab.txt')

results = [{
    'url': el['sourcepage'],
    'label': 'fake',
    'source': 'rbutr'
} for el in data]

utils.write_json_with_path(results, location, 'urls.json')

domains = utils.compute_by_domain(results)

utils.write_json_with_path(results, location, 'domains.json')

rebuttals = defaultdict(lambda: defaultdict(list))
for row in data:
    rebuttals[row['sourcepage']][row['rebuttalpage']].append('rbutr')

utils.write_json_with_path(rebuttals, location, 'rebuttals.json')

# check which urls still exist

rbutr_mapping_location = location / 'intermediate'
rbutr_mapping_path = rbutr_mapping_location / 'mappings.json'
Ejemplo n.º 9
0
#!/bin/env python

import utils

directory = utils.data_location / 'golbeck_fakenews'

# this input file has been exported to TSV from `Fake News Stories.xlsx`
input_file = directory / 'intermediate' / 'data.tsv'

data = utils.read_tsv(input_file)

result = [{'url': row['URL of article'], 'label': 'fake', 'source': 'golbeck_fakenews'} for row  in data if row['Fake or Satire?'].strip() == 'Fake']

utils.write_json_with_path(result, directory, 'urls.json')

by_domain = utils.compute_by_domain(result)

utils.write_json_with_path(by_domain, directory, 'domains.json')

rebuttals = {el['URL of article']: {u.strip(): ['golbeck_fakenews'] for u in el['URL of rebutting article'].split('; ')} for el in data}

utils.write_json_with_path(rebuttals, directory, 'rebuttals.json')

if __name__ == '__main__':
    claimReviews = load_jsonld()

    # if you share a fact checking site, the fact checking site is true
    urls = [{
        'url': c['url'],
        'label': 'true',
        'source': 'datacommons_factcheck'
    } for c in claimReviews]

    # retrieve the claimReviews with more properties
    claimReviews_full = get_claimreviews_from_factcheckers(claimReviews)
    # save to file
    utils.write_json_with_path(claimReviews_full, subfolder_path,
                               'claimReviews.json')

    # rebuttals is a dict that associates each URL with other URLs that are related. In this case it is for suggesting to read the fact checking article
    rebuttals = defaultdict(lambda: defaultdict(list))
    for key, claimReview in claimReviews_full.items():
        # retrieve the URL of the source of the claim (not always there)
        claim_urls = claimreview.get_claim_urls(claimReview)
        if claim_urls:
            print('claim', claim_urls)
            if 'properties' in claimReview:
                fixed_url = claimreview.get_corrected_url(
                    claimReview['properties']['url'])
            else:
                fixed_url = claimreview.get_corrected_url(claimReview['url'])

            # save the found mapping between the claim URL and the factchecking URL
Ejemplo n.º 11
0
#!/bin/env python

import utils

location = utils.data_location / 'factcheckni_list'

data = utils.read_tsv(location / 'source' /
                      'FactCheckNI Articles - OU Research - Sheet1.tsv')

label_map = {
    'Accurate': 'true',
    # 'Unsubstantiated': not true nor folse, no proofs --> discard
    'Inaccurate': 'fake'
}

labeled_urls = [{
    'url': row['Claim URL'],
    'label': label_map[row['Label']],
    'source': 'factcheckni_list'
} for row in data if row['Label'] in label_map]

rebuttals = {
    row['Claim URL']: {
        row['Article URL']: ['factcheckni_list']
    }
    for row in data
}

utils.write_json_with_path(labeled_urls, location, 'urls.json')
utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
Ejemplo n.º 12
0
        rebuttals = utils.read_json(utils.data_location / subfolder / 'rebuttals.json')
        for source_url, rebuttal_l in rebuttals.items():
            for rebuttal_url, source in rebuttal_l.items():
                all_rebuttals[source_url][rebuttal_url].append(source)

urls_cnt = len(all_urls)
domains_cnt = len(all_domains)
fake_urls_cnt = len([el for el in all_urls if el['label'] == 'fake'])
fake_domains_cnt = len([el for el in all_domains if el['label'] == 'fake'])
print('#urls', urls_cnt, ': fake', fake_urls_cnt, 'true', urls_cnt - fake_urls_cnt)
print('#domains', domains_cnt, ': fake', fake_domains_cnt, 'true', domains_cnt - fake_domains_cnt)

aggregated_urls = utils.aggregate(all_urls)
aggregated_domains = utils.aggregate(all_domains, 'domain')

utils.write_json_with_path(aggregated_urls, utils.data_location, 'aggregated_urls.json')
utils.write_json_with_path(aggregated_domains, utils.data_location, 'aggregated_domains.json')
utils.write_json_with_path(all_rebuttals, utils.data_location, 'aggregated_rebuttals.json')

# copy to backend
utils.write_json_with_path(aggregated_urls, Path('../backend'), 'aggregated_urls.json')
utils.write_json_with_path(aggregated_domains, Path('../backend'), 'aggregated_domains.json')
utils.write_json_with_path(all_rebuttals, Path('../backend'), 'aggregated_rebuttals.json')

utils.print_stats(aggregated_urls)
utils.print_stats(aggregated_domains)

print('updating mappings, it may take a while')
mappings_file = utils.data_location / 'mappings.json'

mappings = {}
        #soup = BeautifulSoup(f, 'html.parser')
        #tree = etree.parse(f, etree.HTMLParser())
        str = f.read()

    #root = tree.getroot()
    #matches = root.findall('a[@tabindex="-1" and target="_blank"]')
    #matches = soup.find_all('a', attrs={'tabindex': '-1', 'target': '_blank'})
    #matches = tree.xpath('a')
    # look for the <a> with tabindex="-1" target="_blank"
    fb_urls = re.findall(r'<a\shref="([^>]*)" tabindex="-1" target="_blank"', str)
    real_urls = [urlparse.parse_qs(urlparse.urlparse(u).query)['u'] for u in fb_urls]
    unique = {u for sublist in real_urls for u in sublist}
    #print(unique)
    if len(unique) != 1:
        print(file_location, unique)
        continue
    id = file_location.split('/')[-1].split('.')[0]
    url = unique.pop()
    label = data[id]['label']
    label_binary = {'mostly true': 'true', 'mostly false': 'fake'}.get(label, None)
    unfiltered.append({'url': url, 'label': label, 'source': 'buzzface'})
    if label_binary:
        results.append({'url': url, 'label': label_binary, 'source': 'buzzface'})

utils.write_json_with_path(unfiltered, folder / 'intermediate', 'unfiltered.json')
utils.write_json_with_path(results, folder, 'urls.json')

by_domain = utils.compute_by_domain(results)
utils.write_json_with_path(by_domain, folder, 'domains.json')