def domain_action(namespace): headers, position, reader = custom_reader(namespace.file, namespace.column) headers.append(namespace.column + "_domain") writer = csv.writer(namespace.output) writer.writerow(headers) for line in reader: url = line[position] line.append(get_domain_name(url)) writer.writerow(line)
def clean_data(CLEAN_DATA_DIRECTORY, SCIENTIFIC_TOPIC, DATE): """Import and prepare the dataframe to be used to build the graphs""" posts_path = os.path.join(".", CLEAN_DATA_DIRECTORY, "fake_posts_" + SCIENTIFIC_TOPIC + "_" + DATE + ".csv") posts_df = pd.read_csv(posts_path) if DATE == "28_04_2020": # Remove the url with parameters from the analysis because CT return wrong results for them: posts_df['parameter_in_url'] = posts_df['url'].apply(lambda x: '?' in x) posts_df = posts_df[posts_df['parameter_in_url']==False] posts_df = posts_df[posts_df["platform"] == "Facebook"] posts_df = posts_df.dropna(subset=['account_id', 'url']) posts_df['account_id'] = posts_df['account_id'].apply(lambda x:int(x)) # Sometimes a same facebook group can share multiple times the same URL, # creating multiple lines in the input CSV. We remove the duplicates here: posts_df = posts_df[['url', 'account_name', 'account_id', 'account_subscriber_count', 'actual_like_count']] posts_df = posts_df.drop_duplicates(subset=['url', 'account_id'], keep='last') posts_df['domain_name'] = posts_df['url'].apply(lambda x: ural.get_domain_name(x)) if DATE == "28_04_2020": # Remove the platforms from the analysis: platforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"] posts_df = posts_df[~posts_df['domain_name'].isin(platforms)] # We remove the facebook groups that have shared only one fake URL: vc = posts_df['account_id'].value_counts() posts_df = posts_df[posts_df['account_id'].isin(vc[vc > 1].index)] # We prepare a dataframe to import the facebook group nodes with specific attributes: # - the number of followers # - the account name -> label # - the fake news URL shared by this group -> node size fb_group_df = posts_df[['account_id', 'account_name', 'account_subscriber_count']]\ .sort_values(by="account_subscriber_count", ascending=True)\ .drop_duplicates(subset = ['account_id'], keep='last') temp = posts_df.groupby('account_id')['url'].apply(list)\ .to_frame().reset_index() fb_group_df = fb_group_df.merge(temp, left_on='account_id', right_on='account_id', how='left') fb_group_df['nb_fake_news_shared'] = fb_group_df['url'].apply(lambda x:len(x)) # We prepare a dataframe to import the facebook group nodes with specific attributes: # - the fake news URL shared by this domain -> node size domain_df = posts_df[['url', 'domain_name']].drop_duplicates()\ .groupby('domain_name')['url'].apply(list)\ .to_frame().reset_index() domain_df['nb_fake_news_shared'] = domain_df['url'].apply(lambda x:len(x)) return posts_df, fb_group_df, domain_df
def extract_standard_addendum(cli_args, url): inferred_redirection = infer_redirection(url) return [ normalize_url(url, strip_protocol=cli_args.strip_protocol, strip_trailing_slash=True), inferred_redirection if inferred_redirection != url else '', get_domain_name(url), get_hostname(url), get_normalized_hostname(url), 'yes' if is_shortened_url(url) else '' ]
def clean_data(url_df, fact_check_df, SCIENTIFIC_TOPIC): """Clean and merge the appearance data""" # Remove the spaces added by error arount the URLs url_df['url'] = url_df['url'].transform(lambda x: x.strip()) # Filter the URLs to keep only the ones flagged as False or equivalent: url_df = url_df[(url_df['Flag as'].isin( ['False', 'Partly false', 'Misleading', 'False headline']))] # Use a REGEX to get the article field from the fact-check url website: # if the fact-check url starts with 'https://climatefeedback.org' -> 'climate' article # if the fact-check url starts with 'https://healthfeedback.org' -> 'health' article fact_check_df['field'] = fact_check_df['Review url'].str.extract( 'https://([^/]+)feedback.org') # Merge the two dataframes to get the 'field' for each url: url_df = url_df.dropna(subset=['Item reviewed']) fact_check_df = fact_check_df.dropna(subset=['Items reviewed']) url_df = url_df.merge(fact_check_df[['Items reviewed', 'field', 'topic']], left_on='Item reviewed', right_on='Items reviewed', how='left') # Keep only the URL about the scientific topic of interest: url_df.loc[url_df['topic'] == 'COVID-19', 'field'] = 'COVID-19' url_df = url_df.dropna(subset=['field']) url_df = url_df[url_df['field'] == SCIENTIFIC_TOPIC] # Clean the URLs and extract its domain name: url_df['url'] = url_df['url'].apply(lambda x: ural.normalize_url( x, strip_protocol=False, strip_trailing_slash=True)) url_df['domain_name'] = url_df['url'].apply( lambda x: ural.get_domain_name(x)) # Remove the URLs that are in double in the dataframe, # keeping only the first, i.e. the more recent ocurrence. url_df = url_df.drop_duplicates(subset="url", keep="first") # # Remove the plateforms from the analysis: # plateforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"] # url_df = url_df[~url_df['domain_name'].isin(plateforms)] # # Remove the url with parameters from the analysis because CT return wrong results for them: # url_df['parameter_in_url'] = url_df['url'].apply(lambda x: '?' in x) # url_df = url_df[url_df['parameter_in_url']==False] url_df = url_df[['url', 'Item reviewed', 'field', 'domain_name']] return url_df
def clean_url_format(url_df): url_df['url'] = url_df['url'].transform(lambda x: x.strip()) url_df['url_cleaned'] = url_df['url']\ .apply(lambda x: ural.normalize_url(x, strip_protocol=False, strip_trailing_slash=True)) url_df['domain_name'] = url_df['url_cleaned'].apply( lambda x: ural.get_domain_name(x)) # Remove the URLs that are in double in the dataframe, # keeping only the first, i.e. the more recent ocurrence. url_df = url_df.drop_duplicates(subset="url", keep="first") return url_df
def payloads_iter(iterator, key=None): for item in iterator: url = item if key is None else key(item) if not url: yield FetchWorkerPayload( item=item, domain=None, url=None ) continue # Url cleanup url = ensure_protocol(url.strip()) yield FetchWorkerPayload( item=item, domain=get_domain_name(url), url=url )
def url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm(desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True): enricher.writerow(row) continue enricher.writerow(row, [ normalize_url(url, strip_protocol=namespace.strip_protocol, strip_trailing_slash=True), get_domain_name(url), get_hostname(url), get_normalized_hostname(url) ]) output_file.close()
def grouper(payload): if payload.url is None: return return get_domain_name(payload.url)
def grouper(job): return get_domain_name(job.url)
def test_basics(self): for url, domain in TESTS: assert get_domain_name(url) == domain