def gen_keywords(): if os.path.exists(KEYWORDS_PATH): keywords = load_keywords() else: keywords = {} # We're keeping a cache of already parsed links and also # are storing their matched canonical link to ensure that we can # add a user to the set of an already parsed link. if os.path.exists(PARSED_LINKS_PATH): parsed_links = loadj(PARSED_LINKS_PATH) else: parsed_links = {} graph_names = [n['id'] for n in loadj('website/outgroup.json')['nodes']] user_ids = [USER_IDS[name] for name in graph_names] user_links = get_user_links(user_ids) all_links = [] for user_name, links in user_links.items(): for l in links: # Don't process the link if we already did. if l not in parsed_links: all_links.append((user_name, l)) parsed_links[l] = '' elif parsed_links[l] != '' and parsed_links[l] in keywords: keywords[parsed_links[l]]['users'].add(user_name) print(f'{len(all_links)} to parse...') p = Pool(NUM_WORKERS) kw_tuples = p.starmap(get_keywords_pmap, all_links) for user_name, c_link, l, kws, p_time, title in kw_tuples: if c_link is not None: parsed_links[l] = c_link keywords[c_link]['kws'] = kws keywords[c_link]['time'] = p_time if 'users' not in keywords[c_link]: keywords[c_link]['users'] = set() keywords[c_link]['users'].add(user_name) keywords[c_link]['title'] = title else: parsed_links[l] = '' # Make the keywords dict serializable. for c_link in keywords: keywords[c_link]['users'] = list(keywords[c_link]['users']) keywords[c_link]['kws'] = list(keywords[c_link]['kws']) writej(keywords, KEYWORDS_PATH) writej(parsed_links, PARSED_LINKS_PATH)
def get_followers(): followers = {} files = [f for f in os.listdir(PREFIX) if f.find('_followers')>0] for f_name in tqdm(files): user_id = f_name.split('_')[0] followers[user_id] = frozenset(loadj(PREFIX+f_name)) return followers
def load_keywords(): kws = loadj(KEYWORDS_PATH) keywords = defaultdict(lambda:{}) for l, v in kws.items(): keywords[l]['users'] = set(v['users']) keywords[l]['kws'] = frozenset(v['kws']) keywords[l]['title'] = v['title'] return keywords
def setup_data(app): app['db'] = None app['link_cache'] = loadj(PREFIX + 'link_cache.json') if app['link_cache'] is None: app['link_cache'] = {} app['db_refresh'] = app.loop.create_task(schedule(app, refresh_db, 10*60)) app['flush_cache'] = app.loop.create_task(schedule(app, flush_cache, 15*60)) app.on_cleanup.append(cancel_sched)
async def get_user_by_id(user_id, client): file_path = PREFIX + user_id + '_user' obj = loadj(file_path) if obj is not None: return obj res = await client.api.users.lookup.get(user_id=user_id) writej(res[0], file_path) return res[0]
def get_user_links(ids): user_links = {} print('Extracting links from user timelines...') for user_id in tqdm(ids): f_path = PREFIX + user_id + '_timeline' try: links = extract_links_from_timeline(loadj(f_path), include_retweets=False) user_links[USER_NAMES[user_id]] = links except Exception as e: print(e) return user_links
def get_friend_whitelist(user_names): friend_whitelist = set() files = [f for f in os.listdir(PREFIX) if f.find('_friends') > 0] for f_name in files: user_id, screen_name, _ = f_name.split('_') if screen_name not in FILTERED: friend_whitelist.add(user_id) friends = loadj(PREFIX + f_name) for friend_id in friends: # We might have updated the friends list but not # re-crawled yet. if friend_id in user_names: friend_whitelist.add(friend_id) return friend_whitelist
def write_user_names(): user_names = {} files = [f for f in os.listdir(PREFIX) if f.find('_user')>0] for f_name in files: try: obj = loadj(PREFIX+f_name) user_names[obj['id_str']] = obj['screen_name'] except Exception as e: print(f_name, e) writej(user_names, PREFIX + 'user_names') # Write the reverse dict as well. user_ids = {} for user_id, name in user_names.items(): user_ids[name] = user_id writej(user_ids, PREFIX + 'user_ids')
def get_provenance(): media_prov = defaultdict(lambda: '') is_conservative = defaultdict(lambda: False) files = [f for f in os.listdir(PREFIX) if f.find('_user') > 0] for f_name in files: user = loadj(PREFIX + f_name) for media, matches in MEDIA.items(): for match in matches: user_desc = user['description'].lower() screen_name = user['screen_name'] # NYT bestseller authors throw things off. if match in user_desc and screen_name not in media_prov and 'bestsell' not in user_desc: media_prov[screen_name] = media if 'conservative' in user_desc: is_conservative[screen_name] = True return media_prov, is_conservative
import math from helpers import loadj POSITION = loadj('node_positions.json') U = (1, -0.3601689858) UU = math.sqrt(U[0] * U[0] + U[1] * U[1]) U = (U[0] / UU, U[1] / UU) M = (461.3168343280106, 445.9243631476138) RES = 0.25 RANGE = [ i / 100 for i in range(int(-1 * 100), int((1 + RES) * 100), int(RES * 100)) ] MIN_ISEC_ARTICLE = 4 MIN_USERS = 3 MAX_RETURNED_ARTICLES = 5 # We're capping the political spectrum to -350, 350. def clamp(scalar): scalar = scalar / 350 return int(scalar / RES) * RES def get_scalar(users): avg_pos = (sum([POSITION[u]['x'] for u in users]) / len(users), sum([POSITION[u]['y'] for u in users]) / len(users)) avg_pos = (avg_pos[0] - M[0], avg_pos[1] - M[1])
def get_api_creds(): return loadj('api_creds.json')
import os.path import asyncio from peony import PeonyClient from helpers import loadj, writej, append_listj SEED_NAMES = [ 'dailykos', 'thinkprogress', 'HuffingtonPost', 'voxdotcom', 'nytimes', 'washingtonpost', 'politico', 'USATODAY', 'StephensWSJ', 'WSJ', 'arthurbrooks', 'EWErickson', 'nypost', 'BreitbartNews', 'RealAlexJones' ] MAX_SEED_FOLLOWER_COUNT = 300000 MIN_SEED_FOLLOWER_COUNT = 20000 PREFIX = '/root/timelines/' USER_IDS = loadj(PREFIX + 'user_ids') GRAPH_NAMES = [n['id'] for n in loadj('website/outgroup.json')['nodes']] GRAPH_IDS = [USER_IDS[name] for name in GRAPH_NAMES] class ClientPool: def __init__(self, api_creds): self._index = 0 self._clients = clients = [ PeonyClient(consumer_key=creds['consumer_key'], consumer_secret=creds['consumer_secret'], access_token=creds['access_token'], access_token_secret=creds['access_token_secret']) for creds in api_creds ] self.client_num = len(self._clients)
from datetime import datetime from collections import defaultdict import pytz from tqdm import tqdm from crawler import PREFIX from helpers import loadj MIN_DATE = '2017-04-01' USER_NAMES = loadj(PREFIX + 'user_names') def get_user_links(ids): user_links = {} print('Extracting links from user timelines...') for user_id in tqdm(ids): f_path = PREFIX + user_id + '_timeline' try: links = extract_links_from_timeline(loadj(f_path), include_retweets=False) user_links[USER_NAMES[user_id]] = links except Exception as e: print(e) return user_links def extract_links_from_timeline(tweets, include_retweets=True): if tweets is None: return [] links = [] for t in tweets:
import os from collections import defaultdict from networkx import Graph from crawler import PREFIX from helpers import writej, loadj MIN_DIST = 0.05 MIN_DEGREE = 15 MEDIA = loadj('media.json') NEOCONSERVATIVES = frozenset([ p.lower() for p in [ 'arthurbrooks', 'dpletka', 'FredBarnes', 'davidfrum', 'JonahNRO', 'brithume', 'krauthammer', 'BillKristol', 'JoshuaMuravchik', 'DanielPipes', 'jpodhoretz', 'JRubinBlogger', 'mrubin1971', 'jonathans_tobin' ] ]) def get_user_names(): return loadj(PREFIX + 'user_names') def get_provenance(): media_prov = defaultdict(lambda: '') is_conservative = defaultdict(lambda: False) files = [f for f in os.listdir(PREFIX) if f.find('_user') > 0] for f_name in files: user = loadj(PREFIX + f_name)
def get_user_names(): return loadj(PREFIX + 'user_names')
import os from collections import defaultdict from multiprocessing import Pool from newspaper import Article import newspaper.nlp as nlp from timeoutcontext import timeout from helpers import loadj, writej from links import get_user_links from crawler import PREFIX NUM_WORKERS = 1 KEYWORDS_PATH = PREFIX + 'links_keywords.json' PARSED_LINKS_PATH = PREFIX + 'parsed_links.json' USER_IDS = loadj(PREFIX + 'user_ids') def gen_keywords(): if os.path.exists(KEYWORDS_PATH): keywords = load_keywords() else: keywords = {} # We're keeping a cache of already parsed links and also # are storing their matched canonical link to ensure that we can # add a user to the set of an already parsed link. if os.path.exists(PARSED_LINKS_PATH): parsed_links = loadj(PARSED_LINKS_PATH) else: parsed_links = {} graph_names = [n['id'] for n in loadj('website/outgroup.json')['nodes']]
def get_trimmable_nodes(): return frozenset(loadj('to_trim.json'))