Beispiel #1
0
def gen_keywords():
    if os.path.exists(KEYWORDS_PATH):
        keywords = load_keywords()
    else:
        keywords = {}

    # We're keeping a cache of already parsed links and also
    # are storing their matched canonical link to ensure that we can
    # add a user to the set of an already parsed link.
    if os.path.exists(PARSED_LINKS_PATH):
        parsed_links = loadj(PARSED_LINKS_PATH)
    else:
        parsed_links = {}

    graph_names = [n['id'] for n in loadj('website/outgroup.json')['nodes']]
    user_ids = [USER_IDS[name] for name in graph_names]
    user_links = get_user_links(user_ids)
    all_links = []
    for user_name, links in user_links.items():
        for l in links:
            # Don't process the link if we already did.
            if l not in parsed_links:
                all_links.append((user_name, l))
                parsed_links[l] = ''
            elif parsed_links[l] != '' and parsed_links[l] in keywords:
                keywords[parsed_links[l]]['users'].add(user_name)

    print(f'{len(all_links)} to parse...')
    p = Pool(NUM_WORKERS)
    kw_tuples = p.starmap(get_keywords_pmap, all_links)
    for user_name, c_link, l, kws, p_time, title in kw_tuples:
        if c_link is not None:
            parsed_links[l] = c_link
            keywords[c_link]['kws'] = kws
            keywords[c_link]['time'] = p_time
            if 'users' not in keywords[c_link]:
                keywords[c_link]['users'] = set()
            keywords[c_link]['users'].add(user_name)
            keywords[c_link]['title'] = title
        else:
            parsed_links[l] = ''
    # Make the keywords dict serializable.
    for c_link in keywords:
        keywords[c_link]['users'] = list(keywords[c_link]['users'])
        keywords[c_link]['kws'] = list(keywords[c_link]['kws'])

    writej(keywords, KEYWORDS_PATH)
    writej(parsed_links, PARSED_LINKS_PATH)
Beispiel #2
0
def get_followers():
    followers = {}
    files = [f for f in os.listdir(PREFIX) if f.find('_followers')>0]
    for f_name in tqdm(files):
        user_id = f_name.split('_')[0]
        followers[user_id] = frozenset(loadj(PREFIX+f_name))
    return followers
Beispiel #3
0
def load_keywords():
    kws = loadj(KEYWORDS_PATH)
    keywords = defaultdict(lambda:{})
    for l, v in kws.items():
        keywords[l]['users'] = set(v['users'])
        keywords[l]['kws'] = frozenset(v['kws'])
        keywords[l]['title'] = v['title']
    return keywords
Beispiel #4
0
def setup_data(app):
    app['db'] = None
    app['link_cache'] = loadj(PREFIX + 'link_cache.json')
    if app['link_cache'] is None:
        app['link_cache'] = {}
    app['db_refresh'] = app.loop.create_task(schedule(app, refresh_db, 10*60))
    app['flush_cache'] = app.loop.create_task(schedule(app, flush_cache, 15*60))
    app.on_cleanup.append(cancel_sched)
Beispiel #5
0
async def get_user_by_id(user_id, client):
    file_path = PREFIX + user_id + '_user'
    obj = loadj(file_path)
    if obj is not None:
        return obj

    res = await client.api.users.lookup.get(user_id=user_id)
    writej(res[0], file_path)
    return res[0]
Beispiel #6
0
def get_user_links(ids):
    user_links = {}
    print('Extracting links from user timelines...')
    for user_id in tqdm(ids):
        f_path = PREFIX + user_id + '_timeline'
        try:
            links = extract_links_from_timeline(loadj(f_path),
                                                include_retweets=False)
            user_links[USER_NAMES[user_id]] = links
        except Exception as e:
            print(e)
    return user_links
def get_friend_whitelist(user_names):
    friend_whitelist = set()
    files = [f for f in os.listdir(PREFIX) if f.find('_friends') > 0]
    for f_name in files:
        user_id, screen_name, _ = f_name.split('_')
        if screen_name not in FILTERED:
            friend_whitelist.add(user_id)
            friends = loadj(PREFIX + f_name)
            for friend_id in friends:
                # We might have updated the friends list but not
                # re-crawled yet.
                if friend_id in user_names:
                    friend_whitelist.add(friend_id)
    return friend_whitelist
Beispiel #8
0
def write_user_names():
    user_names = {}
    files = [f for f in os.listdir(PREFIX) if f.find('_user')>0]
    for f_name in files:
        try:
            obj = loadj(PREFIX+f_name)
            user_names[obj['id_str']] = obj['screen_name']
        except Exception as e:
            print(f_name, e)
    writej(user_names, PREFIX + 'user_names')

    # Write the reverse dict as well.
    user_ids = {}
    for user_id, name in user_names.items():
        user_ids[name] = user_id
    writej(user_ids, PREFIX + 'user_ids')
Beispiel #9
0
def get_provenance():
    media_prov = defaultdict(lambda: '')
    is_conservative = defaultdict(lambda: False)
    files = [f for f in os.listdir(PREFIX) if f.find('_user') > 0]
    for f_name in files:
        user = loadj(PREFIX + f_name)
        for media, matches in MEDIA.items():
            for match in matches:
                user_desc = user['description'].lower()
                screen_name = user['screen_name']
                # NYT bestseller authors throw things off.
                if match in user_desc and screen_name not in media_prov and 'bestsell' not in user_desc:
                    media_prov[screen_name] = media
                if 'conservative' in user_desc:
                    is_conservative[screen_name] = True
    return media_prov, is_conservative
Beispiel #10
0
import math

from helpers import loadj

POSITION = loadj('node_positions.json')

U = (1, -0.3601689858)
UU = math.sqrt(U[0] * U[0] + U[1] * U[1])
U = (U[0] / UU, U[1] / UU)
M = (461.3168343280106, 445.9243631476138)

RES = 0.25
RANGE = [
    i / 100 for i in range(int(-1 * 100), int((1 + RES) * 100), int(RES * 100))
]

MIN_ISEC_ARTICLE = 4
MIN_USERS = 3
MAX_RETURNED_ARTICLES = 5


# We're capping the political spectrum to -350, 350.
def clamp(scalar):
    scalar = scalar / 350
    return int(scalar / RES) * RES


def get_scalar(users):
    avg_pos = (sum([POSITION[u]['x'] for u in users]) / len(users),
               sum([POSITION[u]['y'] for u in users]) / len(users))
    avg_pos = (avg_pos[0] - M[0], avg_pos[1] - M[1])
Beispiel #11
0
def get_api_creds():
    return loadj('api_creds.json')
Beispiel #12
0
import os.path
import asyncio
from peony import PeonyClient

from helpers import loadj, writej, append_listj

SEED_NAMES = [
    'dailykos', 'thinkprogress', 'HuffingtonPost', 'voxdotcom', 'nytimes',
    'washingtonpost', 'politico', 'USATODAY', 'StephensWSJ', 'WSJ',
    'arthurbrooks', 'EWErickson', 'nypost', 'BreitbartNews', 'RealAlexJones'
]
MAX_SEED_FOLLOWER_COUNT = 300000
MIN_SEED_FOLLOWER_COUNT = 20000
PREFIX = '/root/timelines/'

USER_IDS = loadj(PREFIX + 'user_ids')
GRAPH_NAMES = [n['id'] for n in loadj('website/outgroup.json')['nodes']]
GRAPH_IDS = [USER_IDS[name] for name in GRAPH_NAMES]


class ClientPool:
    def __init__(self, api_creds):
        self._index = 0
        self._clients = clients = [
            PeonyClient(consumer_key=creds['consumer_key'],
                        consumer_secret=creds['consumer_secret'],
                        access_token=creds['access_token'],
                        access_token_secret=creds['access_token_secret'])
            for creds in api_creds
        ]
        self.client_num = len(self._clients)
Beispiel #13
0
from datetime import datetime
from collections import defaultdict
import pytz
from tqdm import tqdm

from crawler import PREFIX
from helpers import loadj

MIN_DATE = '2017-04-01'
USER_NAMES = loadj(PREFIX + 'user_names')


def get_user_links(ids):
    user_links = {}
    print('Extracting links from user timelines...')
    for user_id in tqdm(ids):
        f_path = PREFIX + user_id + '_timeline'
        try:
            links = extract_links_from_timeline(loadj(f_path),
                                                include_retweets=False)
            user_links[USER_NAMES[user_id]] = links
        except Exception as e:
            print(e)
    return user_links


def extract_links_from_timeline(tweets, include_retweets=True):
    if tweets is None:
        return []
    links = []
    for t in tweets:
Beispiel #14
0
import os
from collections import defaultdict
from networkx import Graph

from crawler import PREFIX
from helpers import writej, loadj

MIN_DIST = 0.05
MIN_DEGREE = 15

MEDIA = loadj('media.json')
NEOCONSERVATIVES = frozenset([
    p.lower() for p in [
        'arthurbrooks', 'dpletka', 'FredBarnes', 'davidfrum', 'JonahNRO',
        'brithume', 'krauthammer', 'BillKristol', 'JoshuaMuravchik',
        'DanielPipes', 'jpodhoretz', 'JRubinBlogger', 'mrubin1971',
        'jonathans_tobin'
    ]
])


def get_user_names():
    return loadj(PREFIX + 'user_names')


def get_provenance():
    media_prov = defaultdict(lambda: '')
    is_conservative = defaultdict(lambda: False)
    files = [f for f in os.listdir(PREFIX) if f.find('_user') > 0]
    for f_name in files:
        user = loadj(PREFIX + f_name)
Beispiel #15
0
def get_user_names():
    return loadj(PREFIX + 'user_names')
Beispiel #16
0
import os
from collections import defaultdict
from multiprocessing import Pool
from newspaper import Article
import newspaper.nlp as nlp
from timeoutcontext import timeout

from helpers import loadj, writej
from links import get_user_links
from crawler import PREFIX

NUM_WORKERS = 1
KEYWORDS_PATH = PREFIX + 'links_keywords.json'
PARSED_LINKS_PATH = PREFIX + 'parsed_links.json'
USER_IDS = loadj(PREFIX + 'user_ids')

def gen_keywords():
    if os.path.exists(KEYWORDS_PATH):
        keywords = load_keywords()
    else:
        keywords = {}

    # We're keeping a cache of already parsed links and also
    # are storing their matched canonical link to ensure that we can
    # add a user to the set of an already parsed link.
    if os.path.exists(PARSED_LINKS_PATH):
        parsed_links = loadj(PARSED_LINKS_PATH)
    else:
        parsed_links = {}

    graph_names = [n['id'] for n in loadj('website/outgroup.json')['nodes']]
Beispiel #17
0
def get_trimmable_nodes():
    return frozenset(loadj('to_trim.json'))