def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    local_output_root = kwargs.get('local-output-root', None)
    page_topics_pickle = kwargs['page-topic-pickle']

    page_topics = pickle.load(open(page_topics_pickle, 'rb'))
    _pt_dict = {}
    for p in page_topics:
        _pt_dict.update({p[0]: p[1]})
    pt_dict = sc.broadcast(_pt_dict)

    pages_other = sc.accumulator([], ListParam())

    author_entity = sc.pickleFile(input_path)

    author_topic = author_entity.map(
        lambda x: entities_to_topics(x, pt_dict, pages_other))

    print('pages_other:')
    pp.pprint(pages_other.value)

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_topic.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    _entity_df = pickle.load(
        open('/home/username/data/output/_jobs/entity_df.pickle', 'rb'))
    entity_df = sc.broadcast(_entity_df)

    _vectorizers_cats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb'))
    vectorizers_cats = sc.broadcast(_vectorizers_cats)
    _vectorizers_topcats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_topcats.pickle',
             'rb'))
    vectorizers_topcats = sc.broadcast(_vectorizers_topcats)

    _subreddit_category = pickle.load(
        open(
            '/home/username/data/output/_jobs/subreddit_category_index.pickle',
            'rb'))
    subreddit_category = sc.broadcast(_subreddit_category)
    _subreddit_topcategory = pickle.load(
        open('/home/username/data/output/_jobs/subreddit_topcategory.pickle',
             'rb'))
    subreddit_topcategory = sc.broadcast(_subreddit_topcategory)

    author_entity = sc.pickleFile(
        '/user/username/data/output/_jobs/author_entity/latest')

    flatten = author_entity.flatMap(lambda x: [(x[0], ) + e for e in x[1]])
    categories = flatten.map(lambda x: entity_to_categories(
        x, subreddit_category, subreddit_topcategory))
    # ('blackngold14', 'Personal foul (basketball)', 'Q15073191', 1100642, 'personal foul', -0.1779, 'Saints', 't3_2nupey') + (cat, topcat)

    ### map as ((subreddit, author, e_id), sent_score)
    ### map as ((cat/topcat, author, e_id), sent_score)
    medians_cat = categories.map(
        lambda x: ((x[8], x[0], x[1], x[3]), x[5])).groupByKey().mapValues(
            list).map(lambda x: x[0] + (float(np.median(x[1])), ))
    medians_topcat = categories.map(
        lambda x: ((x[9], x[0], x[1], x[3]), x[5])).groupByKey().mapValues(
            list).map(lambda x: x[0] + (float(np.median(x[1])), ))

    grouped_cat = medians_cat.groupBy(lambda x: x[1])
    grouped_topcat = medians_topcat.groupBy(lambda x: x[1])

    ### category_entities features vectors
    feature_vectors_cat = grouped_cat.map(
        lambda x: get_feature_vectors(x, entity_df, vectorizers_cats))
    feature_vectors_topcat = grouped_topcat.map(
        lambda x: get_feature_vectors(x, entity_df, vectorizers_topcats))

    feature_vectors_cat.saveAsPickleFile(
        '/user/username/data/output/_jobs/sentity_category_vec')
    feature_vectors_topcat.saveAsPickleFile(
        '/user/username/data/output/_jobs/sentity_topcategory_vec')
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(
        hdfs_root, kwargs.get('input-job', None), kwargs.get(
            'input-path',
            None))  # /user/username/data/output/_jobs/author_subreddit/latest
    subreddit_category_pickle = kwargs['subreddit-category-pickle']
    subreddit_df_pickle = kwargs['subreddit-df-pickle']

    subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb'))
    _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb'))
    subreddit_df = sc.broadcast(_subreddit_df)

    # create vectorizer for each category out of subreddit_df dataset
    subreddits_grouped_by_categories = subreddits_to_categories(
        _subreddit_df, subreddit_category)
    _vectorizers = {}
    for k, v in subreddits_grouped_by_categories.items():
        dv = DictVectorizer()
        dv.fit_transform(v)
        _vectorizers[k] = dv
    vectorizers = sc.broadcast(_vectorizers)

    data = sc.pickleFile(
        input_path)  # /user/username/data/output/author_category_vec/latest

    authors_total = data.count()
    author_category = data.map(lambda x: get_feature_vectors(
        x, subreddit_df, authors_total, vectorizers))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category.saveAsPickleFile(output_path)
Example #4
0
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    botlist_csv = kwargs['botlist-csv']

    sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True)
    import nltk

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    print("input_path: " + input_path)
    file = sc.textFile(
        input_path)  # /user/username/data/output/sub_com_threads
    threads = file.map(lambda l: json.loads(l))

    pairs = threads.map(lambda x: top_level(x, botlist)).filter(lambda x: x)

    pickle.dump(
        pairs.collect(),
        open(
            '/home/username/data/output/_jobs/pairs_winargs_malleability.pickle',
            'wb'))

    pairs_json = pairs.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    botlist_csv = kwargs['botlist-csv']

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    print("input_path: " + input_path)
    file = sc.textFile(input_path)
    threads = file.map(lambda l: json.loads(l))
    pairs = threads.flatMap(lambda x: top_level(x))
    pairs = pairs.filter(lambda x: x[0]['author'] not in botlist.value and x[1]
                         ['author'] not in botlist.value)

    ### - filter out pairs where sub['selftext'] or com['body'] empty
    ### - OR author == '[removed]/[deleted]'

    pairs_json = pairs.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    file = sc.textFile(
        input_path)  # /user/username/data/output/author_subreddits/latest
    data = file.map(lambda l: json.loads(l))

    flattened = data.flatMap(lambda x: [y for y in x['subreddits']])
    reduced = flattened.map(lambda x: (x['subreddit'], 1)).reduceByKey(
        lambda a, b: a + b)
    mapped = reduced.map(lambda x: {"subreddit": x[0], "df": x[1]})

    subreddit_df_json = mapped.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    subreddit_df_json.saveAsTextFile(output_path)

    print('Saving to {0}...'.format(local_output_root +
                                    '/subreddit_df.pickle'))
    collected = mapped.collect()
    _df_dict = {}
    for s in collected:
        _df_dict.update({s['subreddit']: s['df']})
    pickle.dump(_df_dict, open(local_output_root + '/subreddit_df.pickle',
                               'wb'))
Example #7
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    data = sc.pickleFile(
        input_path)  # /user/username/data/output/author_category/latest

    flattened = data.flatMap(lambda x: [k for k, v in x['categories'].items()])
    print(flattened, flattened.count())
    reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
    print(reduced, reduced.count())
    mapped = reduced.map(lambda x: {"category": x[0], "df": x[1]})
    print(mapped, mapped.count())

    data_json = mapped.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    print(data_json, data_json.count())
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    data_json.saveAsTextFile(output_path)

    print('Saving to {0}...'.format(local_output_root + '/category_df.pickle'))
    collected = mapped.collect()
    _df_dict = {}
    for s in collected:
        _df_dict.update({s['category']: s['df']})
    pickle.dump(_df_dict, open(local_output_root + '/category_df.pickle',
                               'wb'))
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))

    file = sc.textFile(input_path)
    pairs = file.map(lambda l: json.loads(l))

    split = pairs.flatMap(lambda x: split(x) )

    pickle.dump(split.collect(), open('/home/username/data/output/_jobs/pairs_winargs_split.pickle','wb'))
Example #9
0
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    data = sc.pickleFile('/user/username/data/output/_jobs/author_entity_freq')

    flattened = data.flatMap(lambda x: [e[0][2] for e in x[1]])
    reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
    mapped = reduced.map(lambda x: {"entity_id": x[0], "df": x[1]})

    print('Saving to {0}...'.format(local_output_root + '/entity_df.pickle'))
    collected = mapped.collect()
    _df_dict = {}
    for s in collected:
        _df_dict.update({s['entity_id']: s['df']})
    pickle.dump(_df_dict, open(local_output_root + '/entity_df.pickle', 'wb'))
Example #10
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(
        hdfs_root, kwargs.get('input-job', None), kwargs.get(
            'input-path',
            None))  # /user/username/data/output/_jobs/author_subreddit/latest
    local_output_root = kwargs.get('local-output-root', None)
    subreddit_category_pickle = kwargs['subreddit-category-pickle']

    subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb'))

    author_subreddit = sc.pickleFile(
        input_path)  # /user/username/data/output/author_subreddit_vec/latest

    author_category = author_subreddit.map(
        lambda x: subreddits_to_categories(x, subreddit_category))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    category_df_pickle = kwargs['category-df-pickle']

    _category_df = pickle.load(open(local_output_root+'/category_df.pickle', 'rb'))
    category_df = sc.broadcast(_category_df)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(_category_df)
    dv = sc.broadcast(_dv)

    data = sc.pickleFile(input_path) # /user/username/data/output/author_category/latest

    authors_total = data.count()
    author_category_features = data.map(lambda x: get_feature_vectors(x, authors_total, category_df, dv))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category_features.saveAsPickleFile(output_path)
Example #12
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    subreddit_df_pickle = kwargs['subreddit-df-pickle']

    _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb'))
    subreddit_df = sc.broadcast(_subreddit_df)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(_subreddit_df)
    dv = sc.broadcast(_dv)

    file = sc.textFile(input_path)
    data = file.map(lambda l: json.loads(l) )

    authors_total = data.count()
    author_subreddit_features = data.map(lambda x: get_feature_vectors(x, authors_total, subreddit_df, dv))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name) # output_folder='/tf_squared'
    author_subreddit_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    
    page_topics = pickle.load(open(local_output_root+'/page_topics.pickle', 'rb'))

    _entity_df = pickle.load(open(local_output_root+'/entity_df.pickle', 'rb'))
    entity_df = sc.broadcast(_entity_df)

    topics_df = {'Academic disciplines': 169126,
        'Arts': 165790,
        'Business': 165670,
        'Concepts': 159671,
        'Culture': 169696,
        'Education': 162128,
        'Entertainment': 166557,
        'Events': 157631,
        'Geography': 164197,
        'Health': 168352,
        'History': 166707,
        'Humanities': 169517,
        'Language': 168451,
        'Law': 163853,
        'Life': 167678,
        'Mathematics': 157341,
        'Nature': 167276,
        'Other': 129536,
        'People': 144695,
        'Philosophy': 163002,
        'Politics': 167504,
        'Reference': 157377,
        'Religion': 161830,
        'Science': 167156,
        'Society': 170080,
        'Sports': 158917,
        'Technology': 167069,
        'Universe': 160159,
        'World': 164604}

    topic_pages = pickle.load(open(local_output_root+'/topic_pages.pickle', 'rb'))
    # vectorizer for each topic
    _vectorizers = {}
    for k,v in topic_pages.items():
        dv = DictVectorizer()
        dv.fit_transform(v)
        _vectorizers[k] = dv
    vectorizers = sc.broadcast(_vectorizers)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(topics_df)
    dv = sc.broadcast(_dv)

    data = sc.pickleFile(input_path)

    authors_total = data.count()
    author_topic_entity_vec = data.filter(lambda x: len(x['topics']) > 0 and len(x['topics_freq']) > 0)\
        .map(lambda x: get_feature_vectors(x, authors_total, entity_df, topics_df, vectorizers, dv) )

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_topic_entity_vec.saveAsPickleFile(output_path)
Example #14
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    '''TODO:
        - load category_entities and topcategory_entities dicts
        - author_entity: group list of entities into categories / topcategories
    '''

    _subreddit_category = pickle.load(
        open(
            '/home/username/data/output/_jobs/subreddit_category_index.pickle',
            'rb'))
    subreddit_category = sc.broadcast(_subreddit_category)
    _subreddit_topcategory = pickle.load(
        open('/home/username/data/output/_jobs/subreddit_topcategory.pickle',
             'rb'))
    subreddit_topcategory = sc.broadcast(_subreddit_topcategory)

    category_entities = pickle.load(
        open('/home/username/data/output/_jobs/category_entities.pickle',
             'rb'))
    topcategory_entities = pickle.load(
        open('/home/username/data/output/_jobs/topcategory_entities.pickle',
             'rb'))

    _entity_df = pickle.load(
        open('/home/username/data/output/_jobs/entity_df.pickle', 'rb'))
    entity_df = sc.broadcast(_entity_df)

    # create vectorizer for each category out of entity_df dataset
    # _vectorizers_cats = {}
    # for k,v in category_entities.items():
    #     dv = DictVectorizer()
    #     dv.fit_transform(v)
    #     _vectorizers_cats[k] = dv
    # vectorizers_cats = sc.broadcast(_vectorizers_cats)
    # pickle.dump(_vectorizers_cats, open('/home/username/data/output/_jobs/vectorizers_cats.pickle','wb'))
    _vectorizers_cats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb'))
    vectorizers_cats = sc.broadcast(_vectorizers_cats)

    # _vectorizers_topcats = {}
    # for k,v in topcategory_entities.items():
    #     dv = DictVectorizer()
    #     dv.fit_transform(v)
    #     _vectorizers_topcats[k] = dv
    # vectorizers_topcats = sc.broadcast(_vectorizers_topcats)
    # pickle.dump(_vectorizers_topcats, open('/home/username/data/output/_jobs/vectorizers_topcats.pickle','wb'))
    _vectorizers_topcats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_topcats.pickle',
             'rb'))
    vectorizers_topcats = sc.broadcast(_vectorizers_topcats)

    # author_entities_categories = sc.pickleFile('/user/username/data/output/_jobs/author_entity_categories')
    data = sc.pickleFile(
        '/user/username/data/output/_jobs/author_entity/latest')
    authors_total = data.count()

    ### calculating category_entities features vectors
    author_entity_features = data.map(lambda x: get_feature_vectors(
        x, authors_total, entity_df, vectorizers_cats, vectorizers_topcats,
        subreddit_category, subreddit_topcategory))

    output_path = utils.hdfs_get_output_path(
        hdfs_root, job_name)  # output_folder='/tf_squared'
    author_entity_features.saveAsPickleFile(output_path)