def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    local_output_root = kwargs.get('local-output-root', None)
    page_topics_pickle = kwargs['page-topic-pickle']

    page_topics = pickle.load(open(page_topics_pickle, 'rb'))
    _pt_dict = {}
    for p in page_topics:
        _pt_dict.update({p[0]: p[1]})
    pt_dict = sc.broadcast(_pt_dict)

    pages_other = sc.accumulator([], ListParam())

    author_entity = sc.pickleFile(input_path)

    author_topic = author_entity.map(
        lambda x: entities_to_topics(x, pt_dict, pages_other))

    print('pages_other:')
    pp.pprint(pages_other.value)

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_topic.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(
        hdfs_root, kwargs.get('input-job', None), kwargs.get(
            'input-path',
            None))  # /user/username/data/output/_jobs/author_subreddit/latest
    subreddit_category_pickle = kwargs['subreddit-category-pickle']
    subreddit_df_pickle = kwargs['subreddit-df-pickle']

    subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb'))
    _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb'))
    subreddit_df = sc.broadcast(_subreddit_df)

    # create vectorizer for each category out of subreddit_df dataset
    subreddits_grouped_by_categories = subreddits_to_categories(
        _subreddit_df, subreddit_category)
    _vectorizers = {}
    for k, v in subreddits_grouped_by_categories.items():
        dv = DictVectorizer()
        dv.fit_transform(v)
        _vectorizers[k] = dv
    vectorizers = sc.broadcast(_vectorizers)

    data = sc.pickleFile(
        input_path)  # /user/username/data/output/author_category_vec/latest

    authors_total = data.count()
    author_category = data.map(lambda x: get_feature_vectors(
        x, subreddit_df, authors_total, vectorizers))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category.saveAsPickleFile(output_path)
Example #3
0
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    botlist_csv = kwargs['botlist-csv']

    sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True)
    import nltk

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    print("input_path: " + input_path)
    file = sc.textFile(
        input_path)  # /user/username/data/output/sub_com_threads
    threads = file.map(lambda l: json.loads(l))

    pairs = threads.map(lambda x: top_level(x, botlist)).filter(lambda x: x)

    pickle.dump(
        pairs.collect(),
        open(
            '/home/username/data/output/_jobs/pairs_winargs_malleability.pickle',
            'wb'))

    pairs_json = pairs.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    botlist_csv = kwargs['botlist-csv']

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    print("input_path: " + input_path)
    file = sc.textFile(input_path)
    threads = file.map(lambda l: json.loads(l))
    pairs = threads.flatMap(lambda x: top_level(x))
    pairs = pairs.filter(lambda x: x[0]['author'] not in botlist.value and x[1]
                         ['author'] not in botlist.value)

    ### - filter out pairs where sub['selftext'] or com['body'] empty
    ### - OR author == '[removed]/[deleted]'

    pairs_json = pairs.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    file = sc.textFile(
        input_path)  # /user/username/data/output/author_subreddits/latest
    data = file.map(lambda l: json.loads(l))

    flattened = data.flatMap(lambda x: [y for y in x['subreddits']])
    reduced = flattened.map(lambda x: (x['subreddit'], 1)).reduceByKey(
        lambda a, b: a + b)
    mapped = reduced.map(lambda x: {"subreddit": x[0], "df": x[1]})

    subreddit_df_json = mapped.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    subreddit_df_json.saveAsTextFile(output_path)

    print('Saving to {0}...'.format(local_output_root +
                                    '/subreddit_df.pickle'))
    collected = mapped.collect()
    _df_dict = {}
    for s in collected:
        _df_dict.update({s['subreddit']: s['df']})
    pickle.dump(_df_dict, open(local_output_root + '/subreddit_df.pickle',
                               'wb'))
Example #6
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))

    data = sc.pickleFile(
        input_path)  # /user/username/data/output/author_category/latest

    flattened = data.flatMap(lambda x: [k for k, v in x['categories'].items()])
    print(flattened, flattened.count())
    reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
    print(reduced, reduced.count())
    mapped = reduced.map(lambda x: {"category": x[0], "df": x[1]})
    print(mapped, mapped.count())

    data_json = mapped.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    print(data_json, data_json.count())
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    data_json.saveAsTextFile(output_path)

    print('Saving to {0}...'.format(local_output_root + '/category_df.pickle'))
    collected = mapped.collect()
    _df_dict = {}
    for s in collected:
        _df_dict.update({s['category']: s['df']})
    pickle.dump(_df_dict, open(local_output_root + '/category_df.pickle',
                               'wb'))
Example #7
0
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']
    botlist_csv = kwargs['botlist-csv']
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']

    sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True)

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    subs_file = sc.textFile(subs_path)
    subs_data = subs_file.map(lambda l: jsonloads(l)).filter(
        lambda l: l is not "" and 'author' in l and 'selftext' in l)

    coms_file = sc.textFile(coms_path)
    coms_data = coms_file.map(lambda l: jsonloads(l)).filter(
        lambda l: l is not "" and 'author' in l and 'body' in l)

    print("\n#Filtering sub/com...")
    subs_data = subs_data.filter(
        lambda x: x['author'] not in botlist.value and len(x[
            "selftext"]) > 0 and x["selftext"] != "[removed]")
    coms_data = coms_data.filter(
        lambda x: x['author'] not in botlist.value and len(x[
            "body"]) > 0 and x["body"] != "[removed]")

    cmv_authors = pickle.load(
        open('/home/username/data/output/cmv_authors.pickle', 'rb'))
    cmv_authors_b = sc.broadcast(cmv_authors)
    subs_data = subs_data.filter(lambda x: x['author'] in cmv_authors_b.value)
    coms_data = subs_data.filter(lambda x: x['author'] in cmv_authors_b.value)

    print("\n#Running nltk...")
    subs_sentences = subs_data.map(lambda x: get_sentences(x, 'selftext')).map(
        lambda x: (x['author'], x['sentences']))
    coms_sentences = coms_data.map(lambda x: get_sentences(x, 'body')).map(
        lambda x: (x['author'], x['sentences']))
    print(subs_sentences.take(1))

    print("\n#Union, flatten, reduce...")
    united = sc.union([subs_sentences, coms_sentences])
    print(united.take(1))

    sentences_reduced = united.reduceByKey(lambda a, b: a + b)
    print(sentences_reduced.take(1))

    print("\n#Saving results...")
    output_json = sentences_reduced.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    output_json.saveAsTextFile(output_path)
Example #8
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    input_path = utils.get_input_path(
        hdfs_root, kwargs.get('input-job', None), kwargs.get(
            'input-path',
            None))  # /user/username/data/output/_jobs/author_subreddit/latest
    local_output_root = kwargs.get('local-output-root', None)
    subreddit_category_pickle = kwargs['subreddit-category-pickle']

    subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb'))

    author_subreddit = sc.pickleFile(
        input_path)  # /user/username/data/output/author_subreddit_vec/latest

    author_category = author_subreddit.map(
        lambda x: subreddits_to_categories(x, subreddit_category))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    category_df_pickle = kwargs['category-df-pickle']

    _category_df = pickle.load(open(local_output_root+'/category_df.pickle', 'rb'))
    category_df = sc.broadcast(_category_df)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(_category_df)
    dv = sc.broadcast(_dv)

    data = sc.pickleFile(input_path) # /user/username/data/output/author_category/latest

    authors_total = data.count()
    author_category_features = data.map(lambda x: get_feature_vectors(x, authors_total, category_df, dv))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_category_features.saveAsPickleFile(output_path)
Example #10
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs.get('local-output-root', None)
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    subreddit_df_pickle = kwargs['subreddit-df-pickle']

    _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb'))
    subreddit_df = sc.broadcast(_subreddit_df)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(_subreddit_df)
    dv = sc.broadcast(_dv)

    file = sc.textFile(input_path)
    data = file.map(lambda l: json.loads(l) )

    authors_total = data.count()
    author_subreddit_features = data.map(lambda x: get_feature_vectors(x, authors_total, subreddit_df, dv))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name) # output_folder='/tf_squared'
    author_subreddit_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']
    botlist_csv = kwargs['botlist-csv']
    stopwords_csv = kwargs['stopwords-csv']  # ranksnl_stopwords.csv
    sem_model_path = kwargs['sem-model-path']
    sem_path = kwargs['sem-path']
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']

    sc.addFile(sem_model_path)
    sc.addPyFile(sem_path)

    sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True)

    print("\n#sematicize():")
    sys.path.insert(0, SparkFiles.get(sem_path.split('/')[-1]))
    from semanticizest import Semanticizer
    print("\n#loading model...")
    _sem = Semanticizer(SparkFiles.get(sem_model_path.split('/')[-1]))
    print("\n#model loaded.")
    sem = sc.broadcast(_sem)

    with open(stopwords_csv, 'r') as f:
        reader = csv.reader(f)
        _stopwords = list(map(lambda x: x[0], list(reader)))
    stopwords = sc.broadcast(_stopwords)

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    subs_file = sc.textFile(subs_path)
    subs_data = subs_file.map(lambda l: json.loads(l))

    coms_file = sc.textFile(coms_path)
    coms_data = coms_file.map(lambda l: json.loads(l))

    print("\n#Filtering sub/com...")
    subs_data = subs_data.filter(
        lambda x: x['author'] not in botlist.value and len(x[
            "selftext"]) > 0 and x["selftext"] != "[removed]")
    coms_data = coms_data.filter(
        lambda x: x['author'] not in botlist.value and len(x[
            "body"]) > 0 and x["body"] != "[removed]")

    print("\n#Running nltk...")
    subs_sentences = subs_data.map(lambda x: get_sentences(x, 'selftext'))
    coms_sentences = coms_data.map(lambda x: get_sentences(x, 'body'))

    print("\n#Running wikifier...")
    subs_entities = subs_sentences.map(lambda x: semanticize(x, sem))
    coms_entities = coms_sentences.map(lambda x: semanticize(x, sem))

    print("\n#Union, flatten, reduce...")
    united = sc.union([subs_entities, coms_entities])

    entities_flat = united.flatMap(
        lambda x: [((x['author'], e), 1) for e in x['entities']])

    print(entities_flat.take(1))
    # [(('Toby-OrNotToby', ('HMV', 0.2189578713968958, 'Q10854572', 1144829, 'HMV')), 1)]
    entities_reduced = entities_flat.reduceByKey(lambda a, b: a + b)
    print(entities_reduced.take(1))
    # [(('disinterestedMarmot', ('Solutions of the Einstein field equations', 5.930494603249911e-05, 'Q4394061', 2001621, 'solution')), 9)]
    entities_grouped_by_author = entities_reduced.map(
        lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey().mapValues(list)
    print(entities_grouped_by_author.take(1))

    print("\n#Saving results...")
    output_json = entities_grouped_by_author.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    output_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None))
    
    page_topics = pickle.load(open(local_output_root+'/page_topics.pickle', 'rb'))

    _entity_df = pickle.load(open(local_output_root+'/entity_df.pickle', 'rb'))
    entity_df = sc.broadcast(_entity_df)

    topics_df = {'Academic disciplines': 169126,
        'Arts': 165790,
        'Business': 165670,
        'Concepts': 159671,
        'Culture': 169696,
        'Education': 162128,
        'Entertainment': 166557,
        'Events': 157631,
        'Geography': 164197,
        'Health': 168352,
        'History': 166707,
        'Humanities': 169517,
        'Language': 168451,
        'Law': 163853,
        'Life': 167678,
        'Mathematics': 157341,
        'Nature': 167276,
        'Other': 129536,
        'People': 144695,
        'Philosophy': 163002,
        'Politics': 167504,
        'Reference': 157377,
        'Religion': 161830,
        'Science': 167156,
        'Society': 170080,
        'Sports': 158917,
        'Technology': 167069,
        'Universe': 160159,
        'World': 164604}

    topic_pages = pickle.load(open(local_output_root+'/topic_pages.pickle', 'rb'))
    # vectorizer for each topic
    _vectorizers = {}
    for k,v in topic_pages.items():
        dv = DictVectorizer()
        dv.fit_transform(v)
        _vectorizers[k] = dv
    vectorizers = sc.broadcast(_vectorizers)

    _dv = DictVectorizer()
    features_vec = _dv.fit_transform(topics_df)
    dv = sc.broadcast(_dv)

    data = sc.pickleFile(input_path)

    authors_total = data.count()
    author_topic_entity_vec = data.filter(lambda x: len(x['topics']) > 0 and len(x['topics_freq']) > 0)\
        .map(lambda x: get_feature_vectors(x, authors_total, entity_df, topics_df, vectorizers, dv) )

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_topic_entity_vec.saveAsPickleFile(output_path)
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    botlist_csv = kwargs['botlist-csv']  # '/home/username/data/botlist.csv'
    stopwords_csv = kwargs[
        'stopwords-csv']  # /home/username/data/ranksnl_stopwords.csv
    sem_model_path = kwargs[
        'sem-model-path']  # 'hdfs://hadoop:8020/user/username/data/enwiki_pages_n3_1.model'
    sem_path = kwargs[
        'sem-path']  # '/home/username/tools/semanticizest/semanticizest3.zip'
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']

    sc.addFile(sem_model_path)
    sc.addPyFile(sem_path)

    sys.path.insert(0, SparkFiles.get('libs.zip'))
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

    sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True)

    print("\n#sematicize():")
    sys.path.insert(0, SparkFiles.get(sem_path.split('/')[-1]))
    from semanticizest import Semanticizer
    print("\n#loading model...")
    _sem = Semanticizer(SparkFiles.get(sem_model_path.split('/')[-1]))
    print("\n#model loaded.")
    sem = sc.broadcast(_sem)

    with open(stopwords_csv, 'r') as f:
        reader = csv.reader(f)
        _stopwords = list(map(lambda x: x[0], list(reader)))
    stopwords = sc.broadcast(_stopwords)

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    import nltk
    nltk.data.path.append(SparkFiles.get("nltk_data"))

    from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize
    from nltk.corpus import stopwords
    from nltk.corpus import wordnet

    from nltk.stem import WordNetLemmatizer
    lemmatizer = sc.broadcast(WordNetLemmatizer())

    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    analyser = sc.broadcast(SentimentIntensityAnalyzer())

    subs_data = sc.pickleFile(subs_path)
    coms_data = sc.pickleFile(coms_path)

    print("\n#Running nltk, semanticizer...")
    subs_entities = subs_data.map(
        lambda x: process_sentences(x, 'selftext', sem, lemmatizer, analyser))
    coms_entities = coms_data.map(
        lambda x: process_sentences(x, 'body', sem, lemmatizer, analyser))

    print("\n#Union, flatten, reduce...")
    united = sc.union([subs_entities, coms_entities])
    entities_reduced = united.reduceByKey(lambda a, b: (a + b))

    ### remove duplicates
    author_entity_cleaned = entities_reduced.map(lambda x:
                                                 (x[0], list(set(x[1]))))

    ### calculate entity frequencies
    ### TODO: used for freq-entity vector jobs, run separately from this job
    entities_flat = author_entity_cleaned.flatMap(
        lambda x: [((x[0], e[2]), (e, 1)) for e in x[1]])
    entities_reduced = entities_flat.reduceByKey(lambda a, b:
                                                 (a[0], a[1] + b[1]))
    entities_grouped_by_author = entities_reduced.map(
        lambda x: (x[0][0], x[1])).groupByKey().mapValues(list)
    entities_grouped_by_author.saveAsPickleFile(
        '/user/username/data/output/_jobs/author_entity_freq')

    ### add category and top-category to entities
    ### TODO: run as a separate job
    subreddit_category = pickle.load(
        open(
            '/home/username/data/output/_jobs/subreddit_category_index.pickle',
            'rb'))
    subreddit_topcategory = pickle.load(
        open('/home/username/data/output/_jobs/subreddit_topcategory.pickle',
             'rb'))
    entities_flat = author_entity_cleaned.flatMap(
        lambda x: [(e[0], e[1], e[2], e[5]) for e in x[1]]).distinct()
    entities_categories = entities_flat.map(
        lambda x: x + (subreddit_category.get(x[3], 'Other'),
                       subreddit_topcategory.get(x[3], 'Other')))
    entities_categories.saveAsPickleFile(
        '/user/username/data/output/_jobs/author_entity_categories')

    ### create two dictionaries (top)category->list of entities
    entities_categories = sc.pickleFile(
        '/user/username/data/output/_jobs/author_entity_categories')
    _entities_categories = entities_categories.map(
        lambda x: (x[2], x[4], x[5])).collect()
    category_entities = {}
    topcategory_entities = {}
    for x in _entities_categories:
        e = x[0]
        cat = x[1]
        topcat = x[2]
        if cat not in category_entities:
            category_entities[cat] = {}
        category_entities[cat][e] = 0
        if topcat not in topcategory_entities:
            topcategory_entities[topcat] = {}
        topcategory_entities[topcat][e] = 0
    pickle.dump(
        category_entities,
        open('/home/username/data/output/_jobs/category_entities.pickle',
             'wb'))
    pickle.dump(
        topcategory_entities,
        open('/home/username/data/output/_jobs/topcategory_entities.pickle',
             'wb'))

    ### create dictionary entity-categories
    ### TODO: run as a separate job
    entity_categories_dict = {}
    for x in _entities_categories:
        e = x[0]
        cat = x[1]
        topcat = x[2]
        if e not in entity_categories_dict:
            entity_categories_dict[e] = {}
        entity_categories_dict[e]['cat'] = cat
        entity_categories_dict[e]['topcat'] = topcat
    pickle.dump(
        entity_categories_dict,
        open('/home/username/data/output/_jobs/entity_categories_dict.pickle',
             'wb'))

    print("\n#Saving results...")
    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_entity_cleaned.saveAsPickleFile(output_path)
Example #14
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    timestamp = int(time.time())
    hdfs_root = kwargs['hdfs-root']
    subs_path = kwargs['subs-path']
    coms_path = kwargs['coms-path']
    botlist_csv = kwargs['botlist-csv']

    with open(botlist_csv, 'r') as f:
        reader = csv.reader(f)
        _botlist = list(map(lambda x: x[0], list(reader)))
    botlist = sc.broadcast(_botlist)

    ### author-submissions frequencies
    subs_data = sc.pickleFile(subs_path)  # corpora-reddit/corpus-submissions
    subs_data = subs_data.filter(lambda x: True if all(
        k in x for k in ('author', 'subreddit')) else False)
    # leave only authors from pairs
    authors_from_pairs = set(
        pickle.load(
            open('/home/username/data/output/_jobs/authors_from_pairs.pickle',
                 'rb')))
    subs_data = subs_data.filter(lambda x: x['author'] in authors_from_pairs)

    author_subreddit_submission = subs_data.map(lambda x: (
        (x['author'], x['subreddit']), 1)).reduceByKey(lambda a, b: a + b)
    author_subreddit_submission = author_subreddit_submission.map(
        lambda x: {
            'author': x[0][0],
            'subreddit': x[0][1],
            'submissions': x[1]
        })

    ### author-comment frequencies
    coms_data = sc.pickleFile(coms_path)  # corpora-reddit/corpus-comments
    coms_data = coms_data.filter(lambda x: True if all(
        k in x for k in ('author', 'subreddit')) else False)
    coms_data = coms_data.filter(lambda x: x['author'] in authors_from_pairs)

    author_subreddit_comment = coms_data.map(lambda x: (
        (x['author'], x['subreddit']), 1)).reduceByKey(lambda a, b: a + b)
    author_subreddit_comment = author_subreddit_comment.map(
        lambda x: {
            'author': x[0][0],
            'subreddit': x[0][1],
            'comments': x[1]
        })

    data1 = author_subreddit_submission.map(extend).map(
        lambda x: ((x['author'], x['subreddit']), x))
    data2 = author_subreddit_comment.map(extend).map(
        lambda x: ((x['author'], x['subreddit']), x))

    united = data1.union(data2).reduceByKey(reduce_subreddit)
    print('united:', united.count())

    author_subreddit_union = united.map(lambda x: {'author': x[1]['author'], 'submissions': x[1]['submissions'], 'subreddit': x[1]['subreddit'],\
        'comments': x[1]['comments']})

    ### Filter user profile subreddits
    print('author_subreddit_union:', author_subreddit_union.count())
    author_subreddit_union = author_subreddit_union.filter(
        lambda x: not x['subreddit'].startswith('u_') and x['subreddit'] != '')
    print('author_subreddit_union (user profiles filtered):',
          author_subreddit_union.count())

    ### Filter bots and group by author
    author_subreddit_botlist = author_subreddit_union.filter(
        lambda x: x['author'] not in botlist.value)
    author_subreddit_grouped = author_subreddit_botlist.map(
        lambda x: (x['author'], x)).groupByKey().mapValues(list)
    print('author_subreddit_botlist:', author_subreddit_botlist.count())
    print('author_subreddit_grouped:', author_subreddit_grouped.count())

    author_subreddit = author_subreddit_grouped.map(lambda x: {
        "author": x[0],
        "subreddits": x[1]
    })

    ### only authors from 'changemyview'
    author_subreddit_cmv = author_subreddit.filter(lambda x: filter_cmv(x))

    author_subreddit_json = author_subreddit_cmv.map(
        lambda l: json.dumps(l, ensure_ascii=False).encode('utf8'))

    output_path = utils.hdfs_get_output_path(hdfs_root, job_name)
    author_subreddit_json.saveAsTextFile(output_path)
Example #15
0
def analyze(sc, **kwargs):
    pp = kwargs['pp']
    job_name = kwargs['job-name']
    hdfs_root = kwargs['hdfs-root']
    local_output_root = kwargs['local-output-root']
    input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None),
                                      kwargs.get('input-path', None))
    '''TODO:
        - load category_entities and topcategory_entities dicts
        - author_entity: group list of entities into categories / topcategories
    '''

    _subreddit_category = pickle.load(
        open(
            '/home/username/data/output/_jobs/subreddit_category_index.pickle',
            'rb'))
    subreddit_category = sc.broadcast(_subreddit_category)
    _subreddit_topcategory = pickle.load(
        open('/home/username/data/output/_jobs/subreddit_topcategory.pickle',
             'rb'))
    subreddit_topcategory = sc.broadcast(_subreddit_topcategory)

    category_entities = pickle.load(
        open('/home/username/data/output/_jobs/category_entities.pickle',
             'rb'))
    topcategory_entities = pickle.load(
        open('/home/username/data/output/_jobs/topcategory_entities.pickle',
             'rb'))

    _entity_df = pickle.load(
        open('/home/username/data/output/_jobs/entity_df.pickle', 'rb'))
    entity_df = sc.broadcast(_entity_df)

    # create vectorizer for each category out of entity_df dataset
    # _vectorizers_cats = {}
    # for k,v in category_entities.items():
    #     dv = DictVectorizer()
    #     dv.fit_transform(v)
    #     _vectorizers_cats[k] = dv
    # vectorizers_cats = sc.broadcast(_vectorizers_cats)
    # pickle.dump(_vectorizers_cats, open('/home/username/data/output/_jobs/vectorizers_cats.pickle','wb'))
    _vectorizers_cats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb'))
    vectorizers_cats = sc.broadcast(_vectorizers_cats)

    # _vectorizers_topcats = {}
    # for k,v in topcategory_entities.items():
    #     dv = DictVectorizer()
    #     dv.fit_transform(v)
    #     _vectorizers_topcats[k] = dv
    # vectorizers_topcats = sc.broadcast(_vectorizers_topcats)
    # pickle.dump(_vectorizers_topcats, open('/home/username/data/output/_jobs/vectorizers_topcats.pickle','wb'))
    _vectorizers_topcats = pickle.load(
        open('/home/username/data/output/_jobs/vectorizers_topcats.pickle',
             'rb'))
    vectorizers_topcats = sc.broadcast(_vectorizers_topcats)

    # author_entities_categories = sc.pickleFile('/user/username/data/output/_jobs/author_entity_categories')
    data = sc.pickleFile(
        '/user/username/data/output/_jobs/author_entity/latest')
    authors_total = data.count()

    ### calculating category_entities features vectors
    author_entity_features = data.map(lambda x: get_feature_vectors(
        x, authors_total, entity_df, vectorizers_cats, vectorizers_topcats,
        subreddit_category, subreddit_topcategory))

    output_path = utils.hdfs_get_output_path(
        hdfs_root, job_name)  # output_folder='/tf_squared'
    author_entity_features.saveAsPickleFile(output_path)