def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) local_output_root = kwargs.get('local-output-root', None) page_topics_pickle = kwargs['page-topic-pickle'] page_topics = pickle.load(open(page_topics_pickle, 'rb')) _pt_dict = {} for p in page_topics: _pt_dict.update({p[0]: p[1]}) pt_dict = sc.broadcast(_pt_dict) pages_other = sc.accumulator([], ListParam()) author_entity = sc.pickleFile(input_path) author_topic = author_entity.map( lambda x: entities_to_topics(x, pt_dict, pages_other)) print('pages_other:') pp.pprint(pages_other.value) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_topic.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) _entity_df = pickle.load( open('/home/username/data/output/_jobs/entity_df.pickle', 'rb')) entity_df = sc.broadcast(_entity_df) _vectorizers_cats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb')) vectorizers_cats = sc.broadcast(_vectorizers_cats) _vectorizers_topcats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_topcats.pickle', 'rb')) vectorizers_topcats = sc.broadcast(_vectorizers_topcats) _subreddit_category = pickle.load( open( '/home/username/data/output/_jobs/subreddit_category_index.pickle', 'rb')) subreddit_category = sc.broadcast(_subreddit_category) _subreddit_topcategory = pickle.load( open('/home/username/data/output/_jobs/subreddit_topcategory.pickle', 'rb')) subreddit_topcategory = sc.broadcast(_subreddit_topcategory) author_entity = sc.pickleFile( '/user/username/data/output/_jobs/author_entity/latest') flatten = author_entity.flatMap(lambda x: [(x[0], ) + e for e in x[1]]) categories = flatten.map(lambda x: entity_to_categories( x, subreddit_category, subreddit_topcategory)) # ('blackngold14', 'Personal foul (basketball)', 'Q15073191', 1100642, 'personal foul', -0.1779, 'Saints', 't3_2nupey') + (cat, topcat) ### map as ((subreddit, author, e_id), sent_score) ### map as ((cat/topcat, author, e_id), sent_score) medians_cat = categories.map( lambda x: ((x[8], x[0], x[1], x[3]), x[5])).groupByKey().mapValues( list).map(lambda x: x[0] + (float(np.median(x[1])), )) medians_topcat = categories.map( lambda x: ((x[9], x[0], x[1], x[3]), x[5])).groupByKey().mapValues( list).map(lambda x: x[0] + (float(np.median(x[1])), )) grouped_cat = medians_cat.groupBy(lambda x: x[1]) grouped_topcat = medians_topcat.groupBy(lambda x: x[1]) ### category_entities features vectors feature_vectors_cat = grouped_cat.map( lambda x: get_feature_vectors(x, entity_df, vectorizers_cats)) feature_vectors_topcat = grouped_topcat.map( lambda x: get_feature_vectors(x, entity_df, vectorizers_topcats)) feature_vectors_cat.saveAsPickleFile( '/user/username/data/output/_jobs/sentity_category_vec') feature_vectors_topcat.saveAsPickleFile( '/user/username/data/output/_jobs/sentity_topcategory_vec')
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path( hdfs_root, kwargs.get('input-job', None), kwargs.get( 'input-path', None)) # /user/username/data/output/_jobs/author_subreddit/latest subreddit_category_pickle = kwargs['subreddit-category-pickle'] subreddit_df_pickle = kwargs['subreddit-df-pickle'] subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb')) _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb')) subreddit_df = sc.broadcast(_subreddit_df) # create vectorizer for each category out of subreddit_df dataset subreddits_grouped_by_categories = subreddits_to_categories( _subreddit_df, subreddit_category) _vectorizers = {} for k, v in subreddits_grouped_by_categories.items(): dv = DictVectorizer() dv.fit_transform(v) _vectorizers[k] = dv vectorizers = sc.broadcast(_vectorizers) data = sc.pickleFile( input_path) # /user/username/data/output/author_category_vec/latest authors_total = data.count() author_category = data.map(lambda x: get_feature_vectors( x, subreddit_df, authors_total, vectorizers)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) botlist_csv = kwargs['botlist-csv'] sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True) import nltk with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) print("input_path: " + input_path) file = sc.textFile( input_path) # /user/username/data/output/sub_com_threads threads = file.map(lambda l: json.loads(l)) pairs = threads.map(lambda x: top_level(x, botlist)).filter(lambda x: x) pickle.dump( pairs.collect(), open( '/home/username/data/output/_jobs/pairs_winargs_malleability.pickle', 'wb')) pairs_json = pairs.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) botlist_csv = kwargs['botlist-csv'] with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) print("input_path: " + input_path) file = sc.textFile(input_path) threads = file.map(lambda l: json.loads(l)) pairs = threads.flatMap(lambda x: top_level(x)) pairs = pairs.filter(lambda x: x[0]['author'] not in botlist.value and x[1] ['author'] not in botlist.value) ### - filter out pairs where sub['selftext'] or com['body'] empty ### - OR author == '[removed]/[deleted]' pairs_json = pairs.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) file = sc.textFile( input_path) # /user/username/data/output/author_subreddits/latest data = file.map(lambda l: json.loads(l)) flattened = data.flatMap(lambda x: [y for y in x['subreddits']]) reduced = flattened.map(lambda x: (x['subreddit'], 1)).reduceByKey( lambda a, b: a + b) mapped = reduced.map(lambda x: {"subreddit": x[0], "df": x[1]}) subreddit_df_json = mapped.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) subreddit_df_json.saveAsTextFile(output_path) print('Saving to {0}...'.format(local_output_root + '/subreddit_df.pickle')) collected = mapped.collect() _df_dict = {} for s in collected: _df_dict.update({s['subreddit']: s['df']}) pickle.dump(_df_dict, open(local_output_root + '/subreddit_df.pickle', 'wb'))
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) data = sc.pickleFile( input_path) # /user/username/data/output/author_category/latest flattened = data.flatMap(lambda x: [k for k, v in x['categories'].items()]) print(flattened, flattened.count()) reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) print(reduced, reduced.count()) mapped = reduced.map(lambda x: {"category": x[0], "df": x[1]}) print(mapped, mapped.count()) data_json = mapped.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) print(data_json, data_json.count()) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) data_json.saveAsTextFile(output_path) print('Saving to {0}...'.format(local_output_root + '/category_df.pickle')) collected = mapped.collect() _df_dict = {} for s in collected: _df_dict.update({s['category']: s['df']}) pickle.dump(_df_dict, open(local_output_root + '/category_df.pickle', 'wb'))
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) file = sc.textFile(input_path) pairs = file.map(lambda l: json.loads(l)) split = pairs.flatMap(lambda x: split(x) ) pickle.dump(split.collect(), open('/home/username/data/output/_jobs/pairs_winargs_split.pickle','wb'))
def analyze(sc, **kwargs): job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) data = sc.pickleFile('/user/username/data/output/_jobs/author_entity_freq') flattened = data.flatMap(lambda x: [e[0][2] for e in x[1]]) reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) mapped = reduced.map(lambda x: {"entity_id": x[0], "df": x[1]}) print('Saving to {0}...'.format(local_output_root + '/entity_df.pickle')) collected = mapped.collect() _df_dict = {} for s in collected: _df_dict.update({s['entity_id']: s['df']}) pickle.dump(_df_dict, open(local_output_root + '/entity_df.pickle', 'wb'))
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path( hdfs_root, kwargs.get('input-job', None), kwargs.get( 'input-path', None)) # /user/username/data/output/_jobs/author_subreddit/latest local_output_root = kwargs.get('local-output-root', None) subreddit_category_pickle = kwargs['subreddit-category-pickle'] subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb')) author_subreddit = sc.pickleFile( input_path) # /user/username/data/output/author_subreddit_vec/latest author_category = author_subreddit.map( lambda x: subreddits_to_categories(x, subreddit_category)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) category_df_pickle = kwargs['category-df-pickle'] _category_df = pickle.load(open(local_output_root+'/category_df.pickle', 'rb')) category_df = sc.broadcast(_category_df) _dv = DictVectorizer() features_vec = _dv.fit_transform(_category_df) dv = sc.broadcast(_dv) data = sc.pickleFile(input_path) # /user/username/data/output/author_category/latest authors_total = data.count() author_category_features = data.map(lambda x: get_feature_vectors(x, authors_total, category_df, dv)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) subreddit_df_pickle = kwargs['subreddit-df-pickle'] _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb')) subreddit_df = sc.broadcast(_subreddit_df) _dv = DictVectorizer() features_vec = _dv.fit_transform(_subreddit_df) dv = sc.broadcast(_dv) file = sc.textFile(input_path) data = file.map(lambda l: json.loads(l) ) authors_total = data.count() author_subreddit_features = data.map(lambda x: get_feature_vectors(x, authors_total, subreddit_df, dv)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) # output_folder='/tf_squared' author_subreddit_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) page_topics = pickle.load(open(local_output_root+'/page_topics.pickle', 'rb')) _entity_df = pickle.load(open(local_output_root+'/entity_df.pickle', 'rb')) entity_df = sc.broadcast(_entity_df) topics_df = {'Academic disciplines': 169126, 'Arts': 165790, 'Business': 165670, 'Concepts': 159671, 'Culture': 169696, 'Education': 162128, 'Entertainment': 166557, 'Events': 157631, 'Geography': 164197, 'Health': 168352, 'History': 166707, 'Humanities': 169517, 'Language': 168451, 'Law': 163853, 'Life': 167678, 'Mathematics': 157341, 'Nature': 167276, 'Other': 129536, 'People': 144695, 'Philosophy': 163002, 'Politics': 167504, 'Reference': 157377, 'Religion': 161830, 'Science': 167156, 'Society': 170080, 'Sports': 158917, 'Technology': 167069, 'Universe': 160159, 'World': 164604} topic_pages = pickle.load(open(local_output_root+'/topic_pages.pickle', 'rb')) # vectorizer for each topic _vectorizers = {} for k,v in topic_pages.items(): dv = DictVectorizer() dv.fit_transform(v) _vectorizers[k] = dv vectorizers = sc.broadcast(_vectorizers) _dv = DictVectorizer() features_vec = _dv.fit_transform(topics_df) dv = sc.broadcast(_dv) data = sc.pickleFile(input_path) authors_total = data.count() author_topic_entity_vec = data.filter(lambda x: len(x['topics']) > 0 and len(x['topics_freq']) > 0)\ .map(lambda x: get_feature_vectors(x, authors_total, entity_df, topics_df, vectorizers, dv) ) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_topic_entity_vec.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) '''TODO: - load category_entities and topcategory_entities dicts - author_entity: group list of entities into categories / topcategories ''' _subreddit_category = pickle.load( open( '/home/username/data/output/_jobs/subreddit_category_index.pickle', 'rb')) subreddit_category = sc.broadcast(_subreddit_category) _subreddit_topcategory = pickle.load( open('/home/username/data/output/_jobs/subreddit_topcategory.pickle', 'rb')) subreddit_topcategory = sc.broadcast(_subreddit_topcategory) category_entities = pickle.load( open('/home/username/data/output/_jobs/category_entities.pickle', 'rb')) topcategory_entities = pickle.load( open('/home/username/data/output/_jobs/topcategory_entities.pickle', 'rb')) _entity_df = pickle.load( open('/home/username/data/output/_jobs/entity_df.pickle', 'rb')) entity_df = sc.broadcast(_entity_df) # create vectorizer for each category out of entity_df dataset # _vectorizers_cats = {} # for k,v in category_entities.items(): # dv = DictVectorizer() # dv.fit_transform(v) # _vectorizers_cats[k] = dv # vectorizers_cats = sc.broadcast(_vectorizers_cats) # pickle.dump(_vectorizers_cats, open('/home/username/data/output/_jobs/vectorizers_cats.pickle','wb')) _vectorizers_cats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb')) vectorizers_cats = sc.broadcast(_vectorizers_cats) # _vectorizers_topcats = {} # for k,v in topcategory_entities.items(): # dv = DictVectorizer() # dv.fit_transform(v) # _vectorizers_topcats[k] = dv # vectorizers_topcats = sc.broadcast(_vectorizers_topcats) # pickle.dump(_vectorizers_topcats, open('/home/username/data/output/_jobs/vectorizers_topcats.pickle','wb')) _vectorizers_topcats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_topcats.pickle', 'rb')) vectorizers_topcats = sc.broadcast(_vectorizers_topcats) # author_entities_categories = sc.pickleFile('/user/username/data/output/_jobs/author_entity_categories') data = sc.pickleFile( '/user/username/data/output/_jobs/author_entity/latest') authors_total = data.count() ### calculating category_entities features vectors author_entity_features = data.map(lambda x: get_feature_vectors( x, authors_total, entity_df, vectorizers_cats, vectorizers_topcats, subreddit_category, subreddit_topcategory)) output_path = utils.hdfs_get_output_path( hdfs_root, job_name) # output_folder='/tf_squared' author_entity_features.saveAsPickleFile(output_path)