def test_merge_messages(self): new_df = merge_messages(self.df2, max_time_diff=td(weeks=4), string_similar_threshold=50 ) self._check_mid_year(new_df) assert_true(len(new_df[new_df['message_id'] == 71808]) > 0)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', required=True) parser.add_argument('--hashtag_ban') args = parser.parse_args() try: df = pd.read_json('data/{}/interactions.json'.format(args.dataset)) except (ValueError, IOError): df = pd.read_pickle('data/{}/interactions.pkl'.format(args.dataset)) df = df.drop_duplicates(subset=['message_id']) df['hashtags'] = df['hashtags'].apply( lambda hs: list(set(map(lambda s: s.lower(), hs))) ) if args.hashtag_ban: df['hashtags'] = df['hashtags'].apply( lambda hs: filter(lambda h: h != args.hashtag_ban, hs) ) df = remove_mentions_and_urls(df) df = df[df['body'].map(len) > 10] # filter short body # df = df[df['body'].map(detect_lan) == 'en'] # non english df = merge_messages(df, timedelta(minutes=30), 50, 'datetime') # df.to_json('data/{}/interactions_new.json'.format(args.dataset), # orient='records') df.to_pickle('data/{}/interactions.pkl'.format(args.dataset))
def test_twitter_case(self): df = merge_messages(self.df3, max_time_diff=td(days=1), string_similar_threshold=50, time_field='datetime') assert_equal(592820666782580736, df.iloc[0]['message_id']) # raise assert_equal(1, len(df))
def test_twitter_case(self): df = merge_messages( self.df3, max_time_diff=td(days=1), string_similar_threshold=50, time_field='datetime' ) assert_equal( 592820666782580736, df.iloc[0]['message_id'] ) # raise assert_equal(1, len(df))
def test_merge_messages(self): new_df = merge_messages(self.df2, max_time_diff=td(weeks=4), string_similar_threshold=50) self._check_mid_year(new_df) assert_true(len(new_df[new_df['message_id'] == 71808]) > 0)
def filter_and_save(df): # df = process_message_body(df) # df = df[df['body'].map(len) > 10] # filter short body # potential: 14557 black_list = [256, 1175] for s in black_list: df = df[df['sender_id'] != s] df.to_json('data/enron/interactions.json', orient="records") if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset') args = parser.parse_args() df = pd.read_json('data/{}/interactions.json'.format(args.dataset)) df = merge_messages(df, timedelta(weeks=4), 70, 'datetime') df.to_json('data/{}/interactions.json'.format(args.dataset)) # frequent_senders = df['sender_id'].value_counts().index[20:40] # for s in frequent_senders: # print(s) # print(df[df['sender_id'] == s]['subject'][:10]) # print('*' * 20) # filter_and_save(df)