def test_merge_messages(self):
     new_df = merge_messages(self.df2,
                             max_time_diff=td(weeks=4),
                             string_similar_threshold=50
     )
     self._check_mid_year(new_df)
     assert_true(len(new_df[new_df['message_id'] == 71808]) > 0)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset', required=True)
    parser.add_argument('--hashtag_ban')

    args = parser.parse_args()
    
    try:
        df = pd.read_json('data/{}/interactions.json'.format(args.dataset))
    except (ValueError, IOError):
        df = pd.read_pickle('data/{}/interactions.pkl'.format(args.dataset))
        
    df = df.drop_duplicates(subset=['message_id'])
    df['hashtags'] = df['hashtags'].apply(
        lambda hs: list(set(map(lambda s: s.lower(), hs)))
        )

    if args.hashtag_ban:
        df['hashtags'] = df['hashtags'].apply(
            lambda hs: filter(lambda h: h != args.hashtag_ban, hs)
            )

    df = remove_mentions_and_urls(df)
    df = df[df['body'].map(len) > 10]  # filter short body
    # df = df[df['body'].map(detect_lan) == 'en']  # non english

    df = merge_messages(df,
                        timedelta(minutes=30),
                        50,
                        'datetime')

    # df.to_json('data/{}/interactions_new.json'.format(args.dataset),
    #            orient='records')
    df.to_pickle('data/{}/interactions.pkl'.format(args.dataset))
 def test_twitter_case(self):
     df = merge_messages(self.df3,
                         max_time_diff=td(days=1),
                         string_similar_threshold=50,
                         time_field='datetime')
     assert_equal(592820666782580736, df.iloc[0]['message_id'])
     # raise
     assert_equal(1, len(df))
 def test_twitter_case(self):
     df = merge_messages(
         self.df3,
         max_time_diff=td(days=1),
         string_similar_threshold=50,
         time_field='datetime'
     )
     assert_equal(
         592820666782580736,
         df.iloc[0]['message_id']
         )
     # raise
     assert_equal(1, len(df))
 def test_merge_messages(self):
     new_df = merge_messages(self.df2,
                             max_time_diff=td(weeks=4),
                             string_similar_threshold=50)
     self._check_mid_year(new_df)
     assert_true(len(new_df[new_df['message_id'] == 71808]) > 0)

def filter_and_save(df):
    # df = process_message_body(df)
    # df = df[df['body'].map(len) > 10]  # filter short body

    # potential: 14557
    black_list = [256, 1175]
    for s in black_list:
        df = df[df['sender_id'] != s]

    df.to_json('data/enron/interactions.json', orient="records")


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset')
    args = parser.parse_args()

    df = pd.read_json('data/{}/interactions.json'.format(args.dataset))
    df = merge_messages(df, timedelta(weeks=4), 70, 'datetime')
    df.to_json('data/{}/interactions.json'.format(args.dataset))
    # frequent_senders = df['sender_id'].value_counts().index[20:40]
    # for s in frequent_senders:
    #     print(s)
    #     print(df[df['sender_id'] == s]['subject'][:10])
    #     print('*' * 20)

    # filter_and_save(df)