def main():
    parser = __define_process_parser()
    old_dataset_file, new_dataset_mapped, missing_data, \
    survey_file, location_to_store = __define_process_parser(True, parser)

    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_mapped, delimiter_sym=',', remove_first=True)
    old_data_missing = hlp.readcsv(missing_data, delimiter_sym=',', remove_first=True)
    old_missing = __dictify(0, old_data_missing)
    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)
    week_list = week_info.keys()
    bullying_positives = __find_positive_survey(survey_file, week_info)
    if bullying_positives is None:
        print 'Exiting...'
        exit()

    ff = filterfields()
    old_data_weekly = hlp.divideintoweekly(old_dataset, week_info, ff, date_field=pr.m_time_sent)
    new_data_weekly = hlp.divideintoweekly(new_dataset, week_info, ff, date_field=nd.m_timecreated)
    bullying_res = [['pid_hash', 'survey_id', 'time_of_survey', 'n_old', 'n_new', 'raw', 'semi', 'ordered', 'other']]
    for datum in bullying_positives:
        bullying_week = datum[-1]
        prev_week = bullying_week - 1 if bullying_week > min(week_list) else min(week_list)
        next_week = bullying_week + 1 if bullying_week < max(week_list) else max(week_list)
        old_data_pos = old_data_weekly[prev_week] + old_data_weekly[bullying_week] + old_data_weekly[next_week]
        new_data_pos = new_data_weekly[prev_week] + new_data_weekly[bullying_week] + new_data_weekly[next_week]
        pid_hash = datum[s_i.s_participant]
        n_old, n_new, nfr_dict = compare_old_new(old_data_pos, new_data_pos, old_missing, pid_hash, ff)
        temp = [pid_hash, datum[s_i.s_id], datum[s_i.s_time], n_old, n_new, nfr_dict['raw'], nfr_dict['semi'],
                nfr_dict['ordered'], nfr_dict['other']]
        bullying_res.append(temp)
    hlp.writecsv(bullying_res, location_to_store+'bullying_res.csv', delimiter_sym=',')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')

    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]
    if weekly_surveys is None:
        reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'],
                                                                          location_to_store)
        analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv')
        analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv')
        hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                         'reciprocity_info_overall.dict', location_to_store)
    else:
        # working with bimonthly data
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'],
                                                                              location_to_store)
            analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv')
            analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv')
            hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                             'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store)
            idx += 1

    print 'tadaa!'
def __get_weekly_counts(dataset, field_to_search, to_equate, weekly_info, ff_obj, sorted_week_list, pid_hash,
                        is_old=False):
    out_in = ff_obj.filterbyequality(field_to_search, pid_hash, data=dataset)
    if is_old:
        n_old_data = len(out_in)
        no_dup_old_data = []
        no_dup_dict = {}
        for datum in out_in:
            if tuple(datum[1:]) not in no_dup_dict:
                no_dup_dict[tuple(datum[1:])] = datum
        for unique_msg in no_dup_dict:
            no_dup_old_data.append(no_dup_dict[unique_msg])
        n_no_dup_old_data = len(no_dup_old_data)
        print 'With duplicates: '+str(n_old_data)+', without: '+str(n_no_dup_old_data)
        out_in = no_dup_old_data
    per_week = hlp.divideintoweekly(out_in, weekly_info, ff_obj)
    weekly_counts = [len(per_week[x]) for x in sorted_week_list]
    return weekly_counts, out_in, per_week
def get_pid_level_features(data_to_use, weekly_info, ff, bullying_overlay, pid_dict, current_pid, fe):
    pid_weekly_data = hlp.divideintoweekly(data_to_use, weekly_info, ff)
    pid_weekly_w_bullying = merge_bullying_data(bullying_overlay, pid_weekly_data, pid_dict[current_pid])

    # get the total degree for the pid, in and out, global
    global_in_degree, global_out_degree = fe.get_week_degree(data_to_use, pid_dict[current_pid])

    # get the total incoming, and outgoing messages, global
    pid_total_incoming, pid_total_outgoing = fe.get_in_out_data(data_to_use, pid_dict[current_pid])
    global_in_ew = len(pid_total_incoming)
    global_out_ew = len(pid_total_outgoing)

    # weekly sentiment score
    weekly_sentiment_score = fe.get_sentiment_score(pid_weekly_data, pid_dict[current_pid],
                                                    separate_in_out=True)
    incoming_ss = {}
    outgoing_ss = {}
    for week_no in weekly_sentiment_score:
        incoming_ss[week_no] = weekly_sentiment_score[week_no]['In']
        outgoing_ss[week_no] = weekly_sentiment_score[week_no]['Out']
    return pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')


    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]

    if weekly_surveys is None:
        reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store)
        hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store)
    else:
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store)
            hlp.dumpvariable([reciprocity_dict, message_pairs],
                             'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Sentiment Message file')
    parser.add_argument('-t', '-T', action='store_true',
                        help='Sentiment type flag, if used then vader, else afinn')
    parser.add_argument('-f', '-F', required=True,
                        help='Folder to store checkpoints, and final result')
    parser.add_argument('-w', '-W', required=False,
                        help='Per week/month analysis')

    args = parser.parse_args()
    message_file = args.m
    sentiment_type = args.t
    location_to_store = args.f
    survey_file = args.w

    # get message data, only sms and fb_message
    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])
    sms_data = ff.filterbyequality(pr.m_type, 'sms')
    pid_dict_sms = hlp.getuniqueparticipants2(sms_data)
    fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message')
    pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data)
    message_data = sms_data + fb_message_data

    # put the labels on
    labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \
        hlp.processafinnsentiment(message_data, label_only=False)

    if survey_file is not None:
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(survey_file)
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)

    #__temp_testing_for_discrepancy(labelled_data, weekly_data)

    # get the pid_dict for easier handling
    pid_dict = hlp.getuniqueparticipants2(labelled_data)
    if survey_file is not None:
        over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants'])
        __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf')
        __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf')
        __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf')

    print '***SMS***'
    print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values())

    print '***FB***'
    print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values())

    print '***OVERALL***'
    print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values())

    summary_src_trg = summarize_message_by_src_trg(labelled_data)
    print '***Message Distribution***'
    for m_type_1 in summary_src_trg:
        print m_type_1, summary_src_trg[m_type_1]

    if survey_file is not None:
        week_list = weekly_data.keys()
        week_list.sort()
        # this is not good, as there aren't enough triads
        months = [[1, 2, 3, 4],
                  [5, 6, 7, 8],
                  [9, 10, 11, 12],
                  [13, 14, 15, 16],
                  [17, 18, 19, 20],
                  [21, 22, 23, 24, 25]]
        # this has at least 8 triads, always, use this
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                  [9, 10, 11, 12, 13, 14, 15, 16],
                  [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        month_idx = 1
        for month in months2:
            labelled_data = []
            for week in month:
                labelled_data.extend(weekly_data[week])
            general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
            frac_triad = general_graph[3]
            summary_triad = general_graph[2]
            frac_triad_rand = random_graph[3]
            summary_triad_rand = random_graph[2]
            print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***'
            print 'len(LD): ', len(labelled_data)
            for summary in frac_triad:
                print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
            words_list, short_list = word_count(labelled_data)
            toWrite_wl_csv = create_word_count_csv(words_list)
            hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv',
                         delimiter_sym=',')
            for mtype in words_list:
                counted_words = Counter(words_list[mtype])
                counted_short = Counter(short_list[mtype])
                print '***For '+mtype+' ***'
                print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
                print 'Top 20 short: ', counted_short.most_common(20)
                print '\n\n'
            hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store)
            month_idx += 1
    else:
        print 'len(LD): ', len(labelled_data)
        words_list, short_list = word_count(labelled_data)
        toWrite_wl_csv = create_word_count_csv(words_list)
        hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',')
        for mtype in words_list:
            counted_words = Counter(words_list[mtype])
            counted_short = Counter(short_list[mtype])
            print '***For '+mtype+' ***'
            print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
            print 'Top 20 short: ', counted_short.most_common(20)
            print '\n\n'
        general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
        frac_triad = general_graph[3]
        summary_triad = general_graph[2]
        frac_triad_rand = random_graph[3]
        summary_triad_rand = random_graph[2]
        for summary in frac_triad:
            print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
        hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store)
        # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d')
        pos, neg, neu = get_polarity_directionality(labelled_data)
        print '***Polarity Distribution***'
        print 'Positive: \n', pos
        print 'Negative: \n', neg
        print 'Neutral: \n', neu

        in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants'])
        print '***Incoming Messages***'
        print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m)
        print '***Outgoing Messages***'
        print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m)
        print '***In Degree***'
        print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d)
        print '***Out Degree***'
        print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d)
        print '***COUNTS***'
        plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_messages.pdf')
        # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.',
        #               location_to_store+'out_messages.pdf')
        plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_degree.pdf', True)
        # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.',
        #               location_to_store+'out_degree.pdf', True)
    print 'TADAA!!'