def individual_reciprocity_analysis(labelled_data, pid_dict, location_to_store):
    reciprocity_info = {}
    ff = filterfields()
    ff.setdata(labelled_data)
    polarity_data = {}
    for pid in pid_dict:
        print 'Working with PID: ', pid, '(', pid_dict[pid], ')'
        messages_by_participant = ff.filterbyequality(pr.m_source, pid)
        messages_to_participant = ff.filterbyequality(pr.m_target, pid)
        polarity_data[pid] = __get_polarity_composition(messages_by_participant+messages_to_participant, pid)
        reciprocity_info[pid] = {}
        n = len(messages_by_participant)
        idx = 0
        for message in messages_by_participant:
            print 'idx=' + str(idx) + '/' + str(n)
            idx += 1
            closest_message = find_closest_message(message, messages_to_participant, ff)
            target_type = 'P' if message[pr.m_target_type] == 'participant' else 'NP'
            target = message[pr.m_target]
            if target_type not in reciprocity_info[pid]:
                reciprocity_info[pid][target_type] = {}
            if target not in reciprocity_info[pid][target_type]:
                reciprocity_info[pid][target_type][target] = __basic_reciprocity_dict()
            sent_message_type = message[-1]
            reply_message_type = 'X' if closest_message is None else closest_message[-1]
            reciprocity_info[pid][target_type][target][sent_message_type][reply_message_type] += 1
        print 'saving checkpoint...'
        hlp.dumpvariable([reciprocity_info, pid, pid_dict], 'checkpoint.chp', location_to_store)
        print 'saved!'
    return reciprocity_info, polarity_data
def find_reciprocity(labelled_data, location_to_store):
    ff = filterfields()
    ff.setdata(labelled_data)
    messages_sent_by_participants = ff.filterbyequality(pr.m_source_type, 'participant')
    reciprocity_dict = {'P': {'P': 0, 'U': 0, 'N': 0, 'X': 0},
                        'N': {'P': 0, 'U': 0, 'N': 0, 'X': 0},
                        'U': {'P': 0, 'U': 0, 'N': 0, 'X': 0}}
    n = len(messages_sent_by_participants)
    idx = 1
    message_pairs = []
    for message in messages_sent_by_participants:
        print 'at message ', idx, ' of ', n
        idx += 1
        reply_message = find_closest_message(message, ff)
        sent_message_type = message[-1]
        if reply_message is None:
            reply_message_type = 'X'
        else:
            reply_message_type = reply_message[-1]
        reciprocity_dict[sent_message_type][reply_message_type] += 1
        message_pairs.append((message, reply_message))
        if 0 == idx%500:
            print 'saving...'
            hlp.dumpvariable([idx, reciprocity_dict, message_pairs, messages_sent_by_participants],
                             'checkpoint.chp', location_to_store)
    print 'done... out of the loop'
    to_use = {'P': '+', 'N': '-', 'U': 'u', 'X': 'null'}
    for sent_type in reciprocity_dict:
        recvd_types = reciprocity_dict[sent_type]
        for recvd_type in recvd_types:
            print 'N('+to_use[recvd_type]+'|'+to_use[sent_type]+')=', recvd_types[recvd_type]

    return reciprocity_dict, message_pairs
def getstats(filepath, participant_dict, p_type, message_type='sms'):
    ff = filterfields(filepath)
    ff.setdata(ff.filterbyequality(pr.m_type, message_type))
    participant_stats = {}
    for participant_id in participant_dict:
        survey_no_list = participant_dict[participant_id]
        p_data = ff.filterbyequality(pr.m_source, participant_id) + \
                 ff.filterbyequality(pr.m_target, participant_id)
        if [] == p_data:
            print 'no data exists for pid: ' + participant_id
            continue
        pid_dict = hlp.getuniqueparticipants(p_data)
        for survey_no in survey_no_list:
            print 'Participant no.', participant_id, ' S.no.: ', survey_no
            idx = survey_no
            survey_no = survey_no_list[survey_no][0]
            end_date = ff.converttodate(survey_no[sr.s_time])
            start_date = end_date - dt.timedelta(days=7)
            data_between_dates = ff.filterbetweendates(start_date, end_date, data_to_work=p_data)
            original_start_date = ff.converttodate(pr.start_datetime)
            data_start_to_date = ff.filterbetweendates(original_start_date, start_date, data_to_work=p_data)
            between_stats, before_stats = graphstats(data_start_to_date, data_between_dates, participant_id, p_type,
                                                     original_start_date, start_date, ff, pid_dict)
            temp_dict = {'between': between_stats, 'before': before_stats, 'pid_dict': pid_dict}
            participant_stats[participant_id] = {idx: temp_dict}
    return participant_stats
def main():
    parser = __define_process_parser()
    old_dataset_file, new_dataset_mapped, missing_data, \
    survey_file, location_to_store = __define_process_parser(True, parser)

    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_mapped, delimiter_sym=',', remove_first=True)
    old_data_missing = hlp.readcsv(missing_data, delimiter_sym=',', remove_first=True)
    old_missing = __dictify(0, old_data_missing)
    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)
    week_list = week_info.keys()
    bullying_positives = __find_positive_survey(survey_file, week_info)
    if bullying_positives is None:
        print 'Exiting...'
        exit()

    ff = filterfields()
    old_data_weekly = hlp.divideintoweekly(old_dataset, week_info, ff, date_field=pr.m_time_sent)
    new_data_weekly = hlp.divideintoweekly(new_dataset, week_info, ff, date_field=nd.m_timecreated)
    bullying_res = [['pid_hash', 'survey_id', 'time_of_survey', 'n_old', 'n_new', 'raw', 'semi', 'ordered', 'other']]
    for datum in bullying_positives:
        bullying_week = datum[-1]
        prev_week = bullying_week - 1 if bullying_week > min(week_list) else min(week_list)
        next_week = bullying_week + 1 if bullying_week < max(week_list) else max(week_list)
        old_data_pos = old_data_weekly[prev_week] + old_data_weekly[bullying_week] + old_data_weekly[next_week]
        new_data_pos = new_data_weekly[prev_week] + new_data_weekly[bullying_week] + new_data_weekly[next_week]
        pid_hash = datum[s_i.s_participant]
        n_old, n_new, nfr_dict = compare_old_new(old_data_pos, new_data_pos, old_missing, pid_hash, ff)
        temp = [pid_hash, datum[s_i.s_id], datum[s_i.s_time], n_old, n_new, nfr_dict['raw'], nfr_dict['semi'],
                nfr_dict['ordered'], nfr_dict['other']]
        bullying_res.append(temp)
    hlp.writecsv(bullying_res, location_to_store+'bullying_res.csv', delimiter_sym=',')
def __find_positive_survey(survey_file, week_info):
    week_no = week_info.keys()
    week_no.sort()

    ff = filterfields()
    s_obj = surveys()

    survey_data = hlp.readcsv(survey_file, delimiter_sym=',')
    n_data = s_obj.interpretanswers(survey_data, True)
    bullying_positives = ff.filterbyequality(s_i.s_qno, '4', data=n_data[1:])

    new_bullying_positives = []
    for datum in bullying_positives:
        datetime_of_survey = ff.converttodate(datum[s_i.s_time])
        found_match = False
        for week in week_no:
            (start_date, end_date) = week_info[week]
            if start_date <= datetime_of_survey <= end_date:
                datum.append(week)
                new_bullying_positives.append(datum)
                found_match = True
                break
        if not found_match:
            print 'Something funky happened...', datum
            return None
    return new_bullying_positives
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '--messageFile', type=str, required=True)
    parser.add_argument('-mt', '--messageTypes', type=str, nargs='+')
    parser.add_argument('-o', '--outputFolder', type=str, required=True)
    parser.add_argument('-of', '--outputFile', type=str, required=True)
    parser.add_argument('-pd', '--participantDictionary', type=str)
    parser.add_argument('-i', '--ignoreParticipants', type=str)
    parser.add_argument('-mc', '--messageTypeConvert', type=str, nargs='*')

    args = parser.parse_args()

    message_file = args.messageFile
    message_types = args.messageTypes
    output_folder = args.outputFolder
    output_file = args.outputFile
    pid_dict = args.participantDictionary
    ignore_pids = args.ignoreParticipants
    message_type_conversions = args.messageTypeConvert

    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])

    to_set_data = []

    # extract the relevant data
    for message_type in message_types:
        to_set_data.extend(ff.filterbyequality(pr.m_type, message_type))

    ff.setdata(to_set_data)

    if ignore_pids is not None:
        ignore_pids = hlp.recovervariable(ignore_pids)
        for pid in ignore_pids:
            ff.removebyequality(pr.m_source, pid)
            ff.removebyequality(pr.m_target, pid)


    # set the pid to normal id dictionary
    if pid_dict is None:
        pid_dict = hlp.getuniqueparticipants(ff.getdata(), mtype='all', separate_pid_npid=True)

    # replace the message type names with the ones provided
    if message_type_conversions is not None:
        for idx in range(0, len(message_type_conversions), 2):
            message_to_convert = message_type_conversions[idx]
            to_convert_to = message_type_conversions[idx+1]
            ff.replacebyequality(pr.m_type, message_to_convert, to_convert_to)

    message_types = ff.getuniqueelements(pr.m_type)
    coded_participant_list = pid_dict[pr.participant['all']].values()
    storage_dict = initiatestorage(coded_participant_list, message_types)
    storage_dict = getperparticipantinout(ff.getdata(), storage_dict, pid_dict)
    plotperparticipantbar(storage_dict, 'Participant ID', '# of Messages', message_types, 'Per Participant Messages',
                          output_folder+output_file)
    hlp.dumpvariable(pid_dict, 'pid_dict.dict', output_folder)
    hlp.dumpvariable(ff.getdata(), 'messageData.list', output_folder)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-o', '-O', required=True, help='Old dataset csv')
    parser.add_argument('-n', '-N', required=True, help='New dataset csv')
    parser.add_argument('-s', '-S', required=True, help='Survey file')
    parser.add_argument('-p', '-P', required=True, help='folder to store figures in, should end with /')
    parser.add_argument('-m', '-M', required=True, help='Master hash mapping csv')
    parser.add_argument('-mt', '-MT', required=True, nargs='+', help='Types of messages to look for')
    parser.add_argument('-d', '-D', action='store_true', help='Flag to debug')

    args = parser.parse_args()

    old_dataset_file = args.o
    new_dataset_file = args.n
    survey_file = args.s
    location_to_store = args.p
    master_hash_csv = args.m
    message_types = args.mt
    do_debug = args.d

    print 'Reading data...'
    master_csv = hlp.readcsv(master_hash_csv, delimiter_sym=',', remove_first=True)
    master_dict = {datum[1]: datum[0] for datum in master_csv}

    ff = filterfields()

    filtered_old = []
    filtered_new = []

    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',', remove_first=True)

    print 'Filtering message types'
    for message_type in message_types:
        filtered_old.extend(ff.filterbyequality(pr.m_type, message_type, data=old_dataset))
        filtered_new.extend(ff.filterbyequality(pr.m_type, message_type, data=new_dataset))

    wi = weeklyinfo()
    weekly_info = wi.getweeklyfo(survey_file)
    week_list = weekly_info.keys()
    week_list.sort()

    print 'Creating in out dictionary'
    in_out_message_dict = get_message_counts(filtered_old, filtered_new, week_list, weekly_info, master_dict, ff,
                                             location_to_store, do_debug)

    print 'Plotting...'
    for pid in in_out_message_dict:
        print pid
        plot_distribution(in_out_message_dict[pid][0][0], in_out_message_dict[pid][0][1],
                          in_out_message_dict[pid][1][0], in_out_message_dict[pid][1][1], week_list, pid,
                          location_to_store)
    print 'TADAA!!'
 def filterweeklydata(self, pid_dict, message_list, week_info, message_type='sms'):
     participant_dict = pid_dict[pr.participant[message_type]]
     ff_obj = filterfields('')
     ff_obj.setdata(message_list)
     min_week = min(week_info.keys())
     max_week = max(week_info.keys())
     min_date = week_info[min_week][0]
     max_date = week_info[max_week][1]
     weekly_dist = {}
     for pid in participant_dict.keys():
         weekly_dist[pid] = self.__perparticipantprocessing(pid, ff_obj, curr_min=min_date, curr_max=max_date,
                                                            send_week_info=False, week_info=week_info)
     return weekly_dist
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '-F', type=str, required=True,
                        help='filepath for message file')
    parser.add_argument('-o', '-O', type=str, required=True,
                        help='path to store figure in')
    parser.add_argument('-mt', '-MT', type=str, nargs='*')
    parser.add_argument('-c', '-C', type=str, nargs='*')

    args = parser.parse_args()
    message_filename = args.f
    output_location = args.o
    message_types = args.mt
    combine_change = args.c
    to_combine = None if combine_change is None else []
    to_change_to = None if combine_change is None else []
    if combine_change is not None:
        for idx in range(0, len(combine_change), 2):
            to_combine.append(combine_change[idx].lower())
            to_change_to.append(combine_change[idx+1])

    ff = filterfields(message_filename)
    ff.setdata(ff.getdata()[1:])
    message_types = ff.getuniqueelements(pr.m_type) if message_types is None else message_types
    numbers = []
    for message_type in message_types:
        numbers.append(len(ff.filterbyequality(pr.m_type, message_type)))
    if to_combine is not None:
        numbers, message_types = combineandconvert(numbers, message_types, to_combine, to_change_to)
    for idx in range(len(numbers)):
        print message_types[idx], numbers[idx]
    fig = plt.figure(figsize=[12, 12])
    ax = fig.add_subplot(111)
    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
    for i in range(len(tableau20)):
        r, g, b = tableau20[i]
        tableau20[i] = (r / 255., g / 255., b / 255.)
    patches, texts, autotexts = ax.pie(numbers, labeldistance=1.05,
                                       colors=tableau20, autopct='%1.1f%%', startangle=90)
    for idx in range(len(texts)):
        texts[idx].set_fontsize(20)
    for idx in range(len(autotexts)):
         autotexts[idx].set_fontsize(20)
    plt.legend(labels=message_types, loc='upper right', fontsize=20, bbox_to_anchor=(1.05, 1))
    plt.savefig(output_location, bbox_inches='tight')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')

    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]
    if weekly_surveys is None:
        reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'],
                                                                          location_to_store)
        analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv')
        analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv')
        hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                         'reciprocity_info_overall.dict', location_to_store)
    else:
        # working with bimonthly data
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'],
                                                                              location_to_store)
            analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv')
            analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv')
            hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                             'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store)
            idx += 1

    print 'tadaa!'
def main():
    parser = argparse.ArgumentParser('Script to generate distribution '
                                     'of edge weights/degrees for all '
                                     'participants')
    parser.add_argument('-m', '-M', type=str, required=True,
                        help='location of the message file')
    parser.add_argument('-mt', '-MT', type=str, default='all',
                        help='types of messages to plot, currently supports '
                             'one of the following: sms, fb, twitter, or all')
    parser.add_argument('-r', '-R', type=str, required=True,
                        help='survey file')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='folder to store data in, leading / required')
    parser.add_argument('-p', '-P', action='store_true',
                        help='flag to generate plots')

    args = parser.parse_args()
    survey_file = args.r
    message_file = args.m
    m_type = args.mt
    folder_to_store = args.s
    generate_plots = args.p

    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)

    ff = filterfields(message_file)
    filtered_data = []
    if m_type == 'all':
        for message_type in ['sms', 'fb_message']:
            filtered_data.extend(ff.filterbyequality(pr.m_type, message_type))
    else:
        filtered_data = ff.filterbyequality(pr.m_type, m_type)
    _, links_tuple, _, pid_dict = hlp.creategraph(filtered_data, filterType=args.mt)
    gh = ghelper()
    plt = plots()
    weekly_deg_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt,
                                                   is_degree=True, week_info=week_info)
    hlp.dumpvariable(weekly_deg_dist, 'weekly_deg_dist.dict', folder_to_store)
    weekly_ew_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt,
                                                  is_degree=False, week_info=week_info)
    hlp.dumpvariable(weekly_ew_dist, 'weekly_ew_dist.dict', folder_to_store)
    if generate_plots:
        plt.plotweeklyprogression(weekly_deg_dist, folder_to_store + 'deg_', 'No. of friends',
                                  'Week No.', 'Friends')
        plt.plotweeklyprogression(weekly_ew_dist, folder_to_store + 'ew_', 'No. of messages exhanged',
                                  'Week No.', 'Messages')

    print 'done...'
def get_degree_message_count(dataset, pid_dict):
    ff = filterfields()
    ff.setdata(dataset)
    in_d = {}
    out_d = {}
    in_m = {}
    out_m = {}

    for pid in pid_dict:
        incoming_messages = ff.filterbyequality(nd.m_target, pid)
        outgoing_messages = ff.filterbyequality(nd.m_source, pid)
        people_sending_me_messages = ff.getuniqueelements(nd.m_source, data=incoming_messages)
        people_i_am_sending_messages = ff.getuniqueelements(nd.m_target, data=outgoing_messages)
        in_m[pid] = len(incoming_messages)
        out_m[pid] = len(outgoing_messages)
        in_d[pid] = len(people_sending_me_messages)
        out_d[pid] = len(people_i_am_sending_messages)

    return in_m, out_m, in_d, out_d
def generate_new_dataset_dictionary(new_dataset, use_m_id=False):
    ff = filterfields()
    new_dataset_dictionary = {}
    for datum in new_dataset:
        if not use_m_id:
            src = datum[nd.m_source]
            trg = datum[nd.m_target]
            if (src, trg) not in new_dataset_dictionary:
                new_dataset_dictionary[(src, trg)] = {}
            message_type = datum[nd.m_type]
            if message_type not in new_dataset_dictionary[(src, trg)]:
                new_dataset_dictionary[(src, trg)][message_type] = {}
            create_time = datum[nd.m_timecreated]
            create_time_dt = ff.converttodate(create_time)
            if create_time_dt not in new_dataset_dictionary[(src, trg)][message_type]:
                new_dataset_dictionary[(src, trg)][message_type][create_time_dt] = []
            new_dataset_dictionary[(src, trg)][message_type][create_time_dt].append(datum)
        else:
            new_dataset_dictionary[datum[nd.msg_id]] = datum
    return new_dataset_dictionary
Esempio n. 14
0
 def getweeklydistributions(self, pid_dict, message_list, message_type='sms',
                            is_degree=True, week_info=None):
     # is_degree = F --> edge weight distribution
     week_info = {} if week_info is None else week_info
     participant_dict = pid_dict[pr.participant[message_type]]
     ff_obj = filterfields('')
     ff_obj.setdata(message_list)
     if week_info is not None:
         min_week = min(week_info.keys())
         max_week = max(week_info.keys())
         min_date = week_info[min_week][0]
         max_date = week_info[max_week][1]
     else:
         min_date, max_date = self.getminmaxdates(message_list, ff_obj)
     weekly_dist = {}
     for pid in participant_dict.keys():
         weekly_dist[pid] = {}
         if week_info is None:
             weekly_dict, temp_week_info = self.__perparticipantprocessing(pid, ff_obj,
                                                                           curr_min=min_date, curr_max=max_date,
                                                                           send_week_info=True)
             week_info[pid] = temp_week_info
         else:
             weekly_dict = self.__perparticipantprocessing(pid, ff_obj, curr_min=min_date, curr_max=max_date,
                                                           send_week_info=False, week_info=week_info)
         weekly_graphs = self.__weeklygraphs(weekly_dict, pid_dict, message_type=message_type)
         for weekno in weekly_graphs.keys():
             go = weekly_graphs[weekno]
             if is_degree:
                 g_info = go.getdegrees(participant_dict[pid])
             else:
                 g_info_ew = go.getedgeweights(participant_dict[pid])
                 in_w = 0
                 out_w = 0
                 for e_tuple in g_info_ew[0]:
                     in_w += e_tuple[2]['weight']
                 for e_tuple in g_info_ew[1]:
                     out_w += e_tuple[2]['weight']
                 g_info = [in_w, out_w]
             weekly_dist[pid][weekno] = g_info
     return weekly_dist, week_info
def get_count_degrees_messages_directed(labelled_data, pid_dict):
    fobj = filterfields()
    fobj.setdata(labelled_data)

    in_m = []
    out_m = []
    in_d = []
    out_d = []
    pid_order = []
    for pid in pid_dict:
        in_data = fobj.filterbyequality(pr.m_target, pid)
        out_data = fobj.filterbyequality(pr.m_source, pid)
        in_m.append(len(in_data))
        out_m.append(len(out_data))
        people_sending_me_messages = fobj.getuniqueelements(pr.m_source, data=in_data)
        people_i_am_sending_messages_to = fobj.getuniqueelements(pr.m_target, data=out_data)
        in_d.append(len(people_sending_me_messages))
        out_d.append(len(people_i_am_sending_messages_to))
        pid_order.append(pid)

    return in_m, out_m, in_d, out_d
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')


    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]

    if weekly_surveys is None:
        reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store)
        hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store)
    else:
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store)
            hlp.dumpvariable([reciprocity_dict, message_pairs],
                             'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
Esempio n. 17
0
def main():
    ff = filterfields(sys.argv[1])
    print 'filtering...'
    filtered_data = ff.filterbyequality(pr.m_type, sys.argv[6])
    hlp.dumpvariable(filtered_data, 'filtered_'+sys.argv[6], sys.argv[5])
    print 'done'
    if '-' is not sys.argv[2]:
        writecsv(sys.argv[2], filtered_data)
    if '-' is not sys.argv[3]:
        links, link_tuple, graph_obj, pid_dict = hlp.creategraph(filtered_data)
        hlp.dumpvariable(links, 'static_links', sys.argv[5])
        hlp.dumpvariable(link_tuple, 'static_links_tuple', sys.argv[5])
        hlp.dumpvariable(graph_obj, 'static_graph_obj', sys.argv[5])
        hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5])
        graph_obj.writegraph(sys.argv[3])
    if '-' is not sys.argv[4]:
        to_write_edge, to_write_nodes, week_dict, pid_dict, week_content = hlp.creategraph(filtered_data, False)
        writetofile(sys.argv[4]+'_el.csv', to_write_edge)
        writetofile(sys.argv[4]+'_nl.csv', to_write_nodes)
        hlp.dumpvariable(week_dict, 'dynamic_week_dict', sys.argv[5])
        hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5])
        hlp.dumpvariable(week_content, 'week_content', sys.argv[5])
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Sentiment Message file')
    parser.add_argument('-t', '-T', action='store_true',
                        help='Sentiment type flag, if used then vader, else afinn')
    parser.add_argument('-f', '-F', required=True,
                        help='Folder to store checkpoints, and final result')
    parser.add_argument('-w', '-W', required=False,
                        help='Per week/month analysis')

    args = parser.parse_args()
    message_file = args.m
    sentiment_type = args.t
    location_to_store = args.f
    survey_file = args.w

    # get message data, only sms and fb_message
    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])
    sms_data = ff.filterbyequality(pr.m_type, 'sms')
    pid_dict_sms = hlp.getuniqueparticipants2(sms_data)
    fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message')
    pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data)
    message_data = sms_data + fb_message_data

    # put the labels on
    labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \
        hlp.processafinnsentiment(message_data, label_only=False)

    if survey_file is not None:
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(survey_file)
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)

    #__temp_testing_for_discrepancy(labelled_data, weekly_data)

    # get the pid_dict for easier handling
    pid_dict = hlp.getuniqueparticipants2(labelled_data)
    if survey_file is not None:
        over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants'])
        __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf')
        __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf')
        __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf')

    print '***SMS***'
    print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values())

    print '***FB***'
    print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values())

    print '***OVERALL***'
    print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values())

    summary_src_trg = summarize_message_by_src_trg(labelled_data)
    print '***Message Distribution***'
    for m_type_1 in summary_src_trg:
        print m_type_1, summary_src_trg[m_type_1]

    if survey_file is not None:
        week_list = weekly_data.keys()
        week_list.sort()
        # this is not good, as there aren't enough triads
        months = [[1, 2, 3, 4],
                  [5, 6, 7, 8],
                  [9, 10, 11, 12],
                  [13, 14, 15, 16],
                  [17, 18, 19, 20],
                  [21, 22, 23, 24, 25]]
        # this has at least 8 triads, always, use this
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                  [9, 10, 11, 12, 13, 14, 15, 16],
                  [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        month_idx = 1
        for month in months2:
            labelled_data = []
            for week in month:
                labelled_data.extend(weekly_data[week])
            general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
            frac_triad = general_graph[3]
            summary_triad = general_graph[2]
            frac_triad_rand = random_graph[3]
            summary_triad_rand = random_graph[2]
            print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***'
            print 'len(LD): ', len(labelled_data)
            for summary in frac_triad:
                print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
            words_list, short_list = word_count(labelled_data)
            toWrite_wl_csv = create_word_count_csv(words_list)
            hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv',
                         delimiter_sym=',')
            for mtype in words_list:
                counted_words = Counter(words_list[mtype])
                counted_short = Counter(short_list[mtype])
                print '***For '+mtype+' ***'
                print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
                print 'Top 20 short: ', counted_short.most_common(20)
                print '\n\n'
            hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store)
            month_idx += 1
    else:
        print 'len(LD): ', len(labelled_data)
        words_list, short_list = word_count(labelled_data)
        toWrite_wl_csv = create_word_count_csv(words_list)
        hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',')
        for mtype in words_list:
            counted_words = Counter(words_list[mtype])
            counted_short = Counter(short_list[mtype])
            print '***For '+mtype+' ***'
            print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
            print 'Top 20 short: ', counted_short.most_common(20)
            print '\n\n'
        general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
        frac_triad = general_graph[3]
        summary_triad = general_graph[2]
        frac_triad_rand = random_graph[3]
        summary_triad_rand = random_graph[2]
        for summary in frac_triad:
            print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
        hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store)
        # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d')
        pos, neg, neu = get_polarity_directionality(labelled_data)
        print '***Polarity Distribution***'
        print 'Positive: \n', pos
        print 'Negative: \n', neg
        print 'Neutral: \n', neu

        in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants'])
        print '***Incoming Messages***'
        print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m)
        print '***Outgoing Messages***'
        print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m)
        print '***In Degree***'
        print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d)
        print '***Out Degree***'
        print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d)
        print '***COUNTS***'
        plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_messages.pdf')
        # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.',
        #               location_to_store+'out_messages.pdf')
        plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_degree.pdf', True)
        # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.',
        #               location_to_store+'out_degree.pdf', True)
    print 'TADAA!!'
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', type=str, required=True,
                        help='Message list file')
    parser.add_argument('-r', '-R', type=str, required=True,
                        help='survey file')
    parser.add_argument('-p', '-P', type=str, required=True,
                        help='PID dict inverted')
    parser.add_argument('-b', '-B', type=str, required=True,
                        help='bullying dictionary')
    parser.add_argument('-o', '-O', type=str, required=True,
                        help='Output folder')
    parser.add_argument('-l', '-L', type=str, nargs='+',
                        help='Filters chosen')
    parser.add_argument('-f', '-f', type=str, nargs='+',
                        help='Filter files')

    args = parser.parse_args()

    output_folder = args.o

    message_data = hlp.recovervariable(args.m)
    pid_dict = hlp.recovervariable(args.p)

    filters_chosen = args.l
    filter_files = args.f
    catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True)

    wi = weeklyinfo()
    weekly_info = wi.getweeklyfo(args.r)

    ff = filterfields()

    gh = ghelper()
    bullying_overlay = gh.createbullyingoverlay(catch_all_data, weekly_info, ff)
    bullying_overlay = flip_bullying_overlay(bullying_overlay, weekly_info.keys())

    pid_list = pid_dict.keys()
    pid_list.sort()
    for pid in pid_list:
        training_set_final = []
        testing_set_final = []
        pid_list_training = deepcopy(pid_list)
        pid_list_training.remove(pid)
        ff.setdata(message_data)
        testing_raw_data = ff.filterbyequality(pr.m_source, pid_dict[pid]) + \
                           ff.filterbyequality(pr.m_target, pid_dict[pid])
        ff.removebyequality(pr.m_source, pid_dict[pid])
        ff.removebyequality(pr.m_target, pid_dict[pid])
        training_raw_data = ff.getdata()
        fe = raw_features(data=None)
        _, _ = fe.get_scoring_factors(training_raw_data)

        training_weekly_data = {}

        for training_pid in pid_list_training:
            training_weekly_data[training_pid] = {}
            data_to_use = ff.filterbyequality(pr.m_source, pid_dict[training_pid]) + \
                          ff.filterbyequality(pr.m_target, pid_dict[training_pid])
            if 0 == len(data_to_use):
                print 'no data found, probably filtered into the testing set, Training PID: '+\
                      training_pid+', Testing PID: '+pid
                continue
            pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, \
            outgoing_ss = get_pid_level_features(data_to_use, weekly_info, ff,
                                                 bullying_overlay, pid_dict,
                                                 training_pid, fe)
            for week_no in pid_weekly_w_bullying:
                fr_in_degree, fr_out_degree, fr_in_ew, \
                fr_out_ew, fr_in_senti, fr_out_senti, \
                current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe,
                                                                  global_in_degree, global_out_degree,
                                                                  global_in_ew, global_out_ew,
                                                                  incoming_ss, outgoing_ss,
                                                                  pid_dict[training_pid])
                training_set_final.append(
                        [training_pid, week_no,
                         fr_in_senti[0], fr_in_senti[1], fr_in_senti[2],
                         fr_out_senti[0], fr_out_senti[1], fr_out_senti[2],
                         fr_in_degree, fr_out_degree,
                         fr_in_ew, fr_out_ew,
                         current_in_ss, current_out_ss,
                         pid_weekly_w_bullying[week_no]['label']])

        # testing pid
        pid_weekly_w_bullying, global_in_degree, global_out_degree, \
        global_in_ew, global_out_ew, incoming_ss, outgoing_ss = get_pid_level_features(testing_raw_data, weekly_info,
                                                                                       ff, bullying_overlay, pid_dict,
                                                                                       pid, fe)
        for week_no in pid_weekly_w_bullying:
            fr_in_degree, fr_out_degree, fr_in_ew, \
            fr_out_ew, fr_in_senti, fr_out_senti, \
            current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe,
                                                              global_in_degree, global_out_degree,
                                                              global_in_ew, global_out_ew,
                                                              incoming_ss, outgoing_ss,
                                                              pid_dict[pid])
            testing_set_final.append(
                    [pid, week_no,
                     fr_in_senti[0], fr_in_senti[1], fr_in_senti[2],
                     fr_out_senti[0], fr_out_senti[1], fr_out_senti[2],
                     fr_in_degree, fr_out_degree,
                     fr_in_ew, fr_out_ew,
                     current_in_ss, current_out_ss,
                     pid_weekly_w_bullying[week_no]['label']])
        header = ['pid', 'wkno',
                  'frWInSenPos', 'frWInSenNeu', 'frWInSenNeg',
                  'frWOutSenPos', 'frWOutSenNeu', 'frWOutSenNeg',
                  'frInDegO', 'frOutDegO',
                  'frInEdgeO', 'frOutEdgeO',
                  'inSenSc', 'outSenSc',
                  'label']
        training_set_final = [header] + training_set_final
        testing_set_final = [header] + testing_set_final

        hlp.writecsv(training_set_final, output_folder+pid+'_tr.csv')
        hlp.writecsv(testing_set_final, output_folder+pid+'_ts.csv')
from sentimentanalysis import sentiment
from filterByField import filterfields
from basicInfo import twitterdataset as td
from basicInfo import privateInfo as pr
import helper as hlp
import random

data = hlp.readcsv('../ignore_data/Sentiment_Twitter.csv')
data = data[1:]
ff = filterfields('../ignore_data/messages.csv')
smsdata = ff.filterbyequality(pr.m_type, 'sms')

k = len(data)
l = len(smsdata)
seed = 254
random.seed(seed)
tr_n = 1000000
ts_n = 30
idx = 0
tr_before = []
ts_before = []

while idx < tr_n:
    i = random.randint(0, k)
    datum = data[i]
    tweet_type = td.sentiment_dict[datum[td.sentiment]]
    tweet_content = datum[td.sentiment_text]
    tr_before.append((tweet_content, tweet_type))
    idx += 1

random.seed(seed)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-o', '-O', help='Old Dataset', required=True)
    parser.add_argument('-n', '-N', help='New Dataset', required=True)
    parser.add_argument('-f', '-F', help='Folder to store results in, ending with /', required=True)
    parser.add_argument('-p', '-P', help='text file with list of people who were ordered to be removed', required=True)
    parser.add_argument('-s', '-S', help='text file with list of people who were semi-consented', required=True)

    args = parser.parse_args()

    old_dataset_file = args.o
    new_dataset_file = args.n
    location_to_store = args.f
    ordered_removed_file = args.p
    semi_consented_file = args.s

    print '***Reading data from arguments...'
    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',')
    new_dataset_dictionary = generate_new_dataset_dictionary(new_dataset[1:])
    new_dataset_msg_id_dictionary = generate_new_dataset_dictionary(new_dataset[1:], use_m_id=True)
    with open(ordered_removed_file, 'r') as f:
        ordered_removed = eval(f.read())
    with open(semi_consented_file, 'r') as f:
        semi_consented = eval(f.read())

    print '***Filtering old data within dates of study...'
    ff = filterfields()
    old_dataset_within_dates = ff.filterbetweendates(ff.converttodate(pr.start_datetime),
                                                     ff.converttodate(pr.end_datetime), data_to_work=old_dataset,
                                                     right_equality=True, date_field=pr.m_time_sent)
    old_dataset = old_dataset_within_dates
    old_dataset_counts = {}
    for datum in old_dataset:
        m_type = datum[pr.m_type]
        if m_type not in old_dataset_counts:
            old_dataset_counts[m_type] = 0
        old_dataset_counts[m_type] += 1
    print '*** OLD DATASET COUNTS***', old_dataset_counts
    print '***Finding mapping...'
    mapping_dict = {}
    inverted_mapping_dict = {}
    missed_dict = {}
    no_reason = []
    counts_no_match = {'ord': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0,
                               'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0},
                       'semi': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                                'fb_like': 0, 'fb_comment': 0},
                       'no': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                              'fb_like': 0, 'fb_comment': 0}}
    counts_match = {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                    'fb_like': 0, 'fb_comment': 0}
    no_reason_counts = {}
    for datum in old_dataset:
        m_result, msg_val = message_exists(datum, new_dataset_dictionary, ff)
        if m_result:
            mapping_dict[datum[pr.msg_id]] = msg_val
            if msg_val[1] not in inverted_mapping_dict:
                inverted_mapping_dict[msg_val[1]] = []
            inverted_mapping_dict[msg_val[1]].append(datum[pr.msg_id])
            m_type = datum[pr.m_type]
            if m_type in counts_match:
                counts_match[m_type] += 1
        else:
            src = datum[pr.m_source]
            trg = datum[pr.m_target]
            m_type = datum[pr.m_type]
            if src in ordered_removed or trg in ordered_removed:
                reason = 'ordered removed'
                if m_type in counts_no_match['ord']:
                    counts_no_match['ord'][m_type] += 1
            elif src in semi_consented or trg in semi_consented:
                reason = 'semi consented'
                if m_type in counts_no_match['semi']:
                    counts_no_match['semi'][m_type] += 1
            else:
                reason = ''
                temp = datum
                temp.append(msg_val)
                no_reason.append(temp)
                if m_type in counts_no_match['no']:
                    counts_no_match['no'][m_type] += 1
                if m_type not in no_reason_counts.keys():
                    no_reason_counts[m_type] = {}
                if msg_val not in no_reason_counts[m_type].keys():
                    no_reason_counts[m_type][msg_val] = 0
                no_reason_counts[m_type][msg_val] += 1
            missed_dict[datum[pr.msg_id]] = [msg_val, datum[pr.m_type], reason]
    print '\n\n**NOT FOUND**'
    for key_v in counts_no_match.keys():
        print key_v
        print counts_no_match[key_v]
    print '\n\n**NO REASON**'
    for key_v in no_reason_counts.keys():
        print key_v
        print no_reason_counts[key_v]
    print '\n\n**FOUND**', counts_match
    print '***Creating new dataset with mappings...'
    new_dataset_header = new_dataset[0]
    new_dataset_header.extend(['Old Message IDs'])
    final_dataset = [new_dataset_header]
    for new_msg_id in new_dataset_msg_id_dictionary.keys():
        datum = new_dataset_msg_id_dictionary[new_msg_id]
        old_msg_id = [''] if new_msg_id not in inverted_mapping_dict else inverted_mapping_dict[new_msg_id]
        datum.extend(old_msg_id)
        final_dataset.append(datum)

    print '***Writing data...'
    hlp.writecsv(final_dataset, location_to_store + 'new_old_mapped_hashed_dataset.csv', delimiter_sym=',')
    mapping_dict_list = [[x, mapping_dict[x][0], mapping_dict[x][1]] for x in mapping_dict]
    mapping_header = [['old_id', 'cosine_val', 'new_id']]
    mapping_header.extend(mapping_dict_list)
    hlp.writecsv(mapping_header, location_to_store + 'old_to_new_mapping.csv', delimiter_sym=',')
    missed_dict_list = [[x, missed_dict[x][0], missed_dict[x][1], missed_dict[x][2]] for x in missed_dict]
    missed_header = [['old_id', 'Reason', 'm_type', 'Explanation']]
    missed_header.extend(missed_dict_list)
    hlp.writecsv(missed_header, location_to_store + 'old_not_found.csv', delimiter_sym=',')
    hlp.writecsv(no_reason, location_to_store + 'old_not_found_no_reason.csv', delimiter_sym=',')
    print 'TADAA!!!'
def main():
    parser = argparse.ArgumentParser('Script to perform sentiment analysis using VADER')

    parser.add_argument('-m', '-M', type=str, required=True,
                        help='Location of the message file')
    parser.add_argument('-mt', '-MT', type=str, required=True, nargs='+',
                        help='types of messages to filter')
    parser.add_argument('-f', '-F', type=str, required=True,
                        help='filename where data is stored, no extension needed')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='location of folder to store the file, ends with a /')
    parser.add_argument('-p', '-P', action='store_true',
                        help='flag to store polarities separately')
    parser.add_argument('-w', '-W', type=str, required=False,
                        help='conduct weekly analysis, path to the survey data for '
                             'creating week information')
    parser.add_argument('-l', '-L', type=str, nargs='+', required=True,
                        help='the filters to use, make one or more choices: seenB, wasB, didB')
    parser.add_argument('-lf', '-LF', type=str, nargs='+', required=True,
                        help='location of filtered data, from runSurveyStats.py, in same order as -l/L flag')

    args = parser.parse_args()
    message_file = args.m
    message_types = args.mt
    filename_to_store = args.f
    location_to_store = args.s
    separate_polarity_score = args.p
    survey_file = args.w
    filters_chosen = args.l
    filter_files = args.lf

    catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True)

    if separate_polarity_score and survey_file is not None:
        print 'Cannot have separate polarity scores and weekly analysis together, ' \
              'please remove the -p/-P flag'
        return

    if survey_file is not None:
        wi = weeklyinfo()
        week_dates = wi.getweeklyfo(survey_file)
        gh = ghelper()
    ff = filterfields(message_file)
    data = []
    for message_type in message_types:
        data.extend(ff.filterbyequality(pr.m_type, message_type))
    pid_dict = hlp.getuniqueparticipants(data, 'all' if len(message_types) > 1 else message_types[0])
    sentiment_analyzer = vadersenti(data[1:])
    returned_data = sentiment_analyzer.compilesentiment(pr.m_content, separate_sentiment_list=separate_polarity_score)
    if separate_polarity_score:
        hlp.dumpvariable(returned_data, filename_to_store + '.data', location_to_store)
    else:
        header = pr.message_header + ['pos', 'neg', 'neu', 'compound']
        final_data = [header] + returned_data
        hlp.writecsv(final_data, location_to_store + filename_to_store + '.csv')
        weekly_data = gh.filterweeklydata(pid_dict, returned_data, week_dates,
                                          'all' if len(message_types) > 1 else message_types[0])
        hlp.dumpvariable(weekly_data, 'weekly_data.dict', location_to_store)
        summarized_sentiment = {}
        for pid in weekly_data:
            summarized_sentiment[pid] = {}
            participant_data = weekly_data[pid]
            for week_no in participant_data:
                summarized_sentiment[pid][week_no] = sentiment_analyzer.summarizesentiment(participant_data[week_no],
                                                                                           separate_in_out=True,
                                                                                           message_type=message_type)
        hlp.dumpvariable(summarized_sentiment, 'weekly_summarized_sentiment.dict', location_to_store)
        plt = plots()
        overlay_data = gh.createbullyingoverlay(catch_all_data, week_dates, ff)
        plt.plotweeklyprogression(summarized_sentiment, location_to_store, 'Sentiment Progress', 'Week',
                                  'Sentiment Value', sentiment_legend=['Positive', 'Negative', 'Neutral'],
                                  overlay_data=overlay_data)

    print 'done'
def main():
    parser = argparse.ArgumentParser('Script to generate a CDF comparing the degrees of our participants')

    parser.add_argument('-l', '-L', type=str, nargs='+', required=True,
                        help='the filters to use, make one or more choices: seenB, wasB, didB')
    parser.add_argument('-f', '-F', type=str, nargs='+', required=True,
                        help='location of filtered data, from runSurveyStats.py, in the same order as -l/L flag')
    parser.add_argument('-m', '-M', type=str, required=True,
                        help='location of the message file')
    parser.add_argument('-mt', '-MT', type=str, default='sms',
                        help='type of message we are filtering, default: sms')
    parser.add_argument('-n', '-N', action='store_true',
                        help='flag indicates that processing should include participants which did not witness '
                             'anything mentioned in the values passed for flags -l/L')
    parser.add_argument('-a', '-A', action='store_true',
                        help='flag indicates that processing should include a plot of all participants')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='folder to store in, leading /')
    parser.add_argument('-r', '-R', type=str, required=True,
                        help='survey file')

    args = parser.parse_args()
    filters_chosen = args.l
    for filter_v in filters_chosen:
        if filter_v not in ['seenB', 'didB', 'wasB']:
            raise Exception('filter value was not from the ones specified')
    filter_files = args.f
    assert len(filter_files) == len(filters_chosen), e.len_filter_file_ne_len_filters_chosen
    include_other_participants = args.n
    include_all_participants = args.a
    location_to_store = args.s
    if not os.path.exists(location_to_store):
        os.mkdir(location_to_store)
    message_file = args.m
    message_type = args.mt
    survey_file = args.r

    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)
    gh = ghelper()
    plt = plots()


    # get the filtered messages
    ff = filterfields(message_file)
    filtered_data = []
    if message_type == 'all':
        for message_type in ['sms', 'fb', 'twitter']:
            filtered_data.extend(ff.filterbyequality(pr.m_type, message_type))
    else:
        filtered_data = ff.filterbyequality(pr.m_type, message_type)

    # generate the links and the graph for the filtered data
    links, links_tuple, graph_obj, pid_dict = hlp.creategraph(filtered_data, filterType=message_type)

    # get the pids from the chosen filters
    bullying_pid_dict = hlp.getfilterdata(filters_chosen, filter_files)
    cumulative_bully_pid = hlp.getfilterdata(filters_chosen, filter_files, cumulative_list=True)

    # get all the information from the filters
    catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True)

    # generate the distributions for in degree and plot them
    in_distributions = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants,
                                                include_other_participants, pid_dict, message_type,
                                                cumulative_bully_pid, in_dist=True)
    in_distributions_ew = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants,
                                                include_other_participants, pid_dict, message_type,
                                                cumulative_bully_pid, in_dist=True, is_degree=False)
    plt.generatetablehist(in_distributions, location_to_store + 'in_degree_table.csv', generate_totals=True)
    plt.generatetablehist(in_distributions_ew, location_to_store + 'in_edge_weight.csv', generate_totals=True)

    # generate the distributions for out degree and plot them
    out_distributions = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants,
                                                 include_other_participants, pid_dict, message_type,
                                                 cumulative_bully_pid, in_dist=False)
    out_distributions_ew = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants,
                                                 include_other_participants, pid_dict, message_type,
                                                 cumulative_bully_pid, in_dist=False)
    plt.generatetablehist(out_distributions, location_to_store + 'out_degree_table.csv', generate_totals=True)
    plt.generatetablehist(out_distributions_ew, location_to_store + 'out_edge_weight.csv', generate_totals=True)


    # line plot of degrees
    weekly_dist_degrees, _ = gh.getweeklydistributions(pid_dict, filtered_data,
                                                    message_type=message_type,
                                                    is_degree=True, week_info=week_info)
    overlay_info = gh.createbullyingoverlay(catch_all_data, week_info, ff)
    plt.plotweeklyprogression(weekly_dist_degrees, location_to_store +'deg_', 'No of friends',
                              'Week No', 'Friends', overlay_data=overlay_info)
    # line plot of weights
    weekly_dist_ew, _ = gh.getweeklydistributions(pid_dict, filtered_data,
                                                    message_type=message_type,
                                                    is_degree=False, week_info=week_info)
    overlay_info = gh.createbullyingoverlay(catch_all_data, week_info, ff)
    plt.plotweeklyprogression(weekly_dist_ew, location_to_store +'ew_', 'No. of messages exchanged',
                              'Week No', 'Messages', overlay_data=overlay_info)
    print 'TADAAA!'