def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', type=str, required=True,
                        help='Message file')
    parser.add_argument('-p', '-P', type=str, required=True,
                        help='pid_dict')
    parser.add_argument('-o', '-O', type=str, required=True,
                        help='output folder')
    parser.add_argument('-of', '-OF', type=str, required=True,
                        help='output file')

    args = parser.parse_args()

    message_data = hlp.recovervariable(args.m)
    pid_dict = hlp.recovervariable(args.p)

    output_folder = args.o
    output_file = args.of

    edge_dict = getedgecount(message_data, pid_dict)
    edge_list = converttolist(edge_dict)

    hlp.writecsv([['source', 'target', 'pos', 'neu', 'neg']]+edge_list, output_folder+output_file)
    hlp.writecsv([['PID', 'Coded ID']]+convertpiddicttolist(pid_dict), output_folder+'pid_dict_list.csv')
def main():
    messages = hlp.recovervariable(sys.argv[1])
    pid_dict = hlp.recovervariable(sys.argv[2])
    week_dict = hlp.recovervariable(sys.argv[3])
    m_type = sys.argv[4]
    participants = pid_dict[pr.participant[m_type]]
    non_participants = pid_dict[pr.nparticipant[m_type]]
    graph_objs = weekgraphs(week_dict, participants, non_participants)
    hlp.dumpvariable(graph_objs, 'week_graph_objs')
def main():
    parser = argparse.ArgumentParser('Script to generate statistics about message types')

    # add arguments
    parser.add_argument('-d', '-D', type=str, required=True,
                        help='location of file to work with')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='folder to store the results, ending with /')
    parser.add_argument('-f', '-F', type=str, required=True,
                        help='filename to store data in')
    parser.add_argument('-w', '-W', type=int, default=0,
                        help='what threshold to classify missing, default 0, Integer value needed')

    # get arguments
    args = parser.parse_args()
    filename = args.d
    threshold_missing = args.w
    location_to_store = args.s
    filepath = args.f

    data = hlp.recovervariable(filename)

    missing_week_dict, per_week_msgs = hlp.missingweeks(data, threshold_value=threshold_missing)
    flipped_dict = flipdict(missing_week_dict)
    printsummary(missing_week_dict, 'No. of participants with less than '+
                 str(threshold_missing)+' data points in ', len(data.keys()), per_week_msgs)
    hlp.dumpvariable(missing_week_dict, filepath, location_to_store)
    hlp.dumpvariable(flipped_dict, 'flipped_'+filepath, location_to_store)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser('Filter out people who have 0 communication weeks greater than the threshold')

    parser.add_argument('-f', '-F', type=str, required=True)
    parser.add_argument('-ti', '-TI', type=int, required=True,
                        help='Incoming threshold')
    parser.add_argument('-to', '-TO', type=int, required=True,
                        help='Outgoing threshold')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='storage folder with /')
    parser.add_argument('-sf', '-SF', type=str, required=True,
                        help='file name for storage')

    args = parser.parse_args()

    flipped_dict = hlp.recovervariable(args.f)
    incoming_th = args.ti
    outgoing_th = args.to
    location_to_store = args.s
    filename = args.sf

    to_remove = []
    for pid in flipped_dict:
        if flipped_dict[pid][0] >= incoming_th and flipped_dict[pid][1] >= outgoing_th:
            to_remove.append(pid)
            print 'REMOVED: ', pid, flipped_dict[pid]
        else:
            print 'NOT REMOVED: ', pid, flipped_dict[pid]

    print 'Removed ', len(to_remove), ' out of a total of ', len(flipped_dict.keys()),  'participants'

    hlp.dumpvariable(to_remove, filename, location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', type=str, required=True)
    parser.add_argument('-l', '-L', type=str, required=True)
    parser.add_argument('-o', '-O', type=str, required=False)

    args = parser.parse_args()

    message_data = hlp.recovervariable(args.m)
    lexicon_file = args.l
    output_file = args.o

    f = open(lexicon_file, 'r')
    lexicon_data = f.readlines()
    f.close()

    pct_words_covered, words_not_present, common_words = get_effective_coverage(lexicon_data, message_data)

    print 'pct words covered by vader: ', pct_words_covered
    print 'words not present: ', words_not_present

    if output_file is not None:
        output_text = ''
        for word in words_not_present.__iter__():
            output_text += word + '\n'
        with open(output_file, 'w') as f:
            f.write(output_text)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '--messageFile', type=str, required=True)
    parser.add_argument('-mt', '--messageTypes', type=str, nargs='+')
    parser.add_argument('-o', '--outputFolder', type=str, required=True)
    parser.add_argument('-of', '--outputFile', type=str, required=True)
    parser.add_argument('-pd', '--participantDictionary', type=str)
    parser.add_argument('-i', '--ignoreParticipants', type=str)
    parser.add_argument('-mc', '--messageTypeConvert', type=str, nargs='*')

    args = parser.parse_args()

    message_file = args.messageFile
    message_types = args.messageTypes
    output_folder = args.outputFolder
    output_file = args.outputFile
    pid_dict = args.participantDictionary
    ignore_pids = args.ignoreParticipants
    message_type_conversions = args.messageTypeConvert

    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])

    to_set_data = []

    # extract the relevant data
    for message_type in message_types:
        to_set_data.extend(ff.filterbyequality(pr.m_type, message_type))

    ff.setdata(to_set_data)

    if ignore_pids is not None:
        ignore_pids = hlp.recovervariable(ignore_pids)
        for pid in ignore_pids:
            ff.removebyequality(pr.m_source, pid)
            ff.removebyequality(pr.m_target, pid)


    # set the pid to normal id dictionary
    if pid_dict is None:
        pid_dict = hlp.getuniqueparticipants(ff.getdata(), mtype='all', separate_pid_npid=True)

    # replace the message type names with the ones provided
    if message_type_conversions is not None:
        for idx in range(0, len(message_type_conversions), 2):
            message_to_convert = message_type_conversions[idx]
            to_convert_to = message_type_conversions[idx+1]
            ff.replacebyequality(pr.m_type, message_to_convert, to_convert_to)

    message_types = ff.getuniqueelements(pr.m_type)
    coded_participant_list = pid_dict[pr.participant['all']].values()
    storage_dict = initiatestorage(coded_participant_list, message_types)
    storage_dict = getperparticipantinout(ff.getdata(), storage_dict, pid_dict)
    plotperparticipantbar(storage_dict, 'Participant ID', '# of Messages', message_types, 'Per Participant Messages',
                          output_folder+output_file)
    hlp.dumpvariable(pid_dict, 'pid_dict.dict', output_folder)
    hlp.dumpvariable(ff.getdata(), 'messageData.list', output_folder)
def filtersurvey(dict_path, qno, answers, is_data = False):
    data = dict_path if is_data else hlp.recovervariable(dict_path)
    survey_obj = surveystats(data)
    if None == answers:
        res = survey_obj.processdict(sInfo.surveyQType[qno])
    else:
        res = {}
        for ans in answers:
            res[ans] = survey_obj.processdict(sInfo.surveyQType[qno], ans)
    return res
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')

    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]
    if weekly_surveys is None:
        reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'],
                                                                          location_to_store)
        analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv')
        analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv')
        hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                         'reciprocity_info_overall.dict', location_to_store)
    else:
        # working with bimonthly data
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'],
                                                                              location_to_store)
            analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv')
            analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv')
            hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                             'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store)
            idx += 1

    print 'tadaa!'
def main():
    parse = argparse.ArgumentParser('Script to create plots of graph statistics')
    parse.add_argument('-i', '-I', type=str, required=True,
                       help='path to graph statistics data')
    parse.add_argument('-o', '-O', type=str, required=True,
                       help='directory to store the generated graphs without leading /')
    parse.add_argument('-f', '-F', type=str, default='mean',
                       help='function to use, currently supports all present in the statistics package')
    args = parse.parse_args()
    ip_file = args.i
    op_dir = args.o
    func = eval('statistics.' + args.f)
    data = hlp.recovervariable(ip_file)
    for ans in data.keys():
        print 'DK:', ans
        dpath = op_dir + '_' + args.f + '/'
        if not os.path.exists(dpath):
            os.mkdir(dpath)
        plotindividual(data[ans], func, dpath)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')


    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]

    if weekly_surveys is None:
        reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store)
        hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store)
    else:
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store)
            hlp.dumpvariable([reciprocity_dict, message_pairs],
                             'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
def main():
    parse = argparse.ArgumentParser('Script to generate statistics on bullying data')
    parse.add_argument('-i', '-I', type=str, required=True,
                       help='Path to the input dictionary containing bullying information')
    parse.add_argument('-m', '-M', type=str, required=True,
                       help='Path to the messages file, should be a csv')
    parse.add_argument('-s', '-S', type=str, required=True,
                       help='Directory where results are stored, with a leading /')
    parse.add_argument('-f', '-F', type=str, required=True,
                       help='File name')
    parse.add_argument('-p', '-P', type=str, required=True,
                       help='Participant type')
    args = parse.parse_args()
    bullying_data = hlp.recovervariable(args.i)
    message_path = args.m
    save_dir = args.s
    save_f = args.f
    p_type = args.p
    res = {}
    for key in bullying_data.keys():
        res[key] = getstats(message_path, bullying_data[key], p_type)
    hlp.dumpvariable(res, save_f, save_dir)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '-F', type=str, required=True,
                        help='weekly dict')
    parser.add_argument('-p', '-P', type=str, required=True,
                        help='pid')
    parser.add_argument('-w', '-W', type=int, nargs='+',
                        help='list of weeks')
    parser.add_argument('-o', '-O', type=str,
                        help='folder to store the output')
    parser.add_argument('-s', '-S', action='store_true',
                        help='separate out the incoming and outgoing messages')
    parser.add_argument('-io', type=str)

    args = parser.parse_args()
    week_dict_file = args.f
    pid = args.p
    weeks = args.w
    location_to_store = args.o
    separate_in_out = args.s
    show_in_out = args.io

    week_data_dict = hlp.recovervariable(week_dict_file)
    participant_data = {pid: {}}
    for week_no in weeks:
        reduced_data = getspecificdata(week_data_dict, pid, week_no, separate_in_out)
        if reduced_data is None:
            print 'No data found, or some error occurred...'
            continue
        else:
            participant_data[pid] = reduced_data
            print '\n\n\n\n\nData summary for PID:', pid, ' week_no: ', week_no
            printmessages(reduced_data, separate_in_out, show_in_out)
    if location_to_store is not None:
        hlp.dumpvariable(participant_data, pid+'.data', location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', type=str, required=True,
                        help='Message list file')
    parser.add_argument('-r', '-R', type=str, required=True,
                        help='survey file')
    parser.add_argument('-p', '-P', type=str, required=True,
                        help='PID dict inverted')
    parser.add_argument('-b', '-B', type=str, required=True,
                        help='bullying dictionary')
    parser.add_argument('-o', '-O', type=str, required=True,
                        help='Output folder')
    parser.add_argument('-l', '-L', type=str, nargs='+',
                        help='Filters chosen')
    parser.add_argument('-f', '-f', type=str, nargs='+',
                        help='Filter files')

    args = parser.parse_args()

    output_folder = args.o

    message_data = hlp.recovervariable(args.m)
    pid_dict = hlp.recovervariable(args.p)

    filters_chosen = args.l
    filter_files = args.f
    catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True)

    wi = weeklyinfo()
    weekly_info = wi.getweeklyfo(args.r)

    ff = filterfields()

    gh = ghelper()
    bullying_overlay = gh.createbullyingoverlay(catch_all_data, weekly_info, ff)
    bullying_overlay = flip_bullying_overlay(bullying_overlay, weekly_info.keys())

    pid_list = pid_dict.keys()
    pid_list.sort()
    for pid in pid_list:
        training_set_final = []
        testing_set_final = []
        pid_list_training = deepcopy(pid_list)
        pid_list_training.remove(pid)
        ff.setdata(message_data)
        testing_raw_data = ff.filterbyequality(pr.m_source, pid_dict[pid]) + \
                           ff.filterbyequality(pr.m_target, pid_dict[pid])
        ff.removebyequality(pr.m_source, pid_dict[pid])
        ff.removebyequality(pr.m_target, pid_dict[pid])
        training_raw_data = ff.getdata()
        fe = raw_features(data=None)
        _, _ = fe.get_scoring_factors(training_raw_data)

        training_weekly_data = {}

        for training_pid in pid_list_training:
            training_weekly_data[training_pid] = {}
            data_to_use = ff.filterbyequality(pr.m_source, pid_dict[training_pid]) + \
                          ff.filterbyequality(pr.m_target, pid_dict[training_pid])
            if 0 == len(data_to_use):
                print 'no data found, probably filtered into the testing set, Training PID: '+\
                      training_pid+', Testing PID: '+pid
                continue
            pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, \
            outgoing_ss = get_pid_level_features(data_to_use, weekly_info, ff,
                                                 bullying_overlay, pid_dict,
                                                 training_pid, fe)
            for week_no in pid_weekly_w_bullying:
                fr_in_degree, fr_out_degree, fr_in_ew, \
                fr_out_ew, fr_in_senti, fr_out_senti, \
                current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe,
                                                                  global_in_degree, global_out_degree,
                                                                  global_in_ew, global_out_ew,
                                                                  incoming_ss, outgoing_ss,
                                                                  pid_dict[training_pid])
                training_set_final.append(
                        [training_pid, week_no,
                         fr_in_senti[0], fr_in_senti[1], fr_in_senti[2],
                         fr_out_senti[0], fr_out_senti[1], fr_out_senti[2],
                         fr_in_degree, fr_out_degree,
                         fr_in_ew, fr_out_ew,
                         current_in_ss, current_out_ss,
                         pid_weekly_w_bullying[week_no]['label']])

        # testing pid
        pid_weekly_w_bullying, global_in_degree, global_out_degree, \
        global_in_ew, global_out_ew, incoming_ss, outgoing_ss = get_pid_level_features(testing_raw_data, weekly_info,
                                                                                       ff, bullying_overlay, pid_dict,
                                                                                       pid, fe)
        for week_no in pid_weekly_w_bullying:
            fr_in_degree, fr_out_degree, fr_in_ew, \
            fr_out_ew, fr_in_senti, fr_out_senti, \
            current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe,
                                                              global_in_degree, global_out_degree,
                                                              global_in_ew, global_out_ew,
                                                              incoming_ss, outgoing_ss,
                                                              pid_dict[pid])
            testing_set_final.append(
                    [pid, week_no,
                     fr_in_senti[0], fr_in_senti[1], fr_in_senti[2],
                     fr_out_senti[0], fr_out_senti[1], fr_out_senti[2],
                     fr_in_degree, fr_out_degree,
                     fr_in_ew, fr_out_ew,
                     current_in_ss, current_out_ss,
                     pid_weekly_w_bullying[week_no]['label']])
        header = ['pid', 'wkno',
                  'frWInSenPos', 'frWInSenNeu', 'frWInSenNeg',
                  'frWOutSenPos', 'frWOutSenNeu', 'frWOutSenNeg',
                  'frInDegO', 'frOutDegO',
                  'frInEdgeO', 'frOutEdgeO',
                  'inSenSc', 'outSenSc',
                  'label']
        training_set_final = [header] + training_set_final
        testing_set_final = [header] + testing_set_final

        hlp.writecsv(training_set_final, output_folder+pid+'_tr.csv')
        hlp.writecsv(testing_set_final, output_folder+pid+'_ts.csv')
 def __init__(self, folder_to_look='./'):
     self.word_corpus = hlp.recovervariable(folder_to_look + 'all_words.list')
     self.total_len = len(self.word_corpus) + 0.0
     self.word_freq = self.word_frequency(self.word_corpus)
     self.kb_neighbor = self.__keyboard_neighborhood()
     self.len_dict = self.__create_word_len_dict()