def main(): parser = __define_process_parser() old_dataset_file, new_dataset_mapped, missing_data, \ survey_file, location_to_store = __define_process_parser(True, parser) old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True) new_dataset = hlp.readcsv(new_dataset_mapped, delimiter_sym=',', remove_first=True) old_data_missing = hlp.readcsv(missing_data, delimiter_sym=',', remove_first=True) old_missing = __dictify(0, old_data_missing) wi = weeklyinfo() week_info = wi.getweeklyfo(survey_file) week_list = week_info.keys() bullying_positives = __find_positive_survey(survey_file, week_info) if bullying_positives is None: print 'Exiting...' exit() ff = filterfields() old_data_weekly = hlp.divideintoweekly(old_dataset, week_info, ff, date_field=pr.m_time_sent) new_data_weekly = hlp.divideintoweekly(new_dataset, week_info, ff, date_field=nd.m_timecreated) bullying_res = [['pid_hash', 'survey_id', 'time_of_survey', 'n_old', 'n_new', 'raw', 'semi', 'ordered', 'other']] for datum in bullying_positives: bullying_week = datum[-1] prev_week = bullying_week - 1 if bullying_week > min(week_list) else min(week_list) next_week = bullying_week + 1 if bullying_week < max(week_list) else max(week_list) old_data_pos = old_data_weekly[prev_week] + old_data_weekly[bullying_week] + old_data_weekly[next_week] new_data_pos = new_data_weekly[prev_week] + new_data_weekly[bullying_week] + new_data_weekly[next_week] pid_hash = datum[s_i.s_participant] n_old, n_new, nfr_dict = compare_old_new(old_data_pos, new_data_pos, old_missing, pid_hash, ff) temp = [pid_hash, datum[s_i.s_id], datum[s_i.s_time], n_old, n_new, nfr_dict['raw'], nfr_dict['semi'], nfr_dict['ordered'], nfr_dict['other']] bullying_res.append(temp) hlp.writecsv(bullying_res, location_to_store+'bullying_res.csv', delimiter_sym=',')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-d", "-D", required=True, help="the dataset") parser.add_argument("-m", "-M", required=True, help="mapping of the hashes") parser.add_argument("-f", "-F", required=True, help="folder to store the output in") parser.add_argument("-o", "-O", action="store_true", help="flag to indicate that we have the old dataset") args = parser.parse_args() dataset_file = args.d mapping_hash_file = args.m location_to_store = args.f # TODO: integrate the old dataset processing is_old = args.o new_dataset = hlp.readcsv(dataset_file, delimiter_sym=",", remove_first=True) mapping_hash = hlp.readcsv(mapping_hash_file, delimiter_sym=",", remove_first=True) pid_dict = {datum[1]: datum[0] for datum in mapping_hash} in_m, out_m, in_d, out_d = get_degree_message_count(new_dataset, pid_dict) vbt.plot_messages_degree( [in_m.values(), out_m.values()], "# of Messages", "Cumulative Participant Prob.", location_to_store + "in_out_messages.pdf", ) vbt.plot_messages_degree( [in_d.values(), out_d.values()], "Degree", "Cumulative Participant Prob.", location_to_store + "in_out_degree.pdf", True, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('-o', '-O', required=True, help='Old dataset csv') parser.add_argument('-n', '-N', required=True, help='New dataset csv') parser.add_argument('-s', '-S', required=True, help='Survey file') parser.add_argument('-p', '-P', required=True, help='folder to store figures in, should end with /') parser.add_argument('-m', '-M', required=True, help='Master hash mapping csv') parser.add_argument('-mt', '-MT', required=True, nargs='+', help='Types of messages to look for') parser.add_argument('-d', '-D', action='store_true', help='Flag to debug') args = parser.parse_args() old_dataset_file = args.o new_dataset_file = args.n survey_file = args.s location_to_store = args.p master_hash_csv = args.m message_types = args.mt do_debug = args.d print 'Reading data...' master_csv = hlp.readcsv(master_hash_csv, delimiter_sym=',', remove_first=True) master_dict = {datum[1]: datum[0] for datum in master_csv} ff = filterfields() filtered_old = [] filtered_new = [] old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True) new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',', remove_first=True) print 'Filtering message types' for message_type in message_types: filtered_old.extend(ff.filterbyequality(pr.m_type, message_type, data=old_dataset)) filtered_new.extend(ff.filterbyequality(pr.m_type, message_type, data=new_dataset)) wi = weeklyinfo() weekly_info = wi.getweeklyfo(survey_file) week_list = weekly_info.keys() week_list.sort() print 'Creating in out dictionary' in_out_message_dict = get_message_counts(filtered_old, filtered_new, week_list, weekly_info, master_dict, ff, location_to_store, do_debug) print 'Plotting...' for pid in in_out_message_dict: print pid plot_distribution(in_out_message_dict[pid][0][0], in_out_message_dict[pid][0][1], in_out_message_dict[pid][1][0], in_out_message_dict[pid][1][1], week_list, pid, location_to_store) print 'TADAA!!'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') parser.add_argument('-n', '-N', required=False, nargs=2, type=int, default=[0, 2], help='the neutral threshold, first value is min, second is max') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p neutral_limit = args.n message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] afinn = afinnsenti(data=message_data, neutral_threshold=neutral_limit) data = afinn.compilesentiment(separate_sentiment_list=separate_sentiment, field_no=nd.m_content) if(separate_sentiment): hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('score') message_header.append('label') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] vader = vadersenti(data=message_data) data = vader.compilesentiment(separate_sentiment_list=separate_sentiment) if separate_sentiment: hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('pos') message_header.append('neg') message_header.append('neu') message_header.append('compound') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def __find_positive_survey(survey_file, week_info): week_no = week_info.keys() week_no.sort() ff = filterfields() s_obj = surveys() survey_data = hlp.readcsv(survey_file, delimiter_sym=',') n_data = s_obj.interpretanswers(survey_data, True) bullying_positives = ff.filterbyequality(s_i.s_qno, '4', data=n_data[1:]) new_bullying_positives = [] for datum in bullying_positives: datetime_of_survey = ff.converttodate(datum[s_i.s_time]) found_match = False for week in week_no: (start_date, end_date) = week_info[week] if start_date <= datetime_of_survey <= end_date: datum.append(week) new_bullying_positives.append(datum) found_match = True break if not found_match: print 'Something funky happened...', datum return None return new_bullying_positives
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', help='Dataset', required=True) parser.add_argument('-n', '-N', help='Flag to indicate new dataset', action='store_true') args = parser.parse_args() dataset_file = args.d new_dataset = args.n dataset = hlp.readcsv(dataset_file, delimiter_sym=',', remove_first=True) type_dict = {} for datum in dataset: m_type = datum[nd.m_type] if new_dataset else datum[pr.m_type] if m_type not in type_dict: type_dict[m_type] = 0 type_dict[m_type] += 1 sorted_types = type_dict.keys() sorted_types.sort() total = sum(type_dict.values()) for keyn in sorted_types: print keyn + ': ' + str(type_dict[keyn]) print 'total: '+str(total)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled csv') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') args = parser.parse_args() data_file = args.d location_to_store = args.f all_afinn_data = hlp.readcsv(data_file) labelled_data = hlp.processafinnsentiment(all_afinn_data) csv_header = ['pid', 'm_type', 'in_pos', 'in_neg', 'in_neu', 'out_pos', 'out_neg', 'out_neu', 'in_deg_part', 'in_deg_nonpart', 'out_deg_part', 'out_deg_nonpart'] pol_dist, complete_in_out = distribution_polarity(labelled_data) print '***For Complete Dataset***' print 'Incoming(P, N, U): ', complete_in_out['in'] print 'Outgoing(P, N, U): ', complete_in_out['out'] hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store) to_store_csv = [csv_header] for pid in pol_dist: pid_data = pol_dist[pid] for m_type in pid_data: m_data = pid_data[m_type] csv_line = __summarize_data(m_data) final_csv_line = [pid, m_type] final_csv_line.extend(csv_line) to_store_csv.append(final_csv_line) hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
from sentimentanalysis import sentiment from filterByField import filterfields from basicInfo import twitterdataset as td from basicInfo import privateInfo as pr import helper as hlp import random data = hlp.readcsv('../ignore_data/Sentiment_Twitter.csv') data = data[1:] ff = filterfields('../ignore_data/messages.csv') smsdata = ff.filterbyequality(pr.m_type, 'sms') k = len(data) l = len(smsdata) seed = 254 random.seed(seed) tr_n = 1000000 ts_n = 30 idx = 0 tr_before = [] ts_before = [] while idx < tr_n: i = random.randint(0, k) datum = data[i] tweet_type = td.sentiment_dict[datum[td.sentiment]] tweet_content = datum[td.sentiment_text] tr_before.append((tweet_content, tweet_type)) idx += 1 random.seed(seed)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-o', '-O', help='Old Dataset', required=True) parser.add_argument('-n', '-N', help='New Dataset', required=True) parser.add_argument('-f', '-F', help='Folder to store results in, ending with /', required=True) parser.add_argument('-p', '-P', help='text file with list of people who were ordered to be removed', required=True) parser.add_argument('-s', '-S', help='text file with list of people who were semi-consented', required=True) args = parser.parse_args() old_dataset_file = args.o new_dataset_file = args.n location_to_store = args.f ordered_removed_file = args.p semi_consented_file = args.s print '***Reading data from arguments...' old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True) new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',') new_dataset_dictionary = generate_new_dataset_dictionary(new_dataset[1:]) new_dataset_msg_id_dictionary = generate_new_dataset_dictionary(new_dataset[1:], use_m_id=True) with open(ordered_removed_file, 'r') as f: ordered_removed = eval(f.read()) with open(semi_consented_file, 'r') as f: semi_consented = eval(f.read()) print '***Filtering old data within dates of study...' ff = filterfields() old_dataset_within_dates = ff.filterbetweendates(ff.converttodate(pr.start_datetime), ff.converttodate(pr.end_datetime), data_to_work=old_dataset, right_equality=True, date_field=pr.m_time_sent) old_dataset = old_dataset_within_dates old_dataset_counts = {} for datum in old_dataset: m_type = datum[pr.m_type] if m_type not in old_dataset_counts: old_dataset_counts[m_type] = 0 old_dataset_counts[m_type] += 1 print '*** OLD DATASET COUNTS***', old_dataset_counts print '***Finding mapping...' mapping_dict = {} inverted_mapping_dict = {} missed_dict = {} no_reason = [] counts_no_match = {'ord': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}, 'semi': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}, 'no': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}} counts_match = {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0} no_reason_counts = {} for datum in old_dataset: m_result, msg_val = message_exists(datum, new_dataset_dictionary, ff) if m_result: mapping_dict[datum[pr.msg_id]] = msg_val if msg_val[1] not in inverted_mapping_dict: inverted_mapping_dict[msg_val[1]] = [] inverted_mapping_dict[msg_val[1]].append(datum[pr.msg_id]) m_type = datum[pr.m_type] if m_type in counts_match: counts_match[m_type] += 1 else: src = datum[pr.m_source] trg = datum[pr.m_target] m_type = datum[pr.m_type] if src in ordered_removed or trg in ordered_removed: reason = 'ordered removed' if m_type in counts_no_match['ord']: counts_no_match['ord'][m_type] += 1 elif src in semi_consented or trg in semi_consented: reason = 'semi consented' if m_type in counts_no_match['semi']: counts_no_match['semi'][m_type] += 1 else: reason = '' temp = datum temp.append(msg_val) no_reason.append(temp) if m_type in counts_no_match['no']: counts_no_match['no'][m_type] += 1 if m_type not in no_reason_counts.keys(): no_reason_counts[m_type] = {} if msg_val not in no_reason_counts[m_type].keys(): no_reason_counts[m_type][msg_val] = 0 no_reason_counts[m_type][msg_val] += 1 missed_dict[datum[pr.msg_id]] = [msg_val, datum[pr.m_type], reason] print '\n\n**NOT FOUND**' for key_v in counts_no_match.keys(): print key_v print counts_no_match[key_v] print '\n\n**NO REASON**' for key_v in no_reason_counts.keys(): print key_v print no_reason_counts[key_v] print '\n\n**FOUND**', counts_match print '***Creating new dataset with mappings...' new_dataset_header = new_dataset[0] new_dataset_header.extend(['Old Message IDs']) final_dataset = [new_dataset_header] for new_msg_id in new_dataset_msg_id_dictionary.keys(): datum = new_dataset_msg_id_dictionary[new_msg_id] old_msg_id = [''] if new_msg_id not in inverted_mapping_dict else inverted_mapping_dict[new_msg_id] datum.extend(old_msg_id) final_dataset.append(datum) print '***Writing data...' hlp.writecsv(final_dataset, location_to_store + 'new_old_mapped_hashed_dataset.csv', delimiter_sym=',') mapping_dict_list = [[x, mapping_dict[x][0], mapping_dict[x][1]] for x in mapping_dict] mapping_header = [['old_id', 'cosine_val', 'new_id']] mapping_header.extend(mapping_dict_list) hlp.writecsv(mapping_header, location_to_store + 'old_to_new_mapping.csv', delimiter_sym=',') missed_dict_list = [[x, missed_dict[x][0], missed_dict[x][1], missed_dict[x][2]] for x in missed_dict] missed_header = [['old_id', 'Reason', 'm_type', 'Explanation']] missed_header.extend(missed_dict_list) hlp.writecsv(missed_header, location_to_store + 'old_not_found.csv', delimiter_sym=',') hlp.writecsv(no_reason, location_to_store + 'old_not_found_no_reason.csv', delimiter_sym=',') print 'TADAA!!!'