def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') parser.add_argument('-n', '-N', required=False, nargs=2, type=int, default=[0, 2], help='the neutral threshold, first value is min, second is max') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p neutral_limit = args.n message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] afinn = afinnsenti(data=message_data, neutral_threshold=neutral_limit) data = afinn.compilesentiment(separate_sentiment_list=separate_sentiment, field_no=nd.m_content) if(separate_sentiment): hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('score') message_header.append('label') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True, help='Message file') parser.add_argument('-p', '-P', type=str, required=True, help='pid_dict') parser.add_argument('-o', '-O', type=str, required=True, help='output folder') parser.add_argument('-of', '-OF', type=str, required=True, help='output file') args = parser.parse_args() message_data = hlp.recovervariable(args.m) pid_dict = hlp.recovervariable(args.p) output_folder = args.o output_file = args.of edge_dict = getedgecount(message_data, pid_dict) edge_list = converttolist(edge_dict) hlp.writecsv([['source', 'target', 'pos', 'neu', 'neg']]+edge_list, output_folder+output_file) hlp.writecsv([['PID', 'Coded ID']]+convertpiddicttolist(pid_dict), output_folder+'pid_dict_list.csv')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] vader = vadersenti(data=message_data) data = vader.compilesentiment(separate_sentiment_list=separate_sentiment) if separate_sentiment: hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('pos') message_header.append('neg') message_header.append('neu') message_header.append('compound') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main(): parser = __define_process_parser() old_dataset_file, new_dataset_mapped, missing_data, \ survey_file, location_to_store = __define_process_parser(True, parser) old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True) new_dataset = hlp.readcsv(new_dataset_mapped, delimiter_sym=',', remove_first=True) old_data_missing = hlp.readcsv(missing_data, delimiter_sym=',', remove_first=True) old_missing = __dictify(0, old_data_missing) wi = weeklyinfo() week_info = wi.getweeklyfo(survey_file) week_list = week_info.keys() bullying_positives = __find_positive_survey(survey_file, week_info) if bullying_positives is None: print 'Exiting...' exit() ff = filterfields() old_data_weekly = hlp.divideintoweekly(old_dataset, week_info, ff, date_field=pr.m_time_sent) new_data_weekly = hlp.divideintoweekly(new_dataset, week_info, ff, date_field=nd.m_timecreated) bullying_res = [['pid_hash', 'survey_id', 'time_of_survey', 'n_old', 'n_new', 'raw', 'semi', 'ordered', 'other']] for datum in bullying_positives: bullying_week = datum[-1] prev_week = bullying_week - 1 if bullying_week > min(week_list) else min(week_list) next_week = bullying_week + 1 if bullying_week < max(week_list) else max(week_list) old_data_pos = old_data_weekly[prev_week] + old_data_weekly[bullying_week] + old_data_weekly[next_week] new_data_pos = new_data_weekly[prev_week] + new_data_weekly[bullying_week] + new_data_weekly[next_week] pid_hash = datum[s_i.s_participant] n_old, n_new, nfr_dict = compare_old_new(old_data_pos, new_data_pos, old_missing, pid_hash, ff) temp = [pid_hash, datum[s_i.s_id], datum[s_i.s_time], n_old, n_new, nfr_dict['raw'], nfr_dict['semi'], nfr_dict['ordered'], nfr_dict['other']] bullying_res.append(temp) hlp.writecsv(bullying_res, location_to_store+'bullying_res.csv', delimiter_sym=',')
def analyze_polarity(polarity_dict, pid_dict, location_to_store, filename): final_csv_data = [['source', 'target', 'Pos', 'Neg', 'Neu']] print '++++ POLARITY INFORMATION ++++' type_abr = {'participants': 'P', 'nonparticipants':'NP'} for pid in polarity_dict: src_new = hlp.getpid(pid_dict, pid) print '**** For PID: P' + str(src_new) + ' ****' for (src, trg) in polarity_dict[pid]: trg_new, trg_type = hlp.getpid(pid_dict, trg, return_p_type=True) print '(P'+str(src_new)+','+ type_abr[trg_type]+str(trg_new)+')' temp = ['P'+str(src_new), type_abr[trg_type]+str(trg_new)] for polarity in polarity_dict[pid][(src, trg)]: temp.append(polarity) final_csv_data.append(temp) hlp.writecsv(final_csv_data, location_to_store+filename, delimiter_sym=',')
def analyze_info(reciprocity_dict, pid_dict, location_to_store, filename): final_csv_data = [] for pid in reciprocity_dict: pid_new = hlp.getpid(pid_dict, pid) print '**** For PID: P' + str(pid_new) + ' ****' pid_data = reciprocity_dict[pid] for target_type in pid_data: target_data = pid_data[target_type] for target_pid in target_data: target_pid_new = hlp.getpid(pid_dict, target_pid) print 'P' + str(pid_new) + '-' + target_type + str(target_pid_new) temp = get_reci_probability(target_data[target_pid], True) starting_data = ['P' + str(pid_new), target_type + str(target_pid_new)] starting_data.extend(temp) final_csv_data.append(starting_data) header = ['source', 'target', 'Pr(+|+)', 'Pr(-|+)', 'Pr(U|+)', 'Pr(X|+)', 'Pr(+|U)', 'Pr(-|U)', 'Pr(U|U)', 'Pr(X|U)', 'Pr(+|-)', 'Pr(-|-)', 'Pr(U|-)', 'Pr(X|-)'] toWrite = [header] + final_csv_data hlp.writecsv(toWrite, location_to_store + filename, delimiter_sym=',')
def generatetablehist(self, input_dictionary, file_path, generate_totals=False, bin_dist=-1): max_value = float('-inf') bin_dist = 5 if -1 == bin_dist else bin_dist for key in input_dictionary.keys(): max_value = max(input_dictionary[key]) if max(input_dictionary[key]) > max_value else max_value max_value = int(ceil(max_value / bin_dist) * bin_dist) bins = range(0, max_value + 1, bin_dist) print 'bins: ', bins, 'max:', max_value first_line = ['category'] for idx in range(1, len(bins)): first_line.append(str(bins[idx - 1]) + ' to ' + str(bins[idx])) if generate_totals: first_line.append('no. of people') final_to_write = [first_line] for key in input_dictionary.keys(): h, be = np.histogram(input_dictionary[key], bins, range=(0, max_value)) h = h / sum(h) if generate_totals: final_to_write.append([key] + h.tolist() + [len(input_dictionary[key])]) else: final_to_write.append([key] + h.tolist()) hlp.writecsv(final_to_write, file_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled csv') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') args = parser.parse_args() data_file = args.d location_to_store = args.f all_afinn_data = hlp.readcsv(data_file) labelled_data = hlp.processafinnsentiment(all_afinn_data) csv_header = ['pid', 'm_type', 'in_pos', 'in_neg', 'in_neu', 'out_pos', 'out_neg', 'out_neu', 'in_deg_part', 'in_deg_nonpart', 'out_deg_part', 'out_deg_nonpart'] pol_dist, complete_in_out = distribution_polarity(labelled_data) print '***For Complete Dataset***' print 'Incoming(P, N, U): ', complete_in_out['in'] print 'Outgoing(P, N, U): ', complete_in_out['out'] hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store) to_store_csv = [csv_header] for pid in pol_dist: pid_data = pol_dist[pid] for m_type in pid_data: m_data = pid_data[m_type] csv_line = __summarize_data(m_data) final_csv_line = [pid, m_type] final_csv_line.extend(csv_line) to_store_csv.append(final_csv_line) hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
def main(sql_path, variable_path): s_obj = surveys() data = s_obj.importsqlascsv(sql_path, 'survey') hlp.dumpvariable(data, 'survey_list.list', variable_path) hlp.writecsv(data, variable_path+'survey_list.csv') ndata = s_obj.interpretanswers(data) hlp.dumpvariable(ndata, 'survey_list_interpret.list', variable_path) hlp.writecsv(ndata, variable_path+'survey_list_interpret.csv') ndata_wR = s_obj.interpretanswers(data, True) hlp.dumpvariable(ndata_wR, 'survey_list_with_response_interpret.list', variable_path) hlp.writecsv(ndata_wR, variable_path+'survey_list_with_response_interpret.csv') data_dict = s_obj.datatodict(ndata) hlp.dumpvariable(data_dict, 'survey_dict_interpret.dict', variable_path) data_wR_dict = s_obj.datatodict(ndata_wR) hlp.dumpvariable(data_wR_dict, 'survey_dict_with_response_interpret.dict', variable_path)
def main(): parser = argparse.ArgumentParser('Script to perform sentiment analysis using VADER') parser.add_argument('-m', '-M', type=str, required=True, help='Location of the message file') parser.add_argument('-mt', '-MT', type=str, required=True, nargs='+', help='types of messages to filter') parser.add_argument('-f', '-F', type=str, required=True, help='filename where data is stored, no extension needed') parser.add_argument('-s', '-S', type=str, required=True, help='location of folder to store the file, ends with a /') parser.add_argument('-p', '-P', action='store_true', help='flag to store polarities separately') parser.add_argument('-w', '-W', type=str, required=False, help='conduct weekly analysis, path to the survey data for ' 'creating week information') parser.add_argument('-l', '-L', type=str, nargs='+', required=True, help='the filters to use, make one or more choices: seenB, wasB, didB') parser.add_argument('-lf', '-LF', type=str, nargs='+', required=True, help='location of filtered data, from runSurveyStats.py, in same order as -l/L flag') args = parser.parse_args() message_file = args.m message_types = args.mt filename_to_store = args.f location_to_store = args.s separate_polarity_score = args.p survey_file = args.w filters_chosen = args.l filter_files = args.lf catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) if separate_polarity_score and survey_file is not None: print 'Cannot have separate polarity scores and weekly analysis together, ' \ 'please remove the -p/-P flag' return if survey_file is not None: wi = weeklyinfo() week_dates = wi.getweeklyfo(survey_file) gh = ghelper() ff = filterfields(message_file) data = [] for message_type in message_types: data.extend(ff.filterbyequality(pr.m_type, message_type)) pid_dict = hlp.getuniqueparticipants(data, 'all' if len(message_types) > 1 else message_types[0]) sentiment_analyzer = vadersenti(data[1:]) returned_data = sentiment_analyzer.compilesentiment(pr.m_content, separate_sentiment_list=separate_polarity_score) if separate_polarity_score: hlp.dumpvariable(returned_data, filename_to_store + '.data', location_to_store) else: header = pr.message_header + ['pos', 'neg', 'neu', 'compound'] final_data = [header] + returned_data hlp.writecsv(final_data, location_to_store + filename_to_store + '.csv') weekly_data = gh.filterweeklydata(pid_dict, returned_data, week_dates, 'all' if len(message_types) > 1 else message_types[0]) hlp.dumpvariable(weekly_data, 'weekly_data.dict', location_to_store) summarized_sentiment = {} for pid in weekly_data: summarized_sentiment[pid] = {} participant_data = weekly_data[pid] for week_no in participant_data: summarized_sentiment[pid][week_no] = sentiment_analyzer.summarizesentiment(participant_data[week_no], separate_in_out=True, message_type=message_type) hlp.dumpvariable(summarized_sentiment, 'weekly_summarized_sentiment.dict', location_to_store) plt = plots() overlay_data = gh.createbullyingoverlay(catch_all_data, week_dates, ff) plt.plotweeklyprogression(summarized_sentiment, location_to_store, 'Sentiment Progress', 'Week', 'Sentiment Value', sentiment_legend=['Positive', 'Negative', 'Neutral'], overlay_data=overlay_data) print 'done'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-o', '-O', help='Old Dataset', required=True) parser.add_argument('-n', '-N', help='New Dataset', required=True) parser.add_argument('-f', '-F', help='Folder to store results in, ending with /', required=True) parser.add_argument('-p', '-P', help='text file with list of people who were ordered to be removed', required=True) parser.add_argument('-s', '-S', help='text file with list of people who were semi-consented', required=True) args = parser.parse_args() old_dataset_file = args.o new_dataset_file = args.n location_to_store = args.f ordered_removed_file = args.p semi_consented_file = args.s print '***Reading data from arguments...' old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True) new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',') new_dataset_dictionary = generate_new_dataset_dictionary(new_dataset[1:]) new_dataset_msg_id_dictionary = generate_new_dataset_dictionary(new_dataset[1:], use_m_id=True) with open(ordered_removed_file, 'r') as f: ordered_removed = eval(f.read()) with open(semi_consented_file, 'r') as f: semi_consented = eval(f.read()) print '***Filtering old data within dates of study...' ff = filterfields() old_dataset_within_dates = ff.filterbetweendates(ff.converttodate(pr.start_datetime), ff.converttodate(pr.end_datetime), data_to_work=old_dataset, right_equality=True, date_field=pr.m_time_sent) old_dataset = old_dataset_within_dates old_dataset_counts = {} for datum in old_dataset: m_type = datum[pr.m_type] if m_type not in old_dataset_counts: old_dataset_counts[m_type] = 0 old_dataset_counts[m_type] += 1 print '*** OLD DATASET COUNTS***', old_dataset_counts print '***Finding mapping...' mapping_dict = {} inverted_mapping_dict = {} missed_dict = {} no_reason = [] counts_no_match = {'ord': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}, 'semi': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}, 'no': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0}} counts_match = {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0} no_reason_counts = {} for datum in old_dataset: m_result, msg_val = message_exists(datum, new_dataset_dictionary, ff) if m_result: mapping_dict[datum[pr.msg_id]] = msg_val if msg_val[1] not in inverted_mapping_dict: inverted_mapping_dict[msg_val[1]] = [] inverted_mapping_dict[msg_val[1]].append(datum[pr.msg_id]) m_type = datum[pr.m_type] if m_type in counts_match: counts_match[m_type] += 1 else: src = datum[pr.m_source] trg = datum[pr.m_target] m_type = datum[pr.m_type] if src in ordered_removed or trg in ordered_removed: reason = 'ordered removed' if m_type in counts_no_match['ord']: counts_no_match['ord'][m_type] += 1 elif src in semi_consented or trg in semi_consented: reason = 'semi consented' if m_type in counts_no_match['semi']: counts_no_match['semi'][m_type] += 1 else: reason = '' temp = datum temp.append(msg_val) no_reason.append(temp) if m_type in counts_no_match['no']: counts_no_match['no'][m_type] += 1 if m_type not in no_reason_counts.keys(): no_reason_counts[m_type] = {} if msg_val not in no_reason_counts[m_type].keys(): no_reason_counts[m_type][msg_val] = 0 no_reason_counts[m_type][msg_val] += 1 missed_dict[datum[pr.msg_id]] = [msg_val, datum[pr.m_type], reason] print '\n\n**NOT FOUND**' for key_v in counts_no_match.keys(): print key_v print counts_no_match[key_v] print '\n\n**NO REASON**' for key_v in no_reason_counts.keys(): print key_v print no_reason_counts[key_v] print '\n\n**FOUND**', counts_match print '***Creating new dataset with mappings...' new_dataset_header = new_dataset[0] new_dataset_header.extend(['Old Message IDs']) final_dataset = [new_dataset_header] for new_msg_id in new_dataset_msg_id_dictionary.keys(): datum = new_dataset_msg_id_dictionary[new_msg_id] old_msg_id = [''] if new_msg_id not in inverted_mapping_dict else inverted_mapping_dict[new_msg_id] datum.extend(old_msg_id) final_dataset.append(datum) print '***Writing data...' hlp.writecsv(final_dataset, location_to_store + 'new_old_mapped_hashed_dataset.csv', delimiter_sym=',') mapping_dict_list = [[x, mapping_dict[x][0], mapping_dict[x][1]] for x in mapping_dict] mapping_header = [['old_id', 'cosine_val', 'new_id']] mapping_header.extend(mapping_dict_list) hlp.writecsv(mapping_header, location_to_store + 'old_to_new_mapping.csv', delimiter_sym=',') missed_dict_list = [[x, missed_dict[x][0], missed_dict[x][1], missed_dict[x][2]] for x in missed_dict] missed_header = [['old_id', 'Reason', 'm_type', 'Explanation']] missed_header.extend(missed_dict_list) hlp.writecsv(missed_header, location_to_store + 'old_not_found.csv', delimiter_sym=',') hlp.writecsv(no_reason, location_to_store + 'old_not_found_no_reason.csv', delimiter_sym=',') print 'TADAA!!!'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True, help='Message list file') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') parser.add_argument('-p', '-P', type=str, required=True, help='PID dict inverted') parser.add_argument('-b', '-B', type=str, required=True, help='bullying dictionary') parser.add_argument('-o', '-O', type=str, required=True, help='Output folder') parser.add_argument('-l', '-L', type=str, nargs='+', help='Filters chosen') parser.add_argument('-f', '-f', type=str, nargs='+', help='Filter files') args = parser.parse_args() output_folder = args.o message_data = hlp.recovervariable(args.m) pid_dict = hlp.recovervariable(args.p) filters_chosen = args.l filter_files = args.f catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) wi = weeklyinfo() weekly_info = wi.getweeklyfo(args.r) ff = filterfields() gh = ghelper() bullying_overlay = gh.createbullyingoverlay(catch_all_data, weekly_info, ff) bullying_overlay = flip_bullying_overlay(bullying_overlay, weekly_info.keys()) pid_list = pid_dict.keys() pid_list.sort() for pid in pid_list: training_set_final = [] testing_set_final = [] pid_list_training = deepcopy(pid_list) pid_list_training.remove(pid) ff.setdata(message_data) testing_raw_data = ff.filterbyequality(pr.m_source, pid_dict[pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[pid]) ff.removebyequality(pr.m_source, pid_dict[pid]) ff.removebyequality(pr.m_target, pid_dict[pid]) training_raw_data = ff.getdata() fe = raw_features(data=None) _, _ = fe.get_scoring_factors(training_raw_data) training_weekly_data = {} for training_pid in pid_list_training: training_weekly_data[training_pid] = {} data_to_use = ff.filterbyequality(pr.m_source, pid_dict[training_pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[training_pid]) if 0 == len(data_to_use): print 'no data found, probably filtered into the testing set, Training PID: '+\ training_pid+', Testing PID: '+pid continue pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, \ outgoing_ss = get_pid_level_features(data_to_use, weekly_info, ff, bullying_overlay, pid_dict, training_pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[training_pid]) training_set_final.append( [training_pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) # testing pid pid_weekly_w_bullying, global_in_degree, global_out_degree, \ global_in_ew, global_out_ew, incoming_ss, outgoing_ss = get_pid_level_features(testing_raw_data, weekly_info, ff, bullying_overlay, pid_dict, pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[pid]) testing_set_final.append( [pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) header = ['pid', 'wkno', 'frWInSenPos', 'frWInSenNeu', 'frWInSenNeg', 'frWOutSenPos', 'frWOutSenNeu', 'frWOutSenNeg', 'frInDegO', 'frOutDegO', 'frInEdgeO', 'frOutEdgeO', 'inSenSc', 'outSenSc', 'label'] training_set_final = [header] + training_set_final testing_set_final = [header] + testing_set_final hlp.writecsv(training_set_final, output_folder+pid+'_tr.csv') hlp.writecsv(testing_set_final, output_folder+pid+'_ts.csv')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Sentiment Message file') parser.add_argument('-t', '-T', action='store_true', help='Sentiment type flag, if used then vader, else afinn') parser.add_argument('-f', '-F', required=True, help='Folder to store checkpoints, and final result') parser.add_argument('-w', '-W', required=False, help='Per week/month analysis') args = parser.parse_args() message_file = args.m sentiment_type = args.t location_to_store = args.f survey_file = args.w # get message data, only sms and fb_message ff = filterfields(message_file) ff.setdata(ff.getdata()[1:]) sms_data = ff.filterbyequality(pr.m_type, 'sms') pid_dict_sms = hlp.getuniqueparticipants2(sms_data) fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message') pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data) message_data = sms_data + fb_message_data # put the labels on labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \ hlp.processafinnsentiment(message_data, label_only=False) if survey_file is not None: wi = weeklyinfo() weekly_info = wi.getweeklyfo(survey_file) weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) #__temp_testing_for_discrepancy(labelled_data, weekly_data) # get the pid_dict for easier handling pid_dict = hlp.getuniqueparticipants2(labelled_data) if survey_file is not None: over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants']) __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf') __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf') __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf') print '***SMS***' print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values()) print '***FB***' print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values()) print '***OVERALL***' print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values()) summary_src_trg = summarize_message_by_src_trg(labelled_data) print '***Message Distribution***' for m_type_1 in summary_src_trg: print m_type_1, summary_src_trg[m_type_1] if survey_file is not None: week_list = weekly_data.keys() week_list.sort() # this is not good, as there aren't enough triads months = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24, 25]] # this has at least 8 triads, always, use this months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] month_idx = 1 for month in months2: labelled_data = [] for week in month: labelled_data.extend(weekly_data[week]) general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***' print 'len(LD): ', len(labelled_data) for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store) month_idx += 1 else: print 'len(LD): ', len(labelled_data) words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store) # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d') pos, neg, neu = get_polarity_directionality(labelled_data) print '***Polarity Distribution***' print 'Positive: \n', pos print 'Negative: \n', neg print 'Neutral: \n', neu in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants']) print '***Incoming Messages***' print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m) print '***Outgoing Messages***' print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m) print '***In Degree***' print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d) print '***Out Degree***' print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d) print '***COUNTS***' plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.', location_to_store+'in_out_messages.pdf') # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.', # location_to_store+'out_messages.pdf') plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.', location_to_store+'in_out_degree.pdf', True) # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.', # location_to_store+'out_degree.pdf', True) print 'TADAA!!'