def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled csv') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') args = parser.parse_args() data_file = args.d location_to_store = args.f all_afinn_data = hlp.readcsv(data_file) labelled_data = hlp.processafinnsentiment(all_afinn_data) csv_header = ['pid', 'm_type', 'in_pos', 'in_neg', 'in_neu', 'out_pos', 'out_neg', 'out_neu', 'in_deg_part', 'in_deg_nonpart', 'out_deg_part', 'out_deg_nonpart'] pol_dist, complete_in_out = distribution_polarity(labelled_data) print '***For Complete Dataset***' print 'Incoming(P, N, U): ', complete_in_out['in'] print 'Outgoing(P, N, U): ', complete_in_out['out'] hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store) to_store_csv = [csv_header] for pid in pol_dist: pid_data = pol_dist[pid] for m_type in pid_data: m_data = pid_data[m_type] csv_line = __summarize_data(m_data) final_csv_line = [pid, m_type] final_csv_line.extend(csv_line) to_store_csv.append(final_csv_line) hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Sentiment Message file') parser.add_argument('-t', '-T', action='store_true', help='Sentiment type flag, if used then vader, else afinn') parser.add_argument('-f', '-F', required=True, help='Folder to store checkpoints, and final result') parser.add_argument('-w', '-W', required=False, help='Per week/month analysis') args = parser.parse_args() message_file = args.m sentiment_type = args.t location_to_store = args.f survey_file = args.w # get message data, only sms and fb_message ff = filterfields(message_file) ff.setdata(ff.getdata()[1:]) sms_data = ff.filterbyequality(pr.m_type, 'sms') pid_dict_sms = hlp.getuniqueparticipants2(sms_data) fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message') pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data) message_data = sms_data + fb_message_data # put the labels on labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \ hlp.processafinnsentiment(message_data, label_only=False) if survey_file is not None: wi = weeklyinfo() weekly_info = wi.getweeklyfo(survey_file) weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) #__temp_testing_for_discrepancy(labelled_data, weekly_data) # get the pid_dict for easier handling pid_dict = hlp.getuniqueparticipants2(labelled_data) if survey_file is not None: over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants']) __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf') __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf') __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf') print '***SMS***' print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values()) print '***FB***' print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values()) print '***OVERALL***' print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values()) summary_src_trg = summarize_message_by_src_trg(labelled_data) print '***Message Distribution***' for m_type_1 in summary_src_trg: print m_type_1, summary_src_trg[m_type_1] if survey_file is not None: week_list = weekly_data.keys() week_list.sort() # this is not good, as there aren't enough triads months = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24, 25]] # this has at least 8 triads, always, use this months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] month_idx = 1 for month in months2: labelled_data = [] for week in month: labelled_data.extend(weekly_data[week]) general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***' print 'len(LD): ', len(labelled_data) for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store) month_idx += 1 else: print 'len(LD): ', len(labelled_data) words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store) # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d') pos, neg, neu = get_polarity_directionality(labelled_data) print '***Polarity Distribution***' print 'Positive: \n', pos print 'Negative: \n', neg print 'Neutral: \n', neu in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants']) print '***Incoming Messages***' print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m) print '***Outgoing Messages***' print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m) print '***In Degree***' print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d) print '***Out Degree***' print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d) print '***COUNTS***' plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.', location_to_store+'in_out_messages.pdf') # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.', # location_to_store+'out_messages.pdf') plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.', location_to_store+'in_out_degree.pdf', True) # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.', # location_to_store+'out_degree.pdf', True) print 'TADAA!!'