def keywords_recovery_preed(): # compare keywords in pro-recovery and pro-ed users' tweets prorec, proed = edrelatedcom.rec_proed() times = dbt.db_connect_col('fed', 'timeline') fdist_rec = FreqDist() fdist_ped = FreqDist() for user in prorec: for tweet in times.find({'user.id':int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// text = text.strip().lower() text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// words = keywords(text) for word in words: fdist_rec[word] += 1 for user in proed: for tweet in times.find({'user.id':int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// text = text.strip().lower() text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// words = keywords(text) for word in words: fdist_ped[word] += 1 print fdist_rec.most_common(50) print fdist_ped.most_common(50)
def compare_opinion(): # Compre pro-recovery and pro-ed users in terms of interventions prorec, proed = edrelatedcom.rec_proed() ## based on profiles rec_times = dbt.db_connect_col('fed', 'recover') # afinn = Afinn(emoticons=True) rec_sen, ed_sen = [], [] for i in xrange(2): users = [prorec, proed][i] for uid in users: textmass = '' for tweet in rec_times.find({'user.id': int(uid)}): if 'retweeted_status' in tweet: continue elif 'quoted_status' in tweet: continue else: text = tweet['text'].encode('utf8') text = text.strip().lower() text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// textmass += " " + text # sent = afinn.score(textmass) sent = sentiment(textmass)[0] if sent>50: print uid [rec_sen, ed_sen][i].append(sent) sns.distplot(rec_sen, hist=False, label='Pro-recovery') sns.distplot(ed_sen, hist=False, label='Pro-ED') plt.show()
def split_treatment(): rec, proed = edrelatedcom.rec_proed() ## based on profiles times = dbt.db_connect_col('fed', 'timeline') prior = dbt.db_connect_col('fed', 'prior_treat') prior.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) prior.create_index([('id', pymongo.ASCENDING)], unique=True) post = dbt.db_connect_col('fed', 'post_treat') post.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) post.create_index([('id', pymongo.ASCENDING)], unique=True) for user in rec: Find = False for tweet in times.find({'user.id': int(user)}).sort([('id', 1)]): # sort: 1 = ascending, -1 = descending if ('retweeted_status' not in tweet) and ('quoted_status' not in tweet): text = tweet['text'].encode('utf8') text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// text = text.strip().lower() if 'treatment' in text or 'therap' in text \ or 'doctor' in text: Find = True if Find: post.insert(tweet) else: prior.insert(tweet)
def compare_weights(): #Compare distributions of CW and GW between pro-ed and pro-recovery users prorec, proed = edrelatedcom.rec_proed() ## based on profiles for users in [prorec, proed]: field = 'text_anal.cw.value' cw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users}, field: {'$exists': True}}) field = 'text_anal.gw.value' gw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users}, field: {'$exists': True}}) sns.distplot(cw, hist=False, label='CW') sns.distplot(gw, hist=False, label='GW') plt.show()
def test(): # times = dbt.db_connect_col('fed', 'treat') # for tweet in times.find(): # print ' '.join(tweet['text'].split()) # text = """ # The cause of eating disorders is not clear.[3] Both biological and environmental factors appear to play a role.[1][3] Cultural idealization of thinness is believed to contribute.[3] Eating disorders affect about 12 percent of dancers.[4] Those who have experienced sexual abuse are also more likely to develop eating disorders.[5] Some disorders such as pica and rumination disorder occur more often in people with intellectual disabilities. Only one eating disorder can be diagnosed at a given time.[2] # # """ # print keywords(text) users = iot.get_values_one_field('fed', 'recover', 'user.id') prerec , proed = edrelatedcom.rec_proed() pusers = [str(i) for i in users] print len(pusers) print len(prerec) uoi = (set(pusers).intersection(set(prerec))) com = dbt.db_connect_col('fed', 'scom') for u in uoi: user = com.find_one({'id': int(u)}) print user['screen_name']
def recovery_user_treatment_tweet(): # verify when 'treatment' in pro-recovery users timeline rec, proed = edrelatedcom.rec_proed() ## based on profiles times = dbt.db_connect_col('fed', 'timeline') count = 0 for user in rec: flag = False for tweet in times.find({'user.id': int(user)}): if 'retweeted_status' in tweet: continue elif 'quoted_status' in tweet: continue else: text = tweet['text'].encode('utf8') text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// text = text.strip().lower() if 'treatment' in text or 'therap' in text \ or 'doctor' in text: print ' '.join(tweet['text'].split()) flag = True if flag: count += 1 print user print len(rec), count
def recover_proed_inter(): # Compare difference between pro-ed and pro-recovery uses in social networking prorec, proed = edrelatedcom.rec_proed() ## based on profiles com = dbt.db_connect_col('fed', 'scom') g = gt.load_network('fed', 'snet') data = [] for node in g.vs: uid = node['name'] user = com.find_one({'id': int(uid)}) followeecount = user['friends_count'] followercount = user['followers_count'] followees = set([g.vs[v]['name'] for v in g.successors(uid)]) followers = set([g.vs[v]['name'] for v in g.predecessors(uid)]) recc_followee, proc_followee, edc_followee = 0.0, 0.0, 0.0 for u in followees: if u in prorec: recc_followee += 1 elif u in proed: proc_followee += 1 else: edc_followee += 1 if followeecount != 0: recc_followee /= followeecount proc_followee /= followeecount edc_followee /= followeecount else: print 'Followee number is zero', uid otherc_followee = 1 - recc_followee - proc_followee - edc_followee recc_follower, proc_follower, edc_follower = 0.0, 0.0, 0.0 for u in followers: if u in prorec: recc_follower += 1 elif u in proed: proc_follower += 1 else: edc_follower += 1 if followercount != 0: recc_follower /= followercount proc_follower /= followercount edc_follower /= followercount else: print 'Follower number is zero', uid otherc_follower = 1 - recc_follower - proc_follower - edc_follower if uid in prorec: data.append(['Rec', recc_followee, 'Rec-Followees']) data.append(['Rec', proc_followee, 'Ped-Followees']) data.append(['Rec', edc_followee, 'ED-Followees']) data.append(['Rec', otherc_followee, 'Oth-Followees']) data.append(['Rec', recc_follower, 'Rec-Followers']) data.append(['Rec', proc_follower, 'Ped-Followers']) data.append(['Rec', edc_follower, 'ED-Followers']) data.append(['Rec', otherc_follower, 'Oth-Followers']) elif uid in proed: data.append(['Ped', proc_followee, 'Ped-Followees']) data.append(['Ped', recc_followee, 'Rec-Followees']) data.append(['Ped', edc_followee, 'ED-Followees']) data.append(['Ped', otherc_followee, 'Oth-Followees']) data.append(['Ped', proc_follower, 'Ped-Followers']) data.append(['Ped', recc_follower, 'Rec-Followers']) data.append(['Ped', edc_follower, 'ED-Followers']) data.append(['Ped', otherc_follower, 'Oth-Followers']) else: pass df = pd.DataFrame(data, columns=['Group', 'Proportion', 'Feature']) df.to_csv('inter.csv') sns.set(style="whitegrid", palette="pastel", color_codes=True) sns.boxplot(x="Feature", y="Proportion", hue="Group", data=df, palette="PRGn") sns.despine(offset=10, trim=True) # plt.ylim(0, 0.8) # sns.violinplot(x="Feature", y="Values", hue="Group", data=df, split=True, # inner="quart", palette="PRGn") # sns.despine(left=True) plt.show()