Beispiel #1
0
def label_dropout_network(g_file, db1n, com1n, db2n, com2n):
    g = gt.Graph.Read_GraphML(g_file)

    allg = gt.Graph.Read_GraphML('fed-net.graphml')
    allg.vs['hub'] = allg.eigenvector_centrality()

    com1 = dbt.db_connect_col(db1n, com1n)
    com2 = dbt.db_connect_col(db2n, com2n)

    labels, hubs = [], []
    for v in g.vs:
        uid = int(v['name'])
        hub = allg.vs.find(name=v['name'])['hub']
        print hub

        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if (u2 is None): # protected or delete
            drop = 1
        else:
            if 'status' not in u1 and 'status' not in u2: # no tweeting
                drop = 1
            elif 'status' not in u1 and 'status' in u2: # start to post
                drop = 0
            elif 'status' in u1 and 'status' not in u2: # delete
                drop = 0
            elif u2['status']['id'] == u1['status']['id']: # no new post
                drop = 1
            elif u2['status']['id'] != u1['status']['id']: # new post
                drop = 0
        labels.append(drop)
        hubs.append(hub)
    g.vs['drop'] = labels
    g.vs['cen'] = hubs
    g.write_graphml('drop-'+g_file)
Beispiel #2
0
def count_longest_tweeting_period(dbname, timename, comname):
    # get users' latest 10 tweets, and calculate the largest posting interval, counted by days.
    com = dbt.db_connect_col(dbname, comname)
    time = dbt.db_connect_col(dbname, timename)
    for user in com.find({'liwc_anal.result.WC': {
            '$exists': True
    }},
                         no_cursor_timeout=True):
        user_id = user['id']
        datas = []
        for tweet in time.find({
                'user.id': user_id
        }, {
                'id': 1,
                'created_at': 1
        }).sort([('id', -1)
                 ]).limit(10):  # sort: 1 = ascending, -1 = descending
            created_at = datetime.strptime(tweet['created_at'],
                                           '%a %b %d %H:%M:%S +0000 %Y')
            datas.append(created_at)
        # print user['id']
        # print datas
        diff = [((datas[i] - datas[i + 1]).days)
                for i in xrange(len(datas) - 1)]
        max_period = max(diff)
        # print max_period
        com.update({'id': user_id},
                   {'$set': {
                       'longest_tweet_interval': max_period
                   }},
                   upsert=False)
Beispiel #3
0
def recovery_users_tweet():
    # gather recovery/treat related tweets
    # When construct control group, if they have retweet treatment, delete them
    com = dbt.db_connect_col('fed', 'scom')
    times = dbt.db_connect_col('fed', 'timeline')
    newtime = dbt.db_connect_col('fed', 'recover')
    newtime.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    newtime.create_index([('id', pymongo.ASCENDING)], unique=True)

    for user in com.find(no_cursor_timeout=True):
        uid = user['id']
        for tweet in times.find({'user.id': uid}):
            # if 'retweeted_status' in tweet:
            #     continue
            # elif 'quoted_status' in tweet:
            #     continue
            # else:
            text = tweet['text'].encode('utf8')
            text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http://
            # if ('I' in text or ' me ' in text):
            text = text.strip().lower()
            if 'recover' in text or 'treatment' in text or 'therap' in text \
                   or 'doctor' in text:
                    # or 'healing' in text or 'therapy' in text or 'doctor' in text or 'hospital' in text:
                # print ' '.join(tweet['text'].split())
                try:
                    newtime.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
Beispiel #4
0
def bio_information(dbname='TwitterProAna', colname='users'):
    com = dbt.db_connect_col(dbname, colname)
    bio_hist = dbt.db_connect_col(dbname, 'bio')
    bio_hist.create_index([('id', pymongo.ASCENDING)])


    for row in com.find({'screen_name': {'$exists': True}}, no_cursor_timeout=True):
        name, text = row['name'], row['description']
        date = row['lastPolledFull']
        if text and name:
            stats = dm.process_text(text, name)
        elif text:
            stats = dm.process_text(text)
        if stats:
            stats['date'] = date
            stats['id'] = row['id']
            try:
                bio_hist.insert(stats)
            except pymongo.errors.DuplicateKeyError:
                pass
        for hist in reversed(row['history']):
            if 'name' in hist:
                name = hist['name']
            if 'description' in hist:
                text = hist['description']
                if text:
                    stats = dm.process_text(text, name)
                    if stats:
                        stats['date'] = hist['lastPolledFull']
                        stats['id'] = row['id']
                        try:
                            bio_hist.insert(stats)
                        except pymongo.errors.DuplicateKeyError:
                            pass
Beispiel #5
0
def split_treatment():
    rec, proed = edrelatedcom.rec_proed() ## based on profiles
    times = dbt.db_connect_col('fed', 'timeline')
    prior = dbt.db_connect_col('fed', 'prior_treat')
    prior.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    prior.create_index([('id', pymongo.ASCENDING)], unique=True)

    post = dbt.db_connect_col('fed', 'post_treat')
    post.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    post.create_index([('id', pymongo.ASCENDING)], unique=True)

    for user in rec:
        Find = False
        for tweet in times.find({'user.id': int(user)}).sort([('id', 1)]):  # sort: 1 = ascending, -1 = descending
            if ('retweeted_status' not in tweet) and ('quoted_status' not in tweet):
                text = tweet['text'].encode('utf8')
                text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http://
                text = text.strip().lower()
                if 'treatment' in text or 'therap' in text \
                       or 'doctor' in text:
                    Find = True
            if Find:
                post.insert(tweet)
            else:
                prior.insert(tweet)
Beispiel #6
0
def user_active():
    # obtain the active duration of users in two observation
    groups = [
         ('ED', 'fed', 'com', 'fed', 'com_survival', {
                                                        'liwc_anal.result.WC': {'$exists': True},
                                                        'level': 1,
                                                        'senti.result.whole.N': {'$gt': 10}}),
         ('RD', 'random', 'scom', 'random', 'com_survival', {
                                                        'liwc_anal.result.WC': {'$exists': True},
                                                        'senti.result.whole.N': {'$gt': 10}}),
         ('YG', 'younger', 'scom', 'younger', 'com_survival', {
                                                            'liwc_anal.result.WC': {'$exists': True},
                                                            'senti.result.whole.N': {'$gt': 10}})
    ]
    for tag, dbname, comname, dbname2, comname2, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        com2 = dbt.db_connect_col(dbname2, comname2)

        network1 = gt.Graph.Read_GraphML(tag.lower()+'-net-all.graphml')
        network1.vs['alive'] = 0
        network1.vs['duration'] = 0
        for v in network1.vs:
            uid = int(v['name'])
            u1 = com.find_one({'id': uid})
            u2 = com2.find_one({'id': uid})
            if u1:
                f1_time = u1['_id'].generation_time.replace(tzinfo=None)
                if u2:
                    f2_time = u2['_id'].generation_time.replace(tzinfo=None)
                    if 'status' in u2:
                        fsecond_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                        if f1_time < fsecond_last_post < f2_time:
                            v['alive'] = 1
                            v['duration'] = friends_active_days(u2, f1_time)[0]
        network1.write_graphml(tag.lower()+'-net-all-active.graphml')
Beispiel #7
0
def split_control():
    ## the mean split point of treatment are 0.330912888352
    times = dbt.db_connect_col('fed', 'timeline')
    control = dbt.db_connect_col('fed', 'control_com')
    prior = dbt.db_connect_col('fed', 'prior_control')
    prior.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    prior.create_index([('id', pymongo.ASCENDING)], unique=True)

    post = dbt.db_connect_col('fed', 'post_control')
    post.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    post.create_index([('id', pymongo.ASCENDING)], unique=True)

    for user in control.find(no_cursor_timeout=True):
        timline_count = user['timeline_count']
        cut = int(timline_count * 0.33)
        count = 0
        for tweet in times.find({'user.id': user['id']}).sort([('id', 1)]):  # sort: 1 = ascending, -1 = descending
            if count < cut:
                try:
                    prior.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
            else:
                try:
                    post.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
            count += 1
Beispiel #8
0
def refine_recovery_tweets(hash_com,
                           tagcol,
                           refine_tagcol,
                           idx=[4, 58
                                ]):  # without non-recovery: 18, 102, 4, 58, 88
    # select tweets have ed-related hashtags
    times = dbt.db_connect_col('fed', tagcol)
    rec_refine = dbt.db_connect_col('fed', refine_tagcol)
    rec_refine.create_index([('user.id', pymongo.ASCENDING),
                             ('id', pymongo.DESCENDING)])
    rec_refine.create_index([('id', pymongo.ASCENDING)], unique=True)
    for tweet in times.find():
        hashtags = tweet['entities']['hashtags']
        for hash in hashtags:
            # need no .encode('utf-8')
            tag = hash['text'].encode('utf-8').lower().replace('_',
                                                               '').replace(
                                                                   '-', '')
            com_id = hash_com.get(tag, -1)
            if com_id > -1:
                if com_id in idx:
                    try:
                        rec_refine.insert(tweet)
                    except pymongo.errors.DuplicateKeyError:
                        pass
Beispiel #9
0
def label_ed_recovery(hash_com, com_size, idx=[18, 102]):
    # select users in prorec that have more ed-related hashtags
    times = dbt.db_connect_col('fed', 'prorec_tag')
    com = dbt.db_connect_col('fed', 'tag_com')
    threshold = float(sum([com_size[i] for i in idx])) / sum(com_size.values())
    print 'threshold: ', threshold
    users = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')))
    for uid in users:
        taget_count, all_count = 0.0, 0.0
        for tweet in times.find({'user.id': uid}):
            hashtags = tweet['entities']['hashtags']
            hash_set = set()
            for hash in hashtags:
                # need no .encode('utf-8')
                hash_set.add(hash['text'].encode('utf-8').lower().replace(
                    '_', '').replace('-', ''))
            for tag in hash_set:
                com_id = hash_com.get(tag, -1)
                if com_id > -1:
                    all_count += 1
                    if com_id in idx:
                        taget_count += 1

        if all_count and taget_count / all_count > threshold:
            com.update({'id': uid}, {'$set': {
                'rec_tageted': True
            }},
                       upsert=False)
Beispiel #10
0
def extract_user(dbname='ed', stream='restream', user='******'):
    # extract users from tweet stream, including author and retweeters.
    stream = dbt.db_connect_col(dbname, stream)
    com = dbt.db_connect_col(dbname, user)
    com.create_index("id", unique=True)
    for tweet in stream.find({
            'userextract': {
                '$exists': False
            },
    },
                             no_cursor_timeout=True):
        author = tweet['user']
        author['level'] = 1
        try:
            com.insert(author)
        except pymongo.errors.DuplicateKeyError:
            pass
        if 'retweeted_status' in tweet:
            retweetee = tweet['retweeted_status']['user']
            retweetee['level'] = 1
            try:
                com.insert(retweetee)
            except pymongo.errors.DuplicateKeyError:
                pass
        stream.update_one({'id': tweet['id']}, {'$set': {
            'userextract': True
        }},
                          upsert=False)
Beispiel #11
0
def recollect_ed(dbname='ed', colname='stream', newcol='restream'):
    # Recollect the stream data, to get the favorite and retweet counts
    stream = dbt.db_connect_col(dbname, colname)
    newstream = dbt.db_connect_col(dbname, newcol)
    newstream.create_index("id", unique=True)
    i = 0
    ids = []
    for tweet in stream.find({
            'recollected': {
                '$exists': False
            },
    },
                             no_cursor_timeout=True):
        if i < 100:
            stream.update_one({'id': tweet['id']},
                              {'$set': {
                                  'recollected': True
                              }},
                              upsert=False)
            ids.append(tweet['id'])
            i += 1
        else:
            print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + str(len(ids))
            tweets = tlup.get_tweets_info(ids)
            for t in tweets:
                try:
                    newstream.insert(t)
                except pymongo.errors.DuplicateKeyError:
                    pass
            i = 0
            ids = []
Beispiel #12
0
def filter_recovery_sentiment():
    user_count, user_pol = {}, {}
    times = dbt.db_connect_col('fed', 'recovery')
    for tweet in times.find():
        uid = tweet['user']['id']
        pol = tweet['polarity']
        count = user_count.get(uid, 0.0)
        polv = user_pol.get(uid, 0.0)
        user_count[uid] = count + 1
        if pol > 0:
            print ' '.join(tweet['text'].split())
            user_pol[uid] = polv + 1
        elif pol < 0:
            user_pol[uid] = polv - 1
        else:
            user_pol[uid] = polv + 0
    user_list = [k for k, v in user_count.items() if v >= 3]
    print sum(user_pol[uid] > 0 for uid in user_list)
    print sum(user_pol[uid] < 0 for uid in user_list)
    rec, nonrec = [], []
    com = dbt.db_connect_col('fed', 'scom')
    for uid in user_list:
        if user_pol[uid] > 0:
            rec.append(str(uid))
            user = com.find_one({'id':uid})
            print 'Positive', user['id_str'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8')
        elif user_pol[uid] < 0:
            nonrec.append(str(uid))
            user = com.find_one({'id':uid})
            print 'Negative', user['id_str'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8')

    return rec, nonrec
Beispiel #13
0
def ed_hashtag():
    # Filter ED related tweets
    dbname = 'fed'
    # select recovery users based on hashtags
    # Store in the databases
    # com = dbt.db_connect_col('fed', 'com')
    ed_tags = set(iot.read_ed_hashtags())
    print ed_tags

    times = dbt.db_connect_col(dbname, 'timeline')
    taged = dbt.db_connect_col(dbname, 'ed_tag')
    taged.create_index([('user.id', pymongo.ASCENDING),
                          ('id', pymongo.DESCENDING)])
    taged.create_index([('id', pymongo.ASCENDING)], unique=True)


    # for user in com.find():
    # for tweet in times.find({'user.id': user['id'], '$where': 'this.entities.hashtags.length>0'}):
    for tweet in times.find({'$where': 'this.entities.hashtags.length>0'}, no_cursor_timeout=True):
        hashtags = tweet['entities']['hashtags']
        for hash in hashtags:
            value = hash['text'].encode('utf-8').lower().replace('_', '').replace('-', '')
            if value in ed_tags:
                try:
                    taged.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
Beispiel #14
0
def trandb(dbname, colnam1, colnam2):
    time1 = dbt.db_connect_col(dbname, colnam1)
    time2 = dbt.db_connect_col(dbname, colnam2)
    for t in time2.find():
        try:
            time1.insert(t)
        except pymongo.errors.DuplicateKeyError:
            pass
Beispiel #15
0
def control_users():
    com = dbt.db_connect_col('fed', 'scom')
    recovery_user = set(iot.get_values_one_field('fed', 'recover', 'user.id'))
    control_com = dbt.db_connect_col('fed', 'control_com')
    control_com.create_index("id", unique=True)
    for user in com.find():
        if user['id'] not in recovery_user:
            control_com.insert(user)
Beispiel #16
0
def filter_ed_tweets():
    # Filter ed-related tweets, based on word2vec
    from ohsn.edrelated import edrelatedcom
    prorec = edrelatedcom.rec_user('fed', 'scom')
    proed = edrelatedcom.proed_users('fed', 'scom')

    # com = dbt.db_connect_col('fed', 'scom')
    times = dbt.db_connect_col('fed', 'timeline')
    ed_times = dbt.db_connect_col('fed', 'edtimeline')
    ed_times.create_index([('user.id', pymongo.ASCENDING),
                           ('id', pymongo.DESCENDING)])
    ed_times.create_index([('id', pymongo.ASCENDING)], unique=True)
    ed_list = set([
        'bmi', 'cw', 'ugw', 'gw', 'lbs', 'hw', 'lw', 'kg', 'ed',
        'eatingdisorder', 'anorexia', 'bulimia', 'anorexic', 'ana', 'bulimic',
        'anorexia', 'mia', 'thinspo', 'bulemia', 'purge', 'bulimia', 'binge',
        'selfharm', 'ednos', 'edprobs', 'edprob', 'proana', 'anamia', 'promia',
        'askanamia', 'bonespo', 'legspo'
    ])
    model = models.word2vec.Word2Vec.load('data/word2vec')
    # Rake = RAKE.Rake('/home/wt/Code/ohsn/ohsn/networkminer/stoplist/SmartStoplist.txt')
    import ohsn.api.profiles_check as pc
    print len(prorec + proed)
    for user in prorec + proed:
        for tweet in times.find({'user.id': int(user)}):
            text = tweet['text'].encode('utf8')
            # replace RT, @, # and Http://
            # text = text.strip().lower()
            # text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http://
            # keywords = Rake.run(text)
            keywords = pc.tokenizer_stoprm(text)
            sumsim = 0.0
            count = 0
            # for word in keywords:
            #     tokens = word[0].split()
            #     sima, ca = 0.0, 0.0
            #     for token in tokens:
            #         if token in model:
            #             for ed in ed_list:
            #                 sim = model.similarity(token, ed)
            #                 # if sim > maxsim:
            #                 sima += sim
            #                 ca += 1
            #     if ca != 0:
            #         sumsim += sima/ca
            #         count += 1
            for word in keywords:
                if word in model:
                    for ed in ed_list:
                        sim = model.similarity(word, ed)
                        sumsim += sim
                        count += 1
            if count != 0 and (sumsim / count
                               ) > 0.26:  # the average similarity of ed words
                try:
                    ed_times.insert(tweet)
                except pymongo.errors.DuplicateKeyError:
                    pass
Beispiel #17
0
def tweet_difference(dbname='fed', comname='scom', timename='timeline'):
    # Calcuate the LIWC features of tweets that are retweeted or favorited
    com = dbt.db_connect_col(dbname, comname)
    times = dbt.db_connect_col(dbname, timename)
    '''Process the timelines of users in POI'''
    liwc = Liwc()

    for user in com.find():
        print user['id']
        textmass_retweet = ''
        textmass_like = ''
        # textmass_equal = ''
        for tweet in times.find({
                'user.id': user['id'],
                'retweeted_status': {
                    '$exists': False
                }
        }):
            retc = tweet['retweet_count']
            favc = tweet["favorite_count"]
            text = tweet['text'].encode('utf8')
            text = rtgrex.sub('', text)
            text = mgrex.sub('', text)
            text = hgrex.sub('', text)
            text = ugrex.sub('', text)
            text = text.strip()
            if not (text.endswith('.') or text.endswith('?')
                    or text.endswith('!')):
                text += '.'
            if retc > favc:
                textmass_retweet += " " + text.lower()
            if favc > retc:
                textmass_like += " " + text.lower()
            # else:
            # textmass_equal += " " + text.lower()
        textmass_retweet_words = textmass_retweet.split()
        textmass_like_words = textmass_like.split()
        # textmass_equal_words = textmass_equal.split()
        if len(textmass_retweet_words) > 50:
            liwc_result = liwc.summarize_document(
                ' '.join(textmass_retweet_words))
            com.update_one({'id': user['id']}, {
                '$set': {
                    'retweet_liwc.mined': True,
                    'retweet_liwc.result': liwc_result
                }
            },
                           upsert=False)
        if len(textmass_like_words) > 50:
            liwc_result = liwc.summarize_document(
                ' '.join(textmass_like_words))
            com.update_one({'id': user['id']}, {
                '$set': {
                    'like_liwc.mined': True,
                    'like_liwc.result': liwc_result
                }
            },
                           upsert=False)
Beispiel #18
0
def check_dumplice(dbname, timename, timename2):
    # add time1 into time2
    time1 = dbt.db_connect_col(dbname, timename)
    times = dbt.db_connect_col(dbname, timename2)
    for tweet in time1.find({}):
        try:
            times.insert(tweet)
        except pymongo.errors.DuplicateKeyError:
            pass
Beispiel #19
0
def remove_random_users(dbname, comname, netname):
    com = dbt.db_connect_col(dbname, comname)
    users = iot.get_values_one_field(dbname, comname, 'id', {'level': 3})
    net = dbt.db_connect_col(dbname, netname)
    for row in net.find(no_cursor_timeout=True):
        uid = row['user']
        fid = row['follower']
        if uid in users or fid in users:
            net.delete_one({'_id': row['_id']})
    com.delete_many({'level': 3})
Beispiel #20
0
def ed_tweet_normal_tweet_count():
    user_ids = set(iot.get_values_one_field('fed', 'ed_tag', 'user.id'))
    print len(user_ids)
    com = dbt.db_connect_col('fed', 'com')
    tags = dbt.db_connect_col('fed', 'ed_tag')
    data = []
    for uid in user_ids:
        ed_count = tags.count({'user.id': uid})
        all_count = com.find_one({'id': uid})['timeline_count']
        data.append([uid, ed_count, all_count])
    df = pd.DataFrame(data, columns=['id', 'ed_tweet_count', 'all_tweet_count'])
    df.to_csv('user-ed-stats.csv')
Beispiel #21
0
def ED_followee():
    # put all ED's followees in follownet
    net = dbt.db_connect_col('fed', 'net2')
    users = set(iot.get_values_one_field('fed', 'scom', 'id'))
    print len(users)
    tem = dbt.db_connect_col('fed', 'follownet')
    for re in net.find():
        if re['follower'] in users:
            try:
                tem.insert(re)
            except pymongo.errors.DuplicateKeyError:
                pass
Beispiel #22
0
def unique_tweet(dbname, streamname, timename):
    # get unique tweets in the stream
    stream = dbt.db_connect_col(dbname, streamname)
    time = dbt.db_connect_col(dbname, timename)
    time.create_index("id", unique=True)
    for tweet in stream.find({}, no_cursor_timeout=True):
        if 'retweeted_status' in tweet:
            text = tweet['retweeted_status']['text']
        else:
            text = tweet['text']
        try:
            time.insert({'id': tweet['id'], 'text': text})
        except pymongo.errors.DuplicateKeyError:
            pass
Beispiel #23
0
def insert_cluster_tweets(dbname, timename, cluster):
    # insert tweets of different clusters into two database collections
    time = dbt.db_connect_col(dbname, timename)
    time.create_index([('user.id', pymongo.ASCENDING),
                       ('id', pymongo.DESCENDING)])
    time.create_index([('id', pymongo.ASCENDING)], unique=True)

    ed_tweet = dbt.db_connect_col(dbname, 'ed_tag')
    for uid in cluster:
        for tweet in ed_tweet.find({'user.id': int(uid)}):
            try:
                time.insert(tweet)
            except pymongo.errors.DuplicateKeyError:
                pass
Beispiel #24
0
def hashtag_users():
    com = dbt.db_connect_col('fed', 'com')
    times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id')))
    times_rec = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')))
    newtime = dbt.db_connect_col('fed', 'tag_com')
    newtime.create_index([('id', pymongo.ASCENDING)], unique=True)

    for users in [times_ped, times_rec]:
        for uid in users:
            user = com.find_one({'id': uid})
            try:
                newtime.insert(user)
            except pymongo.errors.DuplicateKeyError:
                pass
Beispiel #25
0
def random_network():
    com = dbt.db_connect_col('random2', 'com')
    com1 = dbt.db_connect_col('random2', 'com2')
    # com2 = dbt.db_connect_col('random2', 'com2')
    # com3 = dbt.db_connect_col('random2', 'com3')
    com1.create_index("id", unique=True)
    # com2.create_index("id", unique=True)
    # com3.create_index("id", unique=True)
    for user in com.find({'level': 2}, no_cursor_timeout=True):
        # if user['level'] == 2:
        try:
            com1.insert(user)
        except pymongo.errors.DuplicateKeyError:
            pass
    com.delete_many({'level': 2})
Beispiel #26
0
def copy_com(dbname, com_ori_name, com_des_name):
    com_ori = dbt.db_connect_col(dbname, com_ori_name)
    com_des = dbt.db_connect_col(dbname, com_des_name)
    com_des.create_index("id", unique=True)
    com_des.create_index([('level', pymongo.ASCENDING),
                          ('following_prelevel_node', pymongo.ASCENDING)],
                         unique=False)
    com_des.create_index([('level', pymongo.ASCENDING),
                          ('follower_prelevel_node', pymongo.ASCENDING)],
                         unique=False)
    for user in com_ori.find({'level': {'$lt': 3}}, no_cursor_timeout=True):
        try:
            com_des.insert(user)
        except pymongo.errors.DuplicateKeyError:
            pass
Beispiel #27
0
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id':source_uid})
        target_user = com.find_one({'id':target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0/(1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
Beispiel #28
0
def read_user_time(filename):
    fields = iot.read_fields()
    trimed_fields = [field.split('.')[-1] for field in fields]
    groups = [
         ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}),
         ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}),
         ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}})
    ]

    data = []
    for tag, dbname, comname, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)

        for user in com.find(filter_values, no_cursor_timeout=True):
            if 'status' in user:
                created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                scraped_at = user['scrape_timeline_at']
                last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                life_time = diff_day(last_post, created_at)
                average_time = float(life_time)/min(1, user['statuses_count'])
                longest_tweet_intervalb = user['longest_tweet_interval']

                observation_interval = diff_day(scraped_at, last_post)
                if (observation_interval-longest_tweet_intervalb) > 30:
                    death = 1
                else:
                    death = 0
                values = iot.get_fields_one_doc(user, fields)
                data.append([user['id_str'], created_at, last_post, scraped_at, average_time,
                             longest_tweet_intervalb, observation_interval, tag, death] + values)

    df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time',
                                     'longest_time_interval', 'observation_interval', 'group',
                                     'event'] + trimed_fields)
    df.to_csv(filename)
Beispiel #29
0
def recovery_tweet():
    times = dbt.db_connect_col('fed', 'timeline')
    for tweet in times.find():
        text = tweet['text'].encode('utf8')
        text = text.strip().lower().replace("-", "").replace('_', '')
        sentences = re.split( r"\s*[;:`\"()?!{}]\s*|--+|\s*-\s+|''|\.\s|\.$|\.\.+|¡°|¡±", text )
        FLAG = False
        for sentence in sentences:
            if 'recover' in sentence:
                if 'not' not in sentence and 'don\'t' not in sentence and 'never' not in sentence \
                        and 'anti' not in sentence and 'non' not in sentence\
                        and 'relapse' not in sentence:
                    FLAG = True
            # if 'struggl' in sentence:
            #     if 'thin' not in sentence and 'weight' not in sentence \
            #             and 'mirror' not in sentence and 'figure' not in sentence \
            #             and 'food' not in sentence and 'body' not in sentence\
            #             and 'proed' not in sentence and 'proana' not in sentence and 'promia' not in sentence:
            #         FLAG = True
            # if 'fight' in sentence:
            #     if 'thin' not in sentence and 'weight' not in sentence \
            #             and 'mirror' not in sentence and 'figure' not in sentence \
            #             and 'food' not in sentence and 'body' not in sentence:
            #         FLAG = True
        # for sentence in sentences:
        #     if 'proed' in sentence or 'proana' in sentence or 'promia' in sentence:
        #         if 'not' not in sentence and \
        #                         'don\'t' not in sentence and \
        #                         'anti' not in sentence:
        #             FLAG = False
        if FLAG:
            print tweet['id'], ' '.join(tweet['text'].split()).encode('utf-8')
Beispiel #30
0
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id': source_uid})
        target_user = com.find_one({'id': target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0 / (1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
Beispiel #31
0
def tag_jaccard(dbname, hash_time, gfilename):
    # Calculate the jaccard index of hashtag
    g = gt.Graph.Read_GraphML(gfilename + '_tag_undir.graphml')
    times = dbt.db_connect_col(dbname, hash_time)
    tag_tweets = {}
    for tweet in times.find({'$where': 'this.entities.hashtags.length>0'}):
        hashtags = tweet['entities']['hashtags']
        hash_set = set()
        for hash in hashtags:
            hash_set.add(hash['text'].encode('utf-8').lower().replace(
                '_', '').replace('-', ''))
        for hash in hash_set:
            ids = tag_tweets.get(hash, set())
            ids.add(tweet['id'])
            tag_tweets[hash] = ids
    pickle.dump(tag_tweets, open(gfilename + '.pick', 'w'))

    g.es['jaccard'] = 0.0
    for edge in g.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_name = g.vs[source_vertex_id]['name']
        target_name = g.vs[target_vertex_id]['name']
        source_set, target_set = tag_tweets.get(source_name), tag_tweets.get(
            target_name)
        edge['jaccard'] = float(len(
            source_set.intersection(target_set))) / len(
                source_set.union(target_set))
    g.write_graphml(gfilename + '_tag_undir_jarccard.graphml')
Beispiel #32
0
def hashtag_related_networks(dbname, timename, netname):
    '''
    Extract users' behavior network for tweets that are related to hashtags of interests
    :param dbname:
    :param timename:
    :param netname:
    :return:
    '''
    hashtags = iot.read_recovery_ed_keywords()
    timeline = dbutil.db_connect_col(dbname, timename)
    network = dbutil.db_connect_col(dbname, netname)
    network.create_index([("id0", pymongo.ASCENDING),
                         ("id1", pymongo.ASCENDING),
                         ("type", pymongo.ASCENDING),
                         ("statusid", pymongo.ASCENDING)],
                        unique=True)
    filter = {}
    filter['$and'] = [{'$where': 'this.entities.hashtags.length>0'}, {'$where': 'this.entities.user_mentions.length>0'}]

    for tweet in timeline.find(filter, no_cursor_timeout=True):
        tags = tweet['entities']['hashtags']
        hash_tag_flag = False
        part = set([])
        for tag in tags:
            tagv = tag['text'].encode('utf-8').lower().replace('_', '').replace('-', '')
            part.add(tagv)
            # if tagv in hashtags:
            hash_tag_flag = True
        if hash_tag_flag:
            # print tweet['text']
            udmention_list = []
            if ('retweeted_status' in tweet) and len(tweet['retweeted_status']['entities']['user_mentions'])>0:
                for udmention in tweet['retweeted_status']['entities']['user_mentions']:
                    udmention_list.append(udmention['id'])
            for mention in tweet['entities']['user_mentions']:
                if ('in_reply_to_user_id' in tweet) and (mention['id'] == tweet['in_reply_to_user_id']): # reply
                    add_reply_edge(network, tweet['user']['id'], tweet['in_reply_to_user_id'], tweet['created_at'], tweet['id'], list(part))

                elif ('retweeted_status' in tweet) and (mention['id'] == tweet['retweeted_status']['user']['id']): # Retweet
                    add_retweet_edge(network, tweet['user']['id'], tweet['retweeted_status']['user']['id'], tweet['created_at'], tweet['id'], list(part))

                elif mention['id'] in udmention_list:  # mentions in Retweet content
                    add_undirect_mentions_edge(network, tweet['user']['id'], mention['id'], tweet['created_at'], tweet['id'], list(part))

                else:  # original mentions
                    add_direct_mentions_edge(network, tweet['user']['id'], mention['id'], tweet['created_at'], tweet['id'], list(part))
Beispiel #33
0
def get_values_one_field(dbname, colname, fieldname, filt={}):
    poi = dbt.db_connect_col(dbname, colname)
    values = []
    for item in poi.find(filt, [fieldname], no_cursor_timeout=True):
        if '.' in fieldname:
            levels = fieldname.split('.')
            t = item.get(levels[0], {})
            for level in levels[1:]:
                t = t.get(level)
                if t is None:
                    t = 0.0
                    break

            values.append(t)
        else:
            values.append(item.get(fieldname))
    # print 'The length of values is: ', len(values)
    return values
Beispiel #34
0
def friend_community():
    net = gt.Graph.Read_GraphML('ed_weighted_follow.graphml')
    # net = gt.load_network('fed', 'snet')
    gt.net_stat(net)
    com = net.community_infomap(edge_weights='weight')
    comclus = com.subgraphs()
    print len(comclus), com.modularity
    com = dbt.db_connect_col('fed', 'scom')
    index = 0
    hash_com = {}
    for comclu in comclus:
        print '============================================================'
        # if comclu.vcount() > 10:
        for v in comclu.vs:
            user = com.find_one({'id': int(v['name'])})
            print v['name'], user['id'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8')
            hash_com[v['name']] = index
        index += 1
Beispiel #35
0
def refine_recovery(dbname, netname):
    '''
    refine the users who have use hashtag #recovery
    :param dbname:
    :param netname:
    :return:
    '''
    network = dbutil.db_connect_col(dbname, netname)
    proed = set(['proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips', 'proanatip'])
    proedrel = proed
    for link in network.find(no_cursor_timeout=True):
        tags = set(link['tags'])
        if len(proed.intersection(tags)) > 0:
            proedrel = proedrel.union(tags)
    print len(proedrel)
    users = iot.get_values_one_field(dbname, netname, 'id0')
    print len(users)
    for user in users:
        # print user
        utags = set()
        for link in network.find({'id0': user}):
            utags.add(tag for tag in link['tags'])
        if len(utags.intersection(proedrel)) == 0:
            network.delete_many({'id0': user})
Beispiel #36
0
def friendship_community_vis(dbname, colname, filename, ctype):
    '''Out graph for vis.js visualization'''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    # fed_users = iot.get_values_one_field(dbname, 'com', 'id')
    dbcom = dbt.db_connect_col(dbname, 'com')
    fg = gt.load_network(dbname, colname)
    # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet')
    gt.net_stat(fg)
    # fg = fg.as_undirected(mode="mutual")
    # gt.net_stat(fg)

    fg = gt.giant_component(fg, 'WEAK')
    gt.net_stat(fg)

    if ctype == 'ml':
        com = fg.community_multilevel(weights='weight', return_levels=False)
    elif ctype == 'lp':
        fgu = fg.as_undirected(combine_edges=sum)
        init = fgu.community_leading_eigenvector(clusters=2, weights='weight')
        print init.membership
        com = fg.community_label_propagation(weights='weight', initial=init.membership)
        print com.membership
    else:
        com = fg.community_infomap(edge_weights='weight', trials=2)
    fg.vs['group'] = com.membership

    # edges = fg.es.select(weight_gt=3)
    # print 'Filtered edges: %d' %len(edges)
    # fg = fg.subgraph_edges(edges)
    # gt.net_stat(fg)

    # fg.vs['degree'] = fg.degree(mode="all")
    # nodes = fg.vs.select(degree_gt=10)
    # fg = fg.subgraph(nodes)
    # gt.net_stat(fg)

    Coo={}
    for x in fg.vs['group']:
        Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000))

    with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw:
        fw.write('var nodes = [\n')
        for idv, v in enumerate(fg.vs):
            user = dbcom.find_one({'id': int(fg.vs[idv]['name'])})
            desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split())
            fw.write('{id: ' + str(idv+1) + ', '+
                     'label: \'' + user['screen_name'] +'\', ' +
                     'value: ' + str(fg.degree(idv, mode="in")) + ', ' +
                     'title: \'UID: ' + str(fg.vs[idv]['name']) +
                     '<br> Screen Name: ' + user['screen_name'] +
                     '<br> Followers: ' + str(user['followers_count']) +
                     '<br> Followees: ' + str(user['friends_count']) +
                     '<br> Tweets: ' + str(user['statuses_count']) +
                     '<br> Description: ' + str(desc.encode('utf-8')) +
                     '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' +
                     'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' +
                     'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' +
                     'group: ' + str(fg.vs[idv]['group']) + ', ')
            # if int(fg.vs[idv]['name']) in ed_users:
            #     fw.write('shape: ' + '\'triangle\'')
            # else:
            #     fw.write('shape: ' + '\'circle\'')
            fw.write('}, \n')
        fw.write('];\n var edges = [\n')
        for ide, e in enumerate(fg.es):
            fw.write('{from: ' + str(e.source+1) + ', ' +
                     'to: ' + str(e.target+1) + ', ' +
                     'arrows: ' + '\'to\'' + ', ' +
                     'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] +
                     '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' +
                     'value: ' + str(fg.es[ide]['weight']) +
                     '},\n') #str(fg.es[ide]['weight'])
        fw.write('];\n')
Beispiel #37
0
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2):
    '''
    Combine followees and follower together as variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_num', 'f_palive'])
    print attr_names
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            friends = set(network1.neighbors(str(uid))) # id or name
            if len(friends) > 0:
                friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id
                print uid in friend_ids
                print len(friend_ids)
                fatts = []
                alive = 0
                for fid in friend_ids:
                    fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                    fu2 = com2.find_one({'id': fid})
                    if fu != None:
                        fatt = iot.get_fields_one_doc(fu, fields)
                        fatt.extend(active_days(fu))
                        fatts.append(fatt)
                        if fu2 is None or fu2['timeline_count'] == 0:
                            alive += 0
                        else:
                            alive += 1
                if len(fatts) > 0:
                    fatts = np.array(fatts)
                    fmatts = np.mean(fatts, axis=0)
                    row.extend(fmatts)
                    row.append(len(fatts))
                    paliv = float(alive)/len(fatts)
                    print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                    row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-combine.csv', index = False)
Beispiel #38
0
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2):
    '''
    Split followees and followers as different variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['fr_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fr_'+field for field in prof_names])
    attr_names.extend(['fr_num', 'fr_palive'])
    attr_names.extend(['fo_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fo_'+field for field in prof_names])
    attr_names.extend(['fo_num', 'fo_palive'])
    attr_names.extend(['co_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['co_'+field for field in prof_names])
    attr_names.extend(['co_num', 'co_palive'])
    print attr_names
    attr_length = len(fields) + len(prof_names) + 2
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            print '--------------------user %d---------------' %uid
            followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))])
            followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))])
            common = followees.intersection(followers)
            followees = followees - common
            followers = followers - common
            for friend_ids in [followees, followers, common]:
                if len(friend_ids) > 0:
                    # friend_ids = [int(network1.vs[v]['name']) for v in friends]
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})
                        if fu != None:
                            fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC
                            fatt.extend(active_days(fu))
                            fatts.append(fatt)
                            if fu2 is None or fu2['timeline_count'] == 0:
                                alive += 0
                            else:
                                alive += 1
                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
                else:
                    row.extend([None] * attr_length)
            # friends = followers # followers
            # if len(friends) > 0:
            #     friend_ids = [int(network1.vs[v]['name']) for v in friends]
            #     print uid in friend_ids
            #     print len(friend_ids)
            #     fatts = []
            #     alive = 0
            #     for fid in friend_ids:
            #         fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
            #         fu2 = com2.find_one({'id': fid})
            #         if fu != None:
            #             fatt = iot.get_fields_one_doc(fu, fields)
            #             fatt.extend(active_days(fu))
            #             fatts.append(fatt)
            #             if fu2 is None or fu2['timeline_count'] == 0:
            #                 alive += 0
            #             else:
            #                 alive += 1
            #     if len(fatts) > 0:
            #         fatts = np.array(fatts)
            #         fmatts = np.mean(fatts, axis=0)
            #         row.extend(fmatts)
            #         row.append(len(fatts))
            #         paliv = float(alive)/len(fatts)
            #         print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
            #         row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-split.csv', index = False)