Ejemplo n.º 1
0
#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()]
user_sns = ['Neuro_Skeptic']

print 'num users: ', len(user_sns)

of = codecs.open("output_fil.tsv", "w", "utf8")
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] +
                                  ".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts += 1
        if t.geocode_info is not None:
            gt += 1

    of.write(
        tab_stringify_newline([
            user.screen_name, gt, rts,
            len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time,
            user.name, user.n_total_tweets, user.creation_date,
            user.followers_count, user.following_count
        ]))

of.close()
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts+=1
        if t.geocode_info is not None:
            gt +=1

    of.write(tab_stringify_newline([user.screen_name,
                                 gt,
                                 rts,
                                len(user.tweets),
                                user.earliest_tweet_time,
                                user.latest_tweet_time,
                                user.name,
                                user.n_total_tweets,
                                user.creation_date,
                                user.followers_count,
                                user.following_count]))

of.close()



Ejemplo n.º 3
0
    ht_ht_file = codecs.open(
        dir1 + "ht_ht_edgefile.tsv", "w", "utf8"
    )  # hashtag by hashtag network edgelist where co-occurrence in a tweet defines an edge
    mention_file = codecs.open(
        dir1 + "mention_edgefile.tsv", "w", "utf8"
    )  # user by user network edgelist of mention ties
    user_url_file = codecs.open(
        dir1 + "user_url_edgefile.tsv", "w", "utf8"
    )  # bipartite network edgelist of users and urls
    url_mention_file = codecs.open(dir1 + "url_mention_edgefile.tsv", "w", "utf8")  #
    attribute_file = codecs.open(dir1 + "attribute.tsv", "w", "utf8")  # node attribute table
    geo_file = codecs.open(dir1 + "geofile.tsv", "w", "utf8")  # a list of all geo tagged tweets
    lang_file = codecs.open(dir1 + "langfile.tsv", "w", "utf8")  # language frequencies at the user level
    retweet_file = codecs.open(dir1 + "retweet.tsv", "w", "utf8")  # user by user where an edge is a retweet

    friend_file.write(tab_stringify_newline(["Source", "Target"]))
    user_ht_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date"]))
    user_url_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date", "urls_tweet"]))
    ht_ht_file.write(tab_stringify_newline(["userID", "hashtag_A", "hashtag_B", "tweetID", "date"]))
    mention_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date"]))
    url_mention_file.write(tab_stringify_newline(["Source", "Target", "url", "tweetID", "date"]))
    geo_file.write(tab_stringify_newline(["userID", "lat", "lon", "date"]))
    lang_file.write(tab_stringify_newline(["userID", "lang", "count"]))
    retweet_file.write(tab_stringify_newline(["Source", "retweet_sn", "tweetID", "date"]))
    attribute_file.write(
        tab_stringify_newline(
            [
                "userID",
                "ScreenName",
                "followingCount",
                "followerCount",
Ejemplo n.º 4
0
    except:
        mkdir(dir2)

    friend_file = codecs.open(dir1+"friend_edgefile.tsv", "w", "utf8")
    user_ht_file = codecs.open(dir1+"user_ht_edgefile.tsv", "w", "utf8")
    ht_ht_file = codecs.open(dir1+"ht_ht_edgefile.tsv", 'w', 'utf8')
    mention_file = codecs.open(dir1+"mention_edgefile.tsv", "w", "utf8")
    user_url_file = codecs.open(dir1+"user_url_edgefile.tsv", "w", "utf8")
    url_mention_file=codecs.open(dir1+"url_mention_edgefile.tsv","w","utf8")
    attribute_file = codecs.open(dir1+"attribute.tsv", "w", "utf8")
    geo_file = codecs.open(dir1+"geofile.tsv", "w", "utf8")
    lang_file = codecs.open(dir1+"langfile.tsv", "w", "utf8")
    retweet_file=codecs.open(dir1+ "retweet.tsv", "w", "utf8")


    friend_file.write(tab_stringify_newline(['Source', 'Target']))
    user_ht_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date']))
    user_url_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date','urls_tweet']))
    ht_ht_file.write(tab_stringify_newline(['userID','hashtag_A','hashtag_B','tweetID','date']))
    mention_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date']))
    url_mention_file.write(tab_stringify_newline(['Source', 'Target','url','tweetID', 'date']))
    geo_file.write(tab_stringify_newline(['userID', 'lat', 'lon', 'date']))
    lang_file.write(tab_stringify_newline(['userID', 'lang', 'count']))
    retweet_file.write(tab_stringify_newline(['Source', 'retweet_sn','tweetID', 'date']))
    attribute_file.write(tab_stringify_newline(['userID', 'ScreenName', 'followingCount', 'followerCount',
                                                'tweetCount', 'tweetsCollected', 'firstTweet','lastTweet', 'creation_date',
                                                'urlCount', 'mentionCount']))


    for file in onlyfiles:
        try:
Ejemplo n.º 5
0
    user_url_file = codecs.open(
        dir1 + "user_url_edgefile.tsv", "w",
        "utf8")  # bipartite network edgelist of users and urls
    url_mention_file = codecs.open(dir1 + "url_mention_edgefile.tsv", "w",
                                   "utf8")  #
    attribute_file = codecs.open(dir1 + "attribute.tsv", "w",
                                 "utf8")  # node attribute table
    geo_file = codecs.open(dir1 + "geofile.tsv", "w",
                           "utf8")  # a list of all geo tagged tweets
    lang_file = codecs.open(dir1 + "langfile.tsv", "w",
                            "utf8")  # language frequencies at the user level
    retweet_file = codecs.open(
        dir1 + "retweet.tsv", "w",
        "utf8")  # user by user where an edge is a retweet

    friend_file.write(tab_stringify_newline(['Source', 'Target']))
    user_ht_file.write(
        tab_stringify_newline(['Source', 'Target', 'tweetID', 'date']))
    user_url_file.write(
        tab_stringify_newline(
            ['Source', 'Target', 'tweetID', 'date', 'urls_tweet']))
    ht_ht_file.write(
        tab_stringify_newline(
            ['userID', 'hashtag_A', 'hashtag_B', 'tweetID', 'date']))
    mention_file.write(
        tab_stringify_newline(['Source', 'Target', 'tweetID', 'date']))
    url_mention_file.write(
        tab_stringify_newline(['Source', 'Target', 'url', 'tweetID', 'date']))
    geo_file.write(tab_stringify_newline(['userID', 'lat', 'lon', 'date']))
    lang_file.write(tab_stringify_newline(['userID', 'lang', 'count']))
    retweet_file.write(
Ejemplo n.º 6
0
print "N TO FIND: ", len(user_sns)

#user_ids = [u for u in user_ids]
user_ids = [u.lower() for u in user_sns]
out_fil = io.open(sys.argv[3], "w")

i = 0
j = 0
print len(user_sns)

out_fil.write(
    tab_stringify_newline([
        "id", 'name', "screen_name", 'url', 'protected', 'location',
        'description', "followers_count", "friends_count", "favourites_count",
        "created_at", "utc_offset", 'time_zone', "statuses_count", "lang",
        "status_created_at", "status_coordinates", "status_lang",
        "profile_image_url_https", "verified"
    ]))

while i < len(user_sns):
    j += 1
    print j
    api_hook = handles[random.randint(0, len(handles) - 1)]

    curr_ids = set(user_ids[i:(i + 100)])
    user_data = api_hook.get_from_url("users/lookup.json", {
        "user_id": ",".join(curr_ids),
        "include_entities": "false"
    })
    "processed_data/50mpaths2", "dictionaries/*/*",
    BOOTSTRAPPED_DICTIONARY_LOCATION)

features_from_conll_file, dictionary_data = get_all_features(
    CONLL_FILE, all_dictionaries, ark_clusters, sets, names)

# get ids of all random, put in a set, then iterate through
random.seed(0)
all_random_ids = get_test_ids(CONLL_FILE, 0, -1, -1)
random.shuffle(all_random_ids)

output_file = open("results/param_tuning_results.tsv", "w")

for fold in range(5):

    output, models, preds = run_all_on_test_ids(
        fold,
        all_random_ids[(fold * 150):((fold + 1) * 150)],
        model,
        features_from_conll_file,
        dictionary_data,
        eval_params=[.4, .45, .5, .55, .6],
        cutoff_params=[.0001, .0005, .001],
        use_filtered_params=[True, False],
        datasets_to_use=['x', 'wv', 'x_wv', 'all_wv', 'x_wv_ls', 'full'],
        regularization_params=[.53, .58, .6, .63, .65])
    for o in output:
        output_file.write(tab_stringify_newline(o))

output_file.close()
Ejemplo n.º 8
0
    def run(self):
        print('Worker started')
        # do some initialization here

        while True:
            data = self.queue.get(True)
            try:
                if data is None:
                    print('ALL FINISHED!!!!', self.conn_number)
                    self.out_file.close()
                    self.user_info_out_file.close()
                    break
                print 'collecting data'
                if self.gets_user_id:
                    user_data = self.api_hook.get_from_url(
                        "users/lookup.json", {
                            "user_id": ",".join(data),
                            "include_entities": "false"
                        },
                        do_post=True)
                else:
                    user_data = self.api_hook.get_from_url(
                        "users/lookup.json", {
                            "screen_name": ",".join(data),
                            "include_entities": "false"
                        },
                        do_post=True)
                user_ret_ids = [str(u['id']) for u in user_data]
                print len(data), len(user_ret_ids)
                not_there = set.difference(set(data), set(user_ret_ids))
                print len(not_there)
                for u in not_there:
                    self.out_file.write(tab_stringify_newline([u]))
                print 'sleeping'

                for user in user_data:
                    output_data = [
                        user["id"],
                        user.get('name'), user["screen_name"],
                        user.get('url', ''), user['protected'],
                        user.get('location', ''),
                        user.get('description', ''), user["followers_count"],
                        user["friends_count"], user["created_at"],
                        user.get("utc_offset", ''),
                        user.get('time_zone',
                                 ''), user["statuses_count"], user["lang"]
                    ]
                    if 'status' in user:
                        output_data += [
                            user["status"]["created_at"],
                            user["status"]["coordinates"]
                            if user['status']['coordinates'] else '',
                            user["status"]["lang"]
                        ]
                    else:
                        output_data += ['', '', '']

                    output_data += [
                        user.get("profile_image_url_https", ""),
                        user.get("verified", "")
                    ]

                    output_data = [(x.replace("\r\n", "  ").replace(
                        "\n", "  ").replace("\r", "  ").replace("\t", "  "))
                                   if type(x) is str else x
                                   for x in output_data]
                    output_data = [(x.replace(u"\r\n", u"  ").replace(
                        u"\n", u"  ").replace(u"\r", "  ").replace(
                            u"\t", u"  ")) if type(x) is unicode else x
                                   for x in output_data]
                    to_write = tab_stringify_newline(output_data)

                    self.user_info_out_file.write(to_write)

                sleep(15)

            except KeyboardInterrupt as e:
                print e
                break
            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=50, file=sys.stdout)
                print("*** print_exception:")

            print('finished collecting data for: ', data)
    def run(self):
        print('Worker started')
        # do some initialization here

        while True:
            data = self.queue.get(True)
            try:
                if data is None:
                    print('ALL FINISHED!!!!', self.conn_number)
                    self.out_file.close()
                    self.user_info_out_file.close()
                    break
                print 'collecting data'
                user_data = self.api_hook.get_from_url("users/lookup.json",
                                                       {"user_id": ",".join(data), "include_entities": "false"})
                user_ret_ids = [str(u['id']) for u in user_data]
                print len(data),len(user_ret_ids)
                not_there = set.difference(set(data),set(user_ret_ids))
                print len(not_there)
                for u in not_there:
                    self.out_file.write(tab_stringify_newline([u]))
                print 'sleeping'

                for user in user_data:
                    output_data = [user["id"],
                                   user.get('name'),
                                   user["screen_name"],
                                   user.get('url',''),
                                   user['protected'],
                                   user.get('location',''),
                                   user.get('description', ''),
                                   user["followers_count"],
                                   user["friends_count"],
                                   user["created_at"],
                                   user.get("utc_offset",''),
                                   user.get('time_zone',''),
                                   user["statuses_count"],
                                   user["lang"]]
                    if 'status' in user:
                        output_data += [user["status"]["created_at"],
                                        user["status"]["coordinates"] if user['status']['coordinates'] else '',
                                        user["status"]["lang"]]
                    else:
                        output_data += ['','','']

                    output_data += [user.get("profile_image_url_https",""),user.get("verified","")]

                    output_data = [(x.replace("\r\n","  ")
                                    .replace("\n","  ")
                                    .replace("\r","  ")
                                    .replace("\t","  ")) if type(x) is str else x for x in output_data ]
                    output_data = [(x.replace(u"\r\n",u"  ")
                                    .replace(u"\n",u"  ")
                                    .replace(u"\r","  ")
                                    .replace(u"\t",u"  ")) if type(x) is unicode else x for x in output_data ]
                    to_write = tab_stringify_newline(output_data)

                    self.user_info_out_file.write(to_write)

                sleep(15)


            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=50, file=sys.stdout)
                print("*** print_exception:")

            print('finished collecting data for: ', data)
                                                                   "processed_data/50mpaths2",
                                                                   "dictionaries/*/*",
                                                                   BOOTSTRAPPED_DICTIONARY_LOCATION)

features_from_conll_file, dictionary_data = get_all_features(CONLL_FILE,all_dictionaries,ark_clusters,sets,names)

# get ids of all random, put in a set, then iterate through
random.seed(0)
all_random_ids = get_test_ids(CONLL_FILE, 0, -1, -1)
random.shuffle(all_random_ids)

output_file = open("results/param_tuning_results.tsv","w")

for fold in range(5):

    output, models, preds = run_all_on_test_ids(fold,
                                                all_random_ids[(fold*150):( (fold+1)*150)],
                                                model,
                                                features_from_conll_file,
                                                dictionary_data,
                                                eval_params = [.4,.45,.5,.55,.6],
                                                cutoff_params=[.0001,.0005,.001],
                                                use_filtered_params=[True,False],
                                                datasets_to_use = ['x','wv','x_wv','all_wv','x_wv_ls','full'],
                                                regularization_params = [.53,.58,.6,.63,.65])
    for o in output:
        output_file.write(tab_stringify_newline(o))

output_file.close()