Esempio n. 1
0
def build_documents_country(directory):
    '''
    A function to build the documents separated by country
    to get a TDM from a directory

    directory: a directory with Flume files
    '''

    documents = dict()

    resolver = carmen.get_resolver()
    resolver.load_locations()

    raw_tweets = get_tweets(directory)

    for tweet in raw_tweets:
        country = str()
        try:
            country = resolver.resolve_tweet(tweet)[1].country
        except TypeError:
            continue

        text = clean_text(tweet["text"])
        try:
            documents[country] += text
        except KeyError:
            documents[country] = text

    return documents
Esempio n. 2
0
    def __init__(self, *args, **kwargs) -> None:
        """Setup Carmen geotagging options, then init super."""
        with warnings.catch_warnings():
            # The default setup of carmen appears to raise several warnings, we
            # suppress them with the catch_warnings context manager.
            warnings.simplefilter("ignore")
            resolver_options = {'place': {'allow_unknown_locations': True}}
            self.geotagger = get_resolver(options=resolver_options)
            self.geotagger.load_locations()
            self.location_resolver = LocationEncoder()

        super().__init__(*args, **kwargs)  # type: ignore
def oneDayTest_revised():

    locfile = open("../data/realTwitter/testLocationChek.txt", "a+")
    loc_day = 0
    total_day = 0
    global Fcount

    for filename in os.listdir("/home/kitware/aalim/data/Twitter/"):
        print "Processing File " + filename
        count = 0
        totalcount = 0
        with gzip.open("/home/kitware/aalim/data/Twitter/" + filename,
                       "rb") as tf:
            for i, line in enumerate(tf.readlines()):

                totalcount += 1
                if i >= 0:

                    tweet = json.loads(line)

                    if tweet.has_key("delete"):
                        continue

                    if tweet['geo'] == None and tweet[
                            "place"] == None and tweet["coordinates"] == None:

                        continue
                    #for key,value in tweet.items():
                    #    print key,value

                    count += 1
                    resolver = carmen.get_resolver()
                    resolver.load_locations()
                    label, location = resolver.resolve_tweet(tweet)
                    print "<==" + filename.split("%")[0] + " .... " + str(
                        count) + "/" + str(totalcount)
                    #print(location,tweet['geo'],tweet["place"],tweet["coordinates"])
                    #print(json.dumps(tweet, indent=4))
                    locfile.write(str(location) + "\n")

                    #keys_list=tweet.keys()
                    #for key,value in tweet.items():
                    #    print key,value
                else:
                    break
            print(str(count) + "/" + str(totalcount))
            Fcount + 1
            loc_day += count
            total_day += totalcount
            locfile.write(str(location) + "\n")
    print(str(count) + "/" + str(totalcount))
    return str(count) + "/" + str(totalcount)
    def __init__(self, **kwargs):
        '''
        Initialization for CarmenLocationAnnotator

        Parameters
        ----------
        keep_location: dict
            A dictionary of location attributes (country, state, county, city).
            Within each attribute, the value is a list of defined values.
            Example {'country': ['United States'], 'state': ['Maryland', 'Colorado']}
        '''
        self.carmen = carmen.get_resolver()
        self.carmen.load_locations()
        self.keep_location = kwargs.get('keep_location', None)
Esempio n. 5
0
def main_singleFile():
    locfile=open("../data/realTwitter/testLocationChek.txt","a+")
    count=0
    totalcount=0



    with gzip.open("/home/kitware/aalim/data/Twitter/tweets.2013-05-01T00%3A00.M=000.gz","rb") as tf:
        for i,line in enumerate(tf.readlines()):

            totalcount+=1
            if i<=200:
                #print line
                tweet=json.loads(line)

                if tweet.has_key("delete"):
                    continue

                if tweet['geo']==None and tweet["place"]==None and tweet["coordinates"]==None:

                    continue
                #for key,value in tweet.items():
                #    print key,value

                count+=1
                resolver = carmen.get_resolver()
                resolver.load_locations()
                location = resolver.resolve_tweet(tweet)
                print"-----------------"+str(count)+"/"+str(totalcount)+"----------------\n"
                print(location)
                print(tweet['geo'],tweet["place"],tweet["coordinates"])
                print(json.dumps(tweet, indent=4))
                locfile.write(str(location)+"\n")

                #keys_list=tweet.keys()
                for key,value in tweet.items():
                    print key,value
            else:
                break
    print(str(count)+"/"+str(totalcount))
Esempio n. 6
0
import json
import carmen
import glob

file_number = 0
file_name = "file-" + str(file_number) + ".txt"
file_count = len(glob.glob("file-*.txt"))

for n in range(file_count):
    row_number = 1
    with open(file_name) as data_file:
        for line in data_file:
            tweet = json.loads(line)
            resolver = carmen.get_resolver()
            resolver.load_locations()
            location = resolver.resolve_tweet(tweet)

            if location is not None:
                f = open('carmenized.txt', 'a')
                f.write(str(location) + '\n')
                f.close()

            f = open('row_number', 'w')
            f.write(str(row_number))
            f.close()
            row_number += 1

        f = open('file_number', 'w')
        f.write(str(file_number))
        f.close()
        file_number += 1
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Created on Tue Nov  15 20:48:36 2015
# @author: mgaurav
# R script to compute geolocation of all tweets contained in a Json file (input argument) per tweet. computes state and country for each tweet otherwise producess null for a given tweet ID
#

import json
import carmen
import sys

# ending in utf-8 necessary to avoid - UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in position 7: ordinal not in range(128)
print "tweet_id\tuser_id\tcountry\tstate"
with open(sys.argv[1]) as f:
  for line in f:
    tweet = json.loads(line)
    resolver = carmen.get_resolver()
    resolver.load_locations()
    location = resolver.resolve_tweet(tweet)
    #print "tweet_id\tuser_id\tcountry\tstate"
    if location:
      if location[1].state:
        print tweet["id"], "\t", tweet["user"]["id"], "\t", location[1].country.encode('utf-8'), "\t", location[1].state.encode('utf-8')  #"tweet id is ", tweet["id"], " and country is ", location[1].country, " and state is ", location[1].state
      else: 
        print tweet["id"], "\t", tweet["user"]["id"], "\t", "null", "\t", "null"
    else:
      print tweet["id"], "\t", tweet["user"]["id"], "\t", "null", "\t", "null"
## Lab Resources
import carmen  # https://github.com/mdredze/carmen-python
from demographer.gender import CensusGenderDemographer  # https://bitbucket.org/mdredze/demographer/src/master/
from demographer.indorg import IndividualOrgDemographer
from demographer import process_tweet

#######################
### Globals
#######################

## Logging
LOGGER = initialize_logger()

## Geolocation Resolver
GEO_RESOLVER = carmen.get_resolver(order=["place", "geocode", "profile"])
GEO_RESOLVER.load_locations()

## Demographers
DEMOGRAPHERS = [CensusGenderDemographer(), IndividualOrgDemographer()]

## Column Map
COLUMN_MAP = {
    "location": [
        "longitude",
        "latitude",
        "country",
        "state",
        "county",
        "city",
    ],
Esempio n. 9
0
def processData(json_file, rts=10, start_at=None):
    """
    takes in json file of either sdoh risk training dataset or disease subjects. 

    for each user, grabs its user profile data and the recent tweets from the tweepy api.

    adds location and gender data to the user profiles.

    returns a last_n_tweets json object and a user_profile json object.
    """
    #Twitter Creds
#     twitter_app_auth = {
#         'consumer_key': '',
#         'consumer_secret': '',
#         'access_token': '',
#         'access_token_secret': ''
#     }

    # API setup
    auth = tweepy.OAuthHandler(twitter_app_auth['consumer_key'], twitter_app_auth['consumer_secret'])
    auth.set_access_token(twitter_app_auth['access_token'], twitter_app_auth['access_token_secret'])
    api = tweepy.API(auth)

    # Carmen setup
    resolver = carmen.get_resolver()
    resolver.load_locations()

    # File setup 
    file_directory = json_file
    json_data=open(file_directory).read()
    users = json.loads(json_data)

    if start_at:
        start_indx = [users.index(user) for user in users if user['username'] == start_at]
        users = users[start_indx[0]:]

    # Mashape Key for botometer
    mashape_key = 'TonZ1SlGz7mshDB8TSdsbjQebLgHp16UAtojsnSFkac2fxpBTa'

    # Filter for twitter profiles in the US - just do 20 profiles by default
    twitter_profiles = []
    all_recent_tweets = []
    usa_usernames = []
    counter = 0
    for tweet in users:
        try:
            if tweet['username'] not in usa_usernames:
                profile = api.get_user(tweet['username'], wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
                recent_tweets = api.user_timeline(tweet['username'], count=rts, max_id=int(tweet['marker_tweet_id'])-1)
                if recent_tweets:
                    recent_tweet = recent_tweets[0]
                    location = resolver.resolve_tweet(recent_tweet._json)
                else:
                    location = None
                if location:
                    if location[1].country == 'United States':
                        print 'processing %s...' % tweet['username']
                        print 'recent tweets for %s: %s' % (tweet['username'], len(recent_tweets))
                        profile._json['county'] = location[1].county
                        profile._json['latitude'] = location[1].latitude
                        profile._json['longitude'] = location[1].longitude
                        profile = add_gender(profile, recent_tweet)
                        # is it a bot?
                        bom = None
                        while not bom:
                            try:
                                print 'checking whether or not its a bot...'
                                bom = botometer.Botometer(wait_on_ratelimit=True, mashape_key=mashape_key, **twitter_app_auth)
                            except Exception as e:
                                print 'probably timeout error. Waiting 1 minute before trying again...'
                                time.sleep(60)
                        result = bom.check_account(tweet['username'])
                        profile._json['bot_likelihood'] = result['scores']['universal']
                        twitter_profiles.append(profile)
                        all_recent_tweets.append(recent_tweets)
                        usa_usernames.append(tweet['username'])
                        counter += 1
                        if counter == 100:
                            print '100 profiles hit; writing jsons before moving onto the next batch.'
                            usa_users = [x for x in users if x['username'] in usa_usernames]
                            final_twitter_profiles = [user._json for user in twitter_profiles]
                            final_recent_tweets = [status._json for recent_tweets in all_recent_tweets for status in recent_tweets]
                            print 'processed %s (%s) profiles.' % (counter, len(usa_users))
                            print '%s recent tweets. ' % len(final_recent_tweets)
                            write_to_json(final_twitter_profiles, final_recent_tweets, usa_users, json_file)
                            twitter_profiles = []
                            all_recent_tweets = []
                            usa_usernames = []
                            counter = 0
        except tweepy.TweepError as e:
            print e.message
            if 'Failed to send request:' in e.reason:
                print "Time out error caught."
                time.sleep(180)
            elif e.message == 'Not authorized.':
                pass
            elif e.message[0]['message'] == 'Rate limit exceeded':
                print 'rate limit exceeded. waiting 15 minutes...'
                time.sleep(60 * 15)
    usa_users = [x for x in users if x['username'] in usa_usernames]
    final_twitter_profiles = [user._json for user in twitter_profiles]
    final_recent_tweets = [status._json for recent_tweets in all_recent_tweets for status in recent_tweets]
    print 'processed %s (%s) profiles.' % (counter, len(usa_users))
    print '%s recent tweets. ' % len(final_recent_tweets)
    return final_twitter_profiles, final_recent_tweets, usa_users
Esempio n. 10
0
def carmen_initializer(partition):
    print "\nPartition\n"
    resolver = carmen.get_resolver()
    resolver.load_locations()
    for line in partition:
        yield carmenizer(line, resolver)
def build_tables(config):
    """
    Function to associate tweets with URLs, keywords, accounts and locations.
    Parameters:
        config (dict): A dictionary with config information about paths and filenames
    Output:
        It saves several dataframes with two columns depending on the object associated:
        1) | tweet_id | account_id |
        2) |tweet_id | location |
        ... etc
    """

    ## Initialize carmen geolocation

    resolver = get_resolver()
    resolver.load_locations()

    ## dictionaries for the association tweet -> object
    tweet_account = defaultdict()
    tweet_location = defaultdict()
    tweet_carmen_location = defaultdict()
    tweet_url = defaultdict(list)
    tweet_keyword = defaultdict(list)

    ## load file of keywords
    keywords = load_keywords_file('keywords.txt')

    for file in sorted(
            glob.glob(config["PATHS"]["TW_FILES_FOLDER"] +
                      "/*json")):  # iterating over all Twitter data files
        print("Processing : " + str(file))
        start = timeit.default_timer()  # to monitor running time
        tweet_url = defaultdict(list)
        with open(file, 'r') as f:
            for line in f.readlines():
                try:
                    j = json.loads(line)
                    tweet_id = j["id_str"]

                    ## 1) extracting all URLs from tweets/retweets (including extended) ##
                    found_urls = set()
                    urls_entry = get_dict_path(j, ['entities', 'urls'])
                    if urls_entry:
                        found_urls = found_urls.union(get_urls(urls_entry))
                    urls_entry = get_dict_path(
                        j, ['extended_tweet', 'entities', 'urls'])
                    if urls_entry:
                        found_urls = found_urls.union(get_urls(urls_entry))
                    urls_entry = get_dict_path(
                        j, ['retweeted_status', 'entities', 'urls'])
                    if urls_entry:
                        found_urls = found_urls.union(get_urls(urls_entry))
                    urls_entry = get_dict_path(j, [
                        'retweeted_status', 'extended_tweet', 'entities',
                        'urls'
                    ])
                    if urls_entry:
                        found_urls = found_urls.union(get_urls(urls_entry))

                    for url in found_urls:  # iterate over the SET of found URLs
                        domain = extract_top_domain(url)
                        if domain == "twitter.com":  # ignore twitter.com
                            continue
                        # associate tweet_id and url
                        tweet_url[tweet_id].append(url)

                    ## 2) extracting account, its location and matching it with carmen ##
                    account = j["user"]
                    account_id = j["user"]["id_str"]

                    tweet_account[tweet_id] = account_id

                    tweet_location[tweet_id] = str(account["location"])

                    result = resolver.resolve_tweet({'user': account})
                    if not result:
                        match = "No match!"
                    else:
                        match = str(result[1])
                        # result[1] is a Location() object, e.g. Location(country='United Kingdom', state='England', county='London', city='London', known=True, id=2206)

                    tweet_carmen_location[tweet_id] = match

                    ## 3) match keywords in the tweet ##
                    found_keywords = search_tweet_for_keywords(j, keywords)
                    for keyword in found_keywords:
                        tweet_keyword[tweet_id].append(keyword)

                except Exception as e:
                    print(e)
                    print(line)
    print("Processed tweets: " + str(tweet_account.__len__()))

    i = 0
    ## Manually writing tables to .csv files ##
    names = ["account", "location", "carmen_location", "url", "keyword"]
    for data in [
            tweet_account, tweet_location, tweet_carmen_location, tweet_url,
            tweet_keyword
    ]:  # looping over different dictionaries
        name = names[i]
        print("Dumping table for " + str(name))
        i += 1  # increment to get to other names

        filepath = os.path.join(config["PATHS"]["INTERMEDIATE_DATA_DIR"],
                                "tweet_" + name + "_table.csv")
        with open(filepath, 'w', newline='') as csvfile:
            fieldnames = ['tweet_id', name]
            writer = csv.DictWriter(csvfile,
                                    fieldnames=fieldnames,
                                    delimiter=',')
            writer.writeheader()
            for k in data:
                if name in [
                        "url", "keyword"
                ]:  # the mapping is tweet_id -> list(objects), need a different way to write this
                    for val in data[k]:
                        writer.writerow({'tweet_id': k, name: val})
                else:
                    writer.writerow({'tweet_id': k, name: data[k]})