def build_documents_country(directory): ''' A function to build the documents separated by country to get a TDM from a directory directory: a directory with Flume files ''' documents = dict() resolver = carmen.get_resolver() resolver.load_locations() raw_tweets = get_tweets(directory) for tweet in raw_tweets: country = str() try: country = resolver.resolve_tweet(tweet)[1].country except TypeError: continue text = clean_text(tweet["text"]) try: documents[country] += text except KeyError: documents[country] = text return documents
def __init__(self, *args, **kwargs) -> None: """Setup Carmen geotagging options, then init super.""" with warnings.catch_warnings(): # The default setup of carmen appears to raise several warnings, we # suppress them with the catch_warnings context manager. warnings.simplefilter("ignore") resolver_options = {'place': {'allow_unknown_locations': True}} self.geotagger = get_resolver(options=resolver_options) self.geotagger.load_locations() self.location_resolver = LocationEncoder() super().__init__(*args, **kwargs) # type: ignore
def oneDayTest_revised(): locfile = open("../data/realTwitter/testLocationChek.txt", "a+") loc_day = 0 total_day = 0 global Fcount for filename in os.listdir("/home/kitware/aalim/data/Twitter/"): print "Processing File " + filename count = 0 totalcount = 0 with gzip.open("/home/kitware/aalim/data/Twitter/" + filename, "rb") as tf: for i, line in enumerate(tf.readlines()): totalcount += 1 if i >= 0: tweet = json.loads(line) if tweet.has_key("delete"): continue if tweet['geo'] == None and tweet[ "place"] == None and tweet["coordinates"] == None: continue #for key,value in tweet.items(): # print key,value count += 1 resolver = carmen.get_resolver() resolver.load_locations() label, location = resolver.resolve_tweet(tweet) print "<==" + filename.split("%")[0] + " .... " + str( count) + "/" + str(totalcount) #print(location,tweet['geo'],tweet["place"],tweet["coordinates"]) #print(json.dumps(tweet, indent=4)) locfile.write(str(location) + "\n") #keys_list=tweet.keys() #for key,value in tweet.items(): # print key,value else: break print(str(count) + "/" + str(totalcount)) Fcount + 1 loc_day += count total_day += totalcount locfile.write(str(location) + "\n") print(str(count) + "/" + str(totalcount)) return str(count) + "/" + str(totalcount)
def __init__(self, **kwargs): ''' Initialization for CarmenLocationAnnotator Parameters ---------- keep_location: dict A dictionary of location attributes (country, state, county, city). Within each attribute, the value is a list of defined values. Example {'country': ['United States'], 'state': ['Maryland', 'Colorado']} ''' self.carmen = carmen.get_resolver() self.carmen.load_locations() self.keep_location = kwargs.get('keep_location', None)
def main_singleFile(): locfile=open("../data/realTwitter/testLocationChek.txt","a+") count=0 totalcount=0 with gzip.open("/home/kitware/aalim/data/Twitter/tweets.2013-05-01T00%3A00.M=000.gz","rb") as tf: for i,line in enumerate(tf.readlines()): totalcount+=1 if i<=200: #print line tweet=json.loads(line) if tweet.has_key("delete"): continue if tweet['geo']==None and tweet["place"]==None and tweet["coordinates"]==None: continue #for key,value in tweet.items(): # print key,value count+=1 resolver = carmen.get_resolver() resolver.load_locations() location = resolver.resolve_tweet(tweet) print"-----------------"+str(count)+"/"+str(totalcount)+"----------------\n" print(location) print(tweet['geo'],tweet["place"],tweet["coordinates"]) print(json.dumps(tweet, indent=4)) locfile.write(str(location)+"\n") #keys_list=tweet.keys() for key,value in tweet.items(): print key,value else: break print(str(count)+"/"+str(totalcount))
import json import carmen import glob file_number = 0 file_name = "file-" + str(file_number) + ".txt" file_count = len(glob.glob("file-*.txt")) for n in range(file_count): row_number = 1 with open(file_name) as data_file: for line in data_file: tweet = json.loads(line) resolver = carmen.get_resolver() resolver.load_locations() location = resolver.resolve_tweet(tweet) if location is not None: f = open('carmenized.txt', 'a') f.write(str(location) + '\n') f.close() f = open('row_number', 'w') f.write(str(row_number)) f.close() row_number += 1 f = open('file_number', 'w') f.write(str(file_number)) f.close() file_number += 1
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created on Tue Nov 15 20:48:36 2015 # @author: mgaurav # R script to compute geolocation of all tweets contained in a Json file (input argument) per tweet. computes state and country for each tweet otherwise producess null for a given tweet ID # import json import carmen import sys # ending in utf-8 necessary to avoid - UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in position 7: ordinal not in range(128) print "tweet_id\tuser_id\tcountry\tstate" with open(sys.argv[1]) as f: for line in f: tweet = json.loads(line) resolver = carmen.get_resolver() resolver.load_locations() location = resolver.resolve_tweet(tweet) #print "tweet_id\tuser_id\tcountry\tstate" if location: if location[1].state: print tweet["id"], "\t", tweet["user"]["id"], "\t", location[1].country.encode('utf-8'), "\t", location[1].state.encode('utf-8') #"tweet id is ", tweet["id"], " and country is ", location[1].country, " and state is ", location[1].state else: print tweet["id"], "\t", tweet["user"]["id"], "\t", "null", "\t", "null" else: print tweet["id"], "\t", tweet["user"]["id"], "\t", "null", "\t", "null"
## Lab Resources import carmen # https://github.com/mdredze/carmen-python from demographer.gender import CensusGenderDemographer # https://bitbucket.org/mdredze/demographer/src/master/ from demographer.indorg import IndividualOrgDemographer from demographer import process_tweet ####################### ### Globals ####################### ## Logging LOGGER = initialize_logger() ## Geolocation Resolver GEO_RESOLVER = carmen.get_resolver(order=["place", "geocode", "profile"]) GEO_RESOLVER.load_locations() ## Demographers DEMOGRAPHERS = [CensusGenderDemographer(), IndividualOrgDemographer()] ## Column Map COLUMN_MAP = { "location": [ "longitude", "latitude", "country", "state", "county", "city", ],
def processData(json_file, rts=10, start_at=None): """ takes in json file of either sdoh risk training dataset or disease subjects. for each user, grabs its user profile data and the recent tweets from the tweepy api. adds location and gender data to the user profiles. returns a last_n_tweets json object and a user_profile json object. """ #Twitter Creds # twitter_app_auth = { # 'consumer_key': '', # 'consumer_secret': '', # 'access_token': '', # 'access_token_secret': '' # } # API setup auth = tweepy.OAuthHandler(twitter_app_auth['consumer_key'], twitter_app_auth['consumer_secret']) auth.set_access_token(twitter_app_auth['access_token'], twitter_app_auth['access_token_secret']) api = tweepy.API(auth) # Carmen setup resolver = carmen.get_resolver() resolver.load_locations() # File setup file_directory = json_file json_data=open(file_directory).read() users = json.loads(json_data) if start_at: start_indx = [users.index(user) for user in users if user['username'] == start_at] users = users[start_indx[0]:] # Mashape Key for botometer mashape_key = 'TonZ1SlGz7mshDB8TSdsbjQebLgHp16UAtojsnSFkac2fxpBTa' # Filter for twitter profiles in the US - just do 20 profiles by default twitter_profiles = [] all_recent_tweets = [] usa_usernames = [] counter = 0 for tweet in users: try: if tweet['username'] not in usa_usernames: profile = api.get_user(tweet['username'], wait_on_rate_limit=True, wait_on_rate_limit_notify=True) recent_tweets = api.user_timeline(tweet['username'], count=rts, max_id=int(tweet['marker_tweet_id'])-1) if recent_tweets: recent_tweet = recent_tweets[0] location = resolver.resolve_tweet(recent_tweet._json) else: location = None if location: if location[1].country == 'United States': print 'processing %s...' % tweet['username'] print 'recent tweets for %s: %s' % (tweet['username'], len(recent_tweets)) profile._json['county'] = location[1].county profile._json['latitude'] = location[1].latitude profile._json['longitude'] = location[1].longitude profile = add_gender(profile, recent_tweet) # is it a bot? bom = None while not bom: try: print 'checking whether or not its a bot...' bom = botometer.Botometer(wait_on_ratelimit=True, mashape_key=mashape_key, **twitter_app_auth) except Exception as e: print 'probably timeout error. Waiting 1 minute before trying again...' time.sleep(60) result = bom.check_account(tweet['username']) profile._json['bot_likelihood'] = result['scores']['universal'] twitter_profiles.append(profile) all_recent_tweets.append(recent_tweets) usa_usernames.append(tweet['username']) counter += 1 if counter == 100: print '100 profiles hit; writing jsons before moving onto the next batch.' usa_users = [x for x in users if x['username'] in usa_usernames] final_twitter_profiles = [user._json for user in twitter_profiles] final_recent_tweets = [status._json for recent_tweets in all_recent_tweets for status in recent_tweets] print 'processed %s (%s) profiles.' % (counter, len(usa_users)) print '%s recent tweets. ' % len(final_recent_tweets) write_to_json(final_twitter_profiles, final_recent_tweets, usa_users, json_file) twitter_profiles = [] all_recent_tweets = [] usa_usernames = [] counter = 0 except tweepy.TweepError as e: print e.message if 'Failed to send request:' in e.reason: print "Time out error caught." time.sleep(180) elif e.message == 'Not authorized.': pass elif e.message[0]['message'] == 'Rate limit exceeded': print 'rate limit exceeded. waiting 15 minutes...' time.sleep(60 * 15) usa_users = [x for x in users if x['username'] in usa_usernames] final_twitter_profiles = [user._json for user in twitter_profiles] final_recent_tweets = [status._json for recent_tweets in all_recent_tweets for status in recent_tweets] print 'processed %s (%s) profiles.' % (counter, len(usa_users)) print '%s recent tweets. ' % len(final_recent_tweets) return final_twitter_profiles, final_recent_tweets, usa_users
def carmen_initializer(partition): print "\nPartition\n" resolver = carmen.get_resolver() resolver.load_locations() for line in partition: yield carmenizer(line, resolver)
def build_tables(config): """ Function to associate tweets with URLs, keywords, accounts and locations. Parameters: config (dict): A dictionary with config information about paths and filenames Output: It saves several dataframes with two columns depending on the object associated: 1) | tweet_id | account_id | 2) |tweet_id | location | ... etc """ ## Initialize carmen geolocation resolver = get_resolver() resolver.load_locations() ## dictionaries for the association tweet -> object tweet_account = defaultdict() tweet_location = defaultdict() tweet_carmen_location = defaultdict() tweet_url = defaultdict(list) tweet_keyword = defaultdict(list) ## load file of keywords keywords = load_keywords_file('keywords.txt') for file in sorted( glob.glob(config["PATHS"]["TW_FILES_FOLDER"] + "/*json")): # iterating over all Twitter data files print("Processing : " + str(file)) start = timeit.default_timer() # to monitor running time tweet_url = defaultdict(list) with open(file, 'r') as f: for line in f.readlines(): try: j = json.loads(line) tweet_id = j["id_str"] ## 1) extracting all URLs from tweets/retweets (including extended) ## found_urls = set() urls_entry = get_dict_path(j, ['entities', 'urls']) if urls_entry: found_urls = found_urls.union(get_urls(urls_entry)) urls_entry = get_dict_path( j, ['extended_tweet', 'entities', 'urls']) if urls_entry: found_urls = found_urls.union(get_urls(urls_entry)) urls_entry = get_dict_path( j, ['retweeted_status', 'entities', 'urls']) if urls_entry: found_urls = found_urls.union(get_urls(urls_entry)) urls_entry = get_dict_path(j, [ 'retweeted_status', 'extended_tweet', 'entities', 'urls' ]) if urls_entry: found_urls = found_urls.union(get_urls(urls_entry)) for url in found_urls: # iterate over the SET of found URLs domain = extract_top_domain(url) if domain == "twitter.com": # ignore twitter.com continue # associate tweet_id and url tweet_url[tweet_id].append(url) ## 2) extracting account, its location and matching it with carmen ## account = j["user"] account_id = j["user"]["id_str"] tweet_account[tweet_id] = account_id tweet_location[tweet_id] = str(account["location"]) result = resolver.resolve_tweet({'user': account}) if not result: match = "No match!" else: match = str(result[1]) # result[1] is a Location() object, e.g. Location(country='United Kingdom', state='England', county='London', city='London', known=True, id=2206) tweet_carmen_location[tweet_id] = match ## 3) match keywords in the tweet ## found_keywords = search_tweet_for_keywords(j, keywords) for keyword in found_keywords: tweet_keyword[tweet_id].append(keyword) except Exception as e: print(e) print(line) print("Processed tweets: " + str(tweet_account.__len__())) i = 0 ## Manually writing tables to .csv files ## names = ["account", "location", "carmen_location", "url", "keyword"] for data in [ tweet_account, tweet_location, tweet_carmen_location, tweet_url, tweet_keyword ]: # looping over different dictionaries name = names[i] print("Dumping table for " + str(name)) i += 1 # increment to get to other names filepath = os.path.join(config["PATHS"]["INTERMEDIATE_DATA_DIR"], "tweet_" + name + "_table.csv") with open(filepath, 'w', newline='') as csvfile: fieldnames = ['tweet_id', name] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=',') writer.writeheader() for k in data: if name in [ "url", "keyword" ]: # the mapping is tweet_id -> list(objects), need a different way to write this for val in data[k]: writer.writerow({'tweet_id': k, name: val}) else: writer.writerow({'tweet_id': k, name: data[k]})