def extract_and_store_tweets(csvfile, nlp, minetweet): print print "Start processing %s ..." % csvfile print "*" * 20 start = time() # measure time # LOGGING tweets_count = 0 mentions_count = 0 urls_count = 0 hashtags_count = 0 tags_count = 0 unvalid_tweets = 0 i = 1 # iteroator to remember row number on csv with open(csvfile, 'r') as f: # print 'Processing data...' next(f) # skip csv header data = csv.reader(f) # one row at a time for row in data: # create Tweet object t = Tweet() # Populate Tweet t.mid = row[0] t.retweetFromPostId = row[1] t.userId = row[2] t.retweetFromUserId = row[3] t.source = row[4] t.hasImage = row[5] t.txt = row[6] t.geo = row[7] t.created_at = row[8] t.deleted_last_seen = row[9] t.permission_denied = row[10] # Extract tweet entities mentions, urls, hashtags, clean = minetweet.extract_tweet_entities( t.txt) # add to Tweet t.mentions = mentions t.urls = urls t.hashtags = hashtags clean = clean # text-only version of the tweet for NLP # Extract keywords dico = nlp.extract_dictionary(clean) # remove stopwords and store clean dico t.dico = nlp.remove_stopwords(dico) # extract entities # TODO : ignore stopwords # t.entities=nlp.extract_named_entities_from_dico(t.dico) # Some count for stats mentions_count += len(mentions) urls_count += len(urls) hashtags_count += len(hashtags) tags_count += len(t.entities) t.row = i valid_utf8 = True try: t.txt.decode('utf-8') except UnicodeDecodeError: unvalid_tweets += 1 valid_utf8 = False print ' bad encoding : tweet ', t.mid # pprint(t) if valid_utf8 is True: try: t.save() tweets_count += 1 except bson.errors.InvalidStringData: print ' bad encoding : tweet ', t.mid # pprint(t) # LOG print print "-" * 10 print " mentions_count : %d " % mentions_count print " urls_count : %d " % urls_count print " hashtags_count : %d " % hashtags_count print " unvalid tweets : %d " % unvalid_tweets print " TOTAL tweet entities : %d " % (mentions_count + urls_count + hashtags_count) print " TOTAL named entities (NER): %d " % tags_count print print "-" * 10 print "TOTAL tweets processed : %d" % tweets_count print " done in %.3fs" % (time() - start) print
def extract_and_store_tweets(csvfile,nlp,minetweet): print print "Start processing %s ..."%csvfile print "*"*20 start=time() # measure time # LOGGING tweets_count=0 mentions_count=0 urls_count=0 hashtags_count=0 tags_count=0 unvalid_tweets=0 i=1 # iteroator to remember row number on csv with open(csvfile, 'r') as f: # print 'Processing data...' next(f) # skip csv header data = csv.reader(f) # one row at a time for row in data: # create Tweet object t=Tweet() # Populate Tweet t.mid=row[0] t.retweetFromPostId=row[1] t.userId=row[2] t.retweetFromUserId=row[3] t.source=row[4] t.hasImage=row[5] t.txt=row[6] t.geo=row[7] t.created_at=row[8] t.deleted_last_seen=row[9] t.permission_denied=row[10] # Extract tweet entities mentions,urls,hashtags,clean=minetweet.extract_tweet_entities(t.txt) # add to Tweet t.mentions=mentions t.urls=urls t.hashtags=hashtags clean=clean # text-only version of the tweet for NLP # Extract keywords dico=nlp.extract_dictionary(clean) # remove stopwords and store clean dico t.dico=nlp.remove_stopwords(dico) # extract entities # TODO : ignore stopwords # t.entities=nlp.extract_named_entities_from_dico(t.dico) # Some count for stats mentions_count+=len(mentions) urls_count+=len(urls) hashtags_count+=len(hashtags) tags_count+=len(t.entities) t.row=i valid_utf8 = True try: t.txt.decode('utf-8') except UnicodeDecodeError: unvalid_tweets+=1 valid_utf8 = False print ' bad encoding : tweet ',t.mid # pprint(t) if valid_utf8 is True: try: t.save() tweets_count+=1 except bson.errors.InvalidStringData: print ' bad encoding : tweet ',t.mid # pprint(t) # LOG print print "-"*10 print " mentions_count : %d "%mentions_count print " urls_count : %d "%urls_count print " hashtags_count : %d "%hashtags_count print " unvalid tweets : %d "%unvalid_tweets print " TOTAL tweet entities : %d "%(mentions_count+urls_count+hashtags_count) print " TOTAL named entities (NER): %d "%tags_count print print "-"*10 print "TOTAL tweets processed : %d"%tweets_count print " done in %.3fs"%(time()-start) print
def save_tweet(obj): """ Saves each tweet to the database :param obj: """ tweet_id = obj['id_str'] if not tweet_id in tweets_map: # dive into recursion until we hit the original tweet parent_tweet = None if 'retweeted_status' in obj and obj['retweeted_status'] is not None: parent_tweet = save_tweet(obj['retweeted_status']) location = None if obj['coordinates'] and obj['coordinates']['coordinates']: location = WKTElement( f"POINT({obj['coordinates']['coordinates'][0]} {obj['coordinates']['coordinates'][1]})", srid=4326) tweet = Tweet(id=obj['id_str'], content=obj['full_text'], location=location, retweet_count=obj['retweet_count'], favorite_count=obj['favorite_count'], happened_at=obj['created_at']) tweets_map[tweet_id] = True # if user is present in tweet if obj['user'] is not None: # if user is not previously added in hashmap of accounts create new user user_id = obj['user']['id'] if not user_id in accounts_map: account = Account( id=obj['user']['id'], screen_name=obj['user']['screen_name'], name=obj['user']['name'], description=obj['user']['description'], followers_count=obj['user']['followers_count'], friends_count=obj['user']['friends_count'], statuses_count=obj['user']['statuses_count']) accounts_map[account.id] = SavedAccountType.FULL else: # find user in database account = session.query(Account).filter( Account.id == user_id).scalar() # user was previously saved as user_mention and needs to be updated with new attributes which are not present in user_mentions if accounts_map[user_id] == SavedAccountType.MENTION: account.update(obj['user']) accounts_map[account.id] = SavedAccountType.FULL # add user as an author of the tweet tweet.author = account # user mentions if (obj['entities'] is not None and obj['entities']['user_mentions'] is not None and len(obj['entities']['user_mentions'])): mentions = [] # map all mentions for mentioned_user in obj['entities']['user_mentions']: user_id = mentioned_user['id'] # if user is mentioned in the status multiple times (not saved to the db yet, already in accounts hashmap) # or the user mentions himself before being saved to db # (they're saved at the end of save_tweet function along the tweet itself) if (user_id in map(lambda x: x.id, mentions) or user_id == tweet.author.id): continue # check whether the mention wasn't previously saved in hashmap if not user_id in accounts_map: account = Account( id=mentioned_user['id'], screen_name=mentioned_user['screen_name'], name=mentioned_user['user']['name'], ) accounts_map[user_id] = SavedAccountType.MENTION else: # find user in database account = session.query(Account).filter( Account.id == user_id).scalar() # append to the array of mentions mentions.append(account) # associate hashtags array with tweet tweet.mentions = mentions # if place is present in tweet and has all fields present if (obj['place'] is not None and obj['place']['country_code'] and obj['place']['country']): # if place is not previously added in hashmap of countries create a new country country_code = obj['place']['country_code'] if not country_code in countries_map: country = Country(code=obj['place']['country_code'], name=obj['place']['country']) countries_map[country.code] = True else: # find country in database country = session.query(Country).filter( Country.code == country_code).scalar() # add place as an country of the tweet tweet.country = country if (obj['entities'] is not None and obj['entities']['hashtags'] is not None and len(obj['entities']['hashtags'])): hashtags = [] # map all hashtags for hashtag_obj in obj['entities']['hashtags']: # check whether the hashtag wasn't previously saved hashtag_id = hashtag_obj['text'] # hashtag_id in hashtags of current tweet, not saved to the db yet, already in hashtag hashmap if hashtag_id in map(lambda x: x.value, hashtags): continue if not hashtag_id in hashtags_map: hashtags_map[hashtag_id] = True hashtag = Hashtag(hashtag_obj['text']) else: # find hashtag in database hashtag = session.query(Hashtag).filter( Hashtag.value == hashtag_id).scalar() # append to the array of hashtags hashtags.append(hashtag) # associate hashtags array with tweet tweet.hashtags = hashtags # set the parent tweet from the recursion if parent_tweet: tweet.parent = parent_tweet # save tweet object into the db session.add(tweet) return tweet