def _scrape_tweets(start_date_str, num_of_date_project, max_count_per_day): tweet_accounts = util.read_tweet_accounts() for i in range(num_of_date_project): print(datetime.now()) end_date_str = _add_one_day_to_date_string(start_date_str) print(start_date_str) for screen_name in tweet_accounts: #print(screen_name) tweets = TweetScraper.get_tweets_from_user_timeline( screen_name, start_date_str, end_date_str, max_count_per_day) Tweet.init() len_of_tweets = len(tweets) print("Total length of tweets: %s" % str(len_of_tweets)) no_of_tweets_saved = 1 for tweet in tweets: try: if no_of_tweets_saved % 1000 == 0: print("%s tweets have been saved to database." % str(no_of_tweets_saved)) obj = Tweet(meta={'id': tweet['id']}) obj.screen_name = tweet['screen_name'] obj.full_text = tweet['full_text'] obj.created_at = tweet['created_at'] obj.save() no_of_tweets_saved = no_of_tweets_saved + 1 except: no_of_tweets_saved = no_of_tweets_saved + 1 pass start_date_str = _add_one_day_to_date_string(start_date_str)
def extract_and_store_tweets(csvfile,nlp,minetweet): print print "Start processing %s ..."%csvfile print "*"*20 start=time() # measure time # LOGGING tweets_count=0 mentions_count=0 urls_count=0 hashtags_count=0 tags_count=0 unvalid_tweets=0 i=1 # iteroator to remember row number on csv with open(csvfile, 'r') as f: # print 'Processing data...' next(f) # skip csv header data = csv.reader(f) # one row at a time for row in data: # create Tweet object t=Tweet() # Populate Tweet t.mid=row[0] t.retweetFromPostId=row[1] t.userId=row[2] t.retweetFromUserId=row[3] t.source=row[4] t.hasImage=row[5] t.txt=row[6] t.geo=row[7] t.created_at=row[8] t.deleted_last_seen=row[9] t.permission_denied=row[10] # Extract tweet entities mentions,urls,hashtags,clean=minetweet.extract_tweet_entities(t.txt) # add to Tweet t.mentions=mentions t.urls=urls t.hashtags=hashtags clean=clean # text-only version of the tweet for NLP # Extract keywords dico=nlp.extract_dictionary(clean) # remove stopwords and store clean dico t.dico=nlp.remove_stopwords(dico) # extract entities # TODO : ignore stopwords # t.entities=nlp.extract_named_entities_from_dico(t.dico) # Some count for stats mentions_count+=len(mentions) urls_count+=len(urls) hashtags_count+=len(hashtags) tags_count+=len(t.entities) t.row=i valid_utf8 = True try: t.txt.decode('utf-8') except UnicodeDecodeError: unvalid_tweets+=1 valid_utf8 = False print ' bad encoding : tweet ',t.mid # pprint(t) if valid_utf8 is True: try: t.save() tweets_count+=1 except bson.errors.InvalidStringData: print ' bad encoding : tweet ',t.mid # pprint(t) # LOG print print "-"*10 print " mentions_count : %d "%mentions_count print " urls_count : %d "%urls_count print " hashtags_count : %d "%hashtags_count print " unvalid tweets : %d "%unvalid_tweets print " TOTAL tweet entities : %d "%(mentions_count+urls_count+hashtags_count) print " TOTAL named entities (NER): %d "%tags_count print print "-"*10 print "TOTAL tweets processed : %d"%tweets_count print " done in %.3fs"%(time()-start) print
def extract_and_store_tweets(csvfile, nlp, minetweet): print print "Start processing %s ..." % csvfile print "*" * 20 start = time() # measure time # LOGGING tweets_count = 0 mentions_count = 0 urls_count = 0 hashtags_count = 0 tags_count = 0 unvalid_tweets = 0 i = 1 # iteroator to remember row number on csv with open(csvfile, 'r') as f: # print 'Processing data...' next(f) # skip csv header data = csv.reader(f) # one row at a time for row in data: # create Tweet object t = Tweet() # Populate Tweet t.mid = row[0] t.retweetFromPostId = row[1] t.userId = row[2] t.retweetFromUserId = row[3] t.source = row[4] t.hasImage = row[5] t.txt = row[6] t.geo = row[7] t.created_at = row[8] t.deleted_last_seen = row[9] t.permission_denied = row[10] # Extract tweet entities mentions, urls, hashtags, clean = minetweet.extract_tweet_entities( t.txt) # add to Tweet t.mentions = mentions t.urls = urls t.hashtags = hashtags clean = clean # text-only version of the tweet for NLP # Extract keywords dico = nlp.extract_dictionary(clean) # remove stopwords and store clean dico t.dico = nlp.remove_stopwords(dico) # extract entities # TODO : ignore stopwords # t.entities=nlp.extract_named_entities_from_dico(t.dico) # Some count for stats mentions_count += len(mentions) urls_count += len(urls) hashtags_count += len(hashtags) tags_count += len(t.entities) t.row = i valid_utf8 = True try: t.txt.decode('utf-8') except UnicodeDecodeError: unvalid_tweets += 1 valid_utf8 = False print ' bad encoding : tweet ', t.mid # pprint(t) if valid_utf8 is True: try: t.save() tweets_count += 1 except bson.errors.InvalidStringData: print ' bad encoding : tweet ', t.mid # pprint(t) # LOG print print "-" * 10 print " mentions_count : %d " % mentions_count print " urls_count : %d " % urls_count print " hashtags_count : %d " % hashtags_count print " unvalid tweets : %d " % unvalid_tweets print " TOTAL tweet entities : %d " % (mentions_count + urls_count + hashtags_count) print " TOTAL named entities (NER): %d " % tags_count print print "-" * 10 print "TOTAL tweets processed : %d" % tweets_count print " done in %.3fs" % (time() - start) print