def _scrape_tweets(start_date_str, num_of_date_project, max_count_per_day):
    tweet_accounts = util.read_tweet_accounts()

    for i in range(num_of_date_project):
        print(datetime.now())
        end_date_str = _add_one_day_to_date_string(start_date_str)
        print(start_date_str)

        for screen_name in tweet_accounts:

            #print(screen_name)
            tweets = TweetScraper.get_tweets_from_user_timeline(
                screen_name, start_date_str, end_date_str, max_count_per_day)

            Tweet.init()

            len_of_tweets = len(tweets)
            print("Total length of tweets: %s" % str(len_of_tweets))

            no_of_tweets_saved = 1
            for tweet in tweets:
                try:
                    if no_of_tweets_saved % 1000 == 0:
                        print("%s tweets have been saved to database." %
                              str(no_of_tweets_saved))
                    obj = Tweet(meta={'id': tweet['id']})
                    obj.screen_name = tweet['screen_name']
                    obj.full_text = tweet['full_text']
                    obj.created_at = tweet['created_at']
                    obj.save()
                    no_of_tweets_saved = no_of_tweets_saved + 1
                except:
                    no_of_tweets_saved = no_of_tweets_saved + 1
                    pass

        start_date_str = _add_one_day_to_date_string(start_date_str)
Exemple #2
0
def extract_and_store_tweets(csvfile,nlp,minetweet):
    print
    print "Start processing %s ..."%csvfile
    print "*"*20

    start=time() # measure time

    # LOGGING
    tweets_count=0
    mentions_count=0
    urls_count=0
    hashtags_count=0
    tags_count=0
    unvalid_tweets=0


    i=1 # iteroator to remember row number on csv
    with open(csvfile, 'r') as f:

        # print 'Processing data...'
        next(f) # skip csv header
        data = csv.reader(f)

        # one row at a time
        for row in data: 

            # create Tweet object
            t=Tweet()

            # Populate Tweet
            t.mid=row[0]
            t.retweetFromPostId=row[1]
            t.userId=row[2]
            t.retweetFromUserId=row[3]
            t.source=row[4]
            t.hasImage=row[5]
            t.txt=row[6]
            t.geo=row[7]
            t.created_at=row[8]
            t.deleted_last_seen=row[9]
            t.permission_denied=row[10]

            # Extract tweet entities
            mentions,urls,hashtags,clean=minetweet.extract_tweet_entities(t.txt)
            
            # add to Tweet
            t.mentions=mentions
            t.urls=urls
            t.hashtags=hashtags
            clean=clean # text-only version of the tweet for NLP

            # Extract keywords
            dico=nlp.extract_dictionary(clean)

            # remove stopwords and store clean dico
            t.dico=nlp.remove_stopwords(dico)

            # extract entities
            # TODO : ignore stopwords
            # t.entities=nlp.extract_named_entities_from_dico(t.dico)
            
            # Some count for stats
            mentions_count+=len(mentions)
            urls_count+=len(urls)
            hashtags_count+=len(hashtags)
            tags_count+=len(t.entities)

            t.row=i

            valid_utf8 = True
            try:
                t.txt.decode('utf-8')
            except UnicodeDecodeError:
                unvalid_tweets+=1
                valid_utf8 = False
                print ' bad encoding : tweet ',t.mid
                # pprint(t)
            
            if valid_utf8 is True:
                try:
                    t.save()
                    tweets_count+=1
                except bson.errors.InvalidStringData:
                    print ' bad encoding : tweet ',t.mid
                    # pprint(t)

    # LOG
    print
    print "-"*10
    print " mentions_count            : %d "%mentions_count
    print " urls_count                : %d "%urls_count
    print " hashtags_count            : %d "%hashtags_count
    print " unvalid tweets            : %d "%unvalid_tweets
    print " TOTAL tweet entities      : %d "%(mentions_count+urls_count+hashtags_count)
    print " TOTAL named entities (NER): %d "%tags_count
    print
    print "-"*10
    print "TOTAL tweets processed    : %d"%tweets_count
    print " done in %.3fs"%(time()-start)
    print
Exemple #3
0
def extract_and_store_tweets(csvfile, nlp, minetweet):
    print
    print "Start processing %s ..." % csvfile
    print "*" * 20

    start = time()  # measure time

    # LOGGING
    tweets_count = 0
    mentions_count = 0
    urls_count = 0
    hashtags_count = 0
    tags_count = 0
    unvalid_tweets = 0

    i = 1  # iteroator to remember row number on csv
    with open(csvfile, 'r') as f:

        # print 'Processing data...'
        next(f)  # skip csv header
        data = csv.reader(f)

        # one row at a time
        for row in data:

            # create Tweet object
            t = Tweet()

            # Populate Tweet
            t.mid = row[0]
            t.retweetFromPostId = row[1]
            t.userId = row[2]
            t.retweetFromUserId = row[3]
            t.source = row[4]
            t.hasImage = row[5]
            t.txt = row[6]
            t.geo = row[7]
            t.created_at = row[8]
            t.deleted_last_seen = row[9]
            t.permission_denied = row[10]

            # Extract tweet entities
            mentions, urls, hashtags, clean = minetweet.extract_tweet_entities(
                t.txt)

            # add to Tweet
            t.mentions = mentions
            t.urls = urls
            t.hashtags = hashtags
            clean = clean  # text-only version of the tweet for NLP

            # Extract keywords
            dico = nlp.extract_dictionary(clean)

            # remove stopwords and store clean dico
            t.dico = nlp.remove_stopwords(dico)

            # extract entities
            # TODO : ignore stopwords
            # t.entities=nlp.extract_named_entities_from_dico(t.dico)

            # Some count for stats
            mentions_count += len(mentions)
            urls_count += len(urls)
            hashtags_count += len(hashtags)
            tags_count += len(t.entities)

            t.row = i

            valid_utf8 = True
            try:
                t.txt.decode('utf-8')
            except UnicodeDecodeError:
                unvalid_tweets += 1
                valid_utf8 = False
                print ' bad encoding : tweet ', t.mid
                # pprint(t)

            if valid_utf8 is True:
                try:
                    t.save()
                    tweets_count += 1
                except bson.errors.InvalidStringData:
                    print ' bad encoding : tweet ', t.mid
                    # pprint(t)

    # LOG
    print
    print "-" * 10
    print " mentions_count            : %d " % mentions_count
    print " urls_count                : %d " % urls_count
    print " hashtags_count            : %d " % hashtags_count
    print " unvalid tweets            : %d " % unvalid_tweets
    print " TOTAL tweet entities      : %d " % (mentions_count + urls_count +
                                                hashtags_count)
    print " TOTAL named entities (NER): %d " % tags_count
    print
    print "-" * 10
    print "TOTAL tweets processed    : %d" % tweets_count
    print " done in %.3fs" % (time() - start)
    print