Esempio n. 1
0
def obtain_tweet_by_user(username,
                         NUM_TWEETS,
                         label,
                         training=1,
                         NUM_TWEETS_TRAINING=0):

    # Creating list to append tweet data to
    tweets_list1 = []
    if training:
        for i, tweet in enumerate(
                sntwitter.TwitterSearchScraper('from:' +
                                               username).get_items()):
            if i > NUM_TWEETS:
                break
            tweets_list1.append([tweet.content, label])
    elif training == 0:
        for i, tweet in enumerate(
                sntwitter.TwitterSearchScraper('from:' +
                                               username).get_items()):
            if i < NUM_TWEETS_TRAINING:
                continue
            tweets_list1.append([tweet.content, label])
    # Creating a dataframe from the tweets list above
    tweets_df1 = pd.DataFrame(tweets_list1, columns=['Text', 'Label'])
    tweets_df1.to_csv(username + str(training) + ".csv")
    return tweets_df1
Esempio n. 2
0
def scrape_tweets(*, start: str, end: str, num_tweets: int, terms: list[str],
                  target: str):
    scraper = twt.TwitterSearchScraper(
        " ".join(terms),
        f" lang: en since:{start} until:{end} -filter:replies")

    Row = namedtuple("Row", [
        "tweetId", "date", "user", "url", "contents", "weight", "pos", "neu",
        "neg"
    ])

    # Represents the output file we're writing to
    output = None
    if target[0] == "psql":
        output = writer.PSQLWriter(db_config=target[1])
    else:
        output = csv_file = writer.CSVWriter(file_name=target[1],
                                             out_directory=target[2],
                                             column_names=Row._fields)

    for i, tweet in enumerate(scraper.get_items()):
        if i > MAX_TWEETS:
            break

        weight = classify.naive_weight(tweet.content)
        pos, neu, neg = classify.vader(tweet.content)

        output.append(
            Row(tweet.id, tweet.date, tweet.user, tweet.url, tweet.content,
                weight, pos, neu, neg))

    output.stop()
Esempio n. 3
0
    def cmd_twitter(phrase, metadata, session):
        # phrase to be two parts: search by (keyword or username), the search argument
        try:
            search_by = phrase.split(", ")[0]
            search_arg = phrase.split(", ")[1]
            todays_date_str = datetime.today().strftime('%Y-%m-%d')
            yesterdays_date = datetime.now() - timedelta(1)
            yesterdays_date_str = datetime.strftime(yesterdays_date,
                                                    '%Y-%m-%d')
            if search_by == "keyword":
                scrape_crit = f"{search_arg} since:{yesterdays_date_str} until:{todays_date_str}"
                body_text = f"Tweets about {search_arg}:\n\n"

            elif search_by == "username":
                scrape_crit = f"from:{search_arg}"
                body_text = f"Tweets by {search_arg}:\n\n"

            # Using TwitterSearchScraper to scrape data and append tweets to list
            for i, tweet in enumerate(
                    sntwitter.TwitterSearchScraper(scrape_crit).get_items()):
                # print(vars(tweet))
                # print(vars(tweet.user))
                if i > 9:
                    break
                date_str = tweet.date.strftime("%Y-%d-%m")
                body_text += f"Name: {tweet.user.displayname} ({tweet.user.username})\n"
                body_text += f"Date: {date_str}\n"
                body_text += f"Tweet: {tweet.content}\n\n"

        except Exception as e:
            body_text = "Error: " + str(e)
        print(body_text)
        return body_text
Esempio n. 4
0
def get_tweets(keywords, save_dir=SAVEDIR_NEW, maxTweets=100):
    df_keywords = pd.read_csv('keywords.csv')

    dates = pd.date_range('1/1/2020', periods=52, freq='W')

    for week_idx, date in enumerate(dates):
        for label in df_keywords.columns:
            keywords = df_keywords[label]
            ss = label.replace('/', '-')
            savestr = os.path.join(save_dir, f'{ss}-{date}.json')
            start = f'{dates[week_idx]}'[:10]
            stop = f'{dates[week_idx+1]}'[:10]
            print(
                f'{datestr}: Fetching tweets in range {start} - {stop} for keywords: {keywords}'
            )

            tweets = []
            query = " OR ".join(
                keywords) + " lang:de" + ' since:' + start + " until:" + stop
            # Using TwitterSearchScraper to scrape data and append tweets to list
            for i, tweet in enumerate(
                    sntwitter.TwitterSearchScraper(query).get_items()):
                if i > maxTweets:
                    break
                tweets.append(tweet.__dict__)

            print(f'Found {len(tweets)} tweets')
            if len(tweets) > 0:
                pd.DataFrame(tweets).to_json(savestr,
                                             orient='records',
                                             lines=True)
Esempio n. 5
0
def twitter_scraper(tw_id, since="2020-01-01", to="2020-02-01") :
    """

    scrape twitter from given account

    Args:
        tw_id (str): the twitter account id
        since (str, optional): from date. Defaults to "2020-01-01".
        to (str, optional): to date. Defaults to "2020-02-01".
    Returns:

    """
    # initialize
    tweet_time = []
    tweet_dates = []
    tweets_content = []

    # scrape from twitter
    the_query = "from:" + tw_id + " since:" + since + " until:" + to
    for tweet in tw.TwitterSearchScraper(query=the_query).get_items() :
        tweet_dates.append(tweet.date.strftime("%Y-%m-%d"))
        tweet_time.append(tweet.date.strftime("%Y-%m-%d %H:%M:%S"))
        tweets_content.append(tweet.content)

    # convert to dataframe
    tweets = pd.DataFrame(
        {"Time" : tweet_time, "Content" : tweets_content, "Date" : tweet_dates}
    ).set_index("Time")
    tweets = tweets.iloc[
             : :-1
             ]  # reverse the order, retrieved from: https://stackoverflow.com/questions/20444087/right-way-to-reverse-pandas-dataframe

    return tweets
Esempio n. 6
0
def scrape(query, date):
    date = pd.Timestamp(date)
    next_day = date + pd.Timedelta(days=2)
    scraper = sntwitter.TwitterSearchScraper(
        f'{query} since:{date.strftime("%Y-%m-%d")} until:{next_day.strftime("%Y-%m-%d")}'
    )
    fname = f'./tweets{query}/{date.year}/{date.month:02}/{date.strftime("%Y-%m-%d")}-tweets-{query}-compressed.csv'
    start = pd.Timestamp.now()
    print(
        f'Date: {date.strftime("%Y-%m-%d")} | Beg: {start.strftime("%Y-%m-%d %H:%M:%S")}'
    )
    tweets, header, mode = [], True, 'w'
    for i, tweet in enumerate(scraper.get_items()):
        tweets.append(tweet)
        if i % 1000 == 999:
            to_df(tweets, date).to_csv(fname,
                                       index=False,
                                       header=header,
                                       mode=mode,
                                       compression='gzip')
            tweets, header, mode = [], False, 'a'
    to_df(tweets, date).to_csv(fname,
                               index=False,
                               header=header,
                               mode=mode,
                               compression='gzip')
    final = pd.Timestamp.now()
    print(
        f'Date: {date.strftime("%Y-%m-%d")} | End: {final.strftime("%Y-%m-%d %H:%M:%S")}'
    )
    print(
        f'Date: {date.strftime("%Y-%m-%d")} | Dur: {(final - start).total_seconds()}s'
    )
Esempio n. 7
0
def authenticate():

    # twitter_configs = process_twitter_configs()
    # consumer_key = twitter_configs["consumer_key"]
    # consumer_secret = twitter_configs["consumer_key_secret"]
    # access_token = twitter_configs["access_token"]
    # access_secret = twitter_configs["access_token_secret"]
    #
    # auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    # auth.set_access_token(access_token, access_secret)

    csvFile = open('15-10-2020.csv', 'a')  # creates a file in which you want to store the data.
    csvWriter = csv.writer(csvFile)

    maxTweets = 1000  # the number of tweets you require
    # for i, tweet in enumerate(sntwitter.TwitterSearchScraper('#covid19' +
    #                                                          'since:2020-10-15 until:2020-10-16'
    #                                                          ).get_items()):
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:AOC'
                                                             ).get_items()):

        if i > maxTweets:
            break
        print(tweet.date)
        csvWriter.writerow([tweet.date, tweet.content])
Esempio n. 8
0
def load_tweets(q_str, max_tweets=100):

    # Creating list to append tweet data to
    tweets_list = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for idx, tweet in enumerate(
            sntwitter.TwitterSearchScraper(q_str).get_items()):
        if idx > max_tweets:
            break
        source = get_source(tweet.source)
        text = tweet.content.lower()
        # do not include bots and giveaways
        if "Twit" in source and "away" not in text and "give" not in text:
            tweets_list.append([
                tweet.date, tweet.id, text, tweet.user.username,
                tweet.replyCount, tweet.retweetCount, tweet.likeCount,
                tweet.quoteCount, source, tweet.url, tweet.user.id,
                tweet.user.description, tweet.user.followersCount,
                tweet.user.friendsCount
            ])

    # Creating a dataframe from the tweets list above
    df_tweets = pd.DataFrame(tweets_list,
                             columns=[
                                 'created_at', 'id', 'text', 'username',
                                 'replyCount', 'retweetCount', 'likeCount',
                                 'quoteCount', "source", "tweet_url",
                                 'user_id', "user_bio", 'followers_count',
                                 'friends_count'
                             ])

    return df_tweets
Esempio n. 9
0
def get_tweets(maxTweets, trend, startdate, enddate):
    maxTweets = maxTweets
    input = '%s since:%s until:%s' % (trend, startdate, enddate)

    tweets_list2 = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(input).get_items()):
        if i > maxTweets:
            break
        tweets_list2.append([tweet.content])

    tweets = pd.DataFrame(tweets_list2, columns=['Text'])

    def processed_tweet(tweet):
        return ' '.join(
            re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ',
                   tweet).split())

    tweets['processed_tweet'] = tweets['Text'].apply(
        lambda x: processed_tweet(x))

    all_tweets = ' '.join(tweet for tweet in tweets['processed_tweet'])

    wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color="black").generate(all_tweets)
    image = wordcloud.to_image()
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue())

    return img_str.decode('utf-8')
Esempio n. 10
0
def crawler(source_file_dir: str) -> None:
    with open(source_file_dir, 'r') as stock_profile_list:
        data_folder_dir = os.path.dirname(source_file_dir)
        with open(os.path.join(data_folder_dir, 'tweet.csv'), 'w') as tweet_list:
            fields = ['Stock', 'Date', 'Content', 'Author']
            tweet_writer = csv.DictWriter(tweet_list, fields)
            tweet_writer.writeheader()

            rstocklist = csv.DictReader(stock_profile_list)
            next(rstocklist)

            for row in rstocklist:
                max_tweets = 10
                print(f"searching for ticker ${row['ticker']}")
                hashtag = f"${row['ticker']}"

                try:
                    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(hashtag).get_items()):
                        if i > max_tweets:
                            break
                        tweet_writer.writerow(
                            {'Stock': row['ticker'], 'Date': str(tweet.date), 'Content': tweet.content,
                             'Author': tweet.username})
                except Exception as e:  # work on python 3.x
                    print(str(e))
Esempio n. 11
0
def get_tweets_on_date(start_day):
    keyword = 'dogecoin OR Dogecoin OR DogeCoin'
    maxTweets = 2000
    end_day = start_day + timedelta(days=1)

    #Open/create a file to append data to
    csvFile = open('dogecoin-sentiment-' + start_day.strftime('%Y-%m-%d') +
                   '.csv',
                   'w',
                   newline='',
                   encoding='utf8')

    #Use csv writer
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow([
        'id', 'date', 'text', 'user', 'replyCount', 'retweetCount',
        'likeCount', 'quoteCount'
    ])

    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(
                keyword + ' lang:en since:' + start_day.strftime('%Y-%m-%d') +
                ' until:' + end_day.strftime('%Y-%m-%d') +
                ' -filter:links -filter:replies').get_items()):
        if i > maxTweets:
            break
        csvWriter.writerow([
            tweet.id, tweet.date, tweet.content, tweet.replyCount,
            tweet.retweetCount, tweet.likeCount, tweet.quoteCount
        ])
    csvFile.close()
Esempio n. 12
0
def process_twitter_details(keyword):
    start_date = str(date.today() + timedelta(days=-14))
    end_date = str(date.today() + timedelta(days=-7))

    usernames = []
    tweet_ids = []
    contents = []
    dates = []
    medium_links = []
    intext_links = []
    tweet_url = []
    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(keyword + " since:" + start_date +
                                           ' until:' + end_date).get_items()):
        if i > 10:
            break
        usernames.append(tweet.username)
        tweet_ids.append(tweet.id)
        dates.append(tweet.date)
        contents.append(tweet.content)
        medium_links.append(tweet.outlinks)
        intext_links.append(tweet.tcooutlinks)
        tweet_url.append(tweet.url)

    return usernames, tweet_ids, contents, dates, medium_links, intext_links, tweet_url
Esempio n. 13
0
def getFirstAppearance(keywords, startDate, endDate, location, printAllTweets):
    # build query
    counter = 0
    hashString = ''
    for keyword in keywords:
        counter += 1
        hashString = hashString + keyword
        if counter < len(keywords):
            hashString += " OR "
    # print(hashString)
    query = hashString + " since:" + startDate + " until:" + endDate
    if location is not None:
        query += ''' near:"''' + location + '''" within:30mi'''
    tweets = []


    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(
            query).get_items()):
        t1 = Tweet(tweet.id,tweet.date,tweet.content,tweet.username)
        tweets.append(t1)
        if printAllTweets:
            print(t1)
        #frequencycounter+=1
        #results.append(Tweet(tweet.id,tweet.date,tweet.content))
        #id = tweet.id

    if len(tweets)==0:
        return "No tweets found between " + str(startDate) + " and "+ str(endDate)
    #return last element since it is the earliest
    return tweets[len(tweets)-1]
Esempio n. 14
0
def submit():
    countvar = int(x1.get())
    wordsvar = x2.get()
    sincevar = x3.get()
    untilvar = x4.get()
    csvnamevar = x5.get()
    personvar = x6.get()
    locationvar = x7.get()
    tweetslist = []

    params = "'" + wordsvar + " from:" + personvar + " near:" + locationvar + " since:" + sincevar + " until:" + untilvar + "'"
    params = str(params)

    global disglobvar
    if disglobvar == "yes":
        params = "'" + wordsvar + " from:" + personvar + " near:" + locationvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'"
        params = str(params)

    #if personvar==None:
    if len(personvar) == 0:
        params = "'" + wordsvar + " near:" + locationvar + " since:" + sincevar + " until:" + untilvar + "'"
        if disglobvar == "yes":
            params = "'" + wordsvar + " near:" + locationvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'"
        str(params)

    #if locationvar==None:
    if len(locationvar) == 0:
        params = "'" + wordsvar + " from:" + personvar + " since:" + sincevar + " until:" + untilvar + "'"
        if disglobvar == "yes":
            params = "'" + wordsvar + " from:" + personvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'"
        str(params)

    #if locationvar==None and personvar==None:
    if len(locationvar) == 0 and len(personvar) == 0:
        params = "'" + wordsvar + " since:" + sincevar + " until:" + untilvar + "'"
        if disglobvar == "yes":
            params = "'" + wordsvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'"
        str(params)

    #print(params)
    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(params).get_items()):
        if i > countvar:
            break
        if (i % 100 == 0):
            print("Progress:", i, "/", countvar)
        tweetslist.append([
            tweet.content, tweet.date, tweet.user.username, tweet.lang,
            tweet.user.location, tweet.id
        ])
    tweetslistdataframe = pd.DataFrame(tweetslist,
                                       columns=[
                                           "Tweet Content", "Tweet Date",
                                           "Username", "Language", "Location",
                                           'Tweet ID'
                                       ])
    tweetslistdataframe.to_csv(csvnamevar + ".csv")
    tweetslist.clear()
    del tweetslistdataframe
    print("Finished Downloading Tweets")
Esempio n. 15
0
def create_corpus(company):
    name = company[0]
    keyword = company[1]
    start_date = date(2018, 12, 31)
    next_date = start_date + dt.timedelta(days=1)
    end_date = date(2020, 1, 1)
    max_tweets = 10
    tweet_content = []
    tweet_dates = []
    filename = name + "_tweets.csv"
    while next_date < end_date:
        start = start_date.strftime("%Y-%m-%d")
        next = next_date.strftime("%Y-%m-%d")
        search = keyword + ' since:' + start + ' until:' + next + ' lang:en'
        for i, t in enumerate(
                sntwitter.TwitterSearchScraper(search).get_items()):
            if i > max_tweets:
                break
            print("collecting tweets...")
            tweet_content.append(clean_text(t.content))
            tweet_dates.append(t.date)
        start_date = next_date
        next_date = next_date + dt.timedelta(days=1)
        print("finished a batch!")
    tweet_data = {'Time': tweet_dates, 'Text': tweet_content}
    tweet_df = pd.DataFrame(tweet_data)
    tweet_df.to_csv("./tweets/" + filename)
Esempio n. 16
0
def sns(query, num_tweets=10):
    output = []
    # We decode the parameter since we are calling the function from a separate server and parameters info is binary-enconded
    engine_query = '#' + query.decode('utf-8') + ' -filter:retweets'
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(engine_query).get_items()):
        # We need to limit the no. of tweets manually since the sns Python wrapper doesn't include the functionality
        if i > num_tweets-1:
            break
        try:
            to_translate = tweet.content
            translated = GoogleTranslator(source='auto', target='en').translate(to_translate)
            sentimiento = TextBlob(translated).sentiment
        except Exception as e:
            sentimiento = 0.0

        output.append({
            'Tweet_No': str(i+1),
            'Date': str(tweet.date),
            'ID': tweet.id,
            'Content': tweet.content,
            'Username': tweet.username,
            'Sentiment': round(sentimiento.polarity, 2)
        })

    return json.dumps(output, indent=3)
Esempio n. 17
0
    def get_tweets(self, keywords, tweets_per_week, weeks, lang='pl'):
        """
        :param tweets_per_week:
        :param weeks:
        :param lang:
        :param keywords: provide keywords separated by a +, e. g. "korona+szczepienie"
        :return: list Tweet objects
        """

        now = datetime.now()
        now_str = now.strftime('%Y-%m-%d')
        until = now
        since = now

        # Open/create a file to append data to
        csvFile = open(os.path.join('en', (keywords + '-sentiment-' + now_str + '.csv')), 'a', newline='',
                       encoding='utf8')

        # Use csv writer
        csvWriter = csv.writer(csvFile)
        csvWriter.writerow(['id', 'date', 'tweet', 'retweet_count', 'like_count'])
        for _ in range(weeks):
            until = since
            until_str = until.strftime('%Y-%m-%d')
            since = until - timedelta(days=7)
            since_str = since.strftime('%Y-%m-%d')
            for i, tweet in enumerate(sntwitter.TwitterSearchScraper(
                    keywords + ' lang:' + lang + ' since:' + since_str + ' until:' + until_str + ' -filter:links -filter:replies').get_items()):
                if i > tweets_per_week:
                    break

                csvWriter.writerow([tweet.id, tweet.date, tweet.content, tweet.retweetCount, tweet.likeCount])
        csvFile.close()
Esempio n. 18
0
                    def graphs():

                        tweets_list2 = []

                        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(raw_text + ' since:' + since_date +' until:' + until_date).get_items()):
                            if i>count:
                                break
                            tweets_list2.append([tweet.content])
                        
                        df = pd.DataFrame(tweets_list2, columns=['Tweet'])

                        # Create Dataframe with just tweets
                        df['cleanLinks'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])  # Removing URLs
                        df['cleanLinks'] = df['cleanLinks'].apply(lambda x: x.lower())  # applying lowercase to text

                        # Special Characters list
                        spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")",
                                      "*", "+", ",", "-", ".", "/", ":", ";", "<",
                                      "=", ">", "?", "@", "[", "\\", "]", "^", "_",
                                      "`", "{", "|", "}", "~", "–", '$']

                        for char in spec_chars:
                            df['cleanLinks'] = df['cleanLinks'].str.replace(char, ' ')
                            # Counting Numbers

                        def Wordcount(cleanLinks):
                            if 'buying' in cleanLinks.lower():
                                return 'buy positions mentioned'
                            if 'selling' in cleanLinks.lower():
                                return 'sell positions mentioned'
                            if 'buy' in cleanLinks.lower():
                                return 'buy positions mentioned'
                            if 'sell' in cleanLinks.lower():
                                return 'sell positions mentioned'
                            if 'short' in cleanLinks.lower():
                                return 'short positions mentioned'
                            if 'long' in cleanLinks.lower():
                                return 'long positions mentioned'
                            if 'put' in cleanLinks.lower():
                                return 'puts mentioned'
                            if 'call' in cleanLinks.lower():
                                return 'calls mentioned'
                            else:
                                return

                        df['Market Polar Position'] = df['cleanLinks'].apply(Wordcount)

                        # Graph numbers
                        st.markdown('**Visualization of investor positions:**')
                        position_A = df['Market Polar Position'].value_counts()
                        st.write(position_A)

                        # Graph
                        plt.axis('off')
                        df['Market Polar Position'].value_counts().plot(kind='pie', autopct='%1.1f%%',
                                                                        figsize=(10, 5))
                        plt.savefig('buyers.png')
                        buy = Image.open("buyers.png")
                        return buy
Esempio n. 19
0
def main():
    keyword = input('Informe um usuário ou tópico para buscar: ')
    maxTweets = int(input("Selecione a quantidade de tweets para buscar: "))

    # Iniciando um arquivo csv vazio para manipular
    csvFile = open(keyword + '-sentiment-' + now + '.csv',
                   'a',
                   newline='',
                   encoding='utf8')

    # Usando csv writer para abrir o arquivo para escrita e definir suas colunas
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow([
        'id',
        'date',
        'tweet',
    ])

    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(
                keyword + ' since:' + yesterday + ' until:' + now +
                ' -filter:links -filter:replies').get_items()):
        if i > maxTweets:
            break
        csvWriter.writerow([tweet.id, tweet.date, tweet.content])
    csvFile.close()

    # iniciando a analise de sentimentos
    analyzer = SentimentIntensityAnalyzer()

    # Lendo o CSV de volta em nosso programa
    df = pd.read_csv('~/Documents/PycharmProjects/Diversos/webscrap/' +
                     keyword + '-sentiment-' + now + '.csv',
                     parse_dates=True,
                     index_col=0)

    # Criando as colunas de sentimentos
    df['compound'] = [
        analyzer.polarity_scores(x)['compound'] for x in df['tweet']
    ]
    df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['tweet']]
    df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['tweet']]
    df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['tweet']]

    # Pegando a media dos sentimentos de cada coluna
    avg_compound = np.average(df['compound'])
    avg_neg = np.average(df['neg']) * -1
    avg_neu = np.average(df['neu'])
    avg_pos = np.average(df['pos'])

    # Contagem dos tweets
    count = len(df.index)

    # Print das analises
    print("Foram encontrados ", count, "tweets sobre " + keyword, end='\n*')
    print("Sentimentos positivos:", '%.2f' % avg_pos, end='\n*')
    print("Sentimentos neutros:", '%.2f' % avg_neu, end='\n*')
    print("Sentimentos negativos:", '%.2f' % avg_neg, end='\n*')
    print("Sentimentos compostos:", '%.2f' % avg_compound, end='\n')
Esempio n. 20
0
def tweets_search(request):
  if request.method == 'POST':
    key_groups = []
    # Recebo os dados enviados pelo usuário
    username = request.POST.get('username')
    start_date = request.POST.get('startDate').split('-')
    end_date = request.POST.get('endDate').split('-')
    type_search = request.POST.get('search')
    num_search = request.POST.get('num')
    keywords = request.POST.get('words').split(',')

    # Evita que usuários deixem de preencher algo
    if keywords == [''] or start_date == [''] or end_date == ['']:
      return render(request, 'portal/tweets_search.html')

    # Adapto a data no formato DD-MM-AAAA para AAAA-MM-DD
    begin_date = f'{start_date[2]}-{start_date[1]}-{start_date[0]}'
    end_date = f'{end_date[2]}-{end_date[1]}-{end_date[0]}'

    num = len(keywords)
    j = 0
    # Crio a string que determina a raspagem a ser feita
    search = ''
    while num > j:
      # Caso tenha usuário, defino que deve ser concatenado seu username
      if username != '' and j == 0:
        search = search + f'from:{username}'
      # Concateno a primeira keyword
      if j == 0:
        search = search + f' {keywords[0]}'
      # Concateno as demais keywords
      else:
        # Concateno com AND caso deseje tweets com todos as keywords
        if type_search == 'all-kw':
          search = search + f' AND {keywords[j]}'
        # Concateno com OR caso deseje tweets com no mínimo uma keyword
        else:
          search = search + f' OR {keywords[j]}'
      j += 1

    # Concateno as datas de início e fim da busca
    search = search+ f' since:{begin_date}' + f' until:{end_date}'

    tweets = []
    datas = []
    # Realiza a busca definida anteriormente, se limitando com o número máximo de tweets
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(search).get_items()):
      if num_search != 'ilimitado' and i > int(num_search):
        break
      
      # Recebe e formata a data do tweet
      data = str(tweet.date).split()[0]
      data = data.split('-')
      data = f'{data[2]}-{data[1]}-{data[0]}'
      tweets.append([data, tweet.content])

    return render(request, 'portal/tweets_list.html', {'tweets': tweets})
  else:
    return render(request, 'portal/tweets_search.html')
Esempio n. 21
0
                    def gen_wordcloud():

                        tweets_list2 = []

                        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(raw_text + ' since:' + since_date +' until:' + until_date).get_items()):
                            if i>count:
                                break
                            tweets_list2.append([tweet.content])
                        
                        df = pd.DataFrame(tweets_list2, columns=['Tweet'])

                        df['cleanLinks'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])  # Removing URLs
                        df['cleanLinks'] = df['cleanLinks'].apply(lambda x: x.lower())  # applying lowercase to text

                        # Special Character list
                        spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")",
                                      "*", "+", ",", "-", ".", "/", ":", ";", "<",
                                      "=", ">", "?", "@", "[", "\\", "]", "^", "_",
                                      "`", "{", "|", "}", "~", "–", '$']

                        for char in spec_chars:
                            df['cleanLinks'] = df['cleanLinks'].str.replace(char, ' ')
                        # WC generation
                        words = " ".join(df['cleanLinks'])

                        # remove punctuation and stop words
                        def punctuation_stop(text):

                            filtered = []
                            stop_words = set(stopwords.words('english'))
                            word_tokens = word_tokenize(text)
                            for w in word_tokens:
                                if w not in stop_words and w.isalpha():
                                    filtered.append(w.lower())
                            return filtered

                        unwanted = [raw_text, raw_text_U, 'market', 'moving', 'average', 'economy', 'stockmarket',
                                    'stocks', 'stock', 'people', 'money', 'markets', 'today', 'http', 'the', 'to', 'and',
                                    'is',
                                    'of',
                                    'in', 'it', 'you', 'for', 'on', 'this', 'will', 'are', 'price', 'dow', 'jones',
                                    'robinhood', 'link', 'http', 'dow', 'jones', 'order', '//', 'sign', 'join', 'claim']
                        try:
                            words_filtered = punctuation_stop(words)
                            text = " ".join([ele for ele in words_filtered if ele not in unwanted])
                            wc = WordCloud(background_color="gray", stopwords=STOPWORDS, max_words=500, width=2000,
                                           height=2000)
                            wc.generate(text)
                            plt.imshow(wc, interpolation="bilinear")
                            plt.axis('off')
                            plt.savefig('WC.png')
                            gen = Image.open("WC.png")
                            plt.show()
                            return gen

                        except ValueError:
                            st.error('**Not enough tweets found to build wordcloud**')
Esempio n. 22
0
def extract(line, max_lim, log, repeatinfo):
    global username
    global date
    global lang
    global text
    global likes
    #global location
    global sharedata
    global url
    global media
    global repeated

    start_hash_time = time.time()
    line = re.sub(r'[ˆ\n]', r'', line)
    tweetdata = line.split(';')

    if log:
        print( "Extracting " + tweetdata[0] + " in " + str(tweetdata[1]) + " >> " + str(tweetdata[2]) + " ...")
    results = 0

    # Extract data
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(tweetdata[0] + " since:" + tweetdata[1] + " until:" + tweetdata[2]).get_items()):

        if (i > max_lim) and (max_lim > 0):   # Max limit of results
            if log:
                print("Maximum Limit of Extraction! Extraction stopped!")
            break
        
        if (text.count(tweet.content) == 0) or (username.count(tweet.user.username) == 0): # Check for duplicates
            username.append(tweet.user.username)
            date.append(tweet.date)
            lang.append(tweet.lang)
            text.append(tweet.content)
            likes.append(tweet.likeCount)
            #location.append(tweet.location)
            sharedata.append("likes=" + str(tweet.likeCount) + ";retweets=" + str(tweet.retweetCount) + ";replies=" + str(tweet.replyCount) + ";quotes=" + str(tweet.quoteCount))
            url.append(tweet.url)
            if repeatinfo != "":
                repeated.append(repeatinfo)
            
            if tweet.media:
                mediaurl = []
                for medium in tweet.media:
                    if medium.type == "photo":
                        mediaurl.append(medium.fullUrl)
                    elif medium.type == "video":
                        for v in medium.variants:
                            mediaurl.append(v.url.replace("?tag=13", "").replace("?tag=10", ""))
                media.append(mediaurl)
            else:
                media.append([])
            
            results = i

    end_hash_time = formatTime(time.time() - start_hash_time)
    if log:
        print(str(results), tweetdata[0], "tweet(s) extracted in {:0>2}:{:0>2}:{:05.2f}".format(int(end_hash_time[0]), int(end_hash_time[1]), end_hash_time[2]), "\n")
Esempio n. 23
0
def getAuthorityData(startDate, endDate, user):
    query = "from:" + user + " since:" + startDate + " until:" + endDate
    #print (query)

    list = []
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(
            query).get_items()):
        list.append(Tweet(tweet.id, tweet.date, tweet.content,tweet.username))
        #print(tweet.id)
    return list
Esempio n. 24
0
 async def retrieve_trends(self):
     while(1):
         today = datetime.date.today()
         query :str = self._hashtag + ' since:' + str(today)
         print(f'query is {query}')
         totaltweets = sntwitter.TwitterSearchScraper(query)
         for i, tweet in enumerate(totaltweets.get_items()):
             #print(tweet.date)
             #print(tweet.content)
             if tweet == None: continue
             inf :List[float]= [tweet.retweetCount, tweet.likeCount, tweet.replyCount]
             if inf == []: continue
             if self.my_criteria(inf):
                 thisuser: str = tweet.user.username
                 print(f'influencer {thisuser}')
                 untilday = tweet.date.date()
                 last_N_tweets = sntwitter.TwitterSearchScraper('from:' + thisuser +' until:'+str(untilday)).get_items()
                 infthisuer:List[List[float]] = []
                 userinf = []
                 if thisuser in self._userinf.keys():
                     userinf = self._userinf[thisuser]
                 else:
                     for N, tweet_user in enumerate(last_N_tweets):
                         if tweet_user == None: continue
                         # print(tweet_user.date)
                         if N > self._PastN:
                             break
                         #got_hashtags = self.retrive_hashtags(content=tweet_user.content)
                         if N <= self._PastN:
                             infthisuer.append([tweet_user.retweetCount, tweet_user.likeCount, tweet_user.replyCount])
                     if len(infthisuer) ==0: continue
                     infthisuer = np.array(infthisuer, dtype=np.float32)
                     avg = infthisuer.mean(0)
                     userinf = [avg[0], avg[1], avg[2]]
                 if self.alert_criteria(userinf,inf):
                     content = tweet.content.replace('\n', '').replace('\r', '')
                     userinf_str:str = str(userinf[0])+","+str(userinf[1])+","+str(userinf[2])
                     tweet_inf:str = str(inf[0])+","+str(inf[1])+","+str(inf[2])
                     alert_content = '\n'+tweet.url+' / '+str(tweet.date)+" / user inf "+ userinf_str+" / tweet inf "+tweet_inf+\
                                     " / "+tweet.user.username+" / "+content+'\n'
                     self.make_alert(alert_content)
         await asyncio.sleep(1800)
Esempio n. 25
0
def getEarliestTweets(hashtags, startDate, endDate, location):
    #TODO: start ist immer das aktuelle Datum


    #convert strings to actual date elements
    start = datetime.datetime.strptime(startDate, "%Y-%m-%d")
    end = datetime.datetime.strptime(endDate,"%Y-%m-%d")

    # build query
    counter = 0
    hashString = ''
    for hashtag in hashtags:
        counter += 1
        hashString = hashString + hashtag
        if counter < len(hashtags):
            hashString += " OR "
    # print(hashString)

    ids = []
    results = []

    dict = {}


    while (start<=end):



        query = hashString + " since:" + start.strftime("%Y-%m-%d") + " until:" + (start+datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        if location is not None:
            query += ''' near:"''' + location+'''" within:50mi'''
        #print(query)

        frequencycounter = 0



        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(
                query).get_items()):
            frequencycounter+=1
            results.append(Tweet(tweet.id,tweet.date,tweet.content,tweet.username))
            ids.append(tweet.id)
            #print(tweet.id)
            #print(tweet.date)
            #print(tweet.content+"\n")

        dict[start.strftime("%Y-%m-%d")]=frequencycounter
        start = start + datetime.timedelta(days=1)
        """for i in reversed(results):
            print(i.date)
            print(i.id)
            print(i.content+"\n")"""
    print(dict)
    return ids
def getPlayerTweets(handle: str, since: str, until: str) -> dict:
    """
    Retrieves player tweets within the specified time period.

    :param handle: handle of player whose tweet IDs are being retrieved
    :param since: string indicating the lower bound date
    :param until: string indicating the upper bound date
    :return: dict with tweet id as key, tweet object as value
    """
    return {tweet.id: tweet for i, tweet in
            enumerate(sntwitter.TwitterSearchScraper('from:%s since:%s until:%s' % (handle, since, until)).get_items())}
Esempio n. 27
0
def twitter_scrape(handle, num_tweet):
    """
    Input:  1) handle: Twitter handle of a person without '@' (e.g. JoeBiden)
            2) num_tweet: The number of past tweets the analysis will be based on (e.g. 100)

    Outputs: 1) Dataframe containing scraped tweets
    """
    raw_tweet_df = pd.DataFrame(
        itertools.islice(sntwitter.TwitterSearchScraper('from: ' + handle).get_items(), num_tweet))

    return raw_tweet_df
Esempio n. 28
0
def scrape(date):
    # creating generator for scraping
    tweets = twitter.TwitterSearchScraper(
        f"bitcoin since:{yest2str(date)} until:{date} filter:has_engagement lang:en"
    ).get_items()
    # iterating through all tweets
    tweets = itertools.islice(tweets, n_iter)
    # storing tweets in pandas dataframe
    df = pd.DataFrame(tweets)
    # returning necessay columns of dataframe
    return df[['date', 'content']]
Esempio n. 29
0
def search_tweets_sn(q,
                     since=None,
                     until=None,
                     username=None,
                     near=None,
                     radius=None,
                     lang=None,
                     max_tweets=-1,
                     quiet=False):
    """
    Search tweets according to keyword arguments specified using snscrape.

    Parameters
    ----------
    q (str): A query text to be matched.
    since (str. "yyyy-mm-dd"): A lower bound date (UTC) to restrict search. Default is 7 days before today.
    until (str. "yyyy-mm-dd"): An upper bound date (not included) to restrict search. Default is today.
    username (str or iterable): An optional specific username(s) from a twitter account (with or without "@"). Default is no username restriction.
    near (str): A reference location area (e.g. Milan) from where tweets were generated. Default is no reference area.
    radius (str): A distance radius (e.g. 15km) from location specified by "near". Meaningful only if "near" is set.
    lang (str): Restrict language of the tweets retrieved. Must be an ISO 639-1 code (e.g. en, it, etc.). Default is no language restriction.
    max_tweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved. Default is -1.

    Returns
    -------
    tweets (NLPTweetList): list of tweets resulting from the search and amenable to analysis.
    """
    if until is None:
        until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
    if since is None:
        since = datetime.datetime.strftime(
            datetime.datetime.strptime(until, '%Y-%m-%d') -
            datetime.timedelta(days=7), '%Y-%m-%d')
    if max_tweets == -1:
        max_tweets = sys.maxsize

    criteria = f"{q} since:{since} until:{until} exclude:retweets exclude:replies"

    if username is not None:
        criteria += f" from:{username}"
    if near is not None:
        criteria += f" near:{near.replace(' ', '&')}"
    if radius is not None:
        criteria += f" within:{radius}"
    if lang is not None:
        criteria += f" lang:{lang}"

    tweets = NLPTweetList(islice(
        sntwitter.TwitterSearchScraper(criteria).get_items(), max_tweets),
                          tqdm_total=max_tweets,
                          quiet=quiet)
    return tweets
Esempio n. 30
0
def snl():
    last = 1373977398891450374
    for i, tweet in enumerate(
            sntwitter.TwitterSearchScraper(
                'vacinas OR vacina OR vacinacao OR vacinacao (@FlavioDino OR @GovernoMA) max_id:{} since:2021-01-17 until:2021-05-01'
                .format(last)).get_items()):
        if i > 5:
            break
        last = tweet.id
        print("\n")
        print("tweet id: {}".format(tweet.id))
        print("tweet text: {}".format(tweet.content))
        print("tweet date: {}".format(tweet.date))