Esempio n. 1
0
    def get_tweets(self, user, sample_size, output_to_file=True):
        sample = query_tweets_from_user(user, sample_size)

        if output_to_file:
            with open(f'{user}_tweets.json', 'w',
                      encoding='utf-8') as output_file:
                output_file.write(json.dumps(sample, cls=JSONEncoder))

        return sample
Esempio n. 2
0
def get(name, limit=None):
    # This will get tweets from an user selected by the user
    file = open(new, "w")
    json_list = []
    for tweet in query_tweets_from_user(name, limit):
        tweet.timestamp = datetime.strftime(tweet.timestamp, '%Y-%m-%d %H:%M:%S')
        json_list.append(vars(tweet))
    json.dump(json_list, file)
    file.close()
    print("Done writing")
def new_tweets():
    list_of_tweets_UN = query_tweets_from_user('UN', 10)
    list_of_tweets_CDC = query_tweets_from_user('CDCgov', 10)
    list_of_tweets_WHO = query_tweets_from_user('WHO', 10)

    file = open('data/WHO_twitter_new.json', 'w')
    json.dump(list_of_tweets_WHO, file, cls=JSONEncoder)
    file.close()
    file = open('data/UN_twitter_new.json', 'w')
    json.dump(list_of_tweets_UN, file, cls=JSONEncoder)
    file.close()
    file = open('data/CDC_twitter_new.json', 'w')
    json.dump(list_of_tweets_CDC, file, cls=JSONEncoder)
    file.close()

    twit_CDC = pd.read_json('data/CDC_twitter_output.json', encoding='utf-8')
    new_twit_CDC = pd.read_json('data/CDC_twitter_new.json', encoding='utf-8')
    new_twit_CDC = tweet_parsing(new_twit_CDC)
    twit_df = pd.concat([twit_CDC, new_twit_CDC], ignore_index=True)
    new_twit_CDC = twit_df.drop_duplicates('tweet_id')
    new_twit_CDC.to_json('data/CDC_twitter_output.json', orient='records')

    twit_UN = pd.read_json('data/UN_twitter_output.json', encoding='utf-8')
    new_twit_UN = pd.read_json('data/UN_twitter_new.json', encoding='utf-8')
    new_twit_UN = tweet_parsing(new_twit_UN)
    twit_df = pd.concat([twit_UN, new_twit_UN], ignore_index=True)
    new_twit_UN = twit_df.drop_duplicates('tweet_id')
    new_twit_UN.to_json('data/UN_twitter_output.json', orient='records')

    twit_WHO = pd.read_json('data/WHO_twitter_output.json', encoding='utf-8')
    new_twit_WHO = pd.read_json('data/WHO_twitter_new.json', encoding='utf-8')
    new_twit_WHO = tweet_parsing(new_twit_WHO)
    twit_df = pd.concat([twit_WHO, new_twit_WHO], ignore_index=True)
    new_twit_WHO = twit_df.drop_duplicates('tweet_id')
    new_twit_WHO.to_json('data/WHO_twitter_output.json', orient='records')

    twit_df = pd.concat([new_twit_CDC, new_twit_UN, new_twit_WHO],
                        ignore_index=True)

    return twit_df
Esempio n. 4
0
def write_tweets(username, filename):
    file = open(filename, "w")
    count = 0
    punct = punctuation + "”“"
    for tweet in query_tweets_from_user(username, 1000):
        line = str(tweet.text.encode('utf-8'))
        # remove punctuation, etc
        line = ' '.join(word for word in line.split(' ') if (
            not word.startswith('\\') and not word.startswith('pictwitter')))
        line = line.translate(str.maketrans('', '', punct)).lower()
        if line.strip() != '':
            file.write(line[1:].strip() + "\n")
        count = count + 1
    print(str(count) + " actual tweets")
    file.close()
Esempio n. 5
0
 async def poll_implementation(self, bot, account, roomid, send_messages):
     try:
         tweets = query_tweets_from_user(account, limit=1)
         self.logger.info(
             f'Polling twitter account {account} - got {len(tweets)} tweets'
         )
         for tweet in tweets:
             if tweet.tweet_id not in self.known_ids:
                 if send_messages:
                     await bot.send_html(
                         bot.get_room_by_id(roomid),
                         f'<a href="https://twitter.com{tweet.tweet_url}">Twitter {account}</a>: {tweet.text}',
                         f'Twitter {account}: {tweet.text} - https://twitter.com{tweet.tweet_url}'
                     )
             self.known_ids.add(tweet.tweet_id)
     except Exception:
         self.logger.error('Polling twitter account failed:')
         traceback.print_exc(file=sys.stderr)
Esempio n. 6
0
    def start(self):
        try:
            user = self.config['user']
            name = self.config['name']

            if name == '' or name == user:
                name = user

                list_of_tweets = query_tweets_from_user(user, limit=10)
                for tweet in list_of_tweets:
                    if tweet.username == user:
                        name = tweet.fullname
                        break

                cursor = self.con.cursor()
                cursor.execute(
                    'UPDATE twitter_blog SET name=? WHERE user=?', (name, user))
                self.con.commit()

            for tweet in get_tweets(user, pages=1000):
                update_time = int(time.mktime(tweet['time'].timetuple()))  # 时戳
                for photo in tweet['entries']['photos']:
                    self.save_item(photo, name, update_time)
                for video in tweet['entries']['videos']:
                    print('video', video)
                    print('https://video.twimg.com/tweet_video/%s.mp4' % video['id'])

                # 更新时间
                new_time = datetime.datetime.fromtimestamp(
                    update_time).strftime("%Y-%m-%d %H:%M:%S")
                cursor = self.con.cursor()
                cursor.execute(
                    'UPDATE twitter_blog SET update_time=? WHERE user=? AND update_time<?', (new_time, user, new_time))
                self.con.commit()
        except Exception as e:
            print('Exception occurs at line ' +
                  e.__traceback__.tb_lineno.__str__(), e)
Esempio n. 7
0
def main():
    logger.info({'Hello world': '1'})
    try:
        config_path = os.path.split(
            os.path.realpath(__file__))[0] + os.sep + 'config.json'
        if not os.path.isfile(config_path):
            sys.exit(u'当前路径:%s 不存在配置文件config.json' %
                     (os.path.split(os.path.realpath(__file__))[0] + os.sep))
        with open(config_path) as f:
            config = json.loads(f.read())
        validate_config(config)
        print('hello world')
        user_id_list = config['user_id_list']
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            user_id_list = get_user_list(user_id_list)
        for user in user_id_list:
            print(user)
            list_of_tweets = query_tweets_from_user(user, 10)
            outPutFileName = get_filepath(user, 'data') + '.csv'
            with open(outPutFileName, "w", encoding="utf-8") as output:
                writer = csv.writer(output)
                writer.writerow(["text_html", "img_url", "video_url", "links"])
                for t in list_of_tweets:
                    writer.writerow(
                        [t.text_html, t.img_urls, t.video_url, t.links])
                    for imgUrl in t.img_urls:
                        download_one_file(user, 'img', imgUrl)
                    for videoUrl in t.video_url:
                        download_one_file(user, 'video', videoUrl)
    except ValueError:
        print('config.json格式不正确')
    except Exception as e:
        print('Error: ', e)
        traceback.print_exe()
Esempio n. 8
0
from twitterscraper import query_tweets_from_user
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import AdaBoostClassifier
sid = SentimentIntensityAnalyzer()
model = AdaBoostClassifier()

features = []
labels = []

all_tweets = query_tweets_from_user("barackobama", limit=800)

training = all_tweets[:600]
testing = all_tweets[600:]

for tweet in training:
    tweetAnalysis = sid.polarity_scores(tweet.text)
    features.append([
        int(tweetAnalysis["neg"] * 100),
        int(tweetAnalysis["pos"] * 100),
        int(tweetAnalysis["neu"] * 100),
        int(tweet.retweets / 1000)
    ])
    labels.append(int(tweet.likes / 1000))

model = model.fit(features, labels)

matches = 0
errors = 0
for test in testing:
Esempio n. 9
0
from twitterscraper import query_tweets_from_user, query_tweets

# for tweet in query_tweets("Uro, Titan", 10):
#     print(tweet)
file = open("output.txt", "w")
tweets = query_tweets_from_user("@MTGGoldfish")
print(tweets, len(tweets))

for tweet in tweets:
    file.write(str(tweet.text.encode('utf-8')))
    print(str(tweet.text.encode('utf-8')), "nn")

file.close()
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.neural_network import MLPClassifier

#Inicializamos nuestro analizador de sentimientos
sid = SentimentIntensityAnalyzer()

#Hacemos una red neuronal simple, un Multi-layer Perceptron classifier con 3 columnas de neuronas, cada una con 10 neuronas
mlp = MLPClassifier(hidden_layer_sizes=(10,10,10))

features = []
labels = []

#Leemos 40 tweets de la cuenta de Obama
all_tweets = (query_tweets_from_user("barackobama", limit=40))

#Tomamos los 20 primeros tweets de Obama para entrenar a nuestra red neuronal, es decir, por cada tweet tenemos su sentimiento y su número de likes, buscaremos que la red neuronal, dado el sentimiento, nos devuelva el número de likes
training = all_tweets[:20]
#Los últimos 20 tweets los tomamos para hacer pruebas
testing = all_tweets[20:]

#Por cada tweet tomamos su sentimiento, que es un arreglo de tres números, por ejemplo: [0.1,0.2,0.8], que es la probabilidad de que el sentimiento sea negativo, positivo, o neutral. Por cada tweet vamos agregando a un arreglo llamado "features" su sentimiento y a un arreglo llamado "labels" su número de likes
for tweet in training:
	tweetAnalysis = sid.polarity_scores(tweet.text)
	features.append([tweetAnalysis["neg"],tweetAnalysis["pos"],tweetAnalysis["neu"]])
	labels.append(tweet.likes)

#Entrenamos a la red neuronal simplemente llamando a la función "fit"
mlp = mlp.fit(features,labels)
Esempio n. 11
0
#일자 별로 수집하는 코드
#enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함.
#list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29))
if __name__ == '__main__':
    '''
        0. query_tweets_from_user를 사용하여 글쓴 사람의 최근 글 5개를 뽑아옴.
        1. 데이터 베이스에서 URL 이 있는지 검사
        2. 이미 있으면 수집안하고 다음 tweet 객체로 이동
        3. 없으면 보안뉴스 페이지에 접속하여 크롤링

        단점: 디비 혹사...
    '''

    lately_twitter_limit = 5  #가장 최근 긁어올 개수.
    list_of_tweets = query_tweets_from_user(
        "boannews", limit=lately_twitter_limit)  #보안 뉴스 글에서 5개 를 가져옴.

    for tweet in list_of_tweets:
        #print(tweet.text)
        #print(tweet.timestamp)
        #print(tweet.text_html)
        tweet_text = tweet.text
        url = tweet_text[tweet_text.find(
            "http://www.boannews.com/"
        ):]  # http://www.boannews.com/media/view.asp?idx=84215\xa0…
        boannews_url = url[:len(
            url) - 2]  # http://www.boannews.com/media/view.asp?idx=84215

        #SQL에서 URL 중복 체크
        sql = "select EXISTS (select * from raw_table WHERE url=%s) as success"
        val = (boannews_url)
# In[1]:

from twitterscraper import query_tweets
from twitterscraper import query_tweets_from_user
import datetime as dt
import pandas as pd
import nltk

begin_date = dt.date(2017, 1, 1)
end_date = dt.date(2020, 4, 25)

limit = 1000
#lang = 'english'
user = '******'

tweets = query_tweets_from_user(user=user)

df = pd.DataFrame(t.__dict__ for t in tweets)

# In[ ]:

#csv_data = df.to_csv (r'C:\Users\44759\Documents\Anaconda\twitter_shit\tweets.csv', index = None, header = True)

# In[2]:

df.head(60)
#df = df[]

# In[3]:

##for index, row in df.iterrows():
Esempio n. 13
0
    file_data = OrderedDict()
    file_data['author'] = author
    file_data['post_create_datetime'] = date[8:] + ":00"  # 2015-01-01 12:10:00
    file_data['title'] = news_title
    file_data['content'] = content
    file_data['url'] = r2.url
    file_data['publisher'] = '보안뉴스'
    return file_data


if __name__ == '__main__':
    #일자 별로 수집하는 코드
    #enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함.
    #list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29))

    list_of_tweets = query_tweets_from_user("boannews")  #특정 유저의 모든 데이터를 수집.
    finish = len(list_of_tweets) - 1

    for num, tweet in enumerate(list_of_tweets):
        #print(tweet.text)
        #print(tweet.timestamp)
        #print(tweet.text_html)
        tweet_text = tweet.text
        url = tweet_text[tweet_text.find(
            "http://www.boannews.com/"
        ):]  # http://www.boannews.com/media/view.asp?idx=84215\xa0…
        boannews_url = url[:len(
            url) - 2]  # http://www.boannews.com/media/view.asp?idx=84215
        session = HTMLSession()
        r2 = session.get(boannews_url)  # 보안뉴스 세션 열고 수집.
Esempio n. 14
0
    finally:
        connection.close()

    return result


if __name__ == '__main__':
    '''
    파라미터로 유저를 크롤링 할 유저를 넘겨줌
    https://twitter.com/kisa118 
    the_boan
    '''
    user_list = ["kisa118", "softwarecatalog"]  #수집할 트위터 유저 리스트, "the_boan"
    for user in user_list:

        list_of_tweets = query_tweets_from_user(user=user)  #특정 유저의 모든 데이터를 수집.

        for tweet in list_of_tweets:
            #SQL에서 URL 중복 체크
            sql = "select EXISTS (select * from raw_table WHERE url=%s) as success"
            val = (tweet.tweet_url)
            is_exists = select_mydb(sql, val)[0][0]  # ture: 1 / false: 0 반환

            if is_exists:  # 해당 URL이 있으면 패스
                continue
            else:  # 해당 URL이 없으면 데이터 넣기.
                dict_data = OrderedDict()
                dict_data['author'] = tweet.username
                dict_data[
                    'post_create_datetime'] = tweet.timestamp  # 2015-01-01 12:10:00
                dict_data['title'] = tweet.text[:255]
Esempio n. 15
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
import datetime as dt
begin_date = dt.date(2020, 4, 1)
end_date = dt.date(2020, 4, 2)

limit = 500
lang = "english"
user = "******"
#tweets=query_tweets_from_user("guardian", begindate=begin_date, enddate=end_date, limit=limit, lang=lang)
tweets = query_tweets_from_user("cnn", limit=limit)
df = pd.DataFrame(t.__dict__
                  for t in tweets)  #forming a database of all the tweets
lengthrows = len(df.index)
# print("row len =",lengthrows)
df = df.drop(
    [
        'screen_name', 'username', 'user_id', 'tweet_id', 'tweet_url',
        'timestamp', 'timestamp_epochs', 'text_html', 'hashtags', 'has_media',
        'img_urls', 'video_url', 'replies', 'is_replied', 'is_reply_to',
        'parent_tweet_id', 'reply_to_users'
    ],
    axis=1
)  #removing all the information we do not need and keeping only text, retweets and likes

for i in range(0, lengthrows):
Esempio n. 16
0
import time

print('Web Scraping .....')
time.sleep(1)
print('@joe_exotic')
time.sleep(1)
print(('...........'))
time.sleep(1)
print('----------------')
time.sleep(1)
print('The Tiger King')
time.sleep(5)

# Scrape Tweets
handle = 'joe_exotic'
out = twitterscraper.query_tweets_from_user(handle)

# Create Empty storage
df = pd.DataFrame()
Date = []
Text = []
Likes = []
Retweets = []
# Read data into respective columns
for tweet in out:
    Date.append(tweet.timestamp)
    Text.append(tweet.text)
    Likes.append(tweet.likes)
    Retweets.append(tweet.retweets)

# Turn into csv
Esempio n. 17
0
# import libraries
from twitterscraper import query_tweets_from_user
import pandas as pd
import extract_text_from_url as et
import os

# number of tweets
limit = 5

# user tweets
user = input('Enter twitter username without @: ')
user = user.lower()

# all tweets within limit
tweets = query_tweets_from_user(user, limit=limit)

# convert twitter object into DataFrame
df = pd.DataFrame(tweet.__dict__ for tweet in tweets)

# clean tweets and remove retweets
df['screen_name'] = df['screen_name'].apply(lambda x: x.lower())
df = df[df['screen_name'] == user]

# convert user text to UTF-8 ---------------------Work in progress
df['text'] = df['text'].apply(
    lambda x: x.encode('ascii', 'ignore').decode('ascii'))

# convert user text to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

Esempio n. 18
0
def scrape_tweets_to_db(cursor, user, limit=None):
    for tweet in query_tweets_from_user(user, limit=limit):
        cursor.execute(
            '''INSERT INTO tweets VALUES(?,?,?,?)''',
            (None, tweet.text, remove_url(tweet.text), tweet.screen_name))
Esempio n. 19
0

#일자 별로 수집하는 코드
#enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함.
#list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29))
if __name__ == '__main__':
    '''
        0. query_tweets_from_user를 사용하여 글쓴 사람의 최근 글 5개를 뽑아옴.
        1. 데이터 베이스에서 URL 이 있는지 검사
        2. 이미 있으면 수집안하고 다음 tweet 객체로 이동
        3. 없으면 보안뉴스 페이지에 접속하여 크롤링

        단점: 디비 혹사...
    '''

    list_of_tweets = query_tweets_from_user(
        "AhnLab_SecuInfo")  #보안 뉴스 글에서 5개 를 가져옴.

    p = re.compile('[a-z]')

    for tweet in list_of_tweets:

        tweet_text = tweet.text
        url = tweet_text[tweet_text.find("https://asec.ahnlab.com/"):]

        if p.match(url):
            ahnlab_url = url[:len(url) - 1]
            #SQL에서 URL 중복 체크
            sql = "select EXISTS (select * from raw_table WHERE url=%s) as success"
            val = (ahnlab_url)
            is_exists = select_mydb(sql, val)[0][0]  # ture: 1 / false: 0 반환
Esempio n. 20
0
        connection.close()

    return result


if __name__ == '__main__':
    '''
    파라미터로 유저를 크롤링 할 유저를 넘겨줌
    https://twitter.com/kisa118 
    the_boan
    '''
    user_list = ["kisa118", "softwarecatalog"]  #수집할 트위터 유저 리스트, "the_boan"
    for user in user_list:
        lately_twitter_limit = 5

        list_of_tweets = query_tweets_from_user(
            user=user, limit=lately_twitter_limit)  #특정 유저의 모든 데이터를 수집.

        for tweet in list_of_tweets:
            #SQL에서 URL 중복 체크
            sql = "select EXISTS (select * from raw_table WHERE url=%s) as success"
            val = (tweet.tweet_url)
            is_exists = select_mydb(sql, val)[0][0]  # ture: 1 / false: 0 반환

            if is_exists:  # 해당 URL이 있으면 패스
                continue
            else:  # 해당 URL이 없으면 데이터 넣기.
                dict_data = OrderedDict()
                dict_data['author'] = tweet.username
                dict_data[
                    'post_create_datetime'] = tweet.timestamp  # 2015-01-01 12:10:00
                dict_data['title'] = tweet.text[:255]
Esempio n. 21
0
import re

#Se usaron dos librerias twitterscraper que es para hacer scraping de twitter y
# ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags


#Metodo para limpiar tweets quitar caracteres especiales, hashtags y url
def clean_tweet(tweet):
    tweet = re.sub(r"pic.\S+", "", tweet)
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               tweet).split())


#Query para los 20 tweets recientes
tweets = query_tweets_from_user("realDonaldTrump", 20)

#Imprimir los tweets limpios
for tweet in tweets:
    print(clean_tweet(tweet.text))
    tweetHashtag = re.findall(r"#(\w+)", tweet.text)
    if tweetHashtag.__len__ != 0:
        hashtagArray.extend(tweetHashtag)
    print("\n")

#El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter
seg_tw = Segmenter(corpus="twitter")
hashtagArray = []

print("Hashtags Segmention:\n")
Esempio n. 22
0
from twitterscraper import query_tweets_from_user
import pandas as pd

keyword = "@FinancialTimes"
raw_tweets = query_tweets_from_user(keyword, limit=100000000)
text, timestamp, likes, retweets, replies = [], [], [], [], []

for tweet in raw_tweets:
    text.append(tweet.text)
    timestamp.append(tweet.timestamp)
    likes.append(tweet.likes)
    retweets.append(tweet.retweets)
    replies.append(tweet.replies)

tweets = pd.DataFrame({"text": text, "timestamp": timestamp, "likes": likes, "retweets": retweets, "replies": replies})

# Don't need the exact h-m-s, cast it to date object.
tweets['timestamp'] = tweets['timestamp'].apply(lambda x: str(x.date()))
print(tweets.timestamp.unique())
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 30 01:46:37 2020

@author: Arthur Chu
"""

from twitterscraper import query_tweets_from_user
import pandas as pd

tweets = []

twitter_handle_file_path = r"C:\Users\Arthur Chu\Desktop\twitter_politics_ML\raw_data\congress_twitter_handles.csv"

twitter_handles = pd.read_csv(twitter_handle_file_path, encoding="ISO-8859-1")

twitter_handles.dropna(how="any", inplace=True)

handles = twitter_handles['Twitter Handle'].tolist()

for politician in handles:
    tweets += query_tweets_from_user(politician, limit=1000)

df = pd.DataFrame(t.__dict__ for t in tweets)

df.to_csv(
    r"C:\Users\Arthur Chu\Desktop\twitter_politics_ML\raw_data\all_tweets.csv",
    index=False)
Esempio n. 24
0
        'MinofHealthUG': 'Uganda',
        'MalawiGovt': 'Malawi',
        'mohgovgh': 'Ghana',
        'OMSMocambique': 'Mozambique',
        'integrateglobal': 'Togo',
        'RwandaHealth': 'Rwanda'
    }
    cols = [
        'Country', 'Twitter Handle', 'Timestamp', 'Content',
        'Likelyhood of Update', 'URL'
    ]
    df = pd.DataFrame(columns=cols)

    # Pull tweets
    for twitter_handle in feeds:
        for tweet in query_tweets_from_user(twitter_handle, 10):
            translator = Translator()
            if tweet.screen_name.lower() == twitter_handle.lower():
                translated_tweet = translator.translate(tweet.text,
                                                        dest='en').text
                if test_relevence_covid(translated_tweet):
                    df = df.append(
                        pd.DataFrame([[
                            feeds[twitter_handle], twitter_handle,
                            tweet.timestamp, translated_tweet,
                            est_update(translated_tweet),
                            'https://twitter.com%s' % tweet.tweet_url
                        ]],
                                     columns=cols))

    values = df.sort_values(by='Timestamp',
Esempio n. 25
0
import plotly
import plotly.express as px
import plotly.graph_objs as go

#%%
# ------------------
# Got this method of pulling tweets form here:
# ------------------
# https: // medium.com/@kevin.a.crystal/scraping-twitter-with-tweetscraper-and-python-ea783b40443b
# https: // github.com/jenrhill/Power_Outage_Identification/blob/master/code/1_Data_Collection_and_EDA.ipynb
# https: // www.youtube.com/watch?v = zF_Q2v_9zKY

user = '******'
limit = 10000

tweets = ts.query_tweets_from_user(user=user, limit=limit)


#%%
class TweetAnalyzer():
    """
    Functionality for analyzing and categorizing content from tweets.
    """

    #clean tweets
    def clean_tweet(self, tweet):
        return ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   tweet).split())

    #creating sentimental score using TextBlob