def get_tweets(self, user, sample_size, output_to_file=True): sample = query_tweets_from_user(user, sample_size) if output_to_file: with open(f'{user}_tweets.json', 'w', encoding='utf-8') as output_file: output_file.write(json.dumps(sample, cls=JSONEncoder)) return sample
def get(name, limit=None): # This will get tweets from an user selected by the user file = open(new, "w") json_list = [] for tweet in query_tweets_from_user(name, limit): tweet.timestamp = datetime.strftime(tweet.timestamp, '%Y-%m-%d %H:%M:%S') json_list.append(vars(tweet)) json.dump(json_list, file) file.close() print("Done writing")
def new_tweets(): list_of_tweets_UN = query_tweets_from_user('UN', 10) list_of_tweets_CDC = query_tweets_from_user('CDCgov', 10) list_of_tweets_WHO = query_tweets_from_user('WHO', 10) file = open('data/WHO_twitter_new.json', 'w') json.dump(list_of_tweets_WHO, file, cls=JSONEncoder) file.close() file = open('data/UN_twitter_new.json', 'w') json.dump(list_of_tweets_UN, file, cls=JSONEncoder) file.close() file = open('data/CDC_twitter_new.json', 'w') json.dump(list_of_tweets_CDC, file, cls=JSONEncoder) file.close() twit_CDC = pd.read_json('data/CDC_twitter_output.json', encoding='utf-8') new_twit_CDC = pd.read_json('data/CDC_twitter_new.json', encoding='utf-8') new_twit_CDC = tweet_parsing(new_twit_CDC) twit_df = pd.concat([twit_CDC, new_twit_CDC], ignore_index=True) new_twit_CDC = twit_df.drop_duplicates('tweet_id') new_twit_CDC.to_json('data/CDC_twitter_output.json', orient='records') twit_UN = pd.read_json('data/UN_twitter_output.json', encoding='utf-8') new_twit_UN = pd.read_json('data/UN_twitter_new.json', encoding='utf-8') new_twit_UN = tweet_parsing(new_twit_UN) twit_df = pd.concat([twit_UN, new_twit_UN], ignore_index=True) new_twit_UN = twit_df.drop_duplicates('tweet_id') new_twit_UN.to_json('data/UN_twitter_output.json', orient='records') twit_WHO = pd.read_json('data/WHO_twitter_output.json', encoding='utf-8') new_twit_WHO = pd.read_json('data/WHO_twitter_new.json', encoding='utf-8') new_twit_WHO = tweet_parsing(new_twit_WHO) twit_df = pd.concat([twit_WHO, new_twit_WHO], ignore_index=True) new_twit_WHO = twit_df.drop_duplicates('tweet_id') new_twit_WHO.to_json('data/WHO_twitter_output.json', orient='records') twit_df = pd.concat([new_twit_CDC, new_twit_UN, new_twit_WHO], ignore_index=True) return twit_df
def write_tweets(username, filename): file = open(filename, "w") count = 0 punct = punctuation + "”“" for tweet in query_tweets_from_user(username, 1000): line = str(tweet.text.encode('utf-8')) # remove punctuation, etc line = ' '.join(word for word in line.split(' ') if ( not word.startswith('\\') and not word.startswith('pictwitter'))) line = line.translate(str.maketrans('', '', punct)).lower() if line.strip() != '': file.write(line[1:].strip() + "\n") count = count + 1 print(str(count) + " actual tweets") file.close()
async def poll_implementation(self, bot, account, roomid, send_messages): try: tweets = query_tweets_from_user(account, limit=1) self.logger.info( f'Polling twitter account {account} - got {len(tweets)} tweets' ) for tweet in tweets: if tweet.tweet_id not in self.known_ids: if send_messages: await bot.send_html( bot.get_room_by_id(roomid), f'<a href="https://twitter.com{tweet.tweet_url}">Twitter {account}</a>: {tweet.text}', f'Twitter {account}: {tweet.text} - https://twitter.com{tweet.tweet_url}' ) self.known_ids.add(tweet.tweet_id) except Exception: self.logger.error('Polling twitter account failed:') traceback.print_exc(file=sys.stderr)
def start(self): try: user = self.config['user'] name = self.config['name'] if name == '' or name == user: name = user list_of_tweets = query_tweets_from_user(user, limit=10) for tweet in list_of_tweets: if tweet.username == user: name = tweet.fullname break cursor = self.con.cursor() cursor.execute( 'UPDATE twitter_blog SET name=? WHERE user=?', (name, user)) self.con.commit() for tweet in get_tweets(user, pages=1000): update_time = int(time.mktime(tweet['time'].timetuple())) # 时戳 for photo in tweet['entries']['photos']: self.save_item(photo, name, update_time) for video in tweet['entries']['videos']: print('video', video) print('https://video.twimg.com/tweet_video/%s.mp4' % video['id']) # 更新时间 new_time = datetime.datetime.fromtimestamp( update_time).strftime("%Y-%m-%d %H:%M:%S") cursor = self.con.cursor() cursor.execute( 'UPDATE twitter_blog SET update_time=? WHERE user=? AND update_time<?', (new_time, user, new_time)) self.con.commit() except Exception as e: print('Exception occurs at line ' + e.__traceback__.tb_lineno.__str__(), e)
def main(): logger.info({'Hello world': '1'}) try: config_path = os.path.split( os.path.realpath(__file__))[0] + os.sep + 'config.json' if not os.path.isfile(config_path): sys.exit(u'当前路径:%s 不存在配置文件config.json' % (os.path.split(os.path.realpath(__file__))[0] + os.sep)) with open(config_path) as f: config = json.loads(f.read()) validate_config(config) print('hello world') user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = get_user_list(user_id_list) for user in user_id_list: print(user) list_of_tweets = query_tweets_from_user(user, 10) outPutFileName = get_filepath(user, 'data') + '.csv' with open(outPutFileName, "w", encoding="utf-8") as output: writer = csv.writer(output) writer.writerow(["text_html", "img_url", "video_url", "links"]) for t in list_of_tweets: writer.writerow( [t.text_html, t.img_urls, t.video_url, t.links]) for imgUrl in t.img_urls: download_one_file(user, 'img', imgUrl) for videoUrl in t.video_url: download_one_file(user, 'video', videoUrl) except ValueError: print('config.json格式不正确') except Exception as e: print('Error: ', e) traceback.print_exe()
from twitterscraper import query_tweets_from_user import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.ensemble import AdaBoostClassifier sid = SentimentIntensityAnalyzer() model = AdaBoostClassifier() features = [] labels = [] all_tweets = query_tweets_from_user("barackobama", limit=800) training = all_tweets[:600] testing = all_tweets[600:] for tweet in training: tweetAnalysis = sid.polarity_scores(tweet.text) features.append([ int(tweetAnalysis["neg"] * 100), int(tweetAnalysis["pos"] * 100), int(tweetAnalysis["neu"] * 100), int(tweet.retweets / 1000) ]) labels.append(int(tweet.likes / 1000)) model = model.fit(features, labels) matches = 0 errors = 0 for test in testing:
from twitterscraper import query_tweets_from_user, query_tweets # for tweet in query_tweets("Uro, Titan", 10): # print(tweet) file = open("output.txt", "w") tweets = query_tweets_from_user("@MTGGoldfish") print(tweets, len(tweets)) for tweet in tweets: file.write(str(tweet.text.encode('utf-8'))) print(str(tweet.text.encode('utf-8')), "nn") file.close()
import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.neural_network import MLPClassifier #Inicializamos nuestro analizador de sentimientos sid = SentimentIntensityAnalyzer() #Hacemos una red neuronal simple, un Multi-layer Perceptron classifier con 3 columnas de neuronas, cada una con 10 neuronas mlp = MLPClassifier(hidden_layer_sizes=(10,10,10)) features = [] labels = [] #Leemos 40 tweets de la cuenta de Obama all_tweets = (query_tweets_from_user("barackobama", limit=40)) #Tomamos los 20 primeros tweets de Obama para entrenar a nuestra red neuronal, es decir, por cada tweet tenemos su sentimiento y su número de likes, buscaremos que la red neuronal, dado el sentimiento, nos devuelva el número de likes training = all_tweets[:20] #Los últimos 20 tweets los tomamos para hacer pruebas testing = all_tweets[20:] #Por cada tweet tomamos su sentimiento, que es un arreglo de tres números, por ejemplo: [0.1,0.2,0.8], que es la probabilidad de que el sentimiento sea negativo, positivo, o neutral. Por cada tweet vamos agregando a un arreglo llamado "features" su sentimiento y a un arreglo llamado "labels" su número de likes for tweet in training: tweetAnalysis = sid.polarity_scores(tweet.text) features.append([tweetAnalysis["neg"],tweetAnalysis["pos"],tweetAnalysis["neu"]]) labels.append(tweet.likes) #Entrenamos a la red neuronal simplemente llamando a la función "fit" mlp = mlp.fit(features,labels)
#일자 별로 수집하는 코드 #enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함. #list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29)) if __name__ == '__main__': ''' 0. query_tweets_from_user를 사용하여 글쓴 사람의 최근 글 5개를 뽑아옴. 1. 데이터 베이스에서 URL 이 있는지 검사 2. 이미 있으면 수집안하고 다음 tweet 객체로 이동 3. 없으면 보안뉴스 페이지에 접속하여 크롤링 단점: 디비 혹사... ''' lately_twitter_limit = 5 #가장 최근 긁어올 개수. list_of_tweets = query_tweets_from_user( "boannews", limit=lately_twitter_limit) #보안 뉴스 글에서 5개 를 가져옴. for tweet in list_of_tweets: #print(tweet.text) #print(tweet.timestamp) #print(tweet.text_html) tweet_text = tweet.text url = tweet_text[tweet_text.find( "http://www.boannews.com/" ):] # http://www.boannews.com/media/view.asp?idx=84215\xa0… boannews_url = url[:len( url) - 2] # http://www.boannews.com/media/view.asp?idx=84215 #SQL에서 URL 중복 체크 sql = "select EXISTS (select * from raw_table WHERE url=%s) as success" val = (boannews_url)
# In[1]: from twitterscraper import query_tweets from twitterscraper import query_tweets_from_user import datetime as dt import pandas as pd import nltk begin_date = dt.date(2017, 1, 1) end_date = dt.date(2020, 4, 25) limit = 1000 #lang = 'english' user = '******' tweets = query_tweets_from_user(user=user) df = pd.DataFrame(t.__dict__ for t in tweets) # In[ ]: #csv_data = df.to_csv (r'C:\Users\44759\Documents\Anaconda\twitter_shit\tweets.csv', index = None, header = True) # In[2]: df.head(60) #df = df[] # In[3]: ##for index, row in df.iterrows():
file_data = OrderedDict() file_data['author'] = author file_data['post_create_datetime'] = date[8:] + ":00" # 2015-01-01 12:10:00 file_data['title'] = news_title file_data['content'] = content file_data['url'] = r2.url file_data['publisher'] = '보안뉴스' return file_data if __name__ == '__main__': #일자 별로 수집하는 코드 #enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함. #list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29)) list_of_tweets = query_tweets_from_user("boannews") #특정 유저의 모든 데이터를 수집. finish = len(list_of_tweets) - 1 for num, tweet in enumerate(list_of_tweets): #print(tweet.text) #print(tweet.timestamp) #print(tweet.text_html) tweet_text = tweet.text url = tweet_text[tweet_text.find( "http://www.boannews.com/" ):] # http://www.boannews.com/media/view.asp?idx=84215\xa0… boannews_url = url[:len( url) - 2] # http://www.boannews.com/media/view.asp?idx=84215 session = HTMLSession() r2 = session.get(boannews_url) # 보안뉴스 세션 열고 수집.
finally: connection.close() return result if __name__ == '__main__': ''' 파라미터로 유저를 크롤링 할 유저를 넘겨줌 https://twitter.com/kisa118 the_boan ''' user_list = ["kisa118", "softwarecatalog"] #수집할 트위터 유저 리스트, "the_boan" for user in user_list: list_of_tweets = query_tweets_from_user(user=user) #특정 유저의 모든 데이터를 수집. for tweet in list_of_tweets: #SQL에서 URL 중복 체크 sql = "select EXISTS (select * from raw_table WHERE url=%s) as success" val = (tweet.tweet_url) is_exists = select_mydb(sql, val)[0][0] # ture: 1 / false: 0 반환 if is_exists: # 해당 URL이 있으면 패스 continue else: # 해당 URL이 없으면 데이터 넣기. dict_data = OrderedDict() dict_data['author'] = tweet.username dict_data[ 'post_create_datetime'] = tweet.timestamp # 2015-01-01 12:10:00 dict_data['title'] = tweet.text[:255]
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Activation, Dropout, Flatten, Dense from sklearn.ensemble import RandomForestRegressor import numpy as np import pandas as pd import datetime as dt begin_date = dt.date(2020, 4, 1) end_date = dt.date(2020, 4, 2) limit = 500 lang = "english" user = "******" #tweets=query_tweets_from_user("guardian", begindate=begin_date, enddate=end_date, limit=limit, lang=lang) tweets = query_tweets_from_user("cnn", limit=limit) df = pd.DataFrame(t.__dict__ for t in tweets) #forming a database of all the tweets lengthrows = len(df.index) # print("row len =",lengthrows) df = df.drop( [ 'screen_name', 'username', 'user_id', 'tweet_id', 'tweet_url', 'timestamp', 'timestamp_epochs', 'text_html', 'hashtags', 'has_media', 'img_urls', 'video_url', 'replies', 'is_replied', 'is_reply_to', 'parent_tweet_id', 'reply_to_users' ], axis=1 ) #removing all the information we do not need and keeping only text, retweets and likes for i in range(0, lengthrows):
import time print('Web Scraping .....') time.sleep(1) print('@joe_exotic') time.sleep(1) print(('...........')) time.sleep(1) print('----------------') time.sleep(1) print('The Tiger King') time.sleep(5) # Scrape Tweets handle = 'joe_exotic' out = twitterscraper.query_tweets_from_user(handle) # Create Empty storage df = pd.DataFrame() Date = [] Text = [] Likes = [] Retweets = [] # Read data into respective columns for tweet in out: Date.append(tweet.timestamp) Text.append(tweet.text) Likes.append(tweet.likes) Retweets.append(tweet.retweets) # Turn into csv
# import libraries from twitterscraper import query_tweets_from_user import pandas as pd import extract_text_from_url as et import os # number of tweets limit = 5 # user tweets user = input('Enter twitter username without @: ') user = user.lower() # all tweets within limit tweets = query_tweets_from_user(user, limit=limit) # convert twitter object into DataFrame df = pd.DataFrame(tweet.__dict__ for tweet in tweets) # clean tweets and remove retweets df['screen_name'] = df['screen_name'].apply(lambda x: x.lower()) df = df[df['screen_name'] == user] # convert user text to UTF-8 ---------------------Work in progress df['text'] = df['text'].apply( lambda x: x.encode('ascii', 'ignore').decode('ascii')) # convert user text to lowercase df['text'] = df['text'].apply(lambda x: x.lower())
def scrape_tweets_to_db(cursor, user, limit=None): for tweet in query_tweets_from_user(user, limit=limit): cursor.execute( '''INSERT INTO tweets VALUES(?,?,?,?)''', (None, tweet.text, remove_url(tweet.text), tweet.screen_name))
#일자 별로 수집하는 코드 #enddate = dt.date.today() + dt.timedelta(days=1) # 하루 뒤를 더해 줘야함. 안그러면 시차 때문에 최근 글을 불러오지 못함. #list_of_tweets2 = query_tweets("@boannews", begindate=dt.date(2018, 1, 1), enddate=dt.date(2019, 7, 29)) if __name__ == '__main__': ''' 0. query_tweets_from_user를 사용하여 글쓴 사람의 최근 글 5개를 뽑아옴. 1. 데이터 베이스에서 URL 이 있는지 검사 2. 이미 있으면 수집안하고 다음 tweet 객체로 이동 3. 없으면 보안뉴스 페이지에 접속하여 크롤링 단점: 디비 혹사... ''' list_of_tweets = query_tweets_from_user( "AhnLab_SecuInfo") #보안 뉴스 글에서 5개 를 가져옴. p = re.compile('[a-z]') for tweet in list_of_tweets: tweet_text = tweet.text url = tweet_text[tweet_text.find("https://asec.ahnlab.com/"):] if p.match(url): ahnlab_url = url[:len(url) - 1] #SQL에서 URL 중복 체크 sql = "select EXISTS (select * from raw_table WHERE url=%s) as success" val = (ahnlab_url) is_exists = select_mydb(sql, val)[0][0] # ture: 1 / false: 0 반환
connection.close() return result if __name__ == '__main__': ''' 파라미터로 유저를 크롤링 할 유저를 넘겨줌 https://twitter.com/kisa118 the_boan ''' user_list = ["kisa118", "softwarecatalog"] #수집할 트위터 유저 리스트, "the_boan" for user in user_list: lately_twitter_limit = 5 list_of_tweets = query_tweets_from_user( user=user, limit=lately_twitter_limit) #특정 유저의 모든 데이터를 수집. for tweet in list_of_tweets: #SQL에서 URL 중복 체크 sql = "select EXISTS (select * from raw_table WHERE url=%s) as success" val = (tweet.tweet_url) is_exists = select_mydb(sql, val)[0][0] # ture: 1 / false: 0 반환 if is_exists: # 해당 URL이 있으면 패스 continue else: # 해당 URL이 없으면 데이터 넣기. dict_data = OrderedDict() dict_data['author'] = tweet.username dict_data[ 'post_create_datetime'] = tweet.timestamp # 2015-01-01 12:10:00 dict_data['title'] = tweet.text[:255]
import re #Se usaron dos librerias twitterscraper que es para hacer scraping de twitter y # ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags #Metodo para limpiar tweets quitar caracteres especiales, hashtags y url def clean_tweet(tweet): tweet = re.sub(r"pic.\S+", "", tweet) return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) #Query para los 20 tweets recientes tweets = query_tweets_from_user("realDonaldTrump", 20) #Imprimir los tweets limpios for tweet in tweets: print(clean_tweet(tweet.text)) tweetHashtag = re.findall(r"#(\w+)", tweet.text) if tweetHashtag.__len__ != 0: hashtagArray.extend(tweetHashtag) print("\n") #El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter seg_tw = Segmenter(corpus="twitter") hashtagArray = [] print("Hashtags Segmention:\n")
from twitterscraper import query_tweets_from_user import pandas as pd keyword = "@FinancialTimes" raw_tweets = query_tweets_from_user(keyword, limit=100000000) text, timestamp, likes, retweets, replies = [], [], [], [], [] for tweet in raw_tweets: text.append(tweet.text) timestamp.append(tweet.timestamp) likes.append(tweet.likes) retweets.append(tweet.retweets) replies.append(tweet.replies) tweets = pd.DataFrame({"text": text, "timestamp": timestamp, "likes": likes, "retweets": retweets, "replies": replies}) # Don't need the exact h-m-s, cast it to date object. tweets['timestamp'] = tweets['timestamp'].apply(lambda x: str(x.date())) print(tweets.timestamp.unique())
# -*- coding: utf-8 -*- """ Created on Thu Jul 30 01:46:37 2020 @author: Arthur Chu """ from twitterscraper import query_tweets_from_user import pandas as pd tweets = [] twitter_handle_file_path = r"C:\Users\Arthur Chu\Desktop\twitter_politics_ML\raw_data\congress_twitter_handles.csv" twitter_handles = pd.read_csv(twitter_handle_file_path, encoding="ISO-8859-1") twitter_handles.dropna(how="any", inplace=True) handles = twitter_handles['Twitter Handle'].tolist() for politician in handles: tweets += query_tweets_from_user(politician, limit=1000) df = pd.DataFrame(t.__dict__ for t in tweets) df.to_csv( r"C:\Users\Arthur Chu\Desktop\twitter_politics_ML\raw_data\all_tweets.csv", index=False)
'MinofHealthUG': 'Uganda', 'MalawiGovt': 'Malawi', 'mohgovgh': 'Ghana', 'OMSMocambique': 'Mozambique', 'integrateglobal': 'Togo', 'RwandaHealth': 'Rwanda' } cols = [ 'Country', 'Twitter Handle', 'Timestamp', 'Content', 'Likelyhood of Update', 'URL' ] df = pd.DataFrame(columns=cols) # Pull tweets for twitter_handle in feeds: for tweet in query_tweets_from_user(twitter_handle, 10): translator = Translator() if tweet.screen_name.lower() == twitter_handle.lower(): translated_tweet = translator.translate(tweet.text, dest='en').text if test_relevence_covid(translated_tweet): df = df.append( pd.DataFrame([[ feeds[twitter_handle], twitter_handle, tweet.timestamp, translated_tweet, est_update(translated_tweet), 'https://twitter.com%s' % tweet.tweet_url ]], columns=cols)) values = df.sort_values(by='Timestamp',
import plotly import plotly.express as px import plotly.graph_objs as go #%% # ------------------ # Got this method of pulling tweets form here: # ------------------ # https: // medium.com/@kevin.a.crystal/scraping-twitter-with-tweetscraper-and-python-ea783b40443b # https: // github.com/jenrhill/Power_Outage_Identification/blob/master/code/1_Data_Collection_and_EDA.ipynb # https: // www.youtube.com/watch?v = zF_Q2v_9zKY user = '******' limit = 10000 tweets = ts.query_tweets_from_user(user=user, limit=limit) #%% class TweetAnalyzer(): """ Functionality for analyzing and categorizing content from tweets. """ #clean tweets def clean_tweet(self, tweet): return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) #creating sentimental score using TextBlob