def twintSearchRetweet(tweet_text): twint.output.tweets_list = [] c = twint.Config() c.Store_object = True c.Hide_output = True # 公式RTのみ c.Native_retweets = True c.Limit = 4000 c.Search = tweet_text twint.run.Search(c) tweets = twint.output.tweets_list for tweet in tweets: # ユーザー情報取得 twint.output.users_list = [] c = twint.Config() c.Store_object = True c.Hide_output = True c.Username = tweet.user_rt twint.run.Lookup(c) user = None icon_url = None bio = None following = None followers = None display_name = None if len(twint.output.users_list) > 0: user = twint.output.users_list[0] icon_url = user.avatar bio = user.bio following = user.following followers = user.followers display_name = user.name params = { 'tweet_id_str': tweet.id_str, 'user_id_str': tweet.user_id_str, 'screen_name': tweet.username, 'display_name': tweet.name, 'tweeted_at': datetime.datetime.fromtimestamp(tweet.datetime / 1000.0), 'rt_id_str': tweet.retweet_id, 'rt_user_id_str': tweet.user_rt_id, 'rt_icon_url': icon_url, 'rt_user_bio': bio, 'rt_following': following, 'rt_followers': followers, 'rt_screen_name': tweet.user_rt, 'rt_display_name': display_name, 'retweeted_at': datetime.datetime.fromtimestamp( ((int(tweet.retweet_id) >> 22) + 1288834974657) / 1000.0).strftime("%Y-%m-%d %H:%M:%S") } setOrUpdateRetweets(params)
def get_replies(tweet_id=1169681044573962240,option="twint",interval=None,sources=True,testmode=False): if option=="twint": #get tweet date and time start_status=api.get_status(tweet_id) start_time=start_status.created_at author=start_status.user.screen_name profile=[] b = twint.Config() b.Username = author b.Since=str(start_time.date()) b.Store_object=True b.Store_object_tweets_list = profile b.Hide_output=True twint.run.Profile(b) for entry in profile: if entry.id==tweet_id: print("found") print(vars(entry)) con_id=entry.conversation_id if interval==None: T=datetime.timedelta(7,0,0,0,0,0) end_time=start_time+T #set up a search repl_all=[] query_replies="to:@"+author c = twint.Config() #c.Search = query_replies c.Replies=True c.To=author c.Since=str(start_time.date()) c.Until=str(end_time.date()) c.Store_object=True c.Store_object_tweets_list = repl_all if testmode: c.Limit=1 c.User_full=True c.Hide_output=True twint.run.Search(c) A_trial=api.get_status(1170528218702471168) replies=[] for status in repl_all: if status.conversation_id==con_id: replies.append(status) status_list=replies if sources: status_list_s=sources_find(status_list,c,con_id) else: status_list_s=status_list return(status_list_s,c)
def get_linktree_link(tweet): tweets = tweet.split(" ") for word in tweets: if word[ : len('https://linktr.ee/')] == 'https://linktr.ee/': return word return "" if __name__ == '__main__': until, since = get_time() client = twint.Config() client.Search = 'linktr.ee' client.Pandas = True client.Since = since client.Until = until client.Hide_output = True objects = twint.run.Search(client) Tweets_df = twint.storage.panda.Tweets_df Tweets_df = Tweets_df.drop(['cashtags','search','translate', 'trans_src', 'trans_dest'], axis = 1) Tweets_df['linktree_link'] = Tweets_df.apply(lambda x: get_linktree_link(x['tweet']), result_type = 'expand', axis = 1) Tweets_df = Tweets_df[Tweets_df['linktree_link'] != ""] Tweets_df = Tweets_df.drop_duplicates(subset = 'linktree_link') filename = get_file_name(until)
def download_account_tweets(username=None, limit=None, include_replies=False, include_links=False, strip_usertags=False, strip_hashtags=False, sentiment=0, text_format="simple", api=None, w=None): """ Download public Tweets from one Twitter account into a format suitable for training with AI text generation tools. :param username: Twitter @ username to gather tweets or .txt file name with multiple usernames :param limit: # of tweets to gather; None for all tweets. :param include_replies: Whether to include replies to other tweets. :param include_links: Whether to include tweets with links. :param strip_usertags: Whether to remove user tags from the tweets. :param strip_hashtags: Whether to remove hashtags from the tweets. :param sentiment: Number of sentiment categories to include in text. :param text_format: Type of output format for the tweet. :param api: Open Twitter API reference :param w: Open file reference to write output """ print("Retrieving tweets for @{}...".format(username)) # Validate that it is a multiple of 40; set total number of tweets if limit: assert limit % 40 == 0, "`limit` must be a multiple of 40." pbar = tqdm(range(limit), desc="Oldest Tweet") # If no limit specifed, don't specify total number of tweet else: pbar = tqdm() # Create an empty file to store pagination id with open(".temp", "w", encoding="utf-8") as f: f.write(str(-1)) # Set the loop's iterator i = 0 # Iterate forever, and break based on reaching limit or no more tweets while (True): # If a limit is specified, break once it's reached if limit: if i >= (limit // 40): break # Create an empty list to store retrieved tweet objects tweet_data = [] # twint may fail; give it up to 5 tries to return tweets for _ in range(0, 4): if len(tweet_data) == 0: c = twint.Config() c.Store_object = True c.Hide_output = True c.Username = username c.Limit = 40 c.Resume = ".temp" c.Store_object_tweets_list = tweet_data twint.run.Search(c) # If it fails, sleep before retry. if len(tweet_data) == 0: sleep(1.0) else: continue # If still no tweets after 5 tries, stop downloading tweets if len(tweet_data) == 0: break # Do not filter out replies if include_replies: for tweet in tweet_data: tweet_text = format_text(tweet, strip_usertags, strip_hashtags, sentiment, text_format, api) # Do not write tweet to file if the tweet_text is empty if tweet_text != "": # Write tweet text to file w.writerow([tweet_text]) # Filter out replies else: for tweet in tweet_data: if not is_reply(tweet): tweet_text = format_text(tweet, strip_usertags, strip_hashtags, sentiment, text_format, api) # Do not write tweet to file if the tweet_text is empty if tweet_text != "": # Write tweet text to file w.writerow([tweet_text]) pbar.update(40) oldest_tweet = datetime.utcfromtimestamp( tweet_data[-1].datetime / 1000.0).strftime("%Y-%m-%d %H:%M:%S") pbar.set_description("Oldest Tweet: " + oldest_tweet) # Increase the loop's iterator i = i + 1 pbar.close() os.remove(".temp") # Return 0 return 0
def download_tweets( username=None, limit=None, include_replies=False, include_links=False, strip_usertags=False, strip_hashtags=False, ): """Download public Tweets from a given Twitter account into a format suitable for training with AI text generation tools. :param username: Twitter @ username to gather tweets. :param limit: # of tweets to gather; None for all tweets. :param include_replies: Whether to include replies to other tweets. :param strip_usertags: Whether to remove user tags from the tweets. :param strip_hashtags: Whether to remove hashtags from the tweets. :param include_links: Whether to include tweets with links. :return tweets: List of tweets from the Twitter account """ # If a limit is specificed, validate that it is a multiple of 20 if limit: assert limit % 20 == 0, "`limit` must be a multiple of 20." # If no limit specifed, estimate the total number of tweets from profile. else: c_lookup = twint.Config() c_lookup.Username = username c_lookup.Store_object = True c_lookup.Hide_output = True if include_links is True: c_lookup.Links = "include" else: c_lookup.Links = "exclude" twint.run.Lookup(c_lookup) limit = twint.output.users_list[-1].tweets pattern = r"http\S+|pic\.\S+|\xa0|…" if strip_usertags: pattern += r"|@[a-zA-Z0-9_]+" if strip_hashtags: pattern += r"|#[a-zA-Z0-9_]+" # Create an empty file to store pagination id with open(".temp", "w", encoding="utf-8") as f: f.write(str(-1)) print("Retrieving tweets for @{}...".format(username)) with open("{}_tweets.csv".format(username), "w", encoding="utf8") as f: w = csv.writer(f) w.writerow(["tweets"]) # gpt-2-simple expects a CSV header by default pbar = tqdm(range(limit), desc="Oldest Tweet") for i in range((limit // 20) - 1): tweet_data = [] # twint may fail; give it up to 5 tries to return tweets for _ in range(0, 4): if len(tweet_data) == 0: c = twint.Config() c.Store_object = True c.Hide_output = True c.Username = username c.Limit = 40 c.Resume = ".temp" c.Store_object_tweets_list = tweet_data twint.run.Search(c) # If it fails, sleep before retry. if len(tweet_data) == 0: sleep(1.0) else: continue # If still no tweets after multiple tries, we're done if len(tweet_data) == 0: c = twint.Config() c.Store_object = True c.Hide_output = True c.Username = username c.Limit = 40 c.Resume = ".temp" c.Store_object_tweets_list = tweet_data if not include_replies: tweets = [ re.sub(pattern, "", tweet.tweet).strip() for tweet in tweet_data if not is_reply(tweet) ] # On older tweets, if the cleaned tweet starts with an "@", # it is a de-facto reply. for tweet in tweets: if tweet != "" and not tweet.startswith("@"): w.writerow([tweet]) else: tweets = [ re.sub(pattern, "", tweet.tweet).strip() for tweet in tweet_data ] for tweet in tweets: if tweet != "": w.writerow([tweet]) if i > 0: pbar.update(20) else: pbar.update(40) oldest_tweet = datetime.utcfromtimestamp( tweet_data[-1].datetime / 1000.0).strftime("%Y-%m-%d %H:%M:%S") pbar.set_description("Oldest Tweet: " + oldest_tweet) pbar.close() os.remove(".temp")
def Search( query: Text, from_date: datetime.datetime = None, to_date: datetime.datetime = None, number_of_results: int = 100, ) -> pandas.DataFrame: """Search tweets. Args: query: the search query. from_date: search from this datetime. to_date: search till this datetime. number_of_results: number of results to return. Returns: A dataframe of tweets. For columns, reference: { 'id': 1371248526085226496, 'conversation_id': '1371248036563795969', 'created_at': '2021-03-14 23:54:59 UTC', 'date': '2021-03-14', 'time': '23:54:59', 'timezone': '+0000', 'user_id': 1233956153656332291, 'username': '******', 'name': 'funy guy sbungbob', 'place': '', 'tweet': '@Zer0Priv And stock up on Bitcoin and GameStop stocks', 'language': 'en', 'mentions': [], 'urls': [], 'photos': [], 'replies_count': 0, 'retweets_count': 0, 'likes_count': 2, 'hashtags': [], 'cashtags': [], 'link': 'https://twitter.com/je4ia/status/1371248526085226496', 'retweet': False, 'quote_url': '', 'video': 0, 'thumbnail': '', 'near': '', 'geo': '', 'source': '', 'user_rt_id': '', 'user_rt': '', 'retweet_id': '', 'reply_to': [{'screen_name': 'Zer0Priv', 'name': 'Zer0', 'id': '1256485417744031747'}], 'retweet_date': '', 'translate': '', 'trans_src': '', 'trans_dest': '', }, """ nest_asyncio.apply() c = twint.Config() c.Search = query if from_date: c.Since = from_date.strftime('%Y-%m-%d %H:%M:%S') if to_date: c.Until = to_date.strftime('%Y-%m-%d %H:%M:%S') c.Limit = number_of_results c.Pandas = True c.Hide_output = True twint.run.Search(c) return twint.storage.panda.Tweets_df
def busqueda2(termino): clf = SentimentClassifier() #agregue esto<---- #print('-------') #print(clf.predict('Los perros son bonitos')) #print('-------') limite = 850 asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy()) #Soluciona un problema de threads c = twint.Config() c.Search = termino c.Lang = 'es' c.Popular_tweets = True c.Limit = limite c.Store_csv = True c.Output = "tweets.csv" print("inicio Search ("+termino+")>") #Para que 'twint.run.Search(c)' no imprima los valores por consola sys.stdout = open(os.devnull, "w") twint.run.Search(c) sys.stdout = sys.__stdout__ print("termino Search >") #Se lee solo la columna 'tweet' del archivo "tweets.csv" tweets = pd.read_csv("tweets.csv", sep=',', usecols=['tweet'], squeeze=True) data = pd.read_csv("tweets.csv", sep=',', usecols=['id','username', 'name', 'link', 'tweet', 'likes_count', 'date'], squeeze=True).values data_dicc = [] data_dicc_neg = [] data_dicc_pos = [] data_dicc_neu = [] text_ant = 'asd' i=0 while i < len(data): dicc = {} if (text_ant != data[i][3]): text_ant = data[i][3] dicc['name'] = data[i][1] dicc['username'] = data[i][2] dicc['tweet'] = data[i][3] dicc['link'] = data[i][4] dicc['like'] = data[i][5] dicc['date'] = data[i][6] polaridad = clf.predict(tweets[i]) dicc['polaridad'] = polaridad if polaridad <= 0.4: dicc['sentimiento'] = 'negativo' data_dicc_neg.append(dicc) elif polaridad <= 0.6: dicc['sentimiento'] = 'neutro' data_dicc_neu.append(dicc) else: dicc['sentimiento'] = 'positivo' data_dicc_pos.append(dicc) data_dicc.append(dicc) i = i + 1 data_dicc = sorted(data_dicc, key = lambda i: i['like'], reverse=True) data = data_dicc data = data[0:50] data_dicc_neg = sorted(data_dicc_neg, key=lambda i: i['like'], reverse=True) data_neg = data_dicc_neg data_neg = data_neg[0:10] data_dicc_neu = sorted(data_dicc_neu, key=lambda i: i['like'], reverse=True) data_neu = data_dicc_neu data_neu = data_neu[0:10] data_dicc_pos = sorted(data_dicc_pos, key=lambda i: i['like'], reverse=True) data_pos = data_dicc_pos data_pos = data_pos[0:10] #Lista que contiene el texto de cada tweet obtenido listaT = tweets.values print(str(len(listaT))+" tweets ") #se elimina el archivo csv os.remove('tweets.csv') return listaT, data , data_neg, data_neu, data_pos
def __init__(self): self.c = twint.Config()
def get_replies(conversation_id, screen_name, created_at): # replies, likes, retweets replies = twint.Config() logging.info("screen_name {}, Created at {}, Conversation ID {}".format(screen_name, created_at, conversation_id)) print("screen_name {}, Created at {}, Conversation ID {}".format(screen_name, created_at, conversation_id)) replies.Retries_count = 2 replies.Store_object = True replies.Store_object_tweets_list = [] replies.Search = "(to:{})".format(screen_name) replies.Limit = 1000 replies.Hide_output = True max_try = 2 try_times = 0 time_delta = 1 df_list = [] while try_times < max_try: time.sleep(1) if created_at: search_end = created_at + timedelta(time_delta) search_end_str = search_end.strftime("%Y-%m-%d") created_at_str = created_at.strftime("%Y-%m-%d") replies.Until = search_end_str replies.Since = created_at_str twint.run.Search(replies) df = pd.DataFrame([vars(i) for i in replies.Store_object_tweets_list]) replies.search_tweet_list = [] df = df.rename(columns={"figure-conversation-id":"conversation_id","date":"created_at","figure-item-id":"id"}) df.drop_duplicates(inplace=True, subset=['id_str']) if len(df) == 0: time_delta = 2 * time_delta try_times += 1 continue df['username'] = df['username'].apply(lambda x:x.replace("@","")) df['nreplies'] = df['replies_count'] df['nretweets'] = df['retweets_count'] return_replies_df = [] print(len(df)) if len(df) > 0: df['id'] = df['id'].apply(lambda x: int(x)) return_replies_df = df[df['conversation_id'].apply(lambda x:str(x)==str(conversation_id)) ] # return_replies_df = df logging.info("There are {} replies for {}, {}".format(len(return_replies_df), conversation_id, screen_name)) df_list.append(df) if len(return_replies_df) < 10: time_delta = 2 * time_delta try_times += 1 else: break if len(df_list) == 0: return_replies_list = [] unrelated_replies = [] else: df = pd.concat(df_list) df.drop_duplicates(inplace=True, subset=['id']) df = df.astype({"id":"int64"}) return_replies_df = df[df['conversation_id'].apply(lambda x:str(x)==str(conversation_id))] return_replies_list = return_replies_df.to_dict(orient="record") unrelated_replies = df.to_dict(orient="record") print("There are {} related tweets".format(len(return_replies_list))) return return_replies_list, unrelated_replies
def __init__(self, resume_file): self.config = twint.Config() self.resume = resume_file + ".txt"
def get_user_followers(username, search): c = twint.Config() c.Username = username save_result(c, username + "user_followers") twint.run.Followers(c) get_user_following(username, search)
def _get_twint_config(limit): c = twint.Config() c.Limit = limit c.Hide_output = True return c
def busqueda_por_fecha(termino, desde, hasta, hastaFinal, solo_busqueda, limite = 0): clf = SentimentClassifier() if solo_busqueda == False: dAño, dMes, dDia = desde.split('/') hAño, hMes, hDia = hasta.split('/') inicio = date(int(dAño), int(dMes), int(dDia)) final = date(int(hAño), int(hMes), int(hDia)) delta = timedelta(days=1) total = 1000 #limite (hay que establecer un mínimo o se demoraría demasiado) diasTotales = final - inicio if diasTotales.days > 190: print("diasTotales("+diasTotales+") > 190") return 0 if diasTotales.days > 50: intervalo = (diasTotales.days-50)/44.33 else: intervalo = 0 limite = (total*(intervalo+1)/diasTotales.days)*1000 deltaIntervalo = timedelta(days=round(intervalo)) actual = inicio print("limite: "+str(limite)+"| deltaIntervalo: "+str(deltaIntervalo)) while actual < final: busqueda_por_fecha(termino, actual, actual + delta, hastaFinal, True, limite) actual += deltaIntervalo + delta if actual == final: actual -= delta busqueda_por_fecha(termino, actual, actual + delta, hastaFinal, True, limite) elif solo_busqueda == True: asyncio.set_event_loop_policy(AnyThreadEventLoopPolicy()) c = twint.Config() c.Search = termino c.Lang = 'es' c.Popular_tweets = True c.Limit = limite c.Store_csv = True c.Output = "tweets_fecha.csv" c.Until = str(hasta) c.Since = str(desde) sys.stdout = open(os.devnull, "w") twint.run.Search(c) sys.stdout = sys.__stdout__ print(str(desde)+" > "+str(hasta)) return 0 tweetsFecha = pd.read_csv("tweets_fecha.csv", sep=',', usecols=['tweet'], squeeze=True) listaTFecha = tweetsFecha.values Fechas = pd.read_csv("tweets_fecha.csv", sep=',', usecols=['date'], squeeze=True) listaFechas = Fechas.values data = pd.read_csv("tweets_fecha.csv", sep=',', usecols=['id', 'username', 'name', 'link', 'tweet', 'likes_count', 'date'], squeeze=True).values data_dicc = [] data_dicc_neg = [] data_dicc_pos = [] data_dicc_neu = [] text_ant = 'asd' i = 0 while i < len(data): dicc = {} if (text_ant != data[i][3]): text_ant = data[i][3] dicc['name'] = data[i][1] dicc['username'] = data[i][2] dicc['tweet'] = data[i][3] dicc['link'] = data[i][4] dicc['like'] = data[i][5] dicc['date'] = data[i][6] polaridad = clf.predict(tweetsFecha[i]) dicc['polaridad'] = polaridad if polaridad <= 0.4: dicc['sentimiento'] = 'negativo' data_dicc_neg.append(dicc) elif polaridad <= 0.6: dicc['sentimiento'] = 'neutro' data_dicc_neu.append(dicc) else: dicc['sentimiento'] = 'positivo' data_dicc_pos.append(dicc) data_dicc.append(dicc) i = i + 1 data_dicc = sorted(data_dicc, key=lambda i: i['like'], reverse=True) data = data_dicc data = data[0:50] data_dicc_neg = sorted(data_dicc_neg, key=lambda i: i['like'], reverse=True) data_neg = data_dicc_neg data_neg = data_neg[0:10] data_dicc_neu = sorted(data_dicc_neu, key=lambda i: i['like'], reverse=True) data_neu = data_dicc_neu data_neu = data_neu[0:10] data_dicc_pos = sorted(data_dicc_pos, key=lambda i: i['like'], reverse=True) data_pos = data_dicc_pos data_pos = data_pos[0:10] os.remove('tweets_fecha.csv') print(str(len(listaTFecha))+" tweets ") return [listaTFecha, listaFechas, data , data_neg, data_neu, data_pos]
def scraping(): import twint c = twint.Config() c.Search = input("Enter sentence::") c.Limit = 100 c.Email = True c.Store_csv = True c.Output = "none" twint.run.Search(c) import pandas as pd df = pd.read_csv("none/tweets.csv") print(df.columns) # question.isnull().any() # question.columns import datetime as dt import nltk start = dt.datetime.now() sno = nltk.stem.SnowballStemmer("english") i = 0 str1 = "" final_string = [] all_positive_words = [] all_negative_words = [] s = "" from nltk.corpus import stopwords stop = set(stopwords.words('english')) excluding = [ 'against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't" ] stop = [words for words in stop if words not in excluding] import re def cleanhtml(sentence): cleanr = re.compile('<.*>') cleantext = re.sub(cleanr, '', sentence) return cleantext def cleanpunc(sentence): cleaned = re.sub(r'[?|!|\'|"|#]', r' ', sentence) #seee \' and combination cleaned = re.sub(r'[.|,|)|/|\|(]', r' ', cleaned) return cleaned print(sno.stem("tasty")) #checking the rootword of tasty print(sno) def preprocess1(X): final_string = [] for sent in X: filterd_sentence = [] sent = str(sent) sent = cleanhtml(sent) # print(sent) for w in sent.split(): for cleaned_words in cleanpunc(w).split(): if ((cleaned_words.isalpha()) & (len(cleaned_words) > 2)): if (cleaned_words.lower() not in stop): s = (sno.stem( cleaned_words.lower())).encode("utf8") filterd_sentence.append(s) else: continue else: continue str1 = b" ".join(filterd_sentence) final_string.append(str1) # i=i+1 return final_string def preprocess(X): final_string = [] X = [X] for sent in X: filterd_sentence = [] sent = str(sent) sent = cleanhtml(sent) # print(sent) for w in sent.split(): for cleaned_words in cleanpunc(w).split(): if ((cleaned_words.isalpha()) & (len(cleaned_words) > 2)): if (cleaned_words.lower() not in stop): s = (sno.stem( cleaned_words.lower())).encode("utf8") filterd_sentence.append(s) else: continue else: continue str1 = b" ".join(filterd_sentence) final_string.append(str1) # i=i+1 return final_string import pickle from keras.preprocessing import sequence def find_word_index(row, word_index_dict): holder = [] for word in row.split(): if word in word_index_dict: holder.append(word_index_dict[word]) else: holder.append(0) return holder def predict_lstm(x): x = preprocess1(x) # print(x) # with open('model_lstm.pkl', 'rb') as f: # model = pickle.load(f) # print(model.predict[x]) with open('label_transform.pkl', 'rb') as f: encoder = pickle.load(f) from keras.models import load_model model = load_model('LSTM_1.ckpt') with open('word_index_dict.pkl', 'rb') as f: word_index_dict = pickle.load(f) text = [] for sent in x: text.append(find_word_index(sent, word_index_dict)) #print(len(x),len(text)) x = sequence.pad_sequences(text, maxlen=500) pred_prob = model.predict(x) # print(pred_prob) # print(encoder.classes_) sentiment = [] # print(encoder.inverse_transform(pred_prob)) preds = encoder.inverse_transform(pred_prob) for pre in preds: if pre == 1: sentiment.append("Strongly Negative") if pre == 2: sentiment.append("Weekly Negative") if pre == 3: sentiment.append("Neutral") if pre == 4: sentiment.append("Weekly Positive") if pre == 5: sentiment.append("Strongly Positive") # print("output::",pred) return [sentiment, pred_prob] df['cleaned'] = df.tweet.apply(lambda x: preprocess(x)) # df.head() q = predict_lstm(list(df.cleaned)) df['sentiment'] = q[0] import numpy as np df['confidence'] = q[1].tolist() df.drop('cleaned', axis=1, inplace=True) df = df.astype(str) l = [] data = df.copy() for i in data.index: if data['sentiment'][i] == 'Neutral': l.append(3) elif data['sentiment'][i] == 'Strongly Positive': l.append(5) elif data['sentiment'][i] == 'Weekly Positive': l.append(4) elif data['sentiment'][i] == 'Strongly Negative': l.append(1) elif data['sentiment'][i] == 'Weekly Negative': l.append(2) l = pd.DataFrame(l) l.columns = ['label'] data = pd.concat([data, l], axis=1) data = data[['username', 'tweet', 'sentiment', 'label']] data.to_csv("dezzex1.csv")
def get_top_mentions_hashtags_geo(lat_long, radius, limit): os.chdir("Python_Scripts") currentDir = os.getcwd() + "/result/twitter/" try: os.mkdir(currentDir) except: pass os.chdir(currentDir) twint.output.tweets_list = [] c = twint.Config() c.Hide_output = True # hides command line verbose output c.Limit = 500 # maximum number of tweets to pull c.Geo = f"{lat_long},{radius}" c.Store_object = True currentDir = os.getcwd()+"/Python_Scripts/result/twitter/" # sys.path.append("/app/Python_Scripts/Python_Scripts/result/twitter/") # os.chdir(currentDir) c.Store_csv = True c.Output = f"{lat_long}-tweets.csv" twint.run.Search(c) tweets = twint.output.tweets_list mentions_dict = {} hashtags_dict = {} for tweet in tweets: for mention in tweet.mentions: if mention in mentions_dict: mentions_dict[mention] += 1 else: mentions_dict[mention] = 1 for hashtag in tweet.hashtags: if hashtag in hashtags_dict: hashtags_dict[hashtag] += 1 else: hashtags_dict[hashtag] = 1 top_mentions = heapq.nlargest(10, mentions_dict, key=mentions_dict.get) # gets highest mentions top_hashtags = heapq.nlargest(10, hashtags_dict, key=hashtags_dict.get) # gets highest hashtags # makes dictionary of just highest ones mentions_ranked = {} hashtags_ranked = {} for mention in top_mentions: mentions_ranked[mention] = mentions_dict[mention] for hashtag in top_hashtags: hashtags_ranked[hashtag] = hashtags_dict[hashtag] plt.barh(range(len(mentions_ranked)), list(mentions_ranked.values()), align='center', color='maroon') plt.yticks(range(len(mentions_ranked)), list(mentions_ranked.keys())) plt.gca().invert_yaxis() # just to have the highest bar at the top plt.title("Top 10 Trending Mentions from the Geo-location: " + lat_long) plt.savefig(os.getcwd()+"/Python_Scripts/result/twitter/"+ lat_long + '-mentions.png', bbox_inches='tight') # saves the visualization as png # plt.savefig(seed_hashtag + '.pdf', bbox_inches='tight') plt.barh(range(len(hashtags_ranked)), list(hashtags_ranked.values()), align='center', color='maroon') plt.yticks(range(len(hashtags_ranked)), list(hashtags_ranked.keys())) plt.gca().invert_yaxis() # just to have the highest bar at the top plt.title("Top 10 Trending Hashtags from the Geo-location:" + lat_long) # os.chdir(currentDir) plt.savefig("/app/Python_Scripts/result/twitter/"+ lat_long + '-hashtags.png', bbox_inches='tight') # saves the visualization as png # plt.savefig(seed_hashtag + '.pdf', bbox_inches='tight') #print("List of Top 10 mentions " + lat_long + " :") #print(top_mentions) # displays the top 10 hashtags as a list. #print("List of Top 10 hashtags " + lat_long + " :") #print(top_hashtags) # displays the top 15 hashtags as a list. plt.close() exit()
def scrape(username, limit=None): if not os.path.exists(imagefolder): os.makedirs(imagefolder) if not os.path.exists(datafolder): os.makedirs(datafolder) # Configure c = twint.Config() c.Username = username c.Proxy_host = getProxy() c.Proxy_port = 6060 c.Proxy_type = "http" c.Proxy_Username = proxyuser c.Proxy_Password = proxypass c.Media = True c.Pandas = True c.Hide_output = True c.Limit = limit # Run print("Scraping from " + username + "'s twitter...") twint.run.Search(c) print("Scraped!") df = twint.storage.panda.Tweets_df username = df["username"][0] # userid = df["user_id"][0] tweets = [] for index, row in df.iterrows(): # photos = [] # for photo in row["photos"]: # url = photo.split("/")[-1] # if index <= 5: # p = Process(target=downloadAndSavePhoto, args=(url,)) # p.start() # photos.append(baseurl+imagefolder+url) # tweets.append({"id": str(row["id"]), "created_at": str( # row["created_at"]), "tweet": row["tweet"], "photos": photos}) if row["video"] == 0: embed = requests.get( f"https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{row['id']}" ) embed = embed.json()["html"] else: embed = "" tweets.append({ "id": str(row["id"]), "created_at": str(row["created_at"]), "tweet": row["tweet"], "photos": row["photos"], "video": embed }) print("Converted dataframe!") return {"username": str(username), "tweets": tweets}
def get_user_following(username,search): c = twint.Config() c.Username = username save_result(c,username + "user_following") twint.run.Following(c) get_user_tweets(username,search,True)
def download_tweets(username=None, limit=None, include_replies=False, strip_usertags=False, strip_hashtags=False): """Download public Tweets from a given Twitter account into a format suitable for training with AI text generation tools. :param username: Twitter @ username to gather tweets. :param limit: # of tweets to gather; None for all tweets. :param include_replies: Whether to include replies to other tweets. :param strip_usertags: Whether to remove user tags from the tweets. :param strip_hashtags: Whether to remove hashtags from the tweets. """ assert username, "You must specify a username to download tweets from." if limit: assert limit % 20 == 0, "`limit` must be a multiple of 20." # If no limit specifed, estimate the total number of tweets from profile. else: c_lookup = twint.Config() c_lookup.Username = username c_lookup.Store_object = True c_lookup.Hide_output = True twint.run.Lookup(c_lookup) limit = twint.output.users_list[0].tweets pattern = r'http\S+|pic\.\S+|\xa0|…' if strip_usertags: pattern += r'|@[a-zA-Z0-9_]+' if strip_hashtags: pattern += r'|#[a-zA-Z0-9_]+' update_resume_file(-1) print("Retrieving tweets for @{}...".format(username)) with open('{}_tweets.csv'.format(username), 'w', encoding='utf8') as f: w = csv.writer(f) w.writerow(['tweets']) # gpt-2-simple expects a CSV header by default pbar = tqdm(range(limit), desc="Oldest Tweet") for i in range((limit // 20) - 1): tweet_data = [] # twint may fail; give it up to 5 tries to return tweets for _ in range(0, 4): if len(tweet_data) == 0: c = twint.Config() c.Store_object = True c.Hide_output = True c.Username = username c.Limit = 40 c.Resume = '.temp' c.Store_object_tweets_list = tweet_data twint.run.Search(c) # If it fails, sleep before retry. if len(tweet_data) == 0: sleep(1.0) else: continue # If still no tweets after multiple tries, we're done if len(tweet_data) == 0: break if i > 0: tweet_data = tweet_data[20:] if not include_replies: tweets = [ re.sub(pattern, '', tweet.tweet).strip() for tweet in tweet_data if not is_reply(tweet) ] else: tweets = [ re.sub(pattern, '', tweet.tweet).strip() for tweet in tweet_data ] for tweet in tweets: if tweet != '': w.writerow([tweet]) if i > 0: pbar.update(20) else: pbar.update(40) oldest_tweet = (datetime.utcfromtimestamp( tweet_data[-1].datetime / 1000.0).strftime('%Y-%m-%d %H:%M:%S')) pbar.set_description("Oldest Tweet: " + oldest_tweet) pbar.close() os.remove('.temp')
def __init__(self, args): self.config = twint.Config() self.name_of_hashtag = args.hashtag self.config.Limit = 100000000000000000 self.basic_setup_status = self.basicSetup()
import twint c = twint.Config() a = twint.Config() b = twint.Config() c.Search = "Amazon forest" c.Lang = 'en' c.Limit = 10000 c.Since = '2019-01-01' c.Store_csv = True c.Output = "Amazon forest" c.Hide_output = True twint.run.Search(c) a.Search = "Amazon forest fire" a.Lang = 'en' a.Limit = 10000 a.Store_csv = True a.Since = '2019-01-01' a.Output = " Amazon forest fire" a.Hide_output = True twint.run.Search(a) b.Search = "Amazon forest", ' fire', "climate change" b.Lang = 'en' b.Limit = 10000 b.Since = '2019-01-01' b.Store_csv = True b.Output = " climate change"
today = datetime.today().strftime('%Y-%m-%d') date_list = pd.date_range(start="2019-08-02",end='2019-08-11') date_list = date_list.to_series().dt.date #%% filter by country #how long going to sleep after scraping one country done #how long going to sleep after one day has been scraped sleep_day = 1 number_tweets = 20000 for date in date_list: time1 = time.time() config = twint.Config() date1 = date date2 = date - pd.Timedelta(days = 1) language = "en" config.Search = f"until:{date1} since:{date2} lang:{language}" config.Store_object = True #c.Store_csv = True config.Limit = number_tweets #number needed to get around 1mio posts per country config.Store_json = True config.Output = f'En_NoFilter_{date2}_{language}.json'
import twint from datetime import datetime import time import os data = datetime.now() timestr = time.strftime("%Y%m%d") print('-' * 70) print('-Script para raspagem de dados do Twitter-\n' 'Defina os parâmetros para sua busca.\n' 'A saída gera um relatório da busca e um arquivo CSV com os dados.\n') print('-' * 70) parametro = input('Busca por termo (1) ou por usuário (2)? ') c = twint.Config() #configurar os parâmatros de busca do twint if parametro == '1': busca = input('Digite o termo da busca: ') c.Search = "'" + busca + "'" c.Username = None nome = busca + '_' + timestr c.Output = os.path.join('DATA', nome) #pasta de saída if not os.path.exists(c.Output): #se a pasta não existe, cria a pasta DATA os.makedirs(c.Output) elif parametro == '2': busca = input('Digite o nome do usuário: ') c.Search = None c.Username = busca nome = 'user_' + c.Username + '_' + timestr c.Output = os.path.join('DATA', nome)
def get_all_retweets(self, username): c = twint.Config() c.Username = username c.Retweets = True twint.run.Profile(c)
def get_votes_and_profile_image(self, tweet, full_thread_text=None, title=None, doi=None, pubmed_id=None, pmcid=None, return_votes=True): """ Get profile image of tweeter and compute votes of tweet. """ # Inspired by https://github.com/karpathy/arxiv-sanity-preserver def tprepro(tweet_text): # take tweet, return set of words t = tweet_text.lower() t = re.sub(r'[^\w\s@]','',t) # remove punctuation ws = set([w for w in t.split() if not (w.startswith('#') or w.startswith('@'))]) return ws # Lookup the profile of the user users_list = [] c = twint.Config() c.Username = tweet.username c.Store_object = True c.Store_object_users_list = users_list twint.run.Lookup(c) # Get number of followers and profile image url try: num_followers = users_list[0].followers profile_image_url = users_list[0].avatar bio = users_list[0].bio except IndexError: num_followers = 0 profile_image_url = "" bio = "" if return_votes == False: return None, profile_image_url # Give low weight to retweets, tweets without comments and tweets with short length thread_words = set() if full_thread_text: for part in full_thread_text: thread_words = thread_words | tprepro(part) else: thread_words = thread_words | tprepro(tweet.tweet) query_words = set() for identifier in [title, doi, pubmed_id, pmcid]: if identifier is not None: query_words = query_words | tprepro(identifier) for url in tweet.urls: query_words = query_words | tprepro(url) comments = thread_words - query_words isok = int(not(tweet.retweet or len(tweet.tweet) < 40) and len(comments) >= 5) tweet_sort_bonus = 10000 if isok else 0 research_bonus = 0 # If bio contains keywords such as research/professor, give additional points if re.search(r'.*researcher.*', bio, re.IGNORECASE) or re.search(r'.*professor.*', bio, re.IGNORECASE) or re.search(r'.*phd.*', bio, re.IGNORECASE) or re.search(r'.*postdoc.*', bio, re.IGNORECASE) or re.search(r'.*scientist.*', bio, re.IGNORECASE): research_bonus += 500 # Add up all contributing factors votes = int(tweet.likes_count) + int(tweet.retweets_count) + tweet_sort_bonus + num_followers + research_bonus return votes, profile_image_url
def get_retweets(tweet_id=1169681044573962240,option="twint",interval=None,word_limit=100,word_limit_low=7): search_objects_record=[] print("analysing tweet "+str(tweet_id)) #split the tweet into sentences. Then search for them, trying to find the original tweet. start_status=api.get_status(tweet_id,tweet_mode="extended") start_time=start_status.created_at author=start_status.user.screen_name print("author "+str(author)) text=url_remove(start_status.full_text) #split into sentences sentences=text.split(".") if len(sentences[-1])==0: sentences=sentences[:-1] copies_found=[] for sentence in sentences: if len(sentence)==0: continue if sentence[0]==' ': sentence=sentence[1:] #print(length_w) #now build a search - this won't work for twint with longer queries, won't work for tweepy for >7 days old posts if option=="twint": sent_matches=[] S=sentence.split(" ") length_w=len(S) if length_w>100: query=" ".join(S[2:10]) # query="\""+query+"\"" else: query=sentence if length_w<word_limit_low: continue # query="\""+query+"\"" query=query+" filter:nativeretweets" #query='Netanyahu West Bank Palestinian' #query=query+" (@"+author+")" bs=twint.Config() bs.Search=query print("searching "+bs.Search) bs.Store_object=True bs.Filter_retweets=False bs.Hide_output=True bs.Store_object_tweets_list = sent_matches #bs.Output=False twint.run.Search(bs) search_objects_record.append(bs) #add filter:nativeretweets for retweets if option=="tweepy": copies_found = [status for status in tweepy.Cursor(api.search, q=sentence, tweet_mode='extended').items(max_tweets)] copies_found.append(sent_matches) for copy in copies_found: print("Found "+str(len(copy))+" tweets") return(copies_found,search_objects_record)
def twintScraper(from_date=None, end_date=None): # Configure c = twint.Config() c.Username = "******" # c.Search = "coronavirus" c.Limit = 10 # c.Tweet_id = "1257793742540386304" c.Show_hashtags = True c.Get_replies = True c.Verified = True c.Stats = True c.Count = True c.Lang = "en" c.Hide_output = True # c.Resume = "1223026504482918405" # print(get_retweeters_list("1258837711806496770")) # twint.run.Profile(c) c.Store_object = True # c.Since = "2020-01-30 00:00:00" # c.Until ="2020-02-01 00:00:00" c.Since = from_date c.Until = end_date # c.Until =str(datetime.datetime.now())[:19] # print(str(datetime.datetime.now())[:19]) # exit() twint.run.Search(c) tweets_as_objects = twint.output.tweets_list print("Real tweets amount: ", len(tweets_as_objects)) dict_op = { "CONTENT": [], "TWEET_ID": [], "USER_NAME": [], "POST_DATE": [], "POST_TIME": [], "LINK": [], "URL_INCLUDED": [], "RETWEETS_COUNT": [], "RETWEETS_PEOPLE": [], "LIKES_AMOUNT": [], "REPLIIES_AMOUNT": [], "REPLAY_PEOPLE": [], "REPLAY_TIME": [], "REPLAY_CONTENT": [], "MARK": [] } count = 0 for tweet in tweets_as_objects: id = tweet.id name = tweet.username # print(name,"HHHHHHHHHHHHHHHHHHHHHHHH") likes_amount = tweet.likes_count retweets_count = tweet.retweets_count replies_count = tweet.replies_count replies_people = [] replies_time = [] replies_content = [] replies_people, replies_time, replies_content = getReplyer(name, id) print(count, " CONTENT: ", tweet.tweet, " TWEET_ID: ", str(id), " USER_NAME: ", str(name), " POST_DATE: ", tweet.datestamp, " POST_TIME: ", tweet.timestamp, " LINK: ", tweet.link, " URL_INCLUDED: ", tweet.urls, " RETWEETS_COUNT: ", len(get_retweeters_list(id)), " RETWEETS_PEOPLE: ", get_retweeters_list(id), " LIKES_AMOUNT: ", likes_amount, " REPLIIES_AMOUNT: ", len(replies_people), " REPLAY_PEOPLE: ", replies_people, " REPLAY_TIME: ", replies_time, " REPLAY_CONTENT: ", replies_content) dict_op["CONTENT"].append(tweet.tweet) dict_op["TWEET_ID"].append(str(id)) dict_op["USER_NAME"].append(str(name)) dict_op["POST_DATE"].append(tweet.datestamp) dict_op["POST_TIME"].append(tweet.timestamp) dict_op["LINK"].append(tweet.link) dict_op["URL_INCLUDED"].append(tweet.urls) dict_op["RETWEETS_COUNT"].append(len(get_retweeters_list(id))) dict_op["RETWEETS_PEOPLE"].append(get_retweeters_list(id)) dict_op["LIKES_AMOUNT"].append(likes_amount) dict_op["REPLIIES_AMOUNT"].append(len(replies_people)) dict_op["REPLAY_PEOPLE"].append(replies_people) dict_op["REPLAY_TIME"].append(replies_time) dict_op["REPLAY_CONTENT"].append(replies_content) dict_op["MARK"].append("##############################") count += 1 if (count % 200 == 0 and count != 0) or count == len(tweets_as_objects): lastsavedtweetid = dict_op["TWEET_ID"][len(dict_op["TWEET_ID"]) - 1] print(f"SAVE_MARK {count}: lasts aved tweetid = ", lastsavedtweetid) df = pd.DataFrame(data=dict_op) df.to_json(f"{from_date} {count} COVID-19.json", orient='records') dict_op = { "CONTENT": [], "TWEET_ID": [], "USER_NAME": [], "POST_DATE": [], "POST_TIME": [], "LINK": [], "URL_INCLUDED": [], "RETWEETS_COUNT": [], "RETWEETS_PEOPLE": [], "LIKES_AMOUNT": [], "REPLIIES_AMOUNT": [], "REPLAY_PEOPLE": [], "REPLAY_TIME": [], "REPLAY_CONTENT": [], "MARK": [] } if count % 1000 == 0 and count != 0: time.sleep(60.0)
import twint import sys import os c = twint.Config() c.Username = sys.argv[1] c.Limit = 20 c.Output = "tweets.txt" if os.path.exists("tweets.txt"): os.remove("tweets.txt") twint.run.Search(c)
def reply_influence(user, filename=os.path.join(args.file_path, args.reply_filename)): filename = filename.format(user) original_post_conf = twint.Config() original_post_conf.Backoff_exponent = 2 original_post_conf.Retries_count = 40 original_post_conf.Username = user original_post_conf.Search = args.keyword original_post_conf.Lang = args.lang original_post_conf.Pandas = True # original_post_conf.Proxy_host = 'tor' reply_conf = twint.Config() reply_conf.To = user reply_conf.Filter_retweets = True reply_conf.Backoff_exponent = 2 reply_conf.Retries_count = 40 reply_conf.Hide_output = True reply_conf.Lang = args.lang reply_conf.Pandas = True # reply_conf.Resume = 'temp.txt' # reply_conf.Proxy_host = 'tor' str_start_date, id_list = continue_date(user, filename) for begindate, enddate in daterange(str_start_date, args.until, args.date_format): print('----------------------------------------------') print(f'Advanced search : "{args.keyword}" on {begindate}.') start = time.time() original_post_conf.Search = args.keyword original_post_conf.Since = begindate original_post_conf.Until = enddate while True: try: twint.run.Search(original_post_conf) break except Exception as e: print(e) print(f'Search time: {time.time() - start} s.') pd_tweets = twint.storage.panda.Tweets_df if len(pd_tweets) != 0: id_list.extend(pd_tweets['id'].tolist()) print('id_list: ', len(id_list)) print(id_list) reply_list = [] reply_count_list = [] unique_user_count = [] user_weight_list = [] influence_score_list = [] if len(id_list) != 0: print('----------------------------------------------') print(f'Searching reply posts to {user}.') start = time.time() reply_conf.Since = begindate reply_conf.Until = enddate while True: try: twint.run.Search(reply_conf) break except Exception as e: print(e) time.sleep(5) print(f'Search time: {time.time() - start} s.') pd_all_reply_tweets = twint.storage.panda.Tweets_df print(f'get total reply post: {len(pd_all_reply_tweets)}') if len(pd_all_reply_tweets) != 0: # id: current post id # conversation_id: original post id, i.e. the post id that is replied # pd_reply_tweets = pd_all_reply_tweets[ # pd_all_reply_tweets['conversation_id'].isin(id_list) # ] # reply_list = pd_reply_tweets.groupby( # 'conversation_id').size().to_list() pd_reply_tweets_list = [ pd_all_reply_tweets[ pd_all_reply_tweets['conversation_id'] == id ] for id in id_list ] pd_unique_user_list = [ pd_reply_tweets.groupby('user_id').size() for pd_reply_tweets in pd_reply_tweets_list ] reply_count_list = [ len(pd_reply_tweets) for pd_reply_tweets in pd_reply_tweets_list ] unique_user_count = [ len(pd_unique_user) for pd_unique_user in pd_unique_user_list ] user_weight_list = [ 0 if len(pd_unique_user) == 0 else (0.9 ** (pd_unique_user - 1)).sum() / len(pd_unique_user) for pd_unique_user in pd_unique_user_list ] influence_score_list = [ 0 if reply_count == 0 else user_weight * math.log10(reply_count + 1) for user_weight, reply_count in zip(user_weight_list, reply_count_list) ] data = { 'date': [begindate], 'id': [id_list], 'reply_count_list': [reply_count_list], 'unique_user_count': [unique_user_count], 'user_weight_list': [user_weight_list], 'influence_score_list': [influence_score_list], user: [sum(reply_count_list)], } append_data = pd.DataFrame.from_dict(data) append_data.set_index('date', inplace=True) print('----------------------------------------------') print(append_data) append_data.to_csv(filename, mode='a', header=not os.path.exists(filename))
def process_usernames(new_usernames): for username in new_usernames: try: user_name_df = pd.DataFrame() user_id_list = [] user_handle_list = [] user_name_list = [] user_bio_list = [] user_profile_image_list = [] c = twint.Config() c.Username = username c.Store_object = True c.User_full = False c.Pandas =True twint.run.Lookup(c) user_df = twint.storage.panda.User_df.drop_duplicates(subset=['id']) user_id = list(user_df['id'])[0] user_name = list(user_df['name'])[0] user_bio = list(user_df['bio'])[0] user_profile_image = list(user_df['avatar'])[0] user_id_list.append(user_id) user_handle_list.append(username) user_name_list.append(user_name) user_bio_list.append(user_bio) user_profile_image_list.append(user_profile_image) user_name_df['Twitter_Handle'] = user_handle_list user_name_df['Twitter_ID'] = user_id_list user_name_df['Twitter_Name'] = user_name_list user_name_df['Twitter_Bio'] = user_bio_list user_name_df['Twitter_Profile_Image'] = user_profile_image_list print(user_name_df) save_to_mongodb(user_name_df) sleep(60) except: user_name_df = pd.DataFrame() user_id_list = [] user_handle_list = [] user_name_list = [] user_bio_list = [] user_profile_image_list = [] print(username) user_id_list.append('NA') user_handle_list.append(username) user_name_list.append('NA') user_bio_list.append('NA') user_profile_image_list.append('NA') user_name_df['Twitter_Handle'] = user_handle_list user_name_df['Twitter_ID'] = user_id_list user_name_df['Twitter_Name'] = user_name_list user_name_df['Twitter_Bio'] = user_bio_list user_name_df['Twitter_Profile_Image'] = user_profile_image_list print(user_name_df) save_to_mongodb(user_name_df) sleep(200)
def spacysmscraper(text, number): # print("1") asyncio.set_event_loop(None) # Clear the main loop. loop = asyncio.new_event_loop() # Create a new loop. nest_asyncio.apply(loop) # print("2") # Part 1: for Reddit threads reddit = praw.Reddit(client_id='7hU5ZrX236KkyQ', client_secret='c6pSBGl5Z2O1nwc-j-iuFhwGwfs', redirect_uri='http://*****:*****@ \n\"\'" # Function for removing unknown characters remove_unknown_chars = lambda x: ''.join(char for char in x if char in symbols_to_keep) # Function for removing all Twitter user tags (@ongunuzaymacar, etc.) remove_user_tags = lambda x: re.sub(r'@\w+', '', x) # Function for removing all Twitter hashtags (#freetheworld, ect.) remove_hash_tags = lambda x: re.sub(r'#\w+', '', x) # Function for removing all URLs (www.google.com, etc.) remove_urls = lambda x: re.sub(r'(https://|www.)[A-Za-z0-9_.]+', '', x) def clean_tweets(twoot): # Convert to lowercase and remove spaces from beginning twoot = str(twoot).lstrip() # Remove Twitter-related data twoot = remove_user_tags(twoot) twoot = remove_urls(twoot) twoot = remove_hash_tags(twoot) # Remove unwanted characters twoot = remove_unknown_chars(twoot) # Remove spaces from end and condense multiple spaces into one twoot = twoot.rstrip() twoot = re.sub(' +', ' ', twoot) return twoot result["Text"] = result["Text"].apply(clean_tweets) f = lambda x: " ".join(x["Text"].split()) result["Text"] = result.apply(f, axis=1) def ner(x): nlp = spacy.load("en_core_web_sm") doc = nlp(x) textually = [] tags = [] for ent in doc.ents: textually.append(ent.text) tags.append(ent.label_) spacy_dictionary = dict(zip(textually, tags)) good_terms = [] for key in spacy_dictionary: if spacy_dictionary[key] == "ORG": good_terms.append(key) if spacy_dictionary[key] == "GPE": good_terms.append(key) if spacy_dictionary[key] == "LOC": good_terms.append(key) if spacy_dictionary[key] == "PRODUCT": good_terms.append(key) if spacy_dictionary[key] == "DATE": good_terms.append(key) if ("PRODUCT" not in spacy_dictionary.values()) and ("ORG" not in spacy_dictionary.values()): good_terms.clear() return good_terms result["NER Model"] = result["Text"].apply(ner) return result