def parse_tweets(tweet): if len(tweet) > 0: keep_tweet = tweet tweet = tweet[0].split() cleaned_tweet = tweet_cleaner.clean_tweet(tweet) words = [] final_words = [] for word in cleaned_tweet: if len(word) > 2: checking_words = tweet_cleaner.check_words(word) if checking_words != 0: words.append(checking_words) for word in words: # stripping word twice to account for people's weird punctuation errors word = tweet_cleaner.strip_word(word) word = tweet_cleaner.strip_word(word) if len(word) > 2: checking_words = tweet_cleaner.check_words(word) if checking_words != 0: final_words.append(checking_words) words_in_game = [] if len(final_words) > 0: for word in final_words: if word > 0: # check Wordnik results = wordnik_api.check_all(word) if results != None: # if Wordnik never returned a result > 0, put it into the game words_in_game.append(word) if len(words_in_game) > 0: model.set_game_words_tweets(words_in_game, keep_tweet)
def process_json(screen_name): workbook = xlsxwriter.Workbook('%s_tweets.xlsx' % screen_name) worksheet = workbook.add_worksheet() # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 worksheet.write(row, 0, 'id') worksheet.write(row, 1, 'created_at') worksheet.write(row, 2, 'full_text') worksheet.write(row, 3, 'clean_text') row += 1 with open('{}.json'.format(screen_name)) as json_reader: lines = json_reader.readlines() for line in lines: json_tweet = json.loads(line) if 'retweeted_status' in json_tweet: text = json_tweet['retweeted_status']['full_text'] else: text = json_tweet['full_text'] clean_text = tweet_cleaner.clean_tweet(text) clean_text = tweet_cleaner.normalize_arabic(clean_text) clean_text = tweet_cleaner.remove_repeating_char(clean_text) clean_text = tweet_cleaner.keep_only_arabic(clean_text.split()) worksheet.write(row, col, json_tweet['id_str']) worksheet.write(row, col + 1, json_tweet['created_at']) worksheet.write(row, col + 2, text) worksheet.write(row, col + 3, clean_text) row += 1 workbook.close()
def process_json(filters, json_file, xls_file): import preprocessor workbook = xlsxwriter.Workbook(xls_file) worksheet = workbook.add_worksheet() # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 worksheet.write(row, 0, 'id') worksheet.write(row, 1, 'created_at') worksheet.write(row, 2, 'text') worksheet.write(row, 3, 'clean_text') row += 1 lines = json_file.readlines() for line in lines: json_tweet = json.loads(line) if 'retweeted_status' in json_tweet: text = json_tweet['retweeted_status']['text'] else: text = json_tweet['text'] clean_text = tweet_cleaner.clean_tweet(text) clean_text = tweet_cleaner.normalize_arabic(clean_text) clean_text = tweet_cleaner.remove_repeating_char(clean_text) clean_text = tweet_cleaner.keep_only_arabic(clean_text.split()) is_filtered = filter_tweet(filters, clean_text) if not is_filtered: # print only if text is not filtered worksheet.write(row, col, json_tweet['id_str']) worksheet.write(row, col + 1, json_tweet['created_at']) worksheet.write(row, col + 2, text) worksheet.write(row, col + 3, clean_text) row += 1 workbook.close()
def do_embedding(): data_pd = pd.read_csv( DATA_FILE_ORIGINAL_TWEETS, sep=',', skipinitialspace=True, ) data_set = pd.DataFrame(data_pd) # print(data_set.describe()) # print(data_pd.__len__()) # print(data_set.head()) col = data_pd['tweet_text'] # prepare an empty list tweet_list = list() tweets = data_set['tweet_text'].values.tolist() for tweet in tweets[0:params.NBR_TWEETS]: print(tweet) words = tweet_cleaner.clean_tweet(tweet) tweet_list.append(words) len(tweet_list) model = gensim.models.Word2Vec(sentences=tweet_list, size=params.EMBEDDING_DIM, workers=4, min_count=params.EMBEDDING_MIN_SIZE) model.save(STATE_FILE_ORIGINAL_TWEETS)
def extract_tweets_from_json(json_reader, text_writer): json_tweets = json_reader.readlines() print('tweets in json file: {} tweets'.format(len(json_tweets))) tweets_list = list() extracted_tweets_count = 0 if args.include_id: text_writer.write("id\ttweet\n") for json_tweet in json_tweets: try: if json_tweet: # load it as Python dict tweet = json.loads(json_tweet) tid = tweet['id'] text = tweet['text'] text = tweet_cleaner.clean_tweet(text) if args.normalize: text = tweet_cleaner.normalize_arabic(text) if args.remove_repeated_letters: text = tweet_cleaner.remove_repeating_char(text) if args.keep_only_arabic: text = tweet_cleaner.keep_only_arabic(text.split()) if len(text.split()) > 2: if args.exclude_redundant: if text not in tweets_list: tweets_list.append(text) if args.include_id: text_writer.write( str(tid) + "\t" + text + "\n") # print('id:{}'.format(str(tid))) # print('text:', text) # print('tweet:', tweet['text']) # input("press any key...") else: text_writer.write(text + "\n") extracted_tweets_count += 1 else: if args.include_id: text_writer.write(str(tid) + "\t" + text + "\n") else: text_writer.write(text + "\n") extracted_tweets_count += 1 except json.decoder.JSONDecodeError as error: pass except UnicodeDecodeError as error: pass print('extracted tweets: {} tweets'.format(extracted_tweets_count))
def tweets_2_vectors(): # This is the vector of documents / tweets document_vector_list = [] #load the model back model = gensim.models.Word2Vec.load("state_files/w2v.model") data_pd = pd.read_csv( 'data/tweets.csv', sep=',', skipinitialspace=True, ) data_set = pd.DataFrame(data_pd) col = data_pd['tweet_text'] # prepare an empty list tweet_list = list() tweets = data_set['tweet_text'].values.tolist() for tweet in tweets[0:40000]: words = tweet_cleaner.clean_tweet(tweet) count = 0 tweet_vector = [] for w in words: if w in model: if count == 0: tweet_vector = model[w] else: tweet_vector = tweet_vector + model[w] count = count + 1 if len(tweet_vector) > 0: document_vector_list.append(tweet_vector) document_vector_array = np.array(document_vector_list) pickle.dump(document_vector_array, open("state_files/tweets_vector.dat", "wb"))
def tweets_2_vectors( self, active, state_file_tweets_vector='state_files/tweets_vector.dat'): self.state_file_tweets_vector = state_file_tweets_vector if active: # This is the vector of documents / tweets tweets_vector_list = [] # load the model back model = gensim.models.Word2Vec.load(self.model_file_w2v) data_pd = pd.read_csv( self.data_file_tweets, sep=',', skipinitialspace=True, ) data_set = pd.DataFrame(data_pd) tweets = data_set['tweet_text'].values.tolist() for tweet in tweets[0:self.nbr_of_tweets]: words = tweet_cleaner.clean_tweet(tweet) count = 0 tweet_vector = [] for w in words: if w in model: if count == 0: tweet_vector = model[w] else: tweet_vector = tweet_vector + model[w] count = count + 1 if len(tweet_vector) > 0: tweets_vector_list.append(tweet_vector) document_vector_array = np.array(tweets_vector_list) pickle.dump(document_vector_array, open(self.state_file_tweets_vector, "wb"))
def word_embedding(self, active, embedding_dim=100, embedding_min_size=30, model_file_w2v='state_files/w2v.model'): self.embedding_dim = embedding_dim self.embedding_min_size = embedding_min_size self.model_file_w2v = model_file_w2v if active: data_pd = pd.read_csv( self.data_file_tweets, sep=',', skipinitialspace=True, ) data_set = pd.DataFrame(data_pd) # print(data_set.describe()) # print(data_pd.__len__()) # print(data_set.head()) # prepare an empty list tweet_list = list() tweets = data_set['tweet_text'].values.tolist() for tweet in tweets[0:self.nbr_of_tweets]: print(tweet) words = tweet_cleaner.clean_tweet(tweet) tweet_list.append(words) len(tweet_list) model = gensim.models.Word2Vec(sentences=tweet_list, size=self.embedding_dim, workers=4, min_count=self.embedding_min_size) model.save(self.model_file_w2v)
def get_all_tweets(screen_name): # Twitter only allows access to a users most recent 3240 tweets with this method # authorize twitter, initialize tweepy if api_keys.consumer_key == '' or api_keys.consumer_secret == '' \ or api_keys.access_token == '' or api_keys.access_secret == '': print("API key not found. Please check api_keys.py file") sys.exit(-1) auth = tweepy.OAuthHandler(api_keys.consumer_key, api_keys.consumer_secret) auth.set_access_token(api_keys.access_token, api_keys.access_secret) api = tweepy.API(auth) # initialize a list to hold all the tweepy Tweets alltweets = [] # make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200) # save most recent tweets alltweets.extend(new_tweets) # save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 # keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print("getting tweets before {}".format(oldest)) # all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) # save most recent tweets alltweets.extend(new_tweets) # update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print("...{} tweets downloaded so far".format((len(alltweets)))) #print('all tweets\n', alltweets) #print('first tweet:', alltweets[0]) # transform the tweepy tweets into a 2D array that will populate the csv outtweets = [[ tweet.id_str, tweet.created_at.strftime('%m/%d/%Y'), tweet.text.encode("utf-8").decode('utf-8') ] for tweet in alltweets] outtweetsDict = [{ 'id': tweet.id_str, 'created_at': tweet.created_at.strftime('%m/%d/%Y'), 'text': tweet.text.encode("utf-8").decode('utf-8') } for tweet in alltweets] #print('first outtweets:', outtweets[0]) # write the csv with open('%s_tweets.csv' % screen_name, 'w') as csvfile: fieldnames = ["id", "created_at", "text"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel') writer.writeheader() writer.writerows(outtweetsDict) workbook = xlsxwriter.Workbook('%s_tweets.xlsx' % screen_name) worksheet = workbook.add_worksheet() # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 worksheet.write(row, 0, 'id') worksheet.write(row, 1, 'created_at') worksheet.write(row, 2, 'original_text') worksheet.write(row, 3, 'clean_text') row += 1 for tid, tdate, text in outtweets: clean_text = tweet_cleaner.clean_tweet(text) clean_text = tweet_cleaner.normalize_arabic(clean_text) clean_text = tweet_cleaner.remove_repeating_char(clean_text) clean_text = tweet_cleaner.keep_only_arabic(clean_text.split()) worksheet.write(row, col, tid) worksheet.write(row, col + 1, tdate) worksheet.write(row, col + 2, text) worksheet.write(row, col + 3, clean_text) row += 1 workbook.close()
data_pd = pd.read_csv( 'data/tweets.csv', sep=',', skipinitialspace=True, ) data_set = pd.DataFrame(data_pd) col = data_pd['tweet_text'] # prepare an empty list tweet_list = list() tweets = data_set['tweet_text'].values.tolist() for tweet in tweets[0:40000]: words = tweet_cleaner.clean_tweet(tweet) count = 0 tweet_vector = [] for w in words: if w in model: if count == 0: tweet_vector = model[w] else: tweet_vector = tweet_vector + model[w] count = count + 1 if len(tweet_vector) > 0: document_vector_list.append(tweet_vector) document_vector_array = np.array(document_vector_list)