class TweetStoringToHDFS(object): def __init__(self): super().__init__() self.listen_topic = "sentiment_tweet" self.is_listening = True self.consumer = KafkaConsumer( self.listen_topic, bootstrap_servers=['localhost:9092'], auto_offset_reset='earliest', enable_auto_commit=True, group_id='my-group', value_deserializer=lambda x: loads(x.decode('utf-8'))) self.send_topic = "new_tweet" self.hdfsUtil = HDFSUtil() self.curFile = "" self.df = None self.save_at_batches = 1000 # control how frequent we save our file def write_tweet(self): while self.is_listening: for message in self.consumer: # get current datetime as file name date_str = datetime.now().strftime("%d-%m-%Y") file_name = 'tweets_{0}.csv'.format(date_str) if self.curFile == "" or self.curFile != file_name: # Load current HDFS file as df if self.hdfsUtil.is_file_exist(file_name): self.df, _ = self.hdfsUtil.read_file_dataframe( file_name) self.curFile = file_name else: # TODO: Write to HDFS before creating new one, if contain data # HDFS file does not exist, create empty data frame self.curFile = file_name self.df = pd.DataFrame() # Read json object and load it into data frame json_object = dict(message.value) cur_df = pd.DataFrame(list(json_object.values())).T cur_df.columns = list(json_object.keys()) # Append the new tweet frames = [self.df, cur_df] self.df = pd.concat(frames) # saving the result at 1000 new tweets print(f"{len(self.df)} / {self.save_at_batches}") if (len(self.df) % self.save_at_batches) > 0 and len( self.df) >= self.save_at_batches: temp_path = os.path.join("../HDFS", self.hdfsUtil.temp_types["tweet"]) self.df.to_csv(temp_path, index=False) self.hdfsUtil.write_file(temp_path, file_name) self.df = pd.DataFrame()
def __init__(self): self.nlp = spacy.load("en_core_web_sm") self.hdfsUtil = HDFSUtil() self.df, self.df_schema = self.hdfsUtil.read_file_dataframe("tweets_20-02-2020.csv") self.model = api.load("glove-wiki-gigaword-50", return_path=False) # SparkCode sc = SparkContext.getOrCreate() self.sqlContext = SQLContext(sc) self.vector_list = []
def __init__(self): super().__init__() self.listen_topic = "sentiment_tweet" self.is_listening = True self.consumer = KafkaConsumer( self.listen_topic, bootstrap_servers=['localhost:9092'], auto_offset_reset='earliest', enable_auto_commit=True, group_id='my-group', value_deserializer=lambda x: loads(x.decode('utf-8'))) self.send_topic = "new_tweet" self.hdfsUtil = HDFSUtil() self.curFile = "" self.df = None self.save_at_batches = 1000 # control how frequent we save our file
class SampleNLP(object): """ Fixing JAVA version issue: https://stackoverflow.com/questions/53583199/pyspark-error-unsupported-class-file-major-version-55 """ def __init__(self): self.nlp = spacy.load("en_core_web_sm") self.hdfsUtil = HDFSUtil() self.df, self.df_schema = self.hdfsUtil.read_file_dataframe("tweets_20-02-2020.csv") self.model = api.load("glove-wiki-gigaword-50", return_path=False) # SparkCode sc = SparkContext.getOrCreate() self.sqlContext = SQLContext(sc) self.vector_list = [] def spark_word_cloud(self, data, max_row=500): """ Faster extract tokens and count them. Will look for both tweet and rss. :param max_row: Limit number of rows to extract :return: count token dictionary """ start_date = data['start_date'] if "end_date" in data.keys(): end_date = data['end_date'] else: end_date = None # Loading from Tweets sample_text1 = "" tweet_df, tweet_df_schema = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type='tweet') if tweet_df is not None or tweet_df_schema is not None: tweet_df = self.sqlContext.createDataFrame(tweet_df, tweet_df_schema) sample_text1 = " ".join( text.text for text in tweet_df.select("text").rdd.takeSample(False, max_row, seed=42)) # Loading from RSS sample_text2 = "" rss_df, rss_df_schema = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type='rss') if rss_df is not None or rss_df_schema is not None: rss_df = self.sqlContext.createDataFrame(rss_df, rss_df_schema) sample_text2 = " ".join( text.title for text in rss_df.select("title").rdd.takeSample(False, max_row, seed=42)) sample_text = f"{sample_text1} {sample_text2}" # Extract tokens doc = self.nlp(sample_text) tokens = [str(token.lemma_.lower()) for token in doc if not token.is_stop and not token.is_punct and not token.is_space and len(token) >= 3 and not token.like_url and token.is_alpha] # Count tokens token_dic = {} token_set = set(tokens) for t in token_set: token_dic[t] = 0 for t in tokens: token_dic[t] += 1 return token_dic def word_cloud(self, data): start_date = data['start_date'] if "end_date" in data.keys(): end_date = data['end_date'] else: end_date = None self.df = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type='tweet') if self.df is None: return {'status': "200", "message": "No file found"} tokens = [] for idx, row in self.df.iterrows(): doc = self.nlp(row['text']) for token in doc: if token.is_alpha and not token.is_stop: tokens.append(token.text) token_dic = {} token_set = set(tokens) for t in token_set: token_dic[t] = 0 for t in tokens: token_dic[t] += 1 return token_dic def contain_word(self, word): return word in self.model.vocab.keys() def sentence_vector(self, sentence, negative=None, positive=None): """ Use either negative or positive sentences to guide sentence vector """ if negative: negative = list(tokenize(negative)) negative = [word for word in negative if word not in sentence] # print(negative) neg_vectors = [self.model[word] for word in negative if self.contain_word(word)] # tokenize sentence, we need sentence as a string to extract additional words in negative sentence = list(tokenize(sentence)) vectors = [self.model[word] for word in sentence if self.contain_word(word)] vectors = np.mean(vectors, axis=0) if len(neg_vectors) == 0: neg_vectors = np.zeros(vectors.shape) return vectors - np.mean(neg_vectors, axis=0) elif positive: positive = list(tokenize(positive)) positive = [word for word in positive if word not in sentence] # print(positive) pos_vectors = [self.model[word] for word in positive if self.contain_word(word)] # tokenize sentence, we need sentence as a string to extract additional words in positive sentence = list(tokenize(sentence)) vectors = [self.model[word] for word in sentence if self.contain_word(word)] vectors = np.mean(vectors, axis=0) if len(pos_vectors) == 0: pos_vectors = np.zeros(vectors.shape) return vectors + np.mean(pos_vectors, axis=0) else: sentence = list(tokenize(sentence)) vectors = [self.model[word] for word in sentence if self.contain_word(word)] if not vectors: return np.zeros(50) return np.mean(vectors, axis=0) def search_doc(self, query, context=None, top_n=3): try: # get the top result in dataframe query_vector = self.sentence_vector(query, positive=context) # result = self.model.cosine_similarities(query_vector, [v for v in self.df['vector'].values]) result = self.model.cosine_similarities(query_vector, self.vector_list) self.df['score'] = result top_result = self.df.sort_values('score', ascending=False)[:top_n] # get the closes sentences final_result = {} result = [] for idx, row in top_result.iterrows(): meta = {} sents = self.nlp(row['text']).sents sents = list(sents) sents_vectors = [] for sent in sents: vector = self.sentence_vector(str(sent)) sents_vectors.append(vector) scores = self.model.cosine_similarities(query_vector, sents_vectors) scores[np.isnan(scores)] = 0 datetime = parser.parse(str(row['date'])) meta["sentence"] = str(sents[int(np.argmax(scores))]) meta["score"] = str(scores[np.argmax(scores)]) meta["doc"] = str(row['text']) meta['date'] = str(datetime.strftime("%d-%m-%Y_%H:%M:%S")) meta['author'] = str(row['source']) result.append(meta) final_result['result'] = result result_str = json.dumps(final_result) result_str = result_str.encode('utf8') return re.sub(rb'[^\x00-\x7f]', rb' ', result_str) except Exception as e: return "No result." def query_sentence(self, data): start_date = data['start_date'] if "end_date" in data.keys(): end_date = data['end_date'] else: end_date = None query = data['query'] if "context" in data.keys(): context = data['context'] else: context = None if "top_n" in data.keys(): top_n = data['top_n'] else: top_n = 3 # LOADING tweets sample_size = 5000 self.vector_list = [] df_tweet, tweet_df_schema = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type='tweet') if df_tweet is None: df_tweet = pd.DataFrame() else: if len(df_tweet) > sample_size: df_tweet = df_tweet.sample(n=sample_size) # compute tweet document vectors for doc in df_tweet['text']: vector = self.sentence_vector(doc) self.vector_list.append(vector) df_tweet = df_tweet[["text", "screen_name", "created_at"]] df_tweet.columns = ['text', "source", "date"] # LOADING rss df_rss, res_df_schema = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type="rss") if df_rss is not None: # df_rss = df_rss.sample(n=sample_size) # compute rss document vectors for doc in df_rss['title']: vector = self.sentence_vector(doc) self.vector_list.append(vector) df_rss = df_rss[['title', 'link', 'published']] df_rss.columns = ['text', "source", "date"] else: df_rss = pd.DataFrame() # No result found if len(df_tweet) == 0 and len(df_rss) == 0: return "{}" # Combine tweet and rss into single data frame self.df = pd.concat([df_tweet, df_rss]) # Search for the closet document return self.search_doc(query, context, top_n) def get_sentiment(self, data): start_date = data['start_date'] if "end_date" in data.keys(): end_date = data['end_date'] else: end_date = None is_positive = data['is_positive'] top_n = data['top_n'] df_tweet, tweet_df_schema = self.hdfsUtil.read_file_date(start_date=start_date, end_date=end_date, data_type='tweet') sample_size = 10000 if df_tweet is not None and len(df_tweet) > sample_size: df_tweet = df_tweet.sample(n=sample_size) if "compound" in list(df_tweet.columns) and is_positive: df_result = df_tweet.sort_values(by=['compound'], ascending=False)[:top_n] else: df_result = df_tweet.sort_values(by=['compound'], ascending=True)[:top_n] df_result = df_result[["text", "screen_name", "created_at", "compound"]] return df_result.to_json(orient="records")