def preprocess_categorical_data(data: pd.DataFrame, cat_cols: list, default_value: str, link_information: list, document_col: str, join_attr: str = ". "): data_transformed: pd.DataFrame = pd.DataFrame([]) try: df_cat: pd.DataFrame = data.select_dtypes(["object"]) df_cat: pd.DataFrame = df_cat[cat_cols] df_cat.fillna(value=default_value, inplace=True) # Add document df_cat[document_col] = df_cat.apply( FeatureExtraction.extract_document_from_categorical_series, axis=1, args=( link_information, join_attr, )) data_transformed: pd.DataFrame = df_cat.copy() except Exception as e: logger.error(e) return data_transformed
def start_streaming( self, thread_name: str, collection_name: str = "default", es_index_name: str = "default") -> StreamingServiceOutput: output: StreamingServiceOutput = object.__new__(StreamingServiceOutput) try: set_name: str = collection_name if self.storage == "mongoDB" else es_index_name data = { "languages": self.languages, "track": self.track, "storage": self.storage, "collection_names": { "status": f"{set_name}_tweets", "user": "******" }, "mongo_db_name": self.mongo_db_name } response: StreamingProcessOutput = self.twst_api.start_new_streaming_process( thread_name=thread_name, data=data) # Generate the response output: StreamingServiceOutput = StreamingServiceOutput( status_code=response.status_code, message=response.message, data=response.data) except Exception as e: logger.error(e) return output
def get_available_languages(api: API) -> list: languages: list = [] try: languages: list = api.supported_languages() except Exception as e: logger.error(e) return languages
def retrieve_data_from_index_by_searching(self, index, search_key, search_value, fuzzy_threshold=95, request_timeout=30): response = {} try: query = { "query": { "match": { search_key: { "query": search_value, "fuzziness": "0" } } } } results = self.es.search(body=query, index=index, request_timeout=request_timeout) if results['hits']['total'] > 0: res_name = results['hits']['hits'][0]['_source'][search_key] distance = fuzz.ratio( res_name.replace(' ', '').lower(), search_value.replace(' ', '').lower()) if distance >= fuzzy_threshold: response = { "source": results['hits']['hits'][0]['_source'], "id": results['hits']['hits'][0]["_id"] } except Exception as e: logger.error(e) return response
def write_json_file(data: dict, filename: str): try: json_data = json.dumps(str(data)) with open(filename, 'w') as file: json.dump(json_data, file) except Exception as e: logger.error(e)
def generate_user_feature_vector(self, user: User) -> np.ndarray: doc_emb: np.ndarray = np.array([]) try: index_val: int = 0 # 1. Extract features user_features: dict = self.extract_features_from_user_account( user_data=user) # 2. Preprocess features data_transformed: pd.DataFrame = self.preprocess_data( user_features=user_features, popularity_metric=popularity_metric, boolean_cols=boolean_cols, drop_num_cols=drop_num_cols, scaler_filename=scaler_filename, cat_cols=cat_cols, default_value=default_value, link_information=link_information, document_col=document_col, join_attr=join_attr, index_val=index_val) # 3. Generate input embedding doc_emb: np.ndarray = self.get_embedding_from_dataframe( data_transformed=data_transformed, index_val=index_val) except Exception as e: logger.error(e) return doc_emb
def get_textblob_sentiment_analysis(doc: str) -> TextBlobSentOutput: output: TextBlobSentOutput = TextBlobSentOutput() try: was_translated: bool = False # 1. Get the language of the document lang: str = SentimentAnalysisAPI.get_document_language(doc=doc) if lang != "en": # Translate doc: str = SentimentAnalysisAPI.translate_to_english( src_doc=doc, src_lang=lang) was_translated: bool = True sentences: list = SentimentAnalysisAPI.make_sentences(text=doc) polarity_scores: list = [] subjectivity_scores: list = [] for sent in sentences: subjectivity: float = TextBlob(sent).sentiment.subjectivity polarity: float = TextBlob(sent).sentiment.polarity polarity_scores.append(polarity) subjectivity_scores.append(subjectivity) final_subjectivity: float = round( float(np.mean(subjectivity_scores)), 3) final_polarity: float = round(float(np.mean(polarity_scores)), 3) output: TextBlobSentOutput = TextBlobSentOutput( analysed=True, polarity=final_polarity, subjectivity=final_subjectivity, translated=was_translated) except Exception as e: logger.error(e) return output
def remove_all_documents_from_collection(self, collection_name: str): try: docs: DeleteResult = self.db[collection_name].delete_many({}) logger.info("%s documents deleted from %s", docs.deleted_count, collection_name) except Exception as e: logger.error(e)
def find_document_by_filter(self, collection_name: str, filter: dict): doc: Optional[Cursor] = None try: doc: Cursor = self.db[collection_name].find(filter) except Exception as e: logger.error(e) return doc
def convert_bool_to_int(data: pd.DataFrame, boolean_cols: list): try: for col in boolean_cols: data[col] = data[col].astype(int) except Exception as e: logger.error(e) return data
def on_error(self, status_code: int): if status_code == 420: logger.warning(f"Enhance Your Calm; The App is Being Rate Limited For Making Too Many Requests!\n") return False else: logger.error(f"Error {status_code} when ingesting a tweet\n") sys.exit()
def get_doc2vec_embedding(data_input: Optional[pd.DataFrame], index_val: int, document_col: str, all_num_cols: list): doc_embedding_np: np.ndarray = np.array([]) try: if isinstance(data_input, pd.DataFrame): document: str = data_input.loc[index_val, document_col] x_num: list = [ data_input.loc[index_val, j] for j in all_num_cols ] elif isinstance(data_input, pd.Series): document: str = data_input[document_col] x_num: list = [data_input[j] for j in all_num_cols] else: document: str = data_input.get(document_col, "") x_num: list = [ data_input.get(document_col, -1) for j in all_num_cols ] # Document embedding from a string doc_emb: np.ndarray = FeatureExtraction.generate_doc_embedding( document=document, embeddings=FeatureExtraction.get_flair_embeddings()) x_doc_emb: list = list(doc_emb.tolist()) if len(x_doc_emb) > 0: # 3.3 Concatenate doc embedding + numerical cols doc_embedding_np: np.ndarray = np.array([x_num + x_doc_emb ]).reshape((1, -1)) except Exception as e: logger.error(e) return doc_embedding_np
def load_scaler_object(filename: str): scaler_obj: Optional[StandardScaler] = None try: scaler_obj: StandardScaler = joblib.load(filename) except Exception as e: logger.error(e) return scaler_obj
def scale_data(data: pd.DataFrame, fit: bool = True, filename: str = ""): data_transformed: pd.DataFrame = pd.DataFrame([]) try: if fit: scaler: StandardScaler = StandardScaler() scaler.fit(data) prepare_directory(os.sep.join(filename.split(os.sep)[0:-1])) # Save scaler logger.info("Saving scaler object at %s", filename) FeatureExtraction.save_scaler_object(scaler_obj=scaler, filename=filename) else: scaler: StandardScaler = FeatureExtraction.load_scaler_object( filename=filename) # Transform data res_transformed: np.ndarray = scaler.transform(data) data_transformed: pd.DataFrame = pd.DataFrame(res_transformed, columns=data.columns) except Exception as e: logger.error(e) return data_transformed
def preprocess_data(user_features: dict, popularity_metric: str, boolean_cols: list, drop_num_cols: list, scaler_filename: str, cat_cols: list, default_value: str, link_information: list, document_col: str, join_attr: str = ". ", index_val: int = 0): data_transformed: pd.DataFrame = pd.DataFrame([]) try: data_transformed: pd.DataFrame = FeatureExtraction.preprocess_prediction_data( data_dct=user_features, popularity_metric=popularity_metric, boolean_cols=boolean_cols, drop_num_cols=drop_num_cols, scaler_filename=scaler_filename, cat_cols=cat_cols, default_value=default_value, link_information=link_information, document_col=document_col, join_attr=join_attr, index_val=index_val) except Exception as e: logger.error(e) return data_transformed
def find_one_document(self, collection_name: str) -> dict: doc: dict = {} try: doc: dict = self.db[collection_name].find_one() except Exception as e: logger.error(e) return doc
def generate_embedding(data: pd.DataFrame, document_col: str, id_col: str, embeddings: list, doc2vec: str = "transformer_roberta"): doc_embedding_np: np.ndarray = np.array([]) try: all_num_cols: list = FeatureExtraction.get_numerical_columns( data=data) all_num_cols.remove(id_col) document: str = data[document_col] # 3.1 Generate doc embedding doc_emb: np.ndarray = FeatureExtraction.generate_doc_embedding( document=document, embeddings=embeddings, doc2vec=doc2vec) x_doc_emb: list = list(doc_emb.tolist()) x_num: list = [data[j] for j in all_num_cols] if len(x_doc_emb) > 0: # 3.3 Concatenate doc embedding + numerical cols doc_embedding_np: np.ndarray = np.array([x_num + x_doc_emb ]).reshape((1, -1)) print(doc_embedding_np.shape) except Exception as e: logger.error(e) return doc_embedding_np
def get_all_data_by_filter_field(self, collection_name: str, filter: dict): doc: Optional[Cursor] = None try: doc: Cursor = self.db[collection_name].find({}, filter) except Exception as e: logger.error(e) return doc
def embedding_cosine_similarity(x: np.array, y: np.array) -> float: similarity: float = 0.0 try: similarity: float = round(float(1 - distance.cosine(x, y)), 3) except Exception as e: logger.error(e) return similarity
def create_index_at_collection(self, collection_name: str, unique_uuid_col: str): try: self.db[collection_name].create_index( [(unique_uuid_col, ASCENDING)], unique=True) except Exception as e: logger.error(e)
def get_document_language(doc: str) -> str: language: str = "" try: language: str = detect(doc) except Exception as e: logger.error(e) return language
def clean_threads(): try: logger.info(f"Cleaning all threads") for i, thread_name in enumerate(global_streaming_threads): global_streaming_threads.pop(i) except Exception as e: logger.error(e)
def scroll_data_from_elasticsearch(self, index, body, scroll="1m", size=1000): items = [] try: res = self.es.search(index=index, body=body, scroll=scroll) if res: # update data items += res["hits"]["hits"] total_elements = res["hits"]["total"]["value"] rest_elements = (total_elements - size) if rest_elements > 0: done = True prev = res while done: new_items = self.es.scroll( scroll_id=prev['_scroll_id'], scroll=scroll) if len(new_items["hits"]["hits"]) > 0: items += new_items["hits"]["hits"] prev = new_items else: done = False except Exception as e: logger.error(e) return items
def build_output(self): output = {} try: output = { "identifier": self.identifier, "headline": self.headline, "articleBody": self.articleBody, 'url': self.url, "language": self.language, "images": self.images, "videos": self.videos, "dateCreated": self.dateCreated, "dateModified": self.dateModified, "datePublished": self.datePublished, "publishDateEstimated": self.publishDateEstimated, "authors": self.authors, "publisher": self.publisher, "sourceDomain": self.sourceDomain, "country": self.country, "nationality": self.nationality, "calculatedRating": self.calculatedRating, "calculatedRatingDetail": self.calculatedRatingDetail } # Add Fakeness whether it is available if self.fakeness != default_field and self.fakeness is not None: output['fakeness'] = self.fakeness except Exception as e: logger.error(e) return output
def process_analysis(self, publishers: list, authors: list, article_uuid: str, article_url: str, elasticsearch_connector: ElasticsearchConnector, source_rank: SourceRank): try: # 1. Perform analysis response: dict = self.perform_source_credibility_analysis( publishers=publishers, authors=authors, article_url=article_url, elasticsearch_connector=elasticsearch_connector, source_rank=source_rank) if response.get("code") == 200: # 2. Call Fusion Score logger.info("Calling Fusion Score Service") ThreadsProcessor.start_new_streaming_process( thread_name="fusion_score", target_func=self.update_fusion_score, params=( fusion_score_server, fusion_score_port, fusion_score_endpoint, article_uuid, )) """response_fusion_score: dict = self.update_fusion_score( server=fusion_score_server, port=fusion_score_port, endpoint=fusion_score_endpoint, article_uuid=article_uuid)""" except Exception as e: logger.error(e)
def start_new_streaming_process(self, thread_name: str, data: dict) -> StreamingProcessOutput: output: StreamingProcessOutput = object.__new__(StreamingProcessOutput) try: # 1. Generate Streaming Processor api: API = self.set_up_twitter_api() streaming_processor: StreamingProcessor = StreamingProcessor( api=api, languages=data.get("languages", []), track=data.get("track", []), storage=data.get("storage", "mongoDB"), collection_names=data.get("collection_names", {}), mongo_db_name=data.get("mongo_db_name", "default")) # 2. Create new Thread response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process( thread_name=thread_name, target_func=streaming_processor.run_twitter_streaming) # 3. Generate Output output: StreamingProcessOutput = StreamingProcessOutput( message=response.message, status_code=response.status_code, data=response.data) except Exception as e: logger.error(e) return output
def stop_streaming_process(thread_name: str): response: GeneralAPIResponse = object.__new__(GeneralAPIResponse) try: not_found = True stream_threads: list = ThreadsProcessor.get_available_threads() # If the list of threads is not empty if stream_threads: # Check if there is a thread with the same name running thread_names = [i.name for i in stream_threads] # Thread with the same name if thread_name in thread_names: thread_idx = thread_names.index(thread_name) # Stop selected thread current_thread: StreamingThread = stream_threads[thread_idx] response_kill: GeneralAPIResponse = ThreadsProcessor.kill_streaming_thread( streaming_thread=current_thread) if response_kill.status_code == 200: # Remove from list stream_threads.pop(thread_idx) not_found = False # Check not found if not_found: status_code: int = 400 message: str = f"Thread {thread_name} was not found!" else: status_code: int = 400 message: str = f"Thread {thread_name} was not found!" response: GeneralAPIResponse = GeneralAPIResponse( message=message, status_code=status_code, data={}) except Exception as e: logger.error(e) return response
def start_new_searching_process(self, thread_name: str, data: dict) -> StreamingProcessOutput: output: StreamingProcessOutput = object.__new__(StreamingProcessOutput) try: # 1. Generate Streaming Processor api: API = self.set_up_twitter_api() searching_processor: SearchingProcessor = SearchingProcessor( twitter_connector=self.twitter_connector, mongo_db_name=data.get(""), collection_names=data.get(""), local_storage=data.get(""), dest_storage=data.get("")) # 2. Create new Thread response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process( thread_name=thread_name, target_func=searching_processor.run_twitter_searching) # 3. Generate Output output: StreamingProcessOutput = StreamingProcessOutput( message=response.message, status_code=response.status_code, data=response.data) except Exception as e: logger.error(e) return output
def tweet_gathering(api: API, query: str, date_since: str, lang: str = 'en'): try: logger.info("Retrieving Tweets ... ") # Collect tweets tweets = Cursor(api.search, lang=lang, q=query, include_entities=True, monitor_rate_limit=True, wait_on_rate_limit_notify=True, wait_on_rate_limit=True, result_type="recent", tweet_mode='extended').items() while True: try: tweet: Status = tweets.next() print(tweet) yield tweet except RateLimitError: time.sleep(60 * 15) continue except StopIteration: break except Exception as e: logger.error(e)
def offline_service(self): output: GraphAnalyzerOutputDoc = GraphAnalyzerOutputDoc( message=http_response_500, status=500) try: self.service_task: str = offline_service_name if self.data_connector is None: # 1. Set up data manager self.set_up_data_manager(service=self.service_task) # 2. Verify Connection connection_error: dict = self.verify_external_server_connections() # 3. If there is an error in any external connection if True in list(connection_error.values()): status: int = 403 message: str = http_response_403 else: # 4. Start offline process response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process( thread_name=self.service_task, target_func=self.data_connector.start_kafka_offline_process ) status: int = response.status_code message: str = response.message # 5. Build output output.message: str = message output.status: int = status except Exception as e: logger.error(e) return output