def process_analysis(self, publishers: list, authors: list, article_uuid: str, article_url: str, elasticsearch_connector: ElasticsearchConnector, source_rank: SourceRank): try: # 1. Perform analysis response: dict = self.perform_source_credibility_analysis( publishers=publishers, authors=authors, article_url=article_url, elasticsearch_connector=elasticsearch_connector, source_rank=source_rank) if response.get("code") == 200: # 2. Call Fusion Score logger.info("Calling Fusion Score Service") ThreadsProcessor.start_new_streaming_process( thread_name="fusion_score", target_func=self.update_fusion_score, params=( fusion_score_server, fusion_score_port, fusion_score_endpoint, article_uuid, )) """response_fusion_score: dict = self.update_fusion_score( server=fusion_score_server, port=fusion_score_port, endpoint=fusion_score_endpoint, article_uuid=article_uuid)""" except Exception as e: logger.error(e)
def remove_all_documents_from_collection(self, collection_name: str): try: docs: DeleteResult = self.db[collection_name].delete_many({}) logger.info("%s documents deleted from %s", docs.deleted_count, collection_name) except Exception as e: logger.error(e)
def clean_threads(): try: logger.info(f"Cleaning all threads") for i, thread_name in enumerate(global_streaming_threads): global_streaming_threads.pop(i) except Exception as e: logger.error(e)
def tweet_gathering(api: API, query: str, date_since: str, lang: str = 'en'): try: logger.info("Retrieving Tweets ... ") # Collect tweets tweets = Cursor(api.search, lang=lang, q=query, include_entities=True, monitor_rate_limit=True, wait_on_rate_limit_notify=True, wait_on_rate_limit=True, result_type="recent", tweet_mode='extended').items() while True: try: tweet: Status = tweets.next() print(tweet) yield tweet except RateLimitError: time.sleep(60 * 15) continue except StopIteration: break except Exception as e: logger.error(e)
def scale_data(data: pd.DataFrame, fit: bool = True, filename: str = ""): data_transformed: pd.DataFrame = pd.DataFrame([]) try: if fit: scaler: StandardScaler = StandardScaler() scaler.fit(data) prepare_directory(os.sep.join(filename.split(os.sep)[0:-1])) # Save scaler logger.info("Saving scaler object at %s", filename) FeatureExtraction.save_scaler_object(scaler_obj=scaler, filename=filename) else: scaler: StandardScaler = FeatureExtraction.load_scaler_object( filename=filename) # Transform data res_transformed: np.ndarray = scaler.transform(data) data_transformed: pd.DataFrame = pd.DataFrame(res_transformed, columns=data.columns) except Exception as e: logger.error(e) return data_transformed
def establish_client_connection(self): try: logger.info("Connection established with MongoDB at {}:{}".format( self.db_host, self.db_port)) self.db_client: MongoClient = MongoClient(self.db_host, self.db_port) except Exception as e: logger.error(e)
def remove_index(self, index): res = True try: logger.info('Removing index %s from Elasticsearch', str(index)) self.es.indices.delete(index=index, ignore=[400, 404]) except Exception as e: logger.error(e) res = False return res
def check_dbManager(self): """ Checks if dbManager has started. If not, initialize dbManager. """ try: if self.db_client is None: # If no exists if not self.check_database_exists(): logger.info("A new DB has been created") self.establish_client_connection() self.db = self.db_client[self.db_name] except Exception as e: logger.error(e) self.status = 500
def set_up_twitter_api_connection(self) -> API: api: API = object.__new__(API) try: logger.info("Connecting to Twitter API ... ") auth: OAuthHandler = OAuthHandler(self.consumer_key, self.consumer_secret) auth.set_access_token(self.access_token, self.access_token_secret) api: API = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) self.api: API = api except Exception as e: logger.error(e) return api
def storage_data(data: TwitterDataOutput, collection_names: dict, storage: str, mongodb_connector: MongoDBConnector, elasticsearch_connector: ElasticsearchConnector, identifier_key: str): if storage == "mongoDB": # 1. Insert Status non_exist_status: bool = mongodb_connector.verify_documents_in_collection( entity=data.status, collection_name=collection_names.get("status"), identifier_key=identifier_key) if non_exist_status: mongodb_connector.insert_document_to_collection( document=data.status, collection_name=collection_names.get("status")) # 2. Insert User non_exist_user: bool = mongodb_connector.verify_documents_in_collection( entity=data.user, collection_name=collection_names.get("user"), identifier_key=identifier_key) if non_exist_user: mongodb_connector.insert_document_to_collection( document=data.user, collection_name=collection_names.get("user")) logger.info("Inserted data into MongoDB with success!") elif storage == "elasticsearch": # Insert status res_status: dict = elasticsearch_connector.bulk_data_into_index( index=collection_names.get("status"), uuid=data.status.get("uuid"), source_data=data.status) # Insert User res_user: dict = elasticsearch_connector.bulk_data_into_index( index=collection_names.get("user"), uuid=data.user.get("uuid"), source_data=data.user) if res_status and res_user: logger.info("Inserted data into ES with success!") else: raise ValueError("JSON Storage Not Implemented yet")
def extract_data_from_elasticsearch(self, index, query, scroll="1m", size=10000): data = None try: logger.info('Extracting data from index %s of Elasticsearch ...', index) items = self.scroll_data_from_elasticsearch(index=index, body=query, scroll=scroll, size=size) if len(items) > 0: data = self.parse_data(items, index) except Exception as e: logger.error(e) return data
def get_user_profiles_by_screen_names(api: API, screen_names: list): max_iter: int = 100 users_data: list = [] try: logger.info("Retrieve user profiles from Twitter API ... ") if len(screen_names) > max_iter: n: int = int(np.ceil(len(screen_names) / max_iter)) data_chunks: list = chunks_from_list(data_ls=screen_names, n=n) for i, data in enumerate(data_chunks): response: ResultSet = api.lookup_users(screen_names=data) users_data.append(response) else: response: ResultSet = api.lookup_users( screen_names=screen_names) users_data.append(response) except Exception as e: logger.error(e) return users_data
def connect(self): try: self.es = Elasticsearch([{ 'host': self.host, 'port': self.port }], timeout=1000) if self.es.ping(request_timeout=1): self.connection: bool = True logger.info('Connected to ElasticSearch at \'%s:%s\'.', self.host, self.port) else: self.connection: bool = False logger.info('It was not possible to connect to \'%s:%s\'.', self.host, self.port) except Exception as e: logger.error(e) return self
def run_twitter_searching(self): try: # 1. Set up connection if self.twitter_connector.api is None: self.twitter_connector.set_up_twitter_api_connection() # 2. Run searching logger.info(f"Loading files from {self.local_storage}") data_tweets_ids: iter = read_all_files_from_local_storage( local_storage_path=self.local_storage, column_index=0) # 3. For each Tweet ID for tweet_id in data_tweets_ids: logger.info(f"1. Loading Status with ID {tweet_id}") # 3.1 Check if the data is already in the storage non_exists: bool = self.check_data_in_storage( entity_id=tweet_id, storage=self.dest_storage, collection_name=self.collection_names.get("status"), identifier_key="id") if non_exists: logger.info(f"2. Pre-processing Status with ID {tweet_id}") # 3.2 Retrieve Status tweet_status: Status = TwitterConnector.get_tweet_data_from_tweet_id( api=self.twitter_connector.api, tweet_id=tweet_id) # If there is data if tweet_status is not None: # 3.3 Generate Output output: TwitterDataOutput = TwitterDataProcessor.process_twitter_data( tweet_status, add_sentiment=self.add_sentiment, add_bot_analysis=self.add_bot_analysis, twitter_credentials=self.get_twitter_credentials()) # Only insert if both elements are available if output.status and output.user: # 3.4 Store data logger.info( f"3. Storing Status with ID {tweet_id} in {self.dest_storage.title()}" ) self.storage_data( data=output, collection_names=self.collection_names, storage=self.dest_storage, mongodb_connector=self.mongodb_connector, elasticsearch_connector=self. elasticsearch_connector, identifier_key=self.identifier_key) except Exception as e: logger.error(e)
def get_user_profiles_by_ids(api: API, user_ids: list) -> list: max_iter: int = 100 users_data: list = [] try: if len(user_ids) > max_iter: n: int = int(np.ceil(len(user_ids) / max_iter)) data_chunks: list = chunks_from_list(data_ls=user_ids, n=n) logger.info("Retrieve user profiles from Twitter API ... ") for i, data in enumerate(data_chunks): partial_user_ids: list = data response: ResultSet = api.lookup_users( user_ids=partial_user_ids) users_data.append(response) else: response: ResultSet = api.lookup_users(user_ids=user_ids) users_data.append(response) except Exception as e: logger.error(e) return users_data
def generate_doc_embedding(document: str, embeddings: list, doc2vec="transformer_roberta"): doc_embedding: np.ndarray = np.array([]) try: logger.info("Generating embedding for document .... ") # 1. Initialise Document Embedding # a) Pooling if doc2vec == "pool": document_embeddings: DocumentPoolEmbeddings = DocumentPoolEmbeddings( embeddings=embeddings) elif doc2vec == "rnn": document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( embeddings=embeddings, hidden_size=256, rnn_type='LSTM') # b) Transformer elif doc2vec == "transformer_bert": document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings( 'bert-base-multilingual-cased') else: document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings( 'roberta-base') # 2. Create an example sentence sentence: Sentence = Sentence(document) # 3. Embed the sentence with our document embedding document_embeddings.embed(sentence) # 4. Save embedding into CPU if "cuda" in str(flair.device).lower(): doc_emb_cpu: Tensor = sentence.embedding.cpu() # 5. Convert to numpy array doc_embedding: np.ndarray = doc_emb_cpu.detach().numpy() else: doc_embedding: np.ndarray = sentence.get_embedding().detach( ).numpy() except Exception as e: logger.error(e) return doc_embedding
def generate_twitter_account_database(user_ids: list, api: API, extended_info: bool = False): user_info: list = [] try: user_ids: list = __class__.preprocess_user_profile_list(user_ids) response: list = __class__.get_user_profiles_by_ids( api=api, user_ids=user_ids) logger.info("Extract user profile information") if not extended_info: user_info: list = [ __class__.retrieve_user_data(user) for result in response for user in result ] else: user_info: list = [ __class__.retrieve_extended_user_data(user) for result in response for user in result ] except Exception as e: logger.error(e) return user_info
def preprocess_dataframe_train(data: pd.DataFrame, popularity_metric: str, boolean_cols: list, drop_num_cols: list, scaler_filename: str, cat_cols: list, default_value: str, link_information: list, document_col: str, target_col: str, join_attr: str = ". "): data_transformed: pd.DataFrame = pd.DataFrame([]) try: # Save target variable data_target: pd.DataFrame = data[[target_col]] # Convert to int boolean data: pd.DataFrame = FeatureExtraction.convert_bool_to_int( data=data, boolean_cols=boolean_cols) # Extract popularity metric data[popularity_metric] = data.apply( FeatureExtraction.compute_popularity_metric, axis=1) df_features: pd.DataFrame = data.copy() logger.info("Preprocessing Numerical Columns") # Preprocess Numerical Features data_transformed_num: pd.DataFrame = FeatureExtraction.preprocess_numerical_data( data=df_features, drop_cols=drop_num_cols, scaler_filename=scaler_filename) logger.info("Preprocessing Categorical Columns") # Preprocess Categorical Features data_transformed_cat: pd.DataFrame = FeatureExtraction.preprocess_categorical_data( data=df_features, cat_cols=cat_cols, default_value=default_value, link_information=link_information, document_col=document_col, join_attr=join_attr) logger.info("Concatenating Transformed Columns") # Concatenate DataFrames data_transformed: pd.DataFrame = pd.concat( [data_transformed_num, data_transformed_cat, data_target], axis=1, sort=False) except Exception as e: logger.error(e) return data_transformed
def on_status(self, status: Status): logger.info(f"1. Loading Status with ID {status.__getattribute__('id')}") # 1. Check whether the status is already in the storage non_exists: bool = self.check_data_in_storage( entity_id=status.__getattribute__('id'), storage=self.storage, collection_name=self.collection_names.get("status"), identifier_key="id") if non_exists: # 3. Process Tweets an Users logger.info(f"2. Pre-processing Status with ID {status.__getattribute__('id')}") data: TwitterDataOutput = self.process_status( status=status, add_sentiment=self.add_sentiment, add_bot_analysis=self.add_bot_analysis) logger.info(f"3. Storing Status with ID {status.__getattribute__('id')} in {self.storage.title()}") # 2. Storage data self.storage_data(data=data, collection_names=self.collection_names, storage=self.storage, mongodb_connector=self.mongodb_connector, elasticsearch_connector=self.elasticsearch_connector, identifier_key=self.identifier_key)
def set_up_flair_cpu_device(): flair.device = "cpu" logger.info(f"Flair device: {flair.device}")
def perform_source_credibility_analysis( self, publishers: list, authors: list, article_url: str, elasticsearch_connector: ElasticsearchConnector, source_rank: SourceRank) -> dict: response: dict = {"message": http_response_500, "code": 500} try: # ==================================================================== # Publisher Analysis # ==================================================================== output_pub_trustworthiness: TrustworthinessDoc = TrustworthinessDoc( trustworthiness=normalize_value(), relevance=normalize_value(mu=0.1, sigma=0.05)) for publisher in publishers: logger.info( f"Analysing Publisher {publisher.get('identifier')}") # 1. Check if the publisher contains data non_exist: bool = elasticsearch_connector.check_document_in_index_by_id( index=org_es_index_features, uuid=publisher.get("identifier")) if non_exist: analyse_static: bool = True else: analyse_static: bool = False # 4. Compute publisher features publisher_features: PublisherFeaturesDoc = self.get_publisher_features( elasticsearch_connector=elasticsearch_connector, source_rank=source_rank, publisher=publisher, article_url=article_url, analyse_static=analyse_static) # 5. Update features in Elasticsearch if analyse_static: elasticsearch_connector.bulk_data_into_index( index=org_es_index_features, uuid=publisher.get("identifier"), source_data=publisher_features.__dict__) else: # Update only non-static features params = { "text_rank_analysis": publisher_features.text_rank_analysis, "anonymous_rank_analysis": publisher_features.anonymous_rank_analysis, "last_updated": publisher_features.last_updated } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=org_es_index_features, uuid=publisher.get("identifier"), body=body) # 6. Compute Trustworthiness & relevance publisher_features_dict = { "open_rank": publisher_features.open_rank_analysis.get("rank"), "suffix_rank": publisher_features.suffix_rank_analysis.get("rank"), "category_rank": publisher_features.category_rank_analysis.get("rank"), "twitter_rank": publisher_features.twitter_rank_analysis.get("rank"), "whois_rank": publisher_features.whois_rank_analysis.get("rank"), "text_rank": publisher_features.text_rank_analysis.get("rank"), "anonymous_rank": publisher_features.anonymous_rank_analysis.get("rank") } output_pub_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features( features=publisher_features_dict, metrics=self.get_publisher_metrics_importances(), total_articles=publisher_features.text_rank_analysis.get( "total_articles")) # 7. Update only non-static features params = { score_key: 100 * output_pub_trustworthiness.trustworthiness, "relevance": output_pub_trustworthiness.relevance, "status": "done" } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=org_es_index, uuid=publisher.get("identifier"), body=body) # ==================================================================== # Author Analysis # ==================================================================== publisher_rank_analysis: dict = { "rank": output_pub_trustworthiness.trustworthiness, "relevance": output_pub_trustworthiness.relevance } for author in authors: logger.info(f"Analysing Author {author.get('identifier')}") # 1. Compute author features author_features: AuthorFeatureDoc = self.get_author_features( elasticsearch_connector=elasticsearch_connector, author=author, publisher_rank_analysis=publisher_rank_analysis) # 2. Compute Trustworthiness & relevance author_features_dict = { "text_rank": author_features.text_rank_analysis.get("rank"), "publisher_rank": publisher_rank_analysis.get("rank") } output_aut_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features( features=author_features_dict, metrics=self.get_author_metrics_importances(), total_articles=author_features.text_rank_analysis.get( "total_articles")) # 3. Verify if author exists in fdg-person-features non_exist_auth: bool = elasticsearch_connector.check_document_in_index_by_id( index=auth_es_index_features, uuid=author.get("identifier")) if non_exist_auth: # Generate entry elasticsearch_connector.bulk_data_into_index( index=auth_es_index_features, uuid=author.get("identifier"), source_data=author_features.__dict__) else: # Update features in Elasticsearch body = {"doc": author_features.__dict__} elasticsearch_connector.update_fields_to_index( index=auth_es_index_features, uuid=author.get("identifier"), body=body) # 4. Update scores in Elasticsearch fdg-ap-person-features params = { score_key: 100 * output_aut_trustworthiness.trustworthiness, "relevance": output_aut_trustworthiness.relevance, "status": "done" } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=person_es_index, uuid=author.get("identifier"), body=body) response["message"]: str = http_response_200 response["code"]: int = 200 except Exception as e: logger.error(e) return response
def start_kafka_offline_process(self): run: bool = True while run: try: # 1. Check if the consumer was initialised if self.kafka_manager.consumer is None: self.kafka_manager.init_kafka_consumer() # 1. Check if the producer was initialised if self.kafka_manager.producer is None: self.kafka_manager.init_kafka_producer() # 1. Read messages from Kafka for msg in self.kafka_manager.consumer: try: # 2. Process message logger.info('Loading Kafka Message') document: dict = loads(msg.value) # 3. Commit document self.kafka_manager.consumer.commit() # 4. Execute Analysis if document.get("status", 400) == 200: logger.info( 'Executing Source credibility analysis') response: GraphAnalyzerOutputDoc = self.process_source_credibility_analysis( document=document) # 4.1 Everything was fine if response.status == 200: output_doc: dict = response.__dict__ logger.info( 'Putting authors/publisher scores into Kafka' ) self.kafka_manager.put_data_into_topic( data=output_doc) logger.info('Done!') # Handle Connection Exception except ConnectionError as er: logger.error(er) sys.exit(1) # Handle CommitFailedError Exception except CommitFailedError as commitErr: logger.error("Not able to make a commit ..." + str(commitErr)) # restart kafka elements and go back to while loop self.kafka_manager.consumer = None self.kafka_manager.producer = None # Go out of the for loop break # Handle any other Exception except Exception as e: logger.error(e) # Perform commit and continue with next message self.kafka_manager.consumer.commit() continue # Handle While loop exceptions except ConnectionError as er: logger.error(er) sys.exit(1) except Exception as e: logger.warning(e) self.kafka_manager.consumer = None self.kafka_manager.producer = None