def process_analysis(self, publishers: list, authors: list,
                         article_uuid: str, article_url: str,
                         elasticsearch_connector: ElasticsearchConnector,
                         source_rank: SourceRank):
        try:
            # 1. Perform analysis
            response: dict = self.perform_source_credibility_analysis(
                publishers=publishers,
                authors=authors,
                article_url=article_url,
                elasticsearch_connector=elasticsearch_connector,
                source_rank=source_rank)

            if response.get("code") == 200:
                # 2. Call Fusion Score
                logger.info("Calling Fusion Score Service")
                ThreadsProcessor.start_new_streaming_process(
                    thread_name="fusion_score",
                    target_func=self.update_fusion_score,
                    params=(
                        fusion_score_server,
                        fusion_score_port,
                        fusion_score_endpoint,
                        article_uuid,
                    ))
                """response_fusion_score: dict = self.update_fusion_score(
                    server=fusion_score_server,
                    port=fusion_score_port,
                    endpoint=fusion_score_endpoint,
                    article_uuid=article_uuid)"""
        except Exception as e:
            logger.error(e)
Esempio n. 2
0
 def remove_all_documents_from_collection(self, collection_name: str):
     try:
         docs: DeleteResult = self.db[collection_name].delete_many({})
         logger.info("%s documents deleted from %s", docs.deleted_count,
                     collection_name)
     except Exception as e:
         logger.error(e)
Esempio n. 3
0
 def clean_threads():
     try:
         logger.info(f"Cleaning all threads")
         for i, thread_name in enumerate(global_streaming_threads):
             global_streaming_threads.pop(i)
     except Exception as e:
         logger.error(e)
Esempio n. 4
0
 def tweet_gathering(api: API,
                     query: str,
                     date_since: str,
                     lang: str = 'en'):
     try:
         logger.info("Retrieving Tweets ... ")
         # Collect tweets
         tweets = Cursor(api.search,
                         lang=lang,
                         q=query,
                         include_entities=True,
                         monitor_rate_limit=True,
                         wait_on_rate_limit_notify=True,
                         wait_on_rate_limit=True,
                         result_type="recent",
                         tweet_mode='extended').items()
         while True:
             try:
                 tweet: Status = tweets.next()
                 print(tweet)
                 yield tweet
             except RateLimitError:
                 time.sleep(60 * 15)
                 continue
             except StopIteration:
                 break
     except Exception as e:
         logger.error(e)
    def scale_data(data: pd.DataFrame, fit: bool = True, filename: str = ""):
        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            if fit:
                scaler: StandardScaler = StandardScaler()
                scaler.fit(data)

                prepare_directory(os.sep.join(filename.split(os.sep)[0:-1]))
                # Save scaler
                logger.info("Saving scaler object at %s", filename)

                FeatureExtraction.save_scaler_object(scaler_obj=scaler,
                                                     filename=filename)
            else:

                scaler: StandardScaler = FeatureExtraction.load_scaler_object(
                    filename=filename)

            # Transform data
            res_transformed: np.ndarray = scaler.transform(data)
            data_transformed: pd.DataFrame = pd.DataFrame(res_transformed,
                                                          columns=data.columns)
        except Exception as e:
            logger.error(e)
        return data_transformed
Esempio n. 6
0
 def establish_client_connection(self):
     try:
         logger.info("Connection established with MongoDB at {}:{}".format(
             self.db_host, self.db_port))
         self.db_client: MongoClient = MongoClient(self.db_host,
                                                   self.db_port)
     except Exception as e:
         logger.error(e)
 def remove_index(self, index):
     res = True
     try:
         logger.info('Removing index %s from Elasticsearch', str(index))
         self.es.indices.delete(index=index, ignore=[400, 404])
     except Exception as e:
         logger.error(e)
         res = False
     return res
Esempio n. 8
0
 def check_dbManager(self):
     """
     Checks if dbManager has started. If not, initialize dbManager.
     """
     try:
         if self.db_client is None:
             # If no exists
             if not self.check_database_exists():
                 logger.info("A new DB has been created")
             self.establish_client_connection()
             self.db = self.db_client[self.db_name]
     except Exception as e:
         logger.error(e)
         self.status = 500
Esempio n. 9
0
 def set_up_twitter_api_connection(self) -> API:
     api: API = object.__new__(API)
     try:
         logger.info("Connecting to Twitter API ... ")
         auth: OAuthHandler = OAuthHandler(self.consumer_key,
                                           self.consumer_secret)
         auth.set_access_token(self.access_token, self.access_token_secret)
         api: API = API(auth,
                        wait_on_rate_limit=True,
                        wait_on_rate_limit_notify=True)
         self.api: API = api
     except Exception as e:
         logger.error(e)
     return api
Esempio n. 10
0
    def storage_data(data: TwitterDataOutput, collection_names: dict,
                     storage: str, mongodb_connector: MongoDBConnector,
                     elasticsearch_connector: ElasticsearchConnector,
                     identifier_key: str):

        if storage == "mongoDB":
            # 1. Insert Status
            non_exist_status: bool = mongodb_connector.verify_documents_in_collection(
                entity=data.status,
                collection_name=collection_names.get("status"),
                identifier_key=identifier_key)
            if non_exist_status:
                mongodb_connector.insert_document_to_collection(
                    document=data.status,
                    collection_name=collection_names.get("status"))

            # 2. Insert User
            non_exist_user: bool = mongodb_connector.verify_documents_in_collection(
                entity=data.user,
                collection_name=collection_names.get("user"),
                identifier_key=identifier_key)
            if non_exist_user:
                mongodb_connector.insert_document_to_collection(
                    document=data.user,
                    collection_name=collection_names.get("user"))

            logger.info("Inserted data into MongoDB with success!")

        elif storage == "elasticsearch":
            # Insert status
            res_status: dict = elasticsearch_connector.bulk_data_into_index(
                index=collection_names.get("status"),
                uuid=data.status.get("uuid"),
                source_data=data.status)

            # Insert User
            res_user: dict = elasticsearch_connector.bulk_data_into_index(
                index=collection_names.get("user"),
                uuid=data.user.get("uuid"),
                source_data=data.user)

            if res_status and res_user:
                logger.info("Inserted data into ES with success!")
        else:
            raise ValueError("JSON Storage Not Implemented yet")
 def extract_data_from_elasticsearch(self,
                                     index,
                                     query,
                                     scroll="1m",
                                     size=10000):
     data = None
     try:
         logger.info('Extracting data from index %s of Elasticsearch ...',
                     index)
         items = self.scroll_data_from_elasticsearch(index=index,
                                                     body=query,
                                                     scroll=scroll,
                                                     size=size)
         if len(items) > 0:
             data = self.parse_data(items, index)
     except Exception as e:
         logger.error(e)
     return data
Esempio n. 12
0
 def get_user_profiles_by_screen_names(api: API, screen_names: list):
     max_iter: int = 100
     users_data: list = []
     try:
         logger.info("Retrieve user profiles from Twitter API ... ")
         if len(screen_names) > max_iter:
             n: int = int(np.ceil(len(screen_names) / max_iter))
             data_chunks: list = chunks_from_list(data_ls=screen_names, n=n)
             for i, data in enumerate(data_chunks):
                 response: ResultSet = api.lookup_users(screen_names=data)
                 users_data.append(response)
         else:
             response: ResultSet = api.lookup_users(
                 screen_names=screen_names)
             users_data.append(response)
     except Exception as e:
         logger.error(e)
     return users_data
 def connect(self):
     try:
         self.es = Elasticsearch([{
             'host': self.host,
             'port': self.port
         }],
                                 timeout=1000)
         if self.es.ping(request_timeout=1):
             self.connection: bool = True
             logger.info('Connected to ElasticSearch at \'%s:%s\'.',
                         self.host, self.port)
         else:
             self.connection: bool = False
             logger.info('It was not possible to connect to \'%s:%s\'.',
                         self.host, self.port)
     except Exception as e:
         logger.error(e)
     return self
Esempio n. 14
0
    def run_twitter_searching(self):
        try:
            # 1. Set up connection
            if self.twitter_connector.api is None:
                self.twitter_connector.set_up_twitter_api_connection()

            # 2. Run searching
            logger.info(f"Loading files from {self.local_storage}")
            data_tweets_ids: iter = read_all_files_from_local_storage(
                local_storage_path=self.local_storage, column_index=0)

            # 3. For each Tweet ID
            for tweet_id in data_tweets_ids:
                logger.info(f"1. Loading Status with ID {tweet_id}")

                # 3.1 Check if the data is already in the storage

                non_exists: bool = self.check_data_in_storage(
                    entity_id=tweet_id,
                    storage=self.dest_storage,
                    collection_name=self.collection_names.get("status"),
                    identifier_key="id")

                if non_exists:
                    logger.info(f"2. Pre-processing Status with ID {tweet_id}")
                    # 3.2 Retrieve Status
                    tweet_status: Status = TwitterConnector.get_tweet_data_from_tweet_id(
                        api=self.twitter_connector.api, tweet_id=tweet_id)

                    # If there is data
                    if tweet_status is not None:
                        # 3.3 Generate Output
                        output: TwitterDataOutput = TwitterDataProcessor.process_twitter_data(
                            tweet_status,
                            add_sentiment=self.add_sentiment,
                            add_bot_analysis=self.add_bot_analysis,
                            twitter_credentials=self.get_twitter_credentials())

                        # Only insert if both elements are available
                        if output.status and output.user:
                            # 3.4 Store data
                            logger.info(
                                f"3. Storing Status with ID {tweet_id} in {self.dest_storage.title()}"
                            )
                            self.storage_data(
                                data=output,
                                collection_names=self.collection_names,
                                storage=self.dest_storage,
                                mongodb_connector=self.mongodb_connector,
                                elasticsearch_connector=self.
                                elasticsearch_connector,
                                identifier_key=self.identifier_key)
        except Exception as e:
            logger.error(e)
Esempio n. 15
0
 def get_user_profiles_by_ids(api: API, user_ids: list) -> list:
     max_iter: int = 100
     users_data: list = []
     try:
         if len(user_ids) > max_iter:
             n: int = int(np.ceil(len(user_ids) / max_iter))
             data_chunks: list = chunks_from_list(data_ls=user_ids, n=n)
             logger.info("Retrieve user profiles from Twitter API ... ")
             for i, data in enumerate(data_chunks):
                 partial_user_ids: list = data
                 response: ResultSet = api.lookup_users(
                     user_ids=partial_user_ids)
                 users_data.append(response)
         else:
             response: ResultSet = api.lookup_users(user_ids=user_ids)
             users_data.append(response)
     except Exception as e:
         logger.error(e)
     return users_data
    def generate_doc_embedding(document: str,
                               embeddings: list,
                               doc2vec="transformer_roberta"):
        doc_embedding: np.ndarray = np.array([])
        try:
            logger.info("Generating embedding for document .... ")
            # 1. Initialise Document Embedding

            # a) Pooling
            if doc2vec == "pool":
                document_embeddings: DocumentPoolEmbeddings = DocumentPoolEmbeddings(
                    embeddings=embeddings)
            elif doc2vec == "rnn":
                document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
                    embeddings=embeddings, hidden_size=256, rnn_type='LSTM')

            # b) Transformer
            elif doc2vec == "transformer_bert":
                document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings(
                    'bert-base-multilingual-cased')
            else:
                document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings(
                    'roberta-base')

            # 2. Create an example sentence
            sentence: Sentence = Sentence(document)

            # 3. Embed the sentence with our document embedding
            document_embeddings.embed(sentence)

            # 4. Save embedding into CPU
            if "cuda" in str(flair.device).lower():
                doc_emb_cpu: Tensor = sentence.embedding.cpu()
                # 5. Convert to numpy array
                doc_embedding: np.ndarray = doc_emb_cpu.detach().numpy()
            else:
                doc_embedding: np.ndarray = sentence.get_embedding().detach(
                ).numpy()
        except Exception as e:
            logger.error(e)
        return doc_embedding
Esempio n. 17
0
 def generate_twitter_account_database(user_ids: list,
                                       api: API,
                                       extended_info: bool = False):
     user_info: list = []
     try:
         user_ids: list = __class__.preprocess_user_profile_list(user_ids)
         response: list = __class__.get_user_profiles_by_ids(
             api=api, user_ids=user_ids)
         logger.info("Extract user profile information")
         if not extended_info:
             user_info: list = [
                 __class__.retrieve_user_data(user) for result in response
                 for user in result
             ]
         else:
             user_info: list = [
                 __class__.retrieve_extended_user_data(user)
                 for result in response for user in result
             ]
     except Exception as e:
         logger.error(e)
     return user_info
    def preprocess_dataframe_train(data: pd.DataFrame,
                                   popularity_metric: str,
                                   boolean_cols: list,
                                   drop_num_cols: list,
                                   scaler_filename: str,
                                   cat_cols: list,
                                   default_value: str,
                                   link_information: list,
                                   document_col: str,
                                   target_col: str,
                                   join_attr: str = ". "):

        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            # Save target variable
            data_target: pd.DataFrame = data[[target_col]]

            # Convert to int boolean
            data: pd.DataFrame = FeatureExtraction.convert_bool_to_int(
                data=data, boolean_cols=boolean_cols)

            # Extract popularity metric
            data[popularity_metric] = data.apply(
                FeatureExtraction.compute_popularity_metric, axis=1)

            df_features: pd.DataFrame = data.copy()

            logger.info("Preprocessing Numerical Columns")
            # Preprocess Numerical Features
            data_transformed_num: pd.DataFrame = FeatureExtraction.preprocess_numerical_data(
                data=df_features,
                drop_cols=drop_num_cols,
                scaler_filename=scaler_filename)

            logger.info("Preprocessing Categorical Columns")
            # Preprocess Categorical Features
            data_transformed_cat: pd.DataFrame = FeatureExtraction.preprocess_categorical_data(
                data=df_features,
                cat_cols=cat_cols,
                default_value=default_value,
                link_information=link_information,
                document_col=document_col,
                join_attr=join_attr)

            logger.info("Concatenating Transformed Columns")
            # Concatenate DataFrames
            data_transformed: pd.DataFrame = pd.concat(
                [data_transformed_num, data_transformed_cat, data_target],
                axis=1,
                sort=False)

        except Exception as e:
            logger.error(e)
        return data_transformed
Esempio n. 19
0
    def on_status(self, status: Status):

        logger.info(f"1. Loading Status with ID {status.__getattribute__('id')}")

        # 1. Check whether the status is already in the storage
        non_exists: bool = self.check_data_in_storage(
            entity_id=status.__getattribute__('id'),
            storage=self.storage,
            collection_name=self.collection_names.get("status"),
            identifier_key="id")

        if non_exists:
            # 3. Process Tweets an Users
            logger.info(f"2. Pre-processing Status with ID {status.__getattribute__('id')}")
            data: TwitterDataOutput = self.process_status(
                status=status, add_sentiment=self.add_sentiment,
                add_bot_analysis=self.add_bot_analysis)

            logger.info(f"3. Storing Status with ID {status.__getattribute__('id')} in {self.storage.title()}")
            # 2. Storage data
            self.storage_data(data=data, collection_names=self.collection_names,
                              storage=self.storage, mongodb_connector=self.mongodb_connector,
                              elasticsearch_connector=self.elasticsearch_connector,
                              identifier_key=self.identifier_key)
 def set_up_flair_cpu_device():
     flair.device = "cpu"
     logger.info(f"Flair device: {flair.device}")
    def perform_source_credibility_analysis(
            self, publishers: list, authors: list, article_url: str,
            elasticsearch_connector: ElasticsearchConnector,
            source_rank: SourceRank) -> dict:
        response: dict = {"message": http_response_500, "code": 500}
        try:
            # ====================================================================
            # Publisher Analysis
            # ====================================================================
            output_pub_trustworthiness: TrustworthinessDoc = TrustworthinessDoc(
                trustworthiness=normalize_value(),
                relevance=normalize_value(mu=0.1, sigma=0.05))
            for publisher in publishers:
                logger.info(
                    f"Analysing Publisher {publisher.get('identifier')}")

                # 1. Check if the publisher contains data
                non_exist: bool = elasticsearch_connector.check_document_in_index_by_id(
                    index=org_es_index_features,
                    uuid=publisher.get("identifier"))

                if non_exist:
                    analyse_static: bool = True
                else:
                    analyse_static: bool = False

                # 4. Compute publisher features
                publisher_features: PublisherFeaturesDoc = self.get_publisher_features(
                    elasticsearch_connector=elasticsearch_connector,
                    source_rank=source_rank,
                    publisher=publisher,
                    article_url=article_url,
                    analyse_static=analyse_static)

                # 5. Update features in Elasticsearch
                if analyse_static:
                    elasticsearch_connector.bulk_data_into_index(
                        index=org_es_index_features,
                        uuid=publisher.get("identifier"),
                        source_data=publisher_features.__dict__)
                else:
                    # Update only non-static features
                    params = {
                        "text_rank_analysis":
                        publisher_features.text_rank_analysis,
                        "anonymous_rank_analysis":
                        publisher_features.anonymous_rank_analysis,
                        "last_updated": publisher_features.last_updated
                    }
                    body = {"doc": params}
                    elasticsearch_connector.update_fields_to_index(
                        index=org_es_index_features,
                        uuid=publisher.get("identifier"),
                        body=body)

                # 6. Compute Trustworthiness & relevance
                publisher_features_dict = {
                    "open_rank":
                    publisher_features.open_rank_analysis.get("rank"),
                    "suffix_rank":
                    publisher_features.suffix_rank_analysis.get("rank"),
                    "category_rank":
                    publisher_features.category_rank_analysis.get("rank"),
                    "twitter_rank":
                    publisher_features.twitter_rank_analysis.get("rank"),
                    "whois_rank":
                    publisher_features.whois_rank_analysis.get("rank"),
                    "text_rank":
                    publisher_features.text_rank_analysis.get("rank"),
                    "anonymous_rank":
                    publisher_features.anonymous_rank_analysis.get("rank")
                }
                output_pub_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features(
                    features=publisher_features_dict,
                    metrics=self.get_publisher_metrics_importances(),
                    total_articles=publisher_features.text_rank_analysis.get(
                        "total_articles"))

                # 7. Update only non-static features
                params = {
                    score_key:
                    100 * output_pub_trustworthiness.trustworthiness,
                    "relevance": output_pub_trustworthiness.relevance,
                    "status": "done"
                }
                body = {"doc": params}
                elasticsearch_connector.update_fields_to_index(
                    index=org_es_index,
                    uuid=publisher.get("identifier"),
                    body=body)

            # ====================================================================
            # Author Analysis
            # ====================================================================
            publisher_rank_analysis: dict = {
                "rank": output_pub_trustworthiness.trustworthiness,
                "relevance": output_pub_trustworthiness.relevance
            }
            for author in authors:
                logger.info(f"Analysing Author {author.get('identifier')}")

                # 1. Compute author features
                author_features: AuthorFeatureDoc = self.get_author_features(
                    elasticsearch_connector=elasticsearch_connector,
                    author=author,
                    publisher_rank_analysis=publisher_rank_analysis)

                # 2. Compute Trustworthiness & relevance
                author_features_dict = {
                    "text_rank":
                    author_features.text_rank_analysis.get("rank"),
                    "publisher_rank": publisher_rank_analysis.get("rank")
                }
                output_aut_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features(
                    features=author_features_dict,
                    metrics=self.get_author_metrics_importances(),
                    total_articles=author_features.text_rank_analysis.get(
                        "total_articles"))

                # 3. Verify if author exists in fdg-person-features
                non_exist_auth: bool = elasticsearch_connector.check_document_in_index_by_id(
                    index=auth_es_index_features,
                    uuid=author.get("identifier"))

                if non_exist_auth:
                    # Generate entry
                    elasticsearch_connector.bulk_data_into_index(
                        index=auth_es_index_features,
                        uuid=author.get("identifier"),
                        source_data=author_features.__dict__)
                else:
                    # Update features in Elasticsearch
                    body = {"doc": author_features.__dict__}
                    elasticsearch_connector.update_fields_to_index(
                        index=auth_es_index_features,
                        uuid=author.get("identifier"),
                        body=body)
                # 4. Update scores in Elasticsearch fdg-ap-person-features
                params = {
                    score_key:
                    100 * output_aut_trustworthiness.trustworthiness,
                    "relevance": output_aut_trustworthiness.relevance,
                    "status": "done"
                }
                body = {"doc": params}
                elasticsearch_connector.update_fields_to_index(
                    index=person_es_index,
                    uuid=author.get("identifier"),
                    body=body)
            response["message"]: str = http_response_200
            response["code"]: int = 200
        except Exception as e:
            logger.error(e)
        return response
Esempio n. 22
0
    def start_kafka_offline_process(self):
        run: bool = True
        while run:
            try:
                # 1. Check if the consumer was initialised
                if self.kafka_manager.consumer is None:
                    self.kafka_manager.init_kafka_consumer()

                # 1. Check if the producer was initialised
                if self.kafka_manager.producer is None:
                    self.kafka_manager.init_kafka_producer()

                # 1. Read messages from Kafka
                for msg in self.kafka_manager.consumer:
                    try:
                        # 2. Process message
                        logger.info('Loading Kafka Message')
                        document: dict = loads(msg.value)

                        # 3. Commit document
                        self.kafka_manager.consumer.commit()

                        # 4. Execute Analysis
                        if document.get("status", 400) == 200:
                            logger.info(
                                'Executing Source credibility analysis')
                            response: GraphAnalyzerOutputDoc = self.process_source_credibility_analysis(
                                document=document)

                            # 4.1 Everything was fine
                            if response.status == 200:
                                output_doc: dict = response.__dict__
                                logger.info(
                                    'Putting authors/publisher scores into Kafka'
                                )
                                self.kafka_manager.put_data_into_topic(
                                    data=output_doc)
                                logger.info('Done!')

                    # Handle Connection Exception
                    except ConnectionError as er:
                        logger.error(er)
                        sys.exit(1)

                    # Handle CommitFailedError Exception
                    except CommitFailedError as commitErr:
                        logger.error("Not able to make a commit ..." +
                                     str(commitErr))
                        # restart kafka elements and go back to while loop
                        self.kafka_manager.consumer = None
                        self.kafka_manager.producer = None
                        # Go out of the for loop
                        break

                    # Handle any other Exception
                    except Exception as e:
                        logger.error(e)
                        # Perform commit and continue with next message
                        self.kafka_manager.consumer.commit()
                        continue

            # Handle While loop exceptions
            except ConnectionError as er:
                logger.error(er)
                sys.exit(1)
            except Exception as e:
                logger.warning(e)
                self.kafka_manager.consumer = None
                self.kafka_manager.producer = None