def __init__(self, identifier=None, name=None, url=None, country=None, nationality=None, bias="", parent_organization="", status="inProgress", trustworthiness=None, relevance=None): self.identifier = identifier self.name = name if isinstance(url, list): self.url = url[0] else: self.url = url self.country = country self.nationality = nationality self.bias = bias self.parent_organization = parent_organization self.status = status if trustworthiness is not None: self.trustworthiness = trustworthiness else: self.trustworthiness = normalize_value() if relevance is not None: self.relevance = relevance else: self.relevance = 1
def __init__(self, identifier=None, name=None, affiliation=None, url=None, nationality="", bias="", job_title="", gender="", status="inProgress", trustworthiness=None, relevance=None): self.identifier = identifier self.name = name if isinstance(affiliation, list): self.affiliation = affiliation[0] else: self.affiliation = affiliation self.url = url self.nationality = nationality self.bias = bias self.job_title = job_title self.gender = gender self.status = status if trustworthiness is not None: self.trustworthiness = trustworthiness else: self.trustworthiness = normalize_value() if relevance is not None: self.relevance = relevance else: self.relevance = 1
def get_trustworthiness_from_features( features: dict, metrics: dict, total_articles: int) -> TrustworthinessDoc: output_trustworthiness: TrustworthinessDoc = TrustworthinessDoc( trustworthiness=normalize_value(), relevance=normalize_value(mu=0.1, sigma=0.05)) try: # ===================================================== # Compute Final trustworthiness # ===================================================== trustworthiness: float = 0.0 weights_sum: float = 0.0 for metric, importance in metrics.items(): # 1. Retrieve values rank_value: float = features.get(metric, 0.0) importance_value: float = compute_importance_from_exponential_distribution( x=importance) # 2. Update score trustworthiness += rank_value * importance_value weights_sum += importance_value # 3. Relevance relevance: float = relevance_mapping(x=total_articles) if relevance == 0: relevance = normalize_value(mu=0.1, sigma=0.05) output_trustworthiness.trustworthiness: float = round( trustworthiness / weights_sum, 3) output_trustworthiness.relevance: float = round(relevance, 3) output_trustworthiness.analysed: bool = True output_trustworthiness.error: bool = False except Exception as e: logger.error(e) return output_trustworthiness
def get_anonymous_rank(elasticsearch_connector: ElasticsearchConnector, index: str, authors_index: str, field: str, query: str, anonymous_key: str) -> dict: anonymous_rank_analysis: dict = {"rank": 0.0, "total_authors": 0} try: # 1. Retrieve all articles associated to the entity response: Response = elasticsearch_connector.search_data_from_elasticsearch_by_matching( index=index, fields=[field], query=query) response_dct: dict = response.to_dict() if response_dct.get("hits").get("total").get("value") > 0: # 2. Count authors authors: list = [ i.get("_source").get("authors") for i in response_dct.get("hits").get("hits") ] authors_uuids: list = list( set(itertools.chain.from_iterable(authors))) total_authors: int = len(authors_uuids) total_anonymous: int = 0 # 3. Count how many of them are anonymous response_authors: Response = elasticsearch_connector.filter_data_from_index_by_uuid( index=authors_index, uuids=authors_uuids) response_authors_dct: dict = response_authors.to_dict() if response_authors_dct.get("hits").get("total").get( "value") > 0: total_anonymous: int = len([ i for i in response_authors_dct.get("hits").get("hits") if i.get("_source").get("name") == anonymous_key ]) anonymous_rank: float = round( 1 - (total_anonymous / total_authors), 3) anonymous_rank_analysis["rank"]: float = anonymous_rank anonymous_rank_analysis["total_authors"]: int = total_authors else: anonymous_rank_analysis["rank"]: float = normalize_value( mu=0.1, sigma=0.05) except Exception as e: logger.error(e) return anonymous_rank_analysis
def get_text_rank(elasticsearch_connector: ElasticsearchConnector, query: str, field: str, es_art_index: str, es_score_index) -> dict: text_rank_analysis: dict = {"rank": 0.0, "total_articles": 0} try: # 1. Retrieve all articles associated to the entity response: Response = elasticsearch_connector.search_data_from_elasticsearch_by_matching( index=es_art_index, fields=[field], query=query) response_dct: dict = response.to_dict() text_rank: float = normalize_value(mu=0.1, sigma=0.05) total_articles: int = 0 if response_dct.get("hits").get("total").get("value") > 0: articles_uuids: list = [ i.get("_id") for i in response_dct.get("hits").get("hits") ] total_articles: int = len(articles_uuids) # 2. For each article id, retrieve score if available response_articles: Response = elasticsearch_connector.filter_data_from_index_by_uuid( index=es_score_index, uuids=articles_uuids) response_articles_dct: dict = response_articles.to_dict() if response_articles_dct.get("hits").get("total").get( "value") > 0: text_scores: list = [ np.multiply( i.get("_source").get("textScore"), i.get("_source").get("relevance")) for i in response_articles_dct.get("hits").get("hits") ] # 3. Compute weighted mean text_rank: float = round( float(sum(text_scores) / len(text_scores)), 3) text_rank_analysis["rank"]: float = text_rank text_rank_analysis["total_articles"]: int = total_articles except Exception as e: logger.error(e) return text_rank_analysis
def perform_source_credibility_analysis( self, publishers: list, authors: list, article_url: str, elasticsearch_connector: ElasticsearchConnector, source_rank: SourceRank) -> dict: response: dict = {"message": http_response_500, "code": 500} try: # ==================================================================== # Publisher Analysis # ==================================================================== output_pub_trustworthiness: TrustworthinessDoc = TrustworthinessDoc( trustworthiness=normalize_value(), relevance=normalize_value(mu=0.1, sigma=0.05)) for publisher in publishers: logger.info( f"Analysing Publisher {publisher.get('identifier')}") # 1. Check if the publisher contains data non_exist: bool = elasticsearch_connector.check_document_in_index_by_id( index=org_es_index_features, uuid=publisher.get("identifier")) if non_exist: analyse_static: bool = True else: analyse_static: bool = False # 4. Compute publisher features publisher_features: PublisherFeaturesDoc = self.get_publisher_features( elasticsearch_connector=elasticsearch_connector, source_rank=source_rank, publisher=publisher, article_url=article_url, analyse_static=analyse_static) # 5. Update features in Elasticsearch if analyse_static: elasticsearch_connector.bulk_data_into_index( index=org_es_index_features, uuid=publisher.get("identifier"), source_data=publisher_features.__dict__) else: # Update only non-static features params = { "text_rank_analysis": publisher_features.text_rank_analysis, "anonymous_rank_analysis": publisher_features.anonymous_rank_analysis, "last_updated": publisher_features.last_updated } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=org_es_index_features, uuid=publisher.get("identifier"), body=body) # 6. Compute Trustworthiness & relevance publisher_features_dict = { "open_rank": publisher_features.open_rank_analysis.get("rank"), "suffix_rank": publisher_features.suffix_rank_analysis.get("rank"), "category_rank": publisher_features.category_rank_analysis.get("rank"), "twitter_rank": publisher_features.twitter_rank_analysis.get("rank"), "whois_rank": publisher_features.whois_rank_analysis.get("rank"), "text_rank": publisher_features.text_rank_analysis.get("rank"), "anonymous_rank": publisher_features.anonymous_rank_analysis.get("rank") } output_pub_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features( features=publisher_features_dict, metrics=self.get_publisher_metrics_importances(), total_articles=publisher_features.text_rank_analysis.get( "total_articles")) # 7. Update only non-static features params = { score_key: 100 * output_pub_trustworthiness.trustworthiness, "relevance": output_pub_trustworthiness.relevance, "status": "done" } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=org_es_index, uuid=publisher.get("identifier"), body=body) # ==================================================================== # Author Analysis # ==================================================================== publisher_rank_analysis: dict = { "rank": output_pub_trustworthiness.trustworthiness, "relevance": output_pub_trustworthiness.relevance } for author in authors: logger.info(f"Analysing Author {author.get('identifier')}") # 1. Compute author features author_features: AuthorFeatureDoc = self.get_author_features( elasticsearch_connector=elasticsearch_connector, author=author, publisher_rank_analysis=publisher_rank_analysis) # 2. Compute Trustworthiness & relevance author_features_dict = { "text_rank": author_features.text_rank_analysis.get("rank"), "publisher_rank": publisher_rank_analysis.get("rank") } output_aut_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features( features=author_features_dict, metrics=self.get_author_metrics_importances(), total_articles=author_features.text_rank_analysis.get( "total_articles")) # 3. Verify if author exists in fdg-person-features non_exist_auth: bool = elasticsearch_connector.check_document_in_index_by_id( index=auth_es_index_features, uuid=author.get("identifier")) if non_exist_auth: # Generate entry elasticsearch_connector.bulk_data_into_index( index=auth_es_index_features, uuid=author.get("identifier"), source_data=author_features.__dict__) else: # Update features in Elasticsearch body = {"doc": author_features.__dict__} elasticsearch_connector.update_fields_to_index( index=auth_es_index_features, uuid=author.get("identifier"), body=body) # 4. Update scores in Elasticsearch fdg-ap-person-features params = { score_key: 100 * output_aut_trustworthiness.trustworthiness, "relevance": output_aut_trustworthiness.relevance, "status": "done" } body = {"doc": params} elasticsearch_connector.update_fields_to_index( index=person_es_index, uuid=author.get("identifier"), body=body) response["message"]: str = http_response_200 response["code"]: int = 200 except Exception as e: logger.error(e) return response