Ejemplo n.º 1
0
    def storage_data(data: TwitterDataOutput, collection_names: dict,
                     storage: str, mongodb_connector: MongoDBConnector,
                     elasticsearch_connector: ElasticsearchConnector,
                     identifier_key: str):

        if storage == "mongoDB":
            # 1. Insert Status
            non_exist_status: bool = mongodb_connector.verify_documents_in_collection(
                entity=data.status,
                collection_name=collection_names.get("status"),
                identifier_key=identifier_key)
            if non_exist_status:
                mongodb_connector.insert_document_to_collection(
                    document=data.status,
                    collection_name=collection_names.get("status"))

            # 2. Insert User
            non_exist_user: bool = mongodb_connector.verify_documents_in_collection(
                entity=data.user,
                collection_name=collection_names.get("user"),
                identifier_key=identifier_key)
            if non_exist_user:
                mongodb_connector.insert_document_to_collection(
                    document=data.user,
                    collection_name=collection_names.get("user"))

            logger.info("Inserted data into MongoDB with success!")

        elif storage == "elasticsearch":
            # Insert status
            res_status: dict = elasticsearch_connector.bulk_data_into_index(
                index=collection_names.get("status"),
                uuid=data.status.get("uuid"),
                source_data=data.status)

            # Insert User
            res_user: dict = elasticsearch_connector.bulk_data_into_index(
                index=collection_names.get("user"),
                uuid=data.user.get("uuid"),
                source_data=data.user)

            if res_status and res_user:
                logger.info("Inserted data into ES with success!")
        else:
            raise ValueError("JSON Storage Not Implemented yet")
    def get_anonymous_rank(elasticsearch_connector: ElasticsearchConnector,
                           index: str, authors_index: str, field: str,
                           query: str, anonymous_key: str) -> dict:
        anonymous_rank_analysis: dict = {"rank": 0.0, "total_authors": 0}
        try:
            # 1. Retrieve all articles associated to the entity
            response: Response = elasticsearch_connector.search_data_from_elasticsearch_by_matching(
                index=index, fields=[field], query=query)
            response_dct: dict = response.to_dict()
            if response_dct.get("hits").get("total").get("value") > 0:

                # 2. Count authors
                authors: list = [
                    i.get("_source").get("authors")
                    for i in response_dct.get("hits").get("hits")
                ]
                authors_uuids: list = list(
                    set(itertools.chain.from_iterable(authors)))
                total_authors: int = len(authors_uuids)
                total_anonymous: int = 0

                # 3. Count how many of them are anonymous
                response_authors: Response = elasticsearch_connector.filter_data_from_index_by_uuid(
                    index=authors_index, uuids=authors_uuids)
                response_authors_dct: dict = response_authors.to_dict()

                if response_authors_dct.get("hits").get("total").get(
                        "value") > 0:
                    total_anonymous: int = len([
                        i for i in response_authors_dct.get("hits").get("hits")
                        if i.get("_source").get("name") == anonymous_key
                    ])

                anonymous_rank: float = round(
                    1 - (total_anonymous / total_authors), 3)
                anonymous_rank_analysis["rank"]: float = anonymous_rank
                anonymous_rank_analysis["total_authors"]: int = total_authors
            else:
                anonymous_rank_analysis["rank"]: float = normalize_value(
                    mu=0.1, sigma=0.05)

        except Exception as e:
            logger.error(e)
        return anonymous_rank_analysis
    def get_text_rank(elasticsearch_connector: ElasticsearchConnector,
                      query: str, field: str, es_art_index: str,
                      es_score_index) -> dict:
        text_rank_analysis: dict = {"rank": 0.0, "total_articles": 0}
        try:
            # 1. Retrieve all articles associated to the entity
            response: Response = elasticsearch_connector.search_data_from_elasticsearch_by_matching(
                index=es_art_index, fields=[field], query=query)

            response_dct: dict = response.to_dict()
            text_rank: float = normalize_value(mu=0.1, sigma=0.05)
            total_articles: int = 0
            if response_dct.get("hits").get("total").get("value") > 0:
                articles_uuids: list = [
                    i.get("_id") for i in response_dct.get("hits").get("hits")
                ]
                total_articles: int = len(articles_uuids)

                # 2. For each article id, retrieve score if available
                response_articles: Response = elasticsearch_connector.filter_data_from_index_by_uuid(
                    index=es_score_index, uuids=articles_uuids)
                response_articles_dct: dict = response_articles.to_dict()

                if response_articles_dct.get("hits").get("total").get(
                        "value") > 0:
                    text_scores: list = [
                        np.multiply(
                            i.get("_source").get("textScore"),
                            i.get("_source").get("relevance"))
                        for i in response_articles_dct.get("hits").get("hits")
                    ]
                    # 3. Compute weighted mean
                    text_rank: float = round(
                        float(sum(text_scores) / len(text_scores)), 3)
            text_rank_analysis["rank"]: float = text_rank
            text_rank_analysis["total_articles"]: int = total_articles
        except Exception as e:
            logger.error(e)
        return text_rank_analysis
Ejemplo n.º 4
0
    def __init__(self, service: str):
        self.service: str = service

        # Kafka Parameters
        self.topic_consumer: str = topic_consumer
        self.topic_producer: str = topic_producer
        self.group_id: str = group_id
        self.kafka_server: str = kafka_server
        self.enable_auto_commit: bool = False
        self.auto_offset_reset: str = "earliest"
        self.kafka_manager: Optional[KafkaConnector] = None

        # Elasticsearch Parameters
        self.es_port: str = es_port
        self.es_host: str = es_host
        self.elasticsearch_connector: ElasticsearchConnector = ElasticsearchConnector(
            host=self.es_host, port=self.es_port)
        self.source_credibility_connector: SourceCredibilityConnector = SourceCredibilityConnector(
            elasticsearch_connector=self.elasticsearch_connector)
Ejemplo n.º 5
0
 def __init__(self,
              twitter_connector: TwitterConnector,
              db_name: str,
              collection_names: dict,
              local_storage: str,
              dest_storage: str = "mongoDB",
              add_sentiment: bool = True,
              add_bot_analysis: bool = True):
     self.twitter_connector: TwitterConnector = twitter_connector
     self.local_storage: str = local_storage
     self.dest_storage: str = dest_storage
     self.collection_names: dict = collection_names
     self.add_sentiment: bool = add_sentiment
     self.add_bot_analysis: bool = add_bot_analysis
     self.identifier_key: str = "uuid"
     self.mongodb_connector: MongoDBConnector = MongoDBConnector(
         host=mongo_host, port=mongo_port, db_name=db_name)
     self.elasticsearch_connector: ElasticsearchConnector = ElasticsearchConnector(
         host=es_host, port=es_port)
     self.set_up_storage_connection()
Ejemplo n.º 6
0
    def __init__(self, api: API, languages: list, track: list, db_name: str,
                 collection_names: dict, storage: str = "mongoDB",
                 add_sentiment: bool = True, add_bot_analysis: bool = True):
        self.api: API = api
        self.languages: list = languages
        self.track: list = track
        self.tweet_mode: str = "extended"
        self.storage: str = storage
        self.collection_names: dict = collection_names
        self.add_sentiment: bool = add_sentiment
        self.add_bot_analysis: bool = add_bot_analysis

        self.mongodb_connector: MongoDBConnector = MongoDBConnector(
            host=mongo_host, port=mongo_port, db_name=db_name)
        self.elasticsearch_connector: ElasticsearchConnector = ElasticsearchConnector(
            host=es_host, port=es_port)
        self.set_up_storage_connection()
        self.stream_listener: TwitterStreamListener = TwitterStreamListener(
            storage=storage, collection_names=self.collection_names,
            mongodb_connector=self.mongodb_connector,
            elasticsearch_connector=self.elasticsearch_connector,
            add_sentiment=self.add_sentiment, add_bot_analysis=self.add_bot_analysis)
Ejemplo n.º 7
0
    def initialize(self):
        if not Config.initialize('/etc/quest/config.yaml'):
            raise Exception('Can\'t read config')

        self.aerospike_connector = AerospikeConnector(max_record_size=Config.AEROSPIKE_MAX_RECORD_SIZE, \
                                                      rw_timeout_ms=Config.AEROSPIKE_RW_TIMEOUT_MS, \
                                                      connection_timeout_ms=Config.AEROSPIKE_CONNECTION_TIMEOUT_MS)
        if not self.aerospike_connector.connect(
                Config.AEROSPIKE_HOSTS, Config.AEROSPIKE_LUA_USER_PATH):
            raise Exception('Can\'t connect to aerospike')

        self.access_rules = AccessRules(self)
        self.avatar_generator = AvatarGenerator(
            'data/avatar_palette.png', 'data/SanFranciscoDisplay-Regular.ttf')
        self.twilio_connector = TwilioConnector()

        self.s3connector = S3Connector(Config.S3_ENDPOINT_URL, \
                                       Config.S3_ACCESS_KEY_ID, \
                                       Config.S3_SECRET_ACCESS_KEY)

        self.elasticsearch_connector = ElasticsearchConnector(Config.ELASTICSEARCH_HOSTS, \
                                                              Config.ELASTICSEARCH_USERS_INDEX, \
                                                              Config.ELASTICSEARCH_TIMEOUT_SEC, \
                                                              enabled=Config.ELASTICSEARCH_ENABLED)
    def perform_source_credibility_analysis(
            self, publishers: list, authors: list, article_url: str,
            elasticsearch_connector: ElasticsearchConnector,
            source_rank: SourceRank) -> dict:
        response: dict = {"message": http_response_500, "code": 500}
        try:
            # ====================================================================
            # Publisher Analysis
            # ====================================================================
            output_pub_trustworthiness: TrustworthinessDoc = TrustworthinessDoc(
                trustworthiness=normalize_value(),
                relevance=normalize_value(mu=0.1, sigma=0.05))
            for publisher in publishers:
                logger.info(
                    f"Analysing Publisher {publisher.get('identifier')}")

                # 1. Check if the publisher contains data
                non_exist: bool = elasticsearch_connector.check_document_in_index_by_id(
                    index=org_es_index_features,
                    uuid=publisher.get("identifier"))

                if non_exist:
                    analyse_static: bool = True
                else:
                    analyse_static: bool = False

                # 4. Compute publisher features
                publisher_features: PublisherFeaturesDoc = self.get_publisher_features(
                    elasticsearch_connector=elasticsearch_connector,
                    source_rank=source_rank,
                    publisher=publisher,
                    article_url=article_url,
                    analyse_static=analyse_static)

                # 5. Update features in Elasticsearch
                if analyse_static:
                    elasticsearch_connector.bulk_data_into_index(
                        index=org_es_index_features,
                        uuid=publisher.get("identifier"),
                        source_data=publisher_features.__dict__)
                else:
                    # Update only non-static features
                    params = {
                        "text_rank_analysis":
                        publisher_features.text_rank_analysis,
                        "anonymous_rank_analysis":
                        publisher_features.anonymous_rank_analysis,
                        "last_updated": publisher_features.last_updated
                    }
                    body = {"doc": params}
                    elasticsearch_connector.update_fields_to_index(
                        index=org_es_index_features,
                        uuid=publisher.get("identifier"),
                        body=body)

                # 6. Compute Trustworthiness & relevance
                publisher_features_dict = {
                    "open_rank":
                    publisher_features.open_rank_analysis.get("rank"),
                    "suffix_rank":
                    publisher_features.suffix_rank_analysis.get("rank"),
                    "category_rank":
                    publisher_features.category_rank_analysis.get("rank"),
                    "twitter_rank":
                    publisher_features.twitter_rank_analysis.get("rank"),
                    "whois_rank":
                    publisher_features.whois_rank_analysis.get("rank"),
                    "text_rank":
                    publisher_features.text_rank_analysis.get("rank"),
                    "anonymous_rank":
                    publisher_features.anonymous_rank_analysis.get("rank")
                }
                output_pub_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features(
                    features=publisher_features_dict,
                    metrics=self.get_publisher_metrics_importances(),
                    total_articles=publisher_features.text_rank_analysis.get(
                        "total_articles"))

                # 7. Update only non-static features
                params = {
                    score_key:
                    100 * output_pub_trustworthiness.trustworthiness,
                    "relevance": output_pub_trustworthiness.relevance,
                    "status": "done"
                }
                body = {"doc": params}
                elasticsearch_connector.update_fields_to_index(
                    index=org_es_index,
                    uuid=publisher.get("identifier"),
                    body=body)

            # ====================================================================
            # Author Analysis
            # ====================================================================
            publisher_rank_analysis: dict = {
                "rank": output_pub_trustworthiness.trustworthiness,
                "relevance": output_pub_trustworthiness.relevance
            }
            for author in authors:
                logger.info(f"Analysing Author {author.get('identifier')}")

                # 1. Compute author features
                author_features: AuthorFeatureDoc = self.get_author_features(
                    elasticsearch_connector=elasticsearch_connector,
                    author=author,
                    publisher_rank_analysis=publisher_rank_analysis)

                # 2. Compute Trustworthiness & relevance
                author_features_dict = {
                    "text_rank":
                    author_features.text_rank_analysis.get("rank"),
                    "publisher_rank": publisher_rank_analysis.get("rank")
                }
                output_aut_trustworthiness: TrustworthinessDoc = self.get_trustworthiness_from_features(
                    features=author_features_dict,
                    metrics=self.get_author_metrics_importances(),
                    total_articles=author_features.text_rank_analysis.get(
                        "total_articles"))

                # 3. Verify if author exists in fdg-person-features
                non_exist_auth: bool = elasticsearch_connector.check_document_in_index_by_id(
                    index=auth_es_index_features,
                    uuid=author.get("identifier"))

                if non_exist_auth:
                    # Generate entry
                    elasticsearch_connector.bulk_data_into_index(
                        index=auth_es_index_features,
                        uuid=author.get("identifier"),
                        source_data=author_features.__dict__)
                else:
                    # Update features in Elasticsearch
                    body = {"doc": author_features.__dict__}
                    elasticsearch_connector.update_fields_to_index(
                        index=auth_es_index_features,
                        uuid=author.get("identifier"),
                        body=body)
                # 4. Update scores in Elasticsearch fdg-ap-person-features
                params = {
                    score_key:
                    100 * output_aut_trustworthiness.trustworthiness,
                    "relevance": output_aut_trustworthiness.relevance,
                    "status": "done"
                }
                body = {"doc": params}
                elasticsearch_connector.update_fields_to_index(
                    index=person_es_index,
                    uuid=author.get("identifier"),
                    body=body)
            response["message"]: str = http_response_200
            response["code"]: int = 200
        except Exception as e:
            logger.error(e)
        return response
    def get_publisher_features(self,
                               elasticsearch_connector: ElasticsearchConnector,
                               source_rank: SourceRank, publisher: dict,
                               article_url: str,
                               analyse_static: bool) -> PublisherFeaturesDoc:
        publisher_features: PublisherFeaturesDoc = object.__new__(
            PublisherFeaturesDoc)
        try:
            # Compute analysis

            # Parser URL
            url: str = publisher.get("source_data").get("url")
            # country: str = publisher.get("source_data").get("country")
            # country_code: Optional[str] = None
            uuid: str = publisher.get("identifier")
            field: str = "publisher"

            res_parser = source_rank.process_url(url=url)

            # Only updated if necessary
            if analyse_static:
                # 1. Open Rank Analysis
                domain: str = res_parser.registered_domain
                fqdn: str = res_parser.fqdn
                res_open_rank: dict = source_rank.get_open_rank_analysis(
                    domain=domain).__dict__

                # 2. Suffix Analysis
                res_suffix: dict = source_rank.get_suffix_analysis(
                    suffix=res_parser.suffix).__dict__

                # 3. WHOIS Analysis
                res_whois: dict = source_rank.get_whois_analysis(
                    domain=domain).__dict__

                # 4. Category Analysis
                res_category: dict = source_rank.get_category_analysis(
                    url=fqdn).__dict__

                # 5. Retrieve Twitter info from URL
                res_twitter: dict = source_rank.get_twitter_analysis(
                    url=article_url).__dict__
            else:
                # 1. Retrieve data from ES
                res_org: dict = elasticsearch_connector.retrieve_data_from_index_by_id(
                    index=org_es_index_features, uuid=uuid)
                res_open_rank: dict = res_org.get("open_rank_analysis")
                res_suffix: dict = res_org.get("suffix_rank_analysis")
                res_whois: dict = res_org.get("whois_rank_analysis")
                res_category: dict = res_org.get("category_rank_analysis")
                res_twitter: dict = res_org.get("twitter_rank_analysis")

            # 6. Text rank
            res_text_rank: dict = self.get_text_rank(
                elasticsearch_connector=elasticsearch_connector,
                query=uuid,
                field=field,
                es_art_index=art_es_index,
                es_score_index=score_art_es_index)

            # 7. Anonymous rank
            res_anonymous_rank: dict = self.get_anonymous_rank(
                elasticsearch_connector=elasticsearch_connector,
                index=art_es_index,
                authors_index=person_es_index,
                field=field,
                query=uuid,
                anonymous_key=default_name)

            # 8. Generate Output
            publisher_features: PublisherFeaturesDoc = PublisherFeaturesDoc(
                open_rank_analysis=res_open_rank,
                suffix_rank_analysis=res_suffix,
                whois_rank_analysis=res_whois,
                category_rank_analysis=res_category,
                twitter_rank_analysis=res_twitter,
                anonymous_rank_analysis=res_anonymous_rank,
                text_rank_analysis=res_text_rank)

        except Exception as e:
            logger.error(e)
        return publisher_features
Ejemplo n.º 10
0
from connectors.source_credibility_connector import SourceCredibilityConnector
from connectors.elasticsearch_connector import ElasticsearchConnector
from unitest.input_examples.input_preprocessed_examples import data_preprocessed
from fandango_models.article import Article
import time

# =========cona=====================================================================
elasticsearch_connector = ElasticsearchConnector(host="localhost", port="9220")
elasticsearch_connector.connect()

# 1. Create input document
art_obj = data_preprocessed.get("data")

art_doc: Article = Article()
document: Article = art_doc.article_from_dict(data=art_obj)

# 2. Create object
source_credibility_connector = SourceCredibilityConnector(
    elasticsearch_connector=elasticsearch_connector)

# 3. Apply analysis
start_time = time.time()
print(f"init: {start_time}")
res = source_credibility_connector.apply_analysis(document=document)
end_time = time.time()
print(f"end: {end_time}")
print(f"Duration: {end_time-start_time}")
Ejemplo n.º 11
0
from config import Config
from connectors.elasticsearch_connector import ElasticsearchConnector
import sys

if not Config.initialize('/etc/quest/config.yaml'):
    raise Exception('Can\'t read config')

if Config.ELASTICSEARCH_ENABLED == False:
    print 'Elasticsearch is disabled, do nothing'
    sys.exit()

elasticsearch_connector = ElasticsearchConnector(Config.ELASTICSEARCH_HOSTS, \
                                                 Config.ELASTICSEARCH_USERS_INDEX, \
                                                 Config.ELASTICSEARCH_TIMEOUT_SEC, \
                                                 enabled=Config.ELASTICSEARCH_ENABLED)

with open('pre-install/elasticsearch_index_settings.json',
          'r') as content_file:
    content = content_file.read()

elasticsearch_connector.create_index(content)
Ejemplo n.º 12
0
from connectors.elasticsearch_connector import ElasticsearchConnector
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch

es_man = ElasticsearchConnector(host="localhost", port="9220")
es_man.connect()

index = "fdg-article"
search_key = "publisher"
uuid = "7457b5f27a46e69e3f891767d01d2c6f6c5829132a4c03e78f4858828d539fc983a70a3c43ff8a082a8650c0972349eafb50bda7a44062428e2b405249a387f3"

a = "7457b5f27a46e69e3f891767d01d2c6f6c5829132a4c03e78f4858828d539fc983a70a3c43ff8a082a8650c0972349eafb50bda7a44062428e2b405249a387f3"
s = Search(using=es_man.es, index=index).query("match", publisher=uuid)
s = s.query("match", publisher=uuid)
s = s.execute()

multi_match = MultiMatch(query=uuid, fields=['publisher'])

s2 = Search(using=es_man.es, index=index).query(multi_match)
s2 = s2.execute()

art_index = "fdg-textscore"
art_id1 = "8796aff2a14a1ea1539265f76b044f1faf00304d6d9e237aaa21da6c4bab2166f0bed8a2eb99a05d18bc945f811f250da1f4c5a4acbfff1a213bc773894edcd1"
art_id2 = "8796aff2a14a1ea1539265f76b044f1faf00304d6d9e237aaa21da6c4bab2166be47575aa0278fdaaccf0d0aac645381b6db09383e58519fc3589398b63777b3"

sss = Search(using=es_man.es, index=art_index) \
    .filter("terms", _id=[art_id1, art_id2])

response = sss.execute()

# ------------------------------------------
Ejemplo n.º 13
0
 def get_es_uuid_from_entity_id(elasticsearch_connector: ElasticsearchConnector,
                                entity_id: str):
     return elasticsearch_connector.generate_128_uuid_from_string(data_uuid=str(entity_id))