def preprocess_categorical_data(data: pd.DataFrame,
                                    cat_cols: list,
                                    default_value: str,
                                    link_information: list,
                                    document_col: str,
                                    join_attr: str = ". "):
        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            df_cat: pd.DataFrame = data.select_dtypes(["object"])
            df_cat: pd.DataFrame = df_cat[cat_cols]
            df_cat.fillna(value=default_value, inplace=True)

            # Add document
            df_cat[document_col] = df_cat.apply(
                FeatureExtraction.extract_document_from_categorical_series,
                axis=1,
                args=(
                    link_information,
                    join_attr,
                ))

            data_transformed: pd.DataFrame = df_cat.copy()

        except Exception as e:
            logger.error(e)
        return data_transformed
Example #2
0
    def start_streaming(
            self,
            thread_name: str,
            collection_name: str = "default",
            es_index_name: str = "default") -> StreamingServiceOutput:
        output: StreamingServiceOutput = object.__new__(StreamingServiceOutput)
        try:
            set_name: str = collection_name if self.storage == "mongoDB" else es_index_name
            data = {
                "languages": self.languages,
                "track": self.track,
                "storage": self.storage,
                "collection_names": {
                    "status": f"{set_name}_tweets",
                    "user": "******"
                },
                "mongo_db_name": self.mongo_db_name
            }

            response: StreamingProcessOutput = self.twst_api.start_new_streaming_process(
                thread_name=thread_name, data=data)

            # Generate the response
            output: StreamingServiceOutput = StreamingServiceOutput(
                status_code=response.status_code,
                message=response.message,
                data=response.data)
        except Exception as e:
            logger.error(e)
        return output
Example #3
0
 def get_available_languages(api: API) -> list:
     languages: list = []
     try:
         languages: list = api.supported_languages()
     except Exception as e:
         logger.error(e)
     return languages
 def retrieve_data_from_index_by_searching(self,
                                           index,
                                           search_key,
                                           search_value,
                                           fuzzy_threshold=95,
                                           request_timeout=30):
     response = {}
     try:
         query = {
             "query": {
                 "match": {
                     search_key: {
                         "query": search_value,
                         "fuzziness": "0"
                     }
                 }
             }
         }
         results = self.es.search(body=query,
                                  index=index,
                                  request_timeout=request_timeout)
         if results['hits']['total'] > 0:
             res_name = results['hits']['hits'][0]['_source'][search_key]
             distance = fuzz.ratio(
                 res_name.replace(' ', '').lower(),
                 search_value.replace(' ', '').lower())
             if distance >= fuzzy_threshold:
                 response = {
                     "source": results['hits']['hits'][0]['_source'],
                     "id": results['hits']['hits'][0]["_id"]
                 }
     except Exception as e:
         logger.error(e)
     return response
Example #5
0
def write_json_file(data: dict, filename: str):
    try:
        json_data = json.dumps(str(data))
        with open(filename, 'w') as file:
            json.dump(json_data, file)
    except Exception as e:
        logger.error(e)
    def generate_user_feature_vector(self, user: User) -> np.ndarray:
        doc_emb: np.ndarray = np.array([])
        try:
            index_val: int = 0

            # 1. Extract features
            user_features: dict = self.extract_features_from_user_account(
                user_data=user)

            # 2. Preprocess features
            data_transformed: pd.DataFrame = self.preprocess_data(
                user_features=user_features,
                popularity_metric=popularity_metric,
                boolean_cols=boolean_cols,
                drop_num_cols=drop_num_cols,
                scaler_filename=scaler_filename,
                cat_cols=cat_cols,
                default_value=default_value,
                link_information=link_information,
                document_col=document_col,
                join_attr=join_attr,
                index_val=index_val)

            # 3. Generate input embedding
            doc_emb: np.ndarray = self.get_embedding_from_dataframe(
                data_transformed=data_transformed, index_val=index_val)

        except Exception as e:
            logger.error(e)
        return doc_emb
    def get_textblob_sentiment_analysis(doc: str) -> TextBlobSentOutput:
        output: TextBlobSentOutput = TextBlobSentOutput()
        try:
            was_translated: bool = False
            # 1. Get the language of the document
            lang: str = SentimentAnalysisAPI.get_document_language(doc=doc)
            if lang != "en":
                # Translate
                doc: str = SentimentAnalysisAPI.translate_to_english(
                    src_doc=doc, src_lang=lang)
                was_translated: bool = True

            sentences: list = SentimentAnalysisAPI.make_sentences(text=doc)
            polarity_scores: list = []
            subjectivity_scores: list = []
            for sent in sentences:
                subjectivity: float = TextBlob(sent).sentiment.subjectivity
                polarity: float = TextBlob(sent).sentiment.polarity
                polarity_scores.append(polarity)
                subjectivity_scores.append(subjectivity)

            final_subjectivity: float = round(
                float(np.mean(subjectivity_scores)), 3)
            final_polarity: float = round(float(np.mean(polarity_scores)), 3)

            output: TextBlobSentOutput = TextBlobSentOutput(
                analysed=True,
                polarity=final_polarity,
                subjectivity=final_subjectivity,
                translated=was_translated)

        except Exception as e:
            logger.error(e)
        return output
Example #8
0
 def remove_all_documents_from_collection(self, collection_name: str):
     try:
         docs: DeleteResult = self.db[collection_name].delete_many({})
         logger.info("%s documents deleted from %s", docs.deleted_count,
                     collection_name)
     except Exception as e:
         logger.error(e)
Example #9
0
 def find_document_by_filter(self, collection_name: str, filter: dict):
     doc: Optional[Cursor] = None
     try:
         doc: Cursor = self.db[collection_name].find(filter)
     except Exception as e:
         logger.error(e)
     return doc
 def convert_bool_to_int(data: pd.DataFrame, boolean_cols: list):
     try:
         for col in boolean_cols:
             data[col] = data[col].astype(int)
     except Exception as e:
         logger.error(e)
     return data
Example #11
0
 def on_error(self, status_code: int):
     if status_code == 420:
         logger.warning(f"Enhance Your Calm; The App is Being Rate Limited For Making Too Many Requests!\n")
         return False
     else:
         logger.error(f"Error {status_code} when ingesting a tweet\n")
         sys.exit()
    def get_doc2vec_embedding(data_input: Optional[pd.DataFrame],
                              index_val: int, document_col: str,
                              all_num_cols: list):
        doc_embedding_np: np.ndarray = np.array([])
        try:
            if isinstance(data_input, pd.DataFrame):
                document: str = data_input.loc[index_val, document_col]
                x_num: list = [
                    data_input.loc[index_val, j] for j in all_num_cols
                ]
            elif isinstance(data_input, pd.Series):
                document: str = data_input[document_col]
                x_num: list = [data_input[j] for j in all_num_cols]
            else:
                document: str = data_input.get(document_col, "")
                x_num: list = [
                    data_input.get(document_col, -1) for j in all_num_cols
                ]

            # Document embedding from a string
            doc_emb: np.ndarray = FeatureExtraction.generate_doc_embedding(
                document=document,
                embeddings=FeatureExtraction.get_flair_embeddings())

            x_doc_emb: list = list(doc_emb.tolist())

            if len(x_doc_emb) > 0:
                # 3.3 Concatenate doc embedding + numerical cols
                doc_embedding_np: np.ndarray = np.array([x_num + x_doc_emb
                                                         ]).reshape((1, -1))
        except Exception as e:
            logger.error(e)
        return doc_embedding_np
 def load_scaler_object(filename: str):
     scaler_obj: Optional[StandardScaler] = None
     try:
         scaler_obj: StandardScaler = joblib.load(filename)
     except Exception as e:
         logger.error(e)
     return scaler_obj
    def scale_data(data: pd.DataFrame, fit: bool = True, filename: str = ""):
        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            if fit:
                scaler: StandardScaler = StandardScaler()
                scaler.fit(data)

                prepare_directory(os.sep.join(filename.split(os.sep)[0:-1]))
                # Save scaler
                logger.info("Saving scaler object at %s", filename)

                FeatureExtraction.save_scaler_object(scaler_obj=scaler,
                                                     filename=filename)
            else:

                scaler: StandardScaler = FeatureExtraction.load_scaler_object(
                    filename=filename)

            # Transform data
            res_transformed: np.ndarray = scaler.transform(data)
            data_transformed: pd.DataFrame = pd.DataFrame(res_transformed,
                                                          columns=data.columns)
        except Exception as e:
            logger.error(e)
        return data_transformed
    def preprocess_data(user_features: dict,
                        popularity_metric: str,
                        boolean_cols: list,
                        drop_num_cols: list,
                        scaler_filename: str,
                        cat_cols: list,
                        default_value: str,
                        link_information: list,
                        document_col: str,
                        join_attr: str = ". ",
                        index_val: int = 0):

        data_transformed: pd.DataFrame = pd.DataFrame([])
        try:
            data_transformed: pd.DataFrame = FeatureExtraction.preprocess_prediction_data(
                data_dct=user_features,
                popularity_metric=popularity_metric,
                boolean_cols=boolean_cols,
                drop_num_cols=drop_num_cols,
                scaler_filename=scaler_filename,
                cat_cols=cat_cols,
                default_value=default_value,
                link_information=link_information,
                document_col=document_col,
                join_attr=join_attr,
                index_val=index_val)
        except Exception as e:
            logger.error(e)
        return data_transformed
Example #16
0
 def find_one_document(self, collection_name: str) -> dict:
     doc: dict = {}
     try:
         doc: dict = self.db[collection_name].find_one()
     except Exception as e:
         logger.error(e)
     return doc
    def generate_embedding(data: pd.DataFrame,
                           document_col: str,
                           id_col: str,
                           embeddings: list,
                           doc2vec: str = "transformer_roberta"):
        doc_embedding_np: np.ndarray = np.array([])
        try:
            all_num_cols: list = FeatureExtraction.get_numerical_columns(
                data=data)
            all_num_cols.remove(id_col)

            document: str = data[document_col]

            # 3.1 Generate doc embedding
            doc_emb: np.ndarray = FeatureExtraction.generate_doc_embedding(
                document=document, embeddings=embeddings, doc2vec=doc2vec)
            x_doc_emb: list = list(doc_emb.tolist())
            x_num: list = [data[j] for j in all_num_cols]

            if len(x_doc_emb) > 0:
                # 3.3 Concatenate doc embedding + numerical cols
                doc_embedding_np: np.ndarray = np.array([x_num + x_doc_emb
                                                         ]).reshape((1, -1))
                print(doc_embedding_np.shape)
        except Exception as e:
            logger.error(e)
        return doc_embedding_np
Example #18
0
 def get_all_data_by_filter_field(self, collection_name: str, filter: dict):
     doc: Optional[Cursor] = None
     try:
         doc: Cursor = self.db[collection_name].find({}, filter)
     except Exception as e:
         logger.error(e)
     return doc
 def embedding_cosine_similarity(x: np.array, y: np.array) -> float:
     similarity: float = 0.0
     try:
         similarity: float = round(float(1 - distance.cosine(x, y)), 3)
     except Exception as e:
         logger.error(e)
     return similarity
Example #20
0
 def create_index_at_collection(self, collection_name: str,
                                unique_uuid_col: str):
     try:
         self.db[collection_name].create_index(
             [(unique_uuid_col, ASCENDING)], unique=True)
     except Exception as e:
         logger.error(e)
Example #21
0
 def get_document_language(doc: str) -> str:
     language: str = ""
     try:
         language: str = detect(doc)
     except Exception as e:
         logger.error(e)
     return language
Example #22
0
 def clean_threads():
     try:
         logger.info(f"Cleaning all threads")
         for i, thread_name in enumerate(global_streaming_threads):
             global_streaming_threads.pop(i)
     except Exception as e:
         logger.error(e)
 def scroll_data_from_elasticsearch(self,
                                    index,
                                    body,
                                    scroll="1m",
                                    size=1000):
     items = []
     try:
         res = self.es.search(index=index, body=body, scroll=scroll)
         if res:
             # update data
             items += res["hits"]["hits"]
             total_elements = res["hits"]["total"]["value"]
             rest_elements = (total_elements - size)
             if rest_elements > 0:
                 done = True
                 prev = res
                 while done:
                     new_items = self.es.scroll(
                         scroll_id=prev['_scroll_id'], scroll=scroll)
                     if len(new_items["hits"]["hits"]) > 0:
                         items += new_items["hits"]["hits"]
                         prev = new_items
                     else:
                         done = False
     except Exception as e:
         logger.error(e)
     return items
Example #24
0
    def build_output(self):
        output = {}
        try:
            output = {
                "identifier": self.identifier,
                "headline": self.headline,
                "articleBody": self.articleBody,
                'url': self.url,
                "language": self.language,
                "images": self.images,
                "videos": self.videos,
                "dateCreated": self.dateCreated,
                "dateModified": self.dateModified,
                "datePublished": self.datePublished,
                "publishDateEstimated": self.publishDateEstimated,
                "authors": self.authors,
                "publisher": self.publisher,
                "sourceDomain": self.sourceDomain,
                "country": self.country,
                "nationality": self.nationality,
                "calculatedRating": self.calculatedRating,
                "calculatedRatingDetail": self.calculatedRatingDetail
            }

            # Add Fakeness whether it is available
            if self.fakeness != default_field and self.fakeness is not None:
                output['fakeness'] = self.fakeness
        except Exception as e:
            logger.error(e)
        return output
    def process_analysis(self, publishers: list, authors: list,
                         article_uuid: str, article_url: str,
                         elasticsearch_connector: ElasticsearchConnector,
                         source_rank: SourceRank):
        try:
            # 1. Perform analysis
            response: dict = self.perform_source_credibility_analysis(
                publishers=publishers,
                authors=authors,
                article_url=article_url,
                elasticsearch_connector=elasticsearch_connector,
                source_rank=source_rank)

            if response.get("code") == 200:
                # 2. Call Fusion Score
                logger.info("Calling Fusion Score Service")
                ThreadsProcessor.start_new_streaming_process(
                    thread_name="fusion_score",
                    target_func=self.update_fusion_score,
                    params=(
                        fusion_score_server,
                        fusion_score_port,
                        fusion_score_endpoint,
                        article_uuid,
                    ))
                """response_fusion_score: dict = self.update_fusion_score(
                    server=fusion_score_server,
                    port=fusion_score_port,
                    endpoint=fusion_score_endpoint,
                    article_uuid=article_uuid)"""
        except Exception as e:
            logger.error(e)
Example #26
0
    def start_new_streaming_process(self, thread_name: str,
                                    data: dict) -> StreamingProcessOutput:
        output: StreamingProcessOutput = object.__new__(StreamingProcessOutput)
        try:
            # 1. Generate Streaming Processor
            api: API = self.set_up_twitter_api()
            streaming_processor: StreamingProcessor = StreamingProcessor(
                api=api,
                languages=data.get("languages", []),
                track=data.get("track", []),
                storage=data.get("storage", "mongoDB"),
                collection_names=data.get("collection_names", {}),
                mongo_db_name=data.get("mongo_db_name", "default"))

            # 2. Create new Thread
            response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process(
                thread_name=thread_name,
                target_func=streaming_processor.run_twitter_streaming)

            # 3. Generate Output
            output: StreamingProcessOutput = StreamingProcessOutput(
                message=response.message,
                status_code=response.status_code,
                data=response.data)
        except Exception as e:
            logger.error(e)
        return output
Example #27
0
    def stop_streaming_process(thread_name: str):
        response: GeneralAPIResponse = object.__new__(GeneralAPIResponse)
        try:
            not_found = True
            stream_threads: list = ThreadsProcessor.get_available_threads()

            # If the list of threads is not empty
            if stream_threads:
                # Check if there is a thread with the same name running
                thread_names = [i.name for i in stream_threads]

                # Thread with the same name
                if thread_name in thread_names:
                    thread_idx = thread_names.index(thread_name)
                    # Stop selected thread
                    current_thread: StreamingThread = stream_threads[thread_idx]
                    response_kill: GeneralAPIResponse = ThreadsProcessor.kill_streaming_thread(
                        streaming_thread=current_thread)
                    if response_kill.status_code == 200:
                        # Remove from list
                        stream_threads.pop(thread_idx)
                        not_found = False
            # Check not found
            if not_found:
                status_code: int = 400
                message: str = f"Thread {thread_name} was not found!"
            else:
                status_code: int = 400
                message: str = f"Thread {thread_name} was not found!"

            response: GeneralAPIResponse = GeneralAPIResponse(
                message=message, status_code=status_code, data={})
        except Exception as e:
            logger.error(e)
        return response
Example #28
0
    def start_new_searching_process(self, thread_name: str,
                                    data: dict) -> StreamingProcessOutput:
        output: StreamingProcessOutput = object.__new__(StreamingProcessOutput)
        try:
            # 1. Generate Streaming Processor
            api: API = self.set_up_twitter_api()

            searching_processor: SearchingProcessor = SearchingProcessor(
                twitter_connector=self.twitter_connector,
                mongo_db_name=data.get(""),
                collection_names=data.get(""),
                local_storage=data.get(""),
                dest_storage=data.get(""))

            # 2. Create new Thread
            response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process(
                thread_name=thread_name,
                target_func=searching_processor.run_twitter_searching)

            # 3. Generate Output
            output: StreamingProcessOutput = StreamingProcessOutput(
                message=response.message,
                status_code=response.status_code,
                data=response.data)
        except Exception as e:
            logger.error(e)
        return output
Example #29
0
 def tweet_gathering(api: API,
                     query: str,
                     date_since: str,
                     lang: str = 'en'):
     try:
         logger.info("Retrieving Tweets ... ")
         # Collect tweets
         tweets = Cursor(api.search,
                         lang=lang,
                         q=query,
                         include_entities=True,
                         monitor_rate_limit=True,
                         wait_on_rate_limit_notify=True,
                         wait_on_rate_limit=True,
                         result_type="recent",
                         tweet_mode='extended').items()
         while True:
             try:
                 tweet: Status = tweets.next()
                 print(tweet)
                 yield tweet
             except RateLimitError:
                 time.sleep(60 * 15)
                 continue
             except StopIteration:
                 break
     except Exception as e:
         logger.error(e)
Example #30
0
    def offline_service(self):
        output: GraphAnalyzerOutputDoc = GraphAnalyzerOutputDoc(
            message=http_response_500, status=500)
        try:
            self.service_task: str = offline_service_name
            if self.data_connector is None:
                # 1. Set up data manager
                self.set_up_data_manager(service=self.service_task)

            # 2. Verify Connection
            connection_error: dict = self.verify_external_server_connections()

            # 3. If there is an error in any external connection
            if True in list(connection_error.values()):
                status: int = 403
                message: str = http_response_403
            else:
                # 4. Start offline process
                response: GeneralAPIResponse = ThreadsProcessor.start_new_streaming_process(
                    thread_name=self.service_task,
                    target_func=self.data_connector.start_kafka_offline_process
                )

                status: int = response.status_code
                message: str = response.message

            # 5. Build output
            output.message: str = message
            output.status: int = status

        except Exception as e:
            logger.error(e)
        return output