Beispiel #1
0
def push_data_to_elasticsearch():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    for app_config_file in app_configs:
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.processed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # Filtering out reviews which are not applicable.
        reviews = filter_utils.filter_reviews_by_time(
            filter_utils.filter_reviews_by_channel(
                reviews, filter_utils.filter_disabled_review_channels(
                    app_config
                ),
            ),
            datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span)
        )

        # We shuffle the reviews. This is because of how elastic search.
        random.shuffle(reviews)

        # We first list out all the indices
        indices = get_indices(app_config.elastic_config.elastic_search_url)
        if app_config.elastic_config.index not in indices:
            # Create a new index
            create_index(app_config.elastic_config.elastic_search_url,
                         app_config.elastic_config.index)

        # Bulk push the data
        i = 0
        while i * constants.BULK_UPLOAD_SIZE < len(reviews):
            response = bulk_push_to_elastic(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.index,
                reviews[i *
                        constants.BULK_UPLOAD_SIZE:min((i + 1) *
                                             constants.BULK_UPLOAD_SIZE, len(reviews))])
            if response.status_code != 200:
                print(
                    "[Error] push_data_to_elasticsearch :: Got status code : ",
                    response.status_code)
                print("[Error] push_data_to_elasticsearch :: Response is : ",
                      response.text)
            i += 1
Beispiel #2
0
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.
            processed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # Filtering out reviews which are not applicable.
        reviews = filter_utils.filter_reviews_by_time(
            filter_utils.filter_reviews_by_channel(
                reviews,
                filter_utils.filter_disabled_review_channels(app_config),
            ),
            datetime.now(timezone.utc) -
            timedelta(days=app_config.email_config.email_time_span))
        if len(reviews) == 0:
            continue

        review_by_category = queries.getVocByCategory(reviews)

        top_categories = sorted([(len(review_by_category[key]), key)
Beispiel #3
0
def run_algo():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    for app_config_file in app_configs:
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Path where the user reviews were stored after parsing.
        parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.parsed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(parsed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # Filtering out reviews which are not applicable.
        reviews = filter_utils.filter_reviews_by_time(
            filter_utils.filter_reviews_by_channel(
                reviews, filter_utils.filter_disabled_review_channels(
                    app_config
                ),
            ),
            datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter)
        )

        # Number of process to make
        num_processes = min(constants.PROCESS_NUMBER, os.cpu_count())

        if constants.CIRCLECI in os.environ:
            num_processes = 2

        # Adding sentiment
        with Pool(num_processes) as process:
            reviews = process.map(add_review_sentiment_score, reviews)

        if app_config.algorithm_config.categorization_algorithm != None and app_config.algorithm_config.category_keywords_weights_file != None:
            # We read from the topic file first
            topics = {}
            topics = utils.open_json(app_config.algorithm_config.category_keywords_weights_file)

            # Adding text-match categorization
            with Pool(num_processes) as process:
                reviews = process.map(
                    partial(
                        text_match_categortization,
                        app_config=app_config,
                        topics=topics
                    ),
                    reviews
                )

        if app_config.algorithm_config.bug_feature_keywords_weights_file != None:
            # We read from the topic file first
            topics = {}
            topics = utils.open_json(app_config.algorithm_config.bug_feature_keywords_weights_file)

            # Adding bug/feature classification
            with Pool(num_processes) as process:
                reviews = process.map(
                    partial(
                        bug_feature_classification,
                        topics=topics
                    ),
                    reviews
                )

        if app_config.algorithm_config.categorization_algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION:
            # Load the TensorFlow model
            model = tf.keras.models.load_model(
                constants.LSTM_CATEGORY_MODEL_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                )
            )

            # Load the article tokenizer file
            tokenizer_json = utils.open_json(
               constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                ),
            )
            article_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                tokenizer_json
            )

            # Load the label tokenizer file
            tokenizer_json = utils.open_json(
                constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                ),
            )
            label_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                tokenizer_json
            )

            cleaned_labels = {}
            for review in reviews:
                label = review.derived_insight.category
                cleaned_label = re.sub(r'\W+', '', label)
                cleaned_label = cleaned_label.lower()
                cleaned_labels[cleaned_label] = label

            # Adding LSTM categorization
            reviews = lstm_classification(
                reviews,
                model,
                article_tokenizer,
                label_tokenizer,
                cleaned_labels
            )

        # Create the intermediate folders
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.processed_data_folder,
            app_name=app_config.app.name,
        )

        dir_name = os.path.dirname(processed_user_reviews_file_path)
        pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

        utils.dump_json(
            [review.to_dict() for review in reviews],
            processed_user_reviews_file_path,
        )
Beispiel #4
0
def train_lstm_model():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        print("[LOG] going through app config ", app_config.app.name)

        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.
            processed_data_folder,
            app_name=app_config.app.name,
        )

        if not (app_config.algorithm_config.categorization_algorithm != None
                and app_config.algorithm_config.categorization_algorithm
                == CategorizationAlgorithms.LSTM_CLASSIFICATION):
            continue

    # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # reviews = utils.filter_reviews(reviews, app_config)

        articles, labels, cleaned_labels = get_articles_and_labels(reviews)

        trained_model, article_tokenizer, label_tokenizer = train(
            articles, labels)

        trained_lstm_categorization_model_file_path = constants.LSTM_CATEGORY_MODEL_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.models_folder,
            app_name=app_config.app.name,
        )

        dir_name = os.path.dirname(trained_lstm_categorization_model_file_path)
        pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

        trained_model.save(trained_lstm_categorization_model_file_path)

        # Saving the tokenizers
        utils.dump_json(
            article_tokenizer.to_json(),
            constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format(
                base_folder=app_config.fawkes_internal_config.data.base_folder,
                dir_name=app_config.fawkes_internal_config.data.models_folder,
                app_name=app_config.app.name,
            ),
        )

        # Saving the tokenizers
        utils.dump_json(
            label_tokenizer.to_json(),
            constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format(
                base_folder=app_config.fawkes_internal_config.data.base_folder,
                dir_name=app_config.fawkes_internal_config.data.models_folder,
                app_name=app_config.app.name,
            ),
        )