コード例 #1
0
def push_data_to_elasticsearch():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    for app_config_file in app_configs:
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.processed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # Filtering out reviews which are not applicable.
        reviews = filter_utils.filter_reviews_by_time(
            filter_utils.filter_reviews_by_channel(
                reviews, filter_utils.filter_disabled_review_channels(
                    app_config
                ),
            ),
            datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span)
        )

        # We shuffle the reviews. This is because of how elastic search.
        random.shuffle(reviews)

        # We first list out all the indices
        indices = get_indices(app_config.elastic_config.elastic_search_url)
        if app_config.elastic_config.index not in indices:
            # Create a new index
            create_index(app_config.elastic_config.elastic_search_url,
                         app_config.elastic_config.index)

        # Bulk push the data
        i = 0
        while i * constants.BULK_UPLOAD_SIZE < len(reviews):
            response = bulk_push_to_elastic(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.index,
                reviews[i *
                        constants.BULK_UPLOAD_SIZE:min((i + 1) *
                                             constants.BULK_UPLOAD_SIZE, len(reviews))])
            if response.status_code != 200:
                print(
                    "[Error] push_data_to_elasticsearch :: Got status code : ",
                    response.status_code)
                print("[Error] push_data_to_elasticsearch :: Response is : ",
                      response.text)
            i += 1
コード例 #2
0
def parse_reviews():
    # Read all the app-config file names
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        parsed_reviews = []
        # We now read the review details for each channel
        for review_channel in app_config.review_channels:
            # We parse the channels only if its enabled!
            if review_channel.is_channel_enabled and review_channel.channel_type != ReviewChannelTypes.BLANK:
                raw_user_reviews_file_path = constants.RAW_USER_REVIEWS_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.
                    base_folder,
                    dir_name=app_config.fawkes_internal_config.data.
                    raw_data_folder,
                    app_name=app_config.app.name,
                    channel_name=review_channel.channel_name,
                    extension=review_channel.file_type)
                if review_channel.file_type == constants.JSON:  # Parse JSON
                    channel_reviews = parse_json(raw_user_reviews_file_path,
                                                 review_channel, app_config)
                elif review_channel.file_type == constants.CSV:  # Parse CSV
                    channel_reviews = parse_csv(raw_user_reviews_file_path,
                                                review_channel, app_config)
                else:  # Unsupported file format
                    raise (
                        "Format not supported exception. Check your file-type key in your config."
                    )
                parsed_reviews += channel_reviews

        # Executing custom code after parsing.
        if app_config.custom_code_module_path != None:
            custom_code_module = importlib.import_module(
                app_config.custom_code_module_path, package=None)
            parsed_reviews = custom_code_module.run_custom_code_post_parse(
                parsed_reviews)

        # After parsing the reviews for that all channels, we dump it into a file.
        # The file has a particular format.
        # {base_folder}/{dir_name}/{app_name}/parsed-user-feedback.{extension}
        parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.parsed_data_folder,
            app_name=app_config.app.name,
        )

        # Create the intermediate folders
        dir_name = os.path.dirname(parsed_user_reviews_file_path)
        pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

        utils.dump_json(
            [parsed_review.to_dict() for parsed_review in parsed_reviews],
            parsed_user_reviews_file_path)
コード例 #3
0
ファイル: lifetime_rating.py プロジェクト: prajwalppv/fawkes
def dump_lifetime_ratings():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        if app_config.elastic_config.lifetime_rating_index != None:
            time = datetime.strftime(datetime.now() - timedelta(1),
                                     constants.TIMESTAMP_FORMAT)

            playstore_rating = getPlayStoreLifetimeRating(app_config)
            appstore_rating = getAppStoreLifetimeRating(app_config)

            # Creating template for uploading lifetime rating
            playstore_doc = Review(
                {},
                timestamp=time,
                rating=playstore_rating,
                app_name=app_config.app.name,
                channel_name="playstore-lifetime",
                channel_type="playstore-lifetime",
                hash_id=utils.calculate_hash(app_config.app.name +
                                             ReviewChannelTypes.ANDROID))
            appstore_doc = Review(
                {},
                timestamp=time,
                rating=playstore_rating,
                app_name=app_config.app.name,
                channel_name="appstore-lifetime",
                channel_type="appstore-lifetime",
                hash_id=utils.calculate_hash(app_config.app.name +
                                             ReviewChannelTypes.IOS))

            # Deleting document to override
            elasticsearch.delete_document(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.lifetime_rating_index, "_doc",
                playstore_doc.hash_id)
            elasticsearch.delete_document(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.lifetime_rating_index, "_doc",
                appstore_doc.hash_id)

            # Uploading again
            elasticsearch.create_document(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.lifetime_rating_index, "_doc",
                playstore_doc.hash_id, playstore_doc)
            elasticsearch.create_document(
                app_config.elastic_config.elastic_search_url,
                app_config.elastic_config.lifetime_rating_index, "_doc",
                appstore_doc.hash_id, appstore_doc)
コード例 #4
0
def parse_keywords_file(keyword_file_name, enable_remove_stop_words=True):
    # Topics is a dict, key = Topic Name. value = list of words and weights.
    topics = {}
    keywords_list = utils.open_json(keyword_file_name)
    for topic_keyword in keywords_list:
        topic = {}
        line = " ".join(keywords_list[topic_keyword])

        # Remove all trailing and beginning write spaces
        line = line.lower()
        line = line.strip()
        # We will replace all the non-alphabet charectors with a space
        cleaned_line = re.sub("[^a-zA-Z]+", " ", line)
        # Replace multiple spaces with a single space
        cleaned_line = re.sub(" +", " ", cleaned_line)
        # Split the line according to space to get the words
        cleaned_line = cleaned_line.split()
        # Remove the stopwords.
        if enable_remove_stop_words:
            cleaned_line = utils.remove_stop_words(cleaned_line)
        # For each word assign a weight
        for word in list(set(cleaned_line)):
            # Add the word to the topic
            topic[lmtzr.lemmatize(word.lower())] = 1
        topics[topic_keyword] = topic
    return topics
コード例 #5
0
def parse_json(raw_user_reviews_file_path, review_channel, app_config):
    """ Parses the JSON files to a Review object """

    reviews = utils.open_json(raw_user_reviews_file_path)
    parsed_reviews = []

    for review in reviews:
        # TODO: Conver this to a standard format like jsonpath
        message = utils.get_json_key_value(
            review, review_channel.message_key.split("."))
        timestamp = utils.get_json_key_value(
            review, review_channel.timestamp_key.split("."))
        rating = None
        if review_channel.rating_key != None:
            rating = utils.get_json_key_value(
                review, review_channel.rating_key.split("."))

        # Add the review object to the parsed reviews
        parsed_reviews.append(
            Review(
                review,
                message=message,
                timestamp=timestamp,
                rating=rating,
                app_name=app_config.app.name,
                channel_name=review_channel.channel_name,
                channel_type=review_channel.channel_type,
                review_timezone=review_channel.timezone,
                timestamp_format=review_channel.timestamp_format,
            ))

    return parsed_reviews
コード例 #6
0
def generate_keyword_weights():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        # First look at the category keywords.
        utils.dump_json(
            parse_keywords_file(
                app_config.algorithm_config.category_keywords_file),
            app_config.algorithm_config.category_keywords_weights_file,
        )
        # Then look at the bug-feature keywords
        utils.dump_json(
            parse_keywords_file(
                app_config.algorithm_config.bug_feature_keywords_file, False),
            app_config.algorithm_config.bug_feature_keywords_weights_file,
        )
コード例 #7
0
ファイル: fetch.py プロジェクト: prajwalppv/fawkes
def fetch_reviews():
    # Read the app-config.json file.
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    # For every app registered in app-config.json we
    for app_config_file in app_configs:
        # Creating an AppConfig object
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Each app has a list of review channels from which the user reviews are fetched.
        for review_channel in app_config.review_channels:
            if review_channel.is_channel_enabled and review_channel.channel_type != ReviewChannelTypes.BLANK:
                # Depending on the channel type, we have different "fetchers" to get the data.
                if review_channel.channel_type == ReviewChannelTypes.TWITTER:
                    reviews = tweets.fetch(
                        review_channel
                    )
                elif review_channel.channel_type == ReviewChannelTypes.SALESFORCE:
                    reviews = salesforce.fetch(
                        review_channel
                    )
                elif review_channel.channel_type == ReviewChannelTypes.SPREADSHEET:
                    reviews = spreadsheet.fetch(
                        review_channel
                    )
                elif review_channel.channel_type == ReviewChannelTypes.CSV:
                    reviews = comma_separated_values.fetch(
                        review_channel
                    )
                elif review_channel.channel_type == ReviewChannelTypes.ANDROID:
                    reviews = playstore.fetch(
                        review_channel
                    )
                elif review_channel.channel_type == ReviewChannelTypes.IOS:
                    reviews = appstore.fetch(
                        review_channel
                    )
                else:
                    continue

                # After fetching the review for that particular channel, we dump it into a file.
                # The file has a particular format.
                # {base_folder}/{dir_name}/{app_name}/{channel_name}-raw-feedback.{extension}
                raw_user_reviews_file_path = constants.RAW_USER_REVIEWS_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.raw_data_folder,
                    app_name=app_config.app.name,
                    channel_name=review_channel.channel_name,
                    extension=review_channel.file_type)

                # Create the intermediate folders
                dir_name = os.path.dirname(raw_user_reviews_file_path)
                pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

                if review_channel.file_type == constants.JSON:
                    utils.dump_json(reviews, raw_user_reviews_file_path)
                else:
                    with open(raw_user_reviews_file_path) as file:
                        file.write(reviews)

        # There are lot of use-cases where we need to execute custom code after the data is fetched.
        # This might include data-transformation, cleanup etc.
        # This is the right place to do that.
        if app_config.custom_code_module_path != None:
            custom_code_module = importlib.import_module(app_config.custom_code_module_path, package=None)
            reviews = custom_code_module.run_custom_code_post_fetch()
コード例 #8
0
    category = review1.derived_insight.category
    # If the category has not been found, it will be "uncategorized"
    # All reviews in uncategorized have a score of 0
    # So we return True in such cases
    if category != constants.CATEGORY_NOT_FOUND:
        return (review2.derived_insight.extra_properties[
            constants.CATEGORY_SCORES][category] -
                review1.derived_insight.extra_properties[
                    constants.CATEGORY_SCORES][category])
    else:
        return True


if __name__ == "__main__":
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.
            processed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
コード例 #9
0
def run_algo():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    for app_config_file in app_configs:
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Path where the user reviews were stored after parsing.
        parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.parsed_data_folder,
            app_name=app_config.app.name,
        )

        # Loading the reviews
        reviews = utils.open_json(parsed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # Filtering out reviews which are not applicable.
        reviews = filter_utils.filter_reviews_by_time(
            filter_utils.filter_reviews_by_channel(
                reviews, filter_utils.filter_disabled_review_channels(
                    app_config
                ),
            ),
            datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter)
        )

        # Number of process to make
        num_processes = min(constants.PROCESS_NUMBER, os.cpu_count())

        if constants.CIRCLECI in os.environ:
            num_processes = 2

        # Adding sentiment
        with Pool(num_processes) as process:
            reviews = process.map(add_review_sentiment_score, reviews)

        if app_config.algorithm_config.categorization_algorithm != None and app_config.algorithm_config.category_keywords_weights_file != None:
            # We read from the topic file first
            topics = {}
            topics = utils.open_json(app_config.algorithm_config.category_keywords_weights_file)

            # Adding text-match categorization
            with Pool(num_processes) as process:
                reviews = process.map(
                    partial(
                        text_match_categortization,
                        app_config=app_config,
                        topics=topics
                    ),
                    reviews
                )

        if app_config.algorithm_config.bug_feature_keywords_weights_file != None:
            # We read from the topic file first
            topics = {}
            topics = utils.open_json(app_config.algorithm_config.bug_feature_keywords_weights_file)

            # Adding bug/feature classification
            with Pool(num_processes) as process:
                reviews = process.map(
                    partial(
                        bug_feature_classification,
                        topics=topics
                    ),
                    reviews
                )

        if app_config.algorithm_config.categorization_algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION:
            # Load the TensorFlow model
            model = tf.keras.models.load_model(
                constants.LSTM_CATEGORY_MODEL_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                )
            )

            # Load the article tokenizer file
            tokenizer_json = utils.open_json(
               constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                ),
            )
            article_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                tokenizer_json
            )

            # Load the label tokenizer file
            tokenizer_json = utils.open_json(
                constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format(
                    base_folder=app_config.fawkes_internal_config.data.base_folder,
                    dir_name=app_config.fawkes_internal_config.data.models_folder,
                    app_name=app_config.app.name,
                ),
            )
            label_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                tokenizer_json
            )

            cleaned_labels = {}
            for review in reviews:
                label = review.derived_insight.category
                cleaned_label = re.sub(r'\W+', '', label)
                cleaned_label = cleaned_label.lower()
                cleaned_labels[cleaned_label] = label

            # Adding LSTM categorization
            reviews = lstm_classification(
                reviews,
                model,
                article_tokenizer,
                label_tokenizer,
                cleaned_labels
            )

        # Create the intermediate folders
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.processed_data_folder,
            app_name=app_config.app.name,
        )

        dir_name = os.path.dirname(processed_user_reviews_file_path)
        pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

        utils.dump_json(
            [review.to_dict() for review in reviews],
            processed_user_reviews_file_path,
        )
コード例 #10
0
ファイル: lstm_classifier.py プロジェクト: prajwalppv/fawkes
def train_lstm_model():
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(
            file_name=constants.APP_CONFIG_FILE_NAME))
    for app_config_file in app_configs:
        app_config = AppConfig(utils.open_json(app_config_file))
        print("[LOG] going through app config ", app_config.app.name)

        # Path where the user reviews were stored after parsing.
        processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.
            processed_data_folder,
            app_name=app_config.app.name,
        )

        if not (app_config.algorithm_config.categorization_algorithm != None
                and app_config.algorithm_config.categorization_algorithm
                == CategorizationAlgorithms.LSTM_CLASSIFICATION):
            continue

    # Loading the reviews
        reviews = utils.open_json(processed_user_reviews_file_path)

        # Converting the json object to Review object
        reviews = [Review.from_review_json(review) for review in reviews]

        # reviews = utils.filter_reviews(reviews, app_config)

        articles, labels, cleaned_labels = get_articles_and_labels(reviews)

        trained_model, article_tokenizer, label_tokenizer = train(
            articles, labels)

        trained_lstm_categorization_model_file_path = constants.LSTM_CATEGORY_MODEL_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.models_folder,
            app_name=app_config.app.name,
        )

        dir_name = os.path.dirname(trained_lstm_categorization_model_file_path)
        pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

        trained_model.save(trained_lstm_categorization_model_file_path)

        # Saving the tokenizers
        utils.dump_json(
            article_tokenizer.to_json(),
            constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format(
                base_folder=app_config.fawkes_internal_config.data.base_folder,
                dir_name=app_config.fawkes_internal_config.data.models_folder,
                app_name=app_config.app.name,
            ),
        )

        # Saving the tokenizers
        utils.dump_json(
            label_tokenizer.to_json(),
            constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format(
                base_folder=app_config.fawkes_internal_config.data.base_folder,
                dir_name=app_config.fawkes_internal_config.data.models_folder,
                app_name=app_config.app.name,
            ),
        )
コード例 #11
0
ファイル: sanity.py プロジェクト: prajwalppv/fawkes
    def test_sanity(self):
        """
        Test for sanity that parsing and algorithms are working
        """
        # First we parse the sample data.
        parse.parse_reviews()
        parsed_output = utils.open_json(
            "data/parsed_data/sample-mint/parsed-user-feedback.json")
        expected_parsed_output = [{
            "message":
            "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending",
            "timestamp": "2020/03/15 14:13:17",
            "rating": 5,
            "app_name": "sample-mint",
            "channel_name": "appstore",
            "channel_type": "ios",
            "hash_id": "de848685d11742dbea77e1e5ad7b892088ada9c9",
            "derived_insight": {
                "sentiment": None,
                "category": "uncategorized",
                "extra_properties": {}
            }
        }]
        self.assertEqual(parsed_output, expected_parsed_output)
        # Before running the algorithms, we generate the keyword weights.
        generate_keyword_weights.generate_keyword_weights()

        # We run the algorithms on that data
        algo.run_algo()

        processed_output = utils.open_json(
            "data/processed_data/sample-mint/processed-user-feedback.json")
        expected_processed_output = [{
            "message":
            "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending",
            "timestamp": "2020/03/15 14:13:17",
            "rating": 5,
            "app_name": "sample-mint",
            "channel_name": "appstore",
            "channel_type": "ios",
            "hash_id": "6dde3aa82726c0a9e3777623854d839184767571",
            "derived_insight": {
                "sentiment": {
                    "neg": 0.0,
                    "neu": 0.928,
                    "pos": 0.072,
                    "compound": 0.4767
                },
                "category": "Application",
                "extra_properties": {
                    "category_scores": {
                        "User Experience": 0,
                        "sign-in/sign-up": 0,
                        "Notification": 0,
                        "Application": 1,
                        "ads": 0
                    },
                    "bug_feature": "feature"
                }
            }
        }]
        self.assertEqual(processed_output, expected_processed_output)
コード例 #12
0
ファイル: send_email.py プロジェクト: prajwalppv/fawkes
                      sendgrid_api_key):
    message = Mail(from_email=from_email_address,
                   to_emails=to_email,
                   subject=subject,
                   html_content=html)
    try:
        sg = SendGridAPIClient(sendgrid_api_key)
        response = sg.send(message)
        print("[LOG] Email to ", to_email, response.status_code)
    except Exception as e:
        print(e.message)


if __name__ == "__main__":
    app_configs = utils.open_json(
        constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME)
    )
    for app_config_file in app_configs:
        app_config = AppConfig(
            utils.open_json(
                app_config_file
            )
        )
        # Path where the generated email in html format will be stored
        email_summary_generated_file_path = constants.EMAIL_SUMMARY_GENERATED_FILE_PATH.format(
            base_folder=app_config.fawkes_internal_config.data.base_folder,
            dir_name=app_config.fawkes_internal_config.data.emails_folder,
            app_name=app_config.app.name,
        )

        dir_name = os.path.dirname(email_summary_generated_file_path)