def query_from_elasticsearch(fawkes_config_file=constants.FAWKES_CONFIG_FILE, query_term="", format=constants.JSON): fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) if query_term == "": endpoint = app_config.elastic_config.elastic_search_url + "_" + constants.SEARCH else: endpoint = app_config.elastic_config.elastic_search_url + query_term + "/" + "_" + constants.SEARCH response = requests.get(endpoint) results = json.loads(response.text) query_response_file = constants.ELASTICSEARCH_FETCH_DATA_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.query_folder, app_name=app_config.app.name, extension=format) utils.write_query_results(results, query_response_file, format) return results
def send_email(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Path where the generated email in html format will be stored email_summary_generated_file_path = constants.EMAIL_SUMMARY_GENERATED_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.emails_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(email_summary_generated_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) template_html = "" with open(email_summary_generated_file_path, "r") as email_file_handle: template_html = email_file_handle.read() for email_id in app_config.email_config.email_list: send_email_helper(app_config.email_config.sender_email_address, email_id, app_config.email_config.email_subject_name, template_html, app_config.email_config.sendgrid_api_key)
def get_similar_reviews_for_app(app_config_file, query, num_results): # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Log the current operation which is being performed. logging.info(logs.QUERY_START, FawkesActions.QUERY_SIMILAR_REVIEWS, "ALL", app_config.app.name) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter)) similar_reviews = get_similar_reviews(reviews, query, num_results) # Log the current operation which is being performed. logging.info(logs.QUERY_END, FawkesActions.QUERY_SIMILAR_REVIEWS, "ALL", app_config.app.name) # Create the intermediate folders query_results_file_path = constants.QUERY_RESULTS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.query_folder, app_name=app_config.app.name, query_hash=utils.calculate_hash(query)) dir_name = os.path.dirname(query_results_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json( [{ "score": score, "review": review.to_dict(), } for score, review in similar_reviews], query_results_file_path, )
def dump_lifetime_ratings(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) if app_config.elastic_config.lifetime_rating_index != None: time = datetime.strftime(datetime.now() - timedelta(1), constants.TIMESTAMP_FORMAT) playstore_rating = getPlayStoreLifetimeRating(app_config) appstore_rating = getAppStoreLifetimeRating(app_config) # Creating template for uploading lifetime rating playstore_doc = Review( {}, timestamp=time, rating=playstore_rating, app_name=app_config.app.name, channel_name="playstore-lifetime", channel_type="playstore-lifetime", hash_id=utils.calculate_hash(app_config.app.name + ReviewChannelTypes.ANDROID)) appstore_doc = Review( {}, timestamp=time, rating=playstore_rating, app_name=app_config.app.name, channel_name="appstore-lifetime", channel_type="appstore-lifetime", hash_id=utils.calculate_hash(app_config.app.name + ReviewChannelTypes.IOS)) # Deleting document to override elasticsearch.delete_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", playstore_doc.hash_id) elasticsearch.delete_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", appstore_doc.hash_id) # Uploading again elasticsearch.create_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", playstore_doc.hash_id, playstore_doc) elasticsearch.create_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", appstore_doc.hash_id, appstore_doc)
def send_reviews_to_slack(fawkes_config_file=constants.FAWKES_CONFIG_FILE): ## Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Log the current operation which is being performed. logging.info(logs.OPERATION, FawkesActions.PUSH_SLACK, "ALL", app_config.app.name) # Create the intermediate folders processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(minutes=app_config.slack_config.slack_run_interval)) # Log the number of reviews we got. logging.info(logs.NUM_REVIEWS, len(reviews), "ALL", app_config.app.name) reviews = sorted( reviews, key=lambda review: review.derived_insight.sentiment["compound"], reverse=True) for review in reviews: send_review_to_slack(app_config.slack_config.slack_hook_url, app_config.slack_config.slack_channel, review, app_config)
def run_bug_feature_categorization(reviews, app_config, num_processes): if Algorithms.BUG_FEATURE_CATEGORIZATION in app_config.algorithm_config.algorithms_to_run: # Log the number of reviews we got. logging.info( logs.CURRENT_ALGORITHM_START, Algorithms.BUG_FEATURE_CATEGORIZATION, "ALL", app_config.app.name) # We read from the topic file first topics = {} topics = utils.open_json( app_config.algorithm_config.categorization.bug_feature_keywords_weights_file) # Adding bug/feature classification with Pool(num_processes) as process: reviews = process.map( partial( bug_feature_classification, topics=topics ), reviews ) # Log the number of reviews we got. logging.info( logs.CURRENT_ALGORITHM_END, Algorithms.BUG_FEATURE_CATEGORIZATION, "ALL", app_config.app.name) return reviews
def parse_keywords_file(keyword_file_name, enable_remove_stop_words=True): # Topics is a dict, key = Topic Name. value = list of words and weights. topics = {} keywords_list = utils.open_json(keyword_file_name) for topic_keyword in keywords_list: topic = {} line = " ".join(keywords_list[topic_keyword]) # Remove all trailing and beginning write spaces line = line.lower() line = line.strip() # We will replace all the non-alphabet charectors with a space cleaned_line = re.sub("[^a-zA-Z]+", " ", line) # Replace multiple spaces with a single space cleaned_line = re.sub(" +", " ", cleaned_line) # Split the line according to space to get the words cleaned_line = cleaned_line.split() # Remove the stopwords. if enable_remove_stop_words: cleaned_line = utils.remove_stop_words(cleaned_line) # For each word assign a weight for word in list(set(cleaned_line)): # Add the word to the topic topic[lmtzr.lemmatize(word.lower())] = 1 topics[topic_keyword] = topic return topics
def parse_json(raw_user_reviews_file_path, review_channel, app_config): """ Parses the JSON files to a Review object """ reviews = utils.open_json(raw_user_reviews_file_path) parsed_reviews = [] for review in reviews: # TODO: Conver this to a standard format like jsonpath. # Extract the message. message = utils.get_json_key_value(review, review_channel.message_key.split(".")) # Extract the timestamp. timestamp = utils.get_json_key_value(review, review_channel.timestamp_key.split(".")) # Extract the rating if present. rating = None if review_channel.rating_key != None: rating = utils.get_json_key_value(review, review_channel.rating_key.split(".")) # Add the review object to the parsed reviews parsed_reviews.append( Review( review, message=message, timestamp=timestamp, rating=rating, app_name=app_config.app.name, channel_name=review_channel.channel_name, channel_type=review_channel.channel_type, review_timezone=review_channel.timezone, timestamp_format=review_channel.timestamp_format, ) ) return parsed_reviews
def validate_app_config_schema(self, document): try: schema = utils.open_json(constants.APP_CONFIG_SCHEMA_FILE) jsonschema.validate(document, schema) except ValidationError as e: raise ValidationError("App config schema validation failed: " + str(e.message))
def generate_keyword_weights(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # First look at the category keywords. utils.dump_json( parse_keywords_file( app_config.algorithm_config.category_keywords_file), app_config.algorithm_config.category_keywords_weights_file, ) # Then look at the bug-feature keywords utils.dump_json( parse_keywords_file( app_config.algorithm_config.bug_feature_keywords_file, False), app_config.algorithm_config.bug_feature_keywords_weights_file, )
def run_algo(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Path where the user reviews were stored after parsing. parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.parsed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(parsed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter)) # Number of process to make num_processes = min(constants.PROCESS_NUMBER, os.cpu_count()) if constants.CIRCLECI in os.environ: num_processes = 2 # Adding sentiment with Pool(num_processes) as process: reviews = process.map(add_review_sentiment_score, reviews) if app_config.algorithm_config.categorization_algorithm != None and app_config.algorithm_config.category_keywords_weights_file != None: # We read from the topic file first topics = {} topics = utils.open_json( app_config.algorithm_config.category_keywords_weights_file) # Adding text-match categorization with Pool(num_processes) as process: reviews = process.map( partial(text_match_categortization, app_config=app_config, topics=topics), reviews) if app_config.algorithm_config.bug_feature_keywords_weights_file != None: # We read from the topic file first topics = {} topics = utils.open_json( app_config.algorithm_config.bug_feature_keywords_weights_file) # Adding bug/feature classification with Pool(num_processes) as process: reviews = process.map( partial(bug_feature_classification, topics=topics), reviews) if app_config.algorithm_config.categorization_algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION: # Load the TensorFlow model model = tf.keras.models.load_model( constants.LSTM_CATEGORY_MODEL_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, )) # Load the article tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, ), ) article_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json) # Load the label tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, ), ) label_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json) cleaned_labels = {} for review in reviews: label = review.derived_insight.category cleaned_label = re.sub(r'\W+', '', label) cleaned_label = cleaned_label.lower() cleaned_labels[cleaned_label] = label # Adding LSTM categorization reviews = lstm_classification(reviews, model, article_tokenizer, label_tokenizer, cleaned_labels) # Create the intermediate folders processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(processed_user_reviews_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json( [review.to_dict() for review in reviews], processed_user_reviews_file_path, )
def fetch_reviews(fawkes_config_file = constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig( utils.open_json(fawkes_config_file) ) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig( utils.open_json( app_config_file ) ) # Each app has a list of review channels from which the user reviews are fetched. for review_channel in app_config.review_channels: if review_channel.is_channel_enabled and review_channel.channel_type != ReviewChannelTypes.BLANK: # Log the current operation which is being performed. logging.info(logs.OPERATION, FawkesActions.FETCH, review_channel.channel_name, app_config.app.name) reviews = [] # Depending on the channel type, we have different "fetchers" to get the data. if review_channel.channel_type == ReviewChannelTypes.TWITTER: reviews = tweets.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.SALESFORCE: reviews = salesforce.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.SPREADSHEET: reviews = spreadsheet.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.CSV: reviews = comma_separated_values.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.ANDROID: reviews = playstore.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.IOS: reviews = appstore.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.SPLUNK: reviews = splunk.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.REMOTE_FILE: reviews = remote.fetch( review_channel ) elif review_channel.channel_type == ReviewChannelTypes.VERTICA: reviews = vertica.fetch( review_channel ) else: continue # Log the number of reviews we got. logging.info(logs.NUM_REVIEWS, len(reviews), review_channel.channel_name, app_config.app.name) # After fetching the review for that particular channel, we dump it into a file. # The file has a particular format. # {base_folder}/{dir_name}/{app_name}/{channel_name}-raw-feedback.{extension} raw_user_reviews_file_path = constants.RAW_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.raw_data_folder, app_name=app_config.app.name, channel_name=review_channel.channel_name, extension=review_channel.file_type) # Create the intermediate folders dir_name = os.path.dirname(raw_user_reviews_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) if review_channel.file_type == constants.JSON: utils.dump_json(reviews, raw_user_reviews_file_path) else: with open(raw_user_reviews_file_path, "w") as file: file.write(reviews) # There are lot of use-cases where we need to execute custom code after the data is fetched. # This might include data-transformation, cleanup etc. # This is the right place to do that. if app_config.custom_code_module_path != None: custom_code_module = importlib.import_module(app_config.custom_code_module_path, package=None) reviews = custom_code_module.run_custom_code_post_fetch()
def parse_reviews(fawkes_config_file = constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig( utils.open_json(fawkes_config_file) ) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig( utils.open_json( app_config_file ) ) parsed_reviews = [] # We now read the review details for each channel for review_channel in app_config.review_channels: # We parse the channels only if its enabled! if review_channel.is_channel_enabled and review_channel.channel_type != ReviewChannelTypes.BLANK: raw_user_reviews_file_path = constants.RAW_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.raw_data_folder, app_name=app_config.app.name, channel_name=review_channel.channel_name, extension=review_channel.file_type ) if review_channel.file_type == constants.JSON: # Parse JSON channel_reviews = parse_json( raw_user_reviews_file_path, review_channel, app_config ) elif review_channel.file_type == constants.CSV: # Parse CSV channel_reviews = parse_csv( raw_user_reviews_file_path, review_channel, app_config ) elif review_channel.file_type == constants.JSON_LINES: channel_reviews = parse_json_lines( raw_user_reviews_file_path, review_channel, app_config ) else: # Unsupported file format raise ( "Format not supported exception. Check your file-type key in your config." ) parsed_reviews += channel_reviews # Executing custom code after parsing. if app_config.custom_code_module_path != None: custom_code_module = importlib.import_module(app_config.custom_code_module_path, package=None) parsed_reviews = custom_code_module.run_custom_code_post_parse( parsed_reviews) # After parsing the reviews for that all channels, we dump it into a file. # The file has a particular format. # {base_folder}/{dir_name}/{app_name}/parsed-user-feedback.{extension} parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.parsed_data_folder, app_name=app_config.app.name, ) # Create the intermediate folders dir_name = os.path.dirname(parsed_user_reviews_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json( [parsed_review.to_dict() for parsed_review in parsed_reviews], parsed_user_reviews_file_path )
def push_data_to_elasticsearch( fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Log the current operation which is being performed. logging.info(logs.OPERATION, FawkesActions.PUSH_ELASTICSEARCH, "ALL", app_config.app.name) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta( days=app_config.elastic_config.elastic_search_days_filter)) # Log the number of reviews we got. logging.info(logs.NUM_REVIEWS, len(reviews), "ALL", app_config.app.name) # We shuffle the reviews. This is because of how elastic search. random.shuffle(reviews) # We first list out all the indices indices = get_indices(app_config.elastic_config.elastic_search_url) if app_config.elastic_config.index not in indices: # Create a new index create_index(app_config.elastic_config.elastic_search_url, app_config.elastic_config.index) # Bulk push the data i = 0 while i * constants.BULK_UPLOAD_SIZE < len(reviews): response = bulk_push_to_elastic( app_config.elastic_config.elastic_search_url, app_config.elastic_config.index, reviews[i * constants.BULK_UPLOAD_SIZE:min( (i + 1) * constants.BULK_UPLOAD_SIZE, len(reviews))]) if response.status_code != 200: print( "[Error] push_data_to_elasticsearch :: Got status code : ", response.status_code) print("[Error] push_data_to_elasticsearch :: Response is : ", response.text) i += 1
def generate_summary(fawkes_config_file=constants.FAWKES_CONFIG_FILE): """ @param{string}: fawkes_config_file - config file path @returns{map<string,list<string>>}: summarized_reviews - summarized reviews per category Main function to create a summary of reviews - queries to get reviews - preprocess reviews based on each category - cluster similar reviews - rank and summarize amongst cluster to provide a summarize """ # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we- for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] reviews = queries.getVocByCategory(reviews) summarized_reviews = {} # For each category, generate a summary for category in reviews: summarized_category_review = [] # get reviews per category categorized_review = reviews[category] # Preprocess reviews sentences = preprocess_review(categorized_review) # number of sentences in a category should be atleast greater than # the number of clusters if (len(sentences) > app_config.algorithm_config.summarization.num_clusters - 1): clustered_sentences = k_means_classification( sentences, app_config.algorithm_config.summarization.num_clusters) for cluster in clustered_sentences.values(): if len(cluster) < constants.minimum_reviews_per_cluster: continue text = ". ".join(cluster) gen_summary = summarize_text( text, app_config.algorithm_config.summarization. summary_length_per_cluster, ) summarized_category_review.append(gen_summary) else: logging.info(logs.INSUFFICIENT_DATA, category) summarized_reviews[category] = summarized_category_review query_results_file_path = constants.REVIEW_SUMMARY_RESULTS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.query_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(query_results_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json([{ "summarized_reviews": summarized_reviews }], query_results_file_path) return summarized_reviews
def test_sanity(self): """ Test for sanity that parsing and algorithms are working """ # First we parse the sample data. parse.parse_reviews() parsed_output = utils.open_json( "data/parsed_data/sample-mint/parsed-user-feedback.json") expected_parsed_output = [{ "message": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending", "timestamp": "2020/03/15 22:06:17", "rating": 5.0, "user_id": None, "app_name": "sample-mint", "channel_name": "appstore", "channel_type": "ios", "hash_id": "a5461e62ee4eccbab92900ba01d49d9ed0642dcc", "derived_insight": { "sentiment": None, "category": "uncategorized", "review_message_encoding": None, "extra_properties": {} }, "raw_review": { "updated": "2020-03-15 14:13:17", "rating": 5, "version": "7.1.0", "content": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still can\u00e2\u20ac\u2122t add all of my financial institutions so my budget is kind of skewed. But other that I can say I\u00e2\u20ac\u2122m more aware of my spending" } }] self.assertEqual(parsed_output, expected_parsed_output) # Before running the algorithms, we generate the keyword weights. text_match_trainer.generate_keyword_weights() # We run the algorithms on that data algo.run_algo() processed_output = utils.open_json( "data/processed_data/sample-mint/processed-user-feedback.json") expected_processed_output = [{ "message": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending", "timestamp": "2020/03/15 22:06:17", "rating": 5.0, "user_id": None, "app_name": "sample-mint", "channel_name": "appstore", "channel_type": "ios", "hash_id": "a5461e62ee4eccbab92900ba01d49d9ed0642dcc", "derived_insight": { "sentiment": { "neg": 0.0, "neu": 0.928, "pos": 0.072, "compound": 0.4767 }, "category": "Application", "review_message_encoding": None, "extra_properties": { "category_scores": { "User Experience": 0, "sign-in/sign-up": 0, "Notification": 0, "Application": 1, "ads": 0 }, "bug_feature": "feature" } }, "raw_review": { "updated": "2020-03-15 14:13:17", "rating": 5, "version": "7.1.0", "content": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still can\u00e2\u20ac\u2122t add all of my financial institutions so my budget is kind of skewed. But other that I can say I\u00e2\u20ac\u2122m more aware of my spending" } }] self.assertEqual(processed_output, expected_processed_output)
def generate_email_summary_detailed( fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span)) if len(reviews) == 0: continue review_by_category = queries.getVocByCategory(reviews) top_categories = sorted([(len(review_by_category[key]), key) for key in review_by_category], reverse=True) top_categories = top_categories[:5] max_sentiment_per_category = {} for category in top_categories: max_sentiment_per_category[category[1]] = sorted( review_by_category[category[1]], key=functools.cmp_to_key(compare_review_by_category_score))[0] reviewDivHTML = "" for category in top_categories: if category[1] == constants.CATEGORY_NOT_FOUND: continue template_data = { "catetgoryName": category[1], "upOrDown": "down", "upDownPercentage": 19, "reviewText": max_sentiment_per_category[category[1]].message, "usersTalking": len(review_by_category[category[1]]) } formatted_html = email_utils.generate_email( constants.WEEKLY_EMAIL_DETAILED_REVIEW_BLOCK_TEMPLATE, template_data) reviewDivHTML += formatted_html # We get all the data. template_data = { "appStoreRating": "{0:.2f}".format(queries.appStoreRating(reviews)), "playStoreRating": "{0:.2f}".format(queries.playStoreRating(reviews)), "positiveReview": queries.positiveReview(reviews), "neutralReview": queries.neutralReview(reviews), "negativeReview": queries.negativeReview(reviews), "fromDate": queries.fromDate(reviews), "toDate": queries.toDate(reviews), "appLogo": app_config.app.logo, "timeSpanWords": app_config.email_config.email_time_span_in_words, "reviewBlock": reviewDivHTML, "appStoreNumberOfReview": queries.appStoreNumberReview(reviews), "playStoreNumberOfReview": queries.playStoreNumberReview(reviews), "appStoreLifetimeRating": lifetime.getAppStoreLifetimeRating(app_config), "playStoreLifetimeRating": lifetime.getPlayStoreLifetimeRating(app_config), "kibanaDashboardURL": app_config.elastic_config.kibana_url } # We finally send the email formatted_html = email_utils.generate_email( app_config.email_config.email_template_file, template_data) # Path where the generated email in html format will be stored email_summary_generated_file_path = constants.EMAIL_SUMMARY_GENERATED_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.emails_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(email_summary_generated_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) with open(email_summary_generated_file_path, "w") as email_file_handle: email_file_handle.write(formatted_html)
def generate_email_summary(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span)) # We get all the data. template_data = { "numberOfReview": queries.numberOfReview(reviews), "topCategory": queries.topCategory(reviews), "numFeatureReq": queries.numFeatureReq(reviews), "numBugsReported": queries.numBugsReported(reviews), "appStoreRating": "{0:.2f}".format(queries.appStoreRating(reviews)), "playStoreRating": "{0:.2f}".format(queries.playStoreRating(reviews)), "happyReview1": queries.happyReview1(reviews), "unhappyReview1": queries.unhappyReview1(reviews), "positiveReview": queries.positiveReview(reviews), "neutralReview": queries.neutralReview(reviews), "negativeReview": queries.negativeReview(reviews), "topCategoryNumberOfReview": queries.topCategoryNumberOfReview(reviews), "fromDate": queries.fromDate(reviews), "toDate": queries.toDate(reviews), "appLogo": app_config.app.logo, "timeSpanWords": app_config.email_config.email_time_span, "kibanaDashboardURL": app_config.elastic_config.kibana_url } # Get the initial HTML from the template file. formatted_html = email_utils.generate_email( app_config.email_config.email_template_file, template_data) # Path where the generated email in html format will be stored email_summary_generated_file_path = constants.EMAIL_SUMMARY_GENERATED_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.emails_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(email_summary_generated_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) with open(email_summary_generated_file_path, "w") as email_file_handle: email_file_handle.write(formatted_html)
def run_categorization(reviews, app_config, num_processes): if Algorithms.CATEGORIZATION in app_config.algorithm_config.algorithms_to_run: if app_config.algorithm_config.categorization.algorithm == CategorizationAlgorithms.TEXT_MATCH_CLASSIFICATION: # Log the number of reviews we got. logging.info(logs.CURRENT_ALGORITHM_START, CategorizationAlgorithms.TEXT_MATCH_CLASSIFICATION, "ALL", app_config.app.name) # We read from the topic file first topics = {} topics = utils.open_json( app_config.algorithm_config.categorization. category_keywords_weights_file) # Adding text-match categorization with Pool(num_processes) as process: reviews = process.map( partial(text_match_categortization, app_config=app_config, topics=topics), reviews) # Log the number of reviews we got. logging.info(logs.CURRENT_ALGORITHM_END, CategorizationAlgorithms.TEXT_MATCH_CLASSIFICATION, "ALL", app_config.app.name) elif app_config.algorithm_config.categorization.algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION: # WE import the module only when its required. import tensorflow as tf # Log the number of reviews we got. logging.info(logs.CURRENT_ALGORITHM_START, CategorizationAlgorithms.LSTM_CLASSIFICATION, "ALL", app_config.app.name) # Load the TensorFlow model model = tf.keras.models.load_model( constants.LSTM_CATEGORY_MODEL_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, )) # Load the article tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, ), ) article_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json) # Load the label tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data. base_folder, dir_name=app_config.fawkes_internal_config.data. models_folder, app_name=app_config.app.name, ), ) label_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json) cleaned_labels = {} for review in reviews: label = review.derived_insight.category cleaned_label = re.sub(r'\W+', '', label) cleaned_label = cleaned_label.lower() cleaned_labels[cleaned_label] = label # Adding LSTM categorization reviews = lstm_classification(reviews, model, article_tokenizer, label_tokenizer, cleaned_labels) # Log the number of reviews we got. logging.info(logs.CURRENT_ALGORITHM_END, CategorizationAlgorithms.LSTM_CLASSIFICATION, "ALL", app_config.app.name) # Log the number of reviews we got. logging.info(logs.NUM_REVIEWS, len(reviews), "ALL", app_config.app.name) return reviews
def test_sanity(self): """ Test for sanity that parsing and algorithms are working """ # First we parse the sample data. parse.parse_reviews() parsed_output = utils.open_json( "data/parsed_data/sample-mint/parsed-user-feedback.json") expected_parsed_output = [{ "message": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending", "timestamp": "2020/03/15 14:13:17", "rating": 5, "app_name": "sample-mint", "channel_name": "appstore", "channel_type": "ios", "hash_id": "de848685d11742dbea77e1e5ad7b892088ada9c9", "derived_insight": { "sentiment": None, "category": "uncategorized", "extra_properties": {} } }] self.assertEqual(parsed_output, expected_parsed_output) # Before running the algorithms, we generate the keyword weights. text_match_trainer.generate_keyword_weights() # We run the algorithms on that data algo.run_algo() processed_output = utils.open_json( "data/processed_data/sample-mint/processed-user-feedback.json") expected_processed_output = [{ "message": "I just heard about this budgeting app. So I gave it a try. I am impressed thus far. However I still cant add all of my financial institutions so my budget is kind of skewed. But other that I can say Im more aware of my spending", "timestamp": "2020/03/15 14:13:17", "rating": 5, "app_name": "sample-mint", "channel_name": "appstore", "channel_type": "ios", "hash_id": "6dde3aa82726c0a9e3777623854d839184767571", "derived_insight": { "sentiment": { "neg": 0.0, "neu": 0.928, "pos": 0.072, "compound": 0.4767 }, "category": "Application", "extra_properties": { "category_scores": { "User Experience": 0, "sign-in/sign-up": 0, "Notification": 0, "Application": 1, "ads": 0 }, "bug_feature": "feature" } } }] self.assertEqual(processed_output, expected_processed_output)
def run_algo(fawkes_config_file=constants.FAWKES_CONFIG_FILE): # Read the app-config.json file. fawkes_config = FawkesConfig(utils.open_json(fawkes_config_file)) # For every app registered in app-config.json we for app_config_file in fawkes_config.apps: # Creating an AppConfig object app_config = AppConfig(utils.open_json(app_config_file)) # Log the current operation which is being performed. logging.info(logs.OPERATION, FawkesActions.RUN_ALGO, "ALL", app_config.app.name) # Path where the user reviews were stored after parsing. parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.parsed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(parsed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter)) # Log the number of reviews we got. logging.info(logs.NUM_REVIEWS, len(reviews), "ALL", app_config.app.name) # Number of process to make num_processes = min(constants.PROCESS_NUMBER, os.cpu_count()) if constants.CIRCLECI in os.environ: num_processes = 2 # Running sentiment analysis reviews = run_sentiment_analysis(reviews, app_config, num_processes) # Running categorization reviews = run_categorization(reviews, app_config, num_processes) # Running bug/feature categorizatio reviews = run_bug_feature_categorization(reviews, app_config, num_processes) # Running the message encoding reviews = run_review_text_encoding(reviews, app_config, num_processes) # Create the intermediate folders processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(processed_user_reviews_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json( [review.to_dict() for review in reviews], processed_user_reviews_file_path, )