def dump_lifetime_ratings(): app_configs = utils.open_json( constants.APP_CONFIG_FILE.format( file_name=constants.APP_CONFIG_FILE_NAME)) for app_config_file in app_configs: app_config = AppConfig(utils.open_json(app_config_file)) if app_config.elastic_config.lifetime_rating_index != None: time = datetime.strftime(datetime.now() - timedelta(1), constants.TIMESTAMP_FORMAT) playstore_rating = getPlayStoreLifetimeRating(app_config) appstore_rating = getAppStoreLifetimeRating(app_config) # Creating template for uploading lifetime rating playstore_doc = Review( {}, timestamp=time, rating=playstore_rating, app_name=app_config.app.name, channel_name="playstore-lifetime", channel_type="playstore-lifetime", hash_id=utils.calculate_hash(app_config.app.name + ReviewChannelTypes.ANDROID)) appstore_doc = Review( {}, timestamp=time, rating=playstore_rating, app_name=app_config.app.name, channel_name="appstore-lifetime", channel_type="appstore-lifetime", hash_id=utils.calculate_hash(app_config.app.name + ReviewChannelTypes.IOS)) # Deleting document to override elasticsearch.delete_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", playstore_doc.hash_id) elasticsearch.delete_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", appstore_doc.hash_id) # Uploading again elasticsearch.create_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", playstore_doc.hash_id, playstore_doc) elasticsearch.create_document( app_config.elastic_config.elastic_search_url, app_config.elastic_config.lifetime_rating_index, "_doc", appstore_doc.hash_id, appstore_doc)
def create_parsed_pkl_from_Json(category, random_seed, sample_size, category_label_number): file_path = get_source_data_path(category) review_set = set() with open(file_path) as file: for line in tqdm(file, total=get_num_lines(file_path), desc="Parsing / Sampling / Writing PKL" + ntpath.basename(file_path)): sample = json.loads(line) reviewer_id = sample['reviewerID'] asin = sample['asin'] helpful = sample['helpful'] text_raw = sample['reviewText'] overall = sample['overall'] summary = sample['summary'] # if text raw not empty add to set if (text_raw != '') and (not text_raw is None): # create Amazon Review object review = Review(category_label_number, reviewer_id, asin, helpful, text_raw, overall, summary) # add review object to set review_set.add(review) file.close() random.seed(random_seed) sampled_review_set = random.sample(review_set, sample_size) write_reviews_parsed(category, sampled_review_set)
def parse_json(raw_user_reviews_file_path, review_channel, app_config): """ Parses the JSON files to a Review object """ reviews = utils.open_json(raw_user_reviews_file_path) parsed_reviews = [] for review in reviews: # TODO: Conver this to a standard format like jsonpath message = utils.get_json_key_value( review, review_channel.message_key.split(".")) timestamp = utils.get_json_key_value( review, review_channel.timestamp_key.split(".")) rating = None if review_channel.rating_key != None: rating = utils.get_json_key_value( review, review_channel.rating_key.split(".")) # Add the review object to the parsed reviews parsed_reviews.append( Review( review, message=message, timestamp=timestamp, rating=rating, app_name=app_config.app.name, channel_name=review_channel.channel_name, channel_type=review_channel.channel_type, review_timezone=review_channel.timezone, timestamp_format=review_channel.timestamp_format, )) return parsed_reviews
def push_data_to_elasticsearch(): app_configs = utils.open_json( constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME) ) for app_config_file in app_configs: app_config = AppConfig( utils.open_json( app_config_file ) ) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels( app_config ), ), datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span) ) # We shuffle the reviews. This is because of how elastic search. random.shuffle(reviews) # We first list out all the indices indices = get_indices(app_config.elastic_config.elastic_search_url) if app_config.elastic_config.index not in indices: # Create a new index create_index(app_config.elastic_config.elastic_search_url, app_config.elastic_config.index) # Bulk push the data i = 0 while i * constants.BULK_UPLOAD_SIZE < len(reviews): response = bulk_push_to_elastic( app_config.elastic_config.elastic_search_url, app_config.elastic_config.index, reviews[i * constants.BULK_UPLOAD_SIZE:min((i + 1) * constants.BULK_UPLOAD_SIZE, len(reviews))]) if response.status_code != 200: print( "[Error] push_data_to_elasticsearch :: Got status code : ", response.status_code) print("[Error] push_data_to_elasticsearch :: Response is : ", response.text) i += 1
def parse_csv(raw_user_reviews_file_path, review_channel, app_config): """ Parses the CSV files to a Review object """ with open(raw_user_reviews_file_path, "r") as file_handle: # Read all the reviews from the CSV file reviews = csv.reader(file_handle, delimiter=",") # We expect the first row to contain the column names. # TODO: We should change this to be taken from the configuration. # There might be usecases where column names are not present in the data. # People might want to indicate the message, timestamp keys using integer indices to the columns. json_keys_list = reviews[0] parsed_reviews = [] # Removing the first row. reviews = reviews[1:] # Iterate through all the reviews for row in reviews: review = {} timestamp = "" message = "" rating = None # There are some csvs for which the last column is empty. # Hence we need to take the min below for i in range(min(len(json_keys_list), len(row))): if json_keys_list[i] == review_channel.timestamp_key: # Storing the timestamp timestamp = row[i] elif json_keys_list[i] == review_channel.message_key: # Storing the message message = row[i] elif json_keys_list[i] == review_channel.rating_key: rating = row[i] # Storing the raw review as received from the source. review[json_keys_list[i]] = row[i] # Add the review object to the parsed reviews parsed_reviews.append( Review( review, message=message, timestamp=timestamp, rating=rating, app_name=app_config.app.name, channel_name=review_channel.channel_name, channel_type=review_channel.channel_type, )) return parsed_reviews
file_name=constants.APP_CONFIG_FILE_NAME)) for app_config_file in app_configs: app_config = AppConfig(utils.open_json(app_config_file)) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels(app_config), ), datetime.now(timezone.utc) - timedelta(days=app_config.email_config.email_time_span)) if len(reviews) == 0: continue review_by_category = queries.getVocByCategory(reviews) top_categories = sorted([(len(review_by_category[key]), key)
def run_algo(): app_configs = utils.open_json( constants.APP_CONFIG_FILE.format(file_name=constants.APP_CONFIG_FILE_NAME) ) for app_config_file in app_configs: app_config = AppConfig( utils.open_json( app_config_file ) ) # Path where the user reviews were stored after parsing. parsed_user_reviews_file_path = constants.PARSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.parsed_data_folder, app_name=app_config.app.name, ) # Loading the reviews reviews = utils.open_json(parsed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # Filtering out reviews which are not applicable. reviews = filter_utils.filter_reviews_by_time( filter_utils.filter_reviews_by_channel( reviews, filter_utils.filter_disabled_review_channels( app_config ), ), datetime.now(timezone.utc) - timedelta(days=app_config.algorithm_config.algorithm_days_filter) ) # Number of process to make num_processes = min(constants.PROCESS_NUMBER, os.cpu_count()) if constants.CIRCLECI in os.environ: num_processes = 2 # Adding sentiment with Pool(num_processes) as process: reviews = process.map(add_review_sentiment_score, reviews) if app_config.algorithm_config.categorization_algorithm != None and app_config.algorithm_config.category_keywords_weights_file != None: # We read from the topic file first topics = {} topics = utils.open_json(app_config.algorithm_config.category_keywords_weights_file) # Adding text-match categorization with Pool(num_processes) as process: reviews = process.map( partial( text_match_categortization, app_config=app_config, topics=topics ), reviews ) if app_config.algorithm_config.bug_feature_keywords_weights_file != None: # We read from the topic file first topics = {} topics = utils.open_json(app_config.algorithm_config.bug_feature_keywords_weights_file) # Adding bug/feature classification with Pool(num_processes) as process: reviews = process.map( partial( bug_feature_classification, topics=topics ), reviews ) if app_config.algorithm_config.categorization_algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION: # Load the TensorFlow model model = tf.keras.models.load_model( constants.LSTM_CATEGORY_MODEL_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ) ) # Load the article tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ), ) article_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json ) # Load the label tokenizer file tokenizer_json = utils.open_json( constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ), ) label_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( tokenizer_json ) cleaned_labels = {} for review in reviews: label = review.derived_insight.category cleaned_label = re.sub(r'\W+', '', label) cleaned_label = cleaned_label.lower() cleaned_labels[cleaned_label] = label # Adding LSTM categorization reviews = lstm_classification( reviews, model, article_tokenizer, label_tokenizer, cleaned_labels ) # Create the intermediate folders processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.processed_data_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(processed_user_reviews_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) utils.dump_json( [review.to_dict() for review in reviews], processed_user_reviews_file_path, )
def train_lstm_model(): app_configs = utils.open_json( constants.APP_CONFIG_FILE.format( file_name=constants.APP_CONFIG_FILE_NAME)) for app_config_file in app_configs: app_config = AppConfig(utils.open_json(app_config_file)) print("[LOG] going through app config ", app_config.app.name) # Path where the user reviews were stored after parsing. processed_user_reviews_file_path = constants.PROCESSED_USER_REVIEWS_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data. processed_data_folder, app_name=app_config.app.name, ) if not (app_config.algorithm_config.categorization_algorithm != None and app_config.algorithm_config.categorization_algorithm == CategorizationAlgorithms.LSTM_CLASSIFICATION): continue # Loading the reviews reviews = utils.open_json(processed_user_reviews_file_path) # Converting the json object to Review object reviews = [Review.from_review_json(review) for review in reviews] # reviews = utils.filter_reviews(reviews, app_config) articles, labels, cleaned_labels = get_articles_and_labels(reviews) trained_model, article_tokenizer, label_tokenizer = train( articles, labels) trained_lstm_categorization_model_file_path = constants.LSTM_CATEGORY_MODEL_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ) dir_name = os.path.dirname(trained_lstm_categorization_model_file_path) pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) trained_model.save(trained_lstm_categorization_model_file_path) # Saving the tokenizers utils.dump_json( article_tokenizer.to_json(), constants.LSTM_CATEGORY_ARTICLE_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ), ) # Saving the tokenizers utils.dump_json( label_tokenizer.to_json(), constants.LSTM_CATEGORY_LABEL_TOKENIZER_FILE_PATH.format( base_folder=app_config.fawkes_internal_config.data.base_folder, dir_name=app_config.fawkes_internal_config.data.models_folder, app_name=app_config.app.name, ), )