def calculate_topic_stability(records): Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = [] context_extractor =\ topic_model_creator.create_topic_model(records, None, None) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = 0.8 print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS) for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1): sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return calculate_stability(all_term_rankings)
def get_records_to_predict(self, use_random_seeds): if use_random_seeds: utilities.plant_seeds() if Constants.EVALUATION_METRIC == 'topn_recall': self.get_records_to_predict_topn() elif Constants.EVALUATION_METRIC in ['rmse', 'mae']: self.get_records_to_predict_rmse() else: raise ValueError('Unrecognized evaluation metric')
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() # self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def main(): utilities.plant_seeds() my_resamplers = [ None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2', 'smote_tomek', 'smoteenn' ] my_classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf', probability=True), SVC(C=1.0, kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(probability=True), RandomForestClassifier(n_estimators=100) ] document_levels = ['review', 'sentence', 1] num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels) index = 1 results_list = [] for document_level in document_levels: Constants.DOCUMENT_LEVEL = document_level my_records = load_records() preprocess_records(my_records) x_matrix, y_vector = transform(my_records) count_specific_generic(my_records) for resampler, classifier in itertools.product(my_resamplers, my_classifiers): print('Cycle %d/%d' % (index, num_cyles)) classification_results =\ test_classifier(x_matrix, y_vector, resampler, classifier) results_list.append(classification_results) index += 1 for results in results_list: print(results) csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\ '_sentence_classifier_results.csv' ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def analyze_topics(): start_time = time.time() utilities.plant_seeds() records = \ ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) num_topics = Constants.TOPIC_MODEL_NUM_TOPICS num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS topic_model_string = None if Constants.TOPIC_MODEL_TYPE == 'ensemble': topic_model = NmfTopicExtractor() topic_model.load_trained_data() topic_model_string = topic_model.print_topic_model('max') elif Constants.TOPIC_MODEL_TYPE == 'lda': topic_model = topic_model_creator.load_topic_model(None, None) topic_model_string = [ topic_model.print_topic(topic_id, num_terms) for topic_id in range(num_topics) ] context_extractor = ContextExtractor(records) context_extractor.separate_reviews() context_extractor.get_context_rich_topics() topic_data = [] for topic in range(num_topics): result = {} result['topic_id'] = topic result.update(split_topic(topic_model_string[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) data_frame = DataFrame.from_dict(topic_data) scores = {} scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS probability_score = data_frame['probability_score'].mean() scores['probability_score'] = probability_score print('probability score: %f' % scores['probability_score']) end_time = time.time() cycle_time = end_time - start_time scores['cycle_time'] = cycle_time print("Cycle time = %f seconds" % cycle_time) return scores
def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.preprocess() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def main(): utilities.plant_seeds() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) context_extractor = \ topic_model_creator.create_topic_model(records, None, None) topic_latex_generator = TopicLatexGenerator(context_extractor) topic_latex_generator.generate_pdf()
def test_classifier(x_matrix, y_vector, sampler_type, my_classifier): utilities.plant_seeds() results = { 'resampler': sampler_type, 'classifier': type(my_classifier).__name__ } resampled_x, resampled_y = resample(x_matrix, y_vector, sampler_type) print('num samples: %d' % len(resampled_y)) y_predictions, y_true_values = cross_validation_predict( my_classifier, resampled_x, resampled_y, 10, sampler_type, 'predict') results.update(print_confusion_matrix(y_true_values, y_predictions)) y_probabilities, y_true_values = cross_validation_predict( my_classifier, resampled_x, resampled_y, 10, sampler_type, 'predict_proba') y_probabilities = y_probabilities[:, 1] results.update(plot_roc_curve(y_true_values, y_probabilities)) # importances = my_classifier.feature_importances_ # indices = numpy.argsort(importances)[::-1] # # # Print the feature ranking # print("Feature ranking:") # # for f in range(x_matrix.shape[1]): # print( # "%d. feature %d (%f)" % # (f + 1, indices[f], importances[indices[f]])) # std = numpy.std([ # tree.feature_importances_ for tree in my_classifier.estimators_], # axis=0 # ) # # # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") # plt.bar(range(x_matrix.shape[1]), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(x_matrix.shape[1]), indices) # plt.xlim([-1, x_matrix.shape[1]]) # plt.show() return results
def full_cycle(): utilities.plant_seeds() print("%s: Starting process" % (time.strftime("%Y/%m/%d-%H:%M:%S"))) train, test = load_data() train = rescale_sale_price(train) train = remove_outliers(train) all_features, train_labels = split_features(train, test) all_features = normalize_numeric_features(all_features) X, X_test = create_x_sets(all_features, train_labels) print("%s: Finished data preprocessing" % (time.strftime("%Y/%m/%d-%H:%M:%S"))) ridge_model, svr_model, gbr_model, xgb_model, rf_model, stack_gen_model = train_models( X, train_labels) print("%s: Finished training models" % (time.strftime("%Y/%m/%d-%H:%M:%S")))
def run(self): utilities.plant_seeds() self.load() records = self.original_records if Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(self.original_records, split, cv_start) return self.perform_cross_validation(records) elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': return self.perform_cross_validation(records) else: raise ValueError('Unknown cross-validation strategy')
def create_topic_model(num_topics): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics }) utilities.plant_seeds() Constants.print_properties() file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \ "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS if os.path.exists(file_path): print('Ensemble topic model already exists') return # topic_ensemble_caller.run_local_parse_directory() topic_ensemble_caller.run_generate_kfold() topic_ensemble_caller.run_combine_nmf()
def create_single_topic_model(cycle_index, fold_index, check_exists=True): Constants.print_properties() print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: msg = 'This function shouldn\'t be used when the ' \ 'separate_topic_model_recsys_reviews property is set to True' raise ValueError(msg) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': pass elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(records, split, cv_start) else: raise ValueError('Unknown cross-validation strategy') utilities.plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) return create_topic_model( train_records, cycle_index, fold_index, check_exists)
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError( 'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def perform_cross_validation(self): Constants.print_properties() utilities.plant_seeds() total_recall = 0.0 total_specific_recall = 0.0 total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) self.reviews = copy.deepcopy(self.original_reviews) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.train_reviews, self.test_reviews = \ ETLUtils.split_train_test_copy( self.reviews, split=split, start=cv_start) self.export() if Constants.USE_CONTEXT: lda_based_context = self.train_word_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() self.evaluate() recall = self.top_n_evaluator.recall specific_recall = self.top_n_evaluator.specific_recall generic_recall = self.top_n_evaluator.generic_recall total_recall += recall total_specific_recall += specific_recall total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) average_recall = total_recall / total_iterations average_specific_recall = total_specific_recall / total_iterations average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average recall: %f' % average_recall) print('average specific recall: %f' % average_specific_recall) print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) results = Constants.get_properties_copy() results['recall'] = average_recall results['specific_recall'] = average_specific_recall results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") if not os.path.exists(Constants.CSV_RESULTS_FILE): with open(Constants.CSV_RESULTS_FILE, 'wb') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writeheader() w.writerow(results) else: with open(Constants.CSV_RESULTS_FILE, 'a') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writerow(results)
def analyze_topics(include_stability=True): start_time = time.time() utilities.plant_seeds() records = \ ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) num_topics = Constants.TOPIC_MODEL_NUM_TOPICS num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS if Constants.TOPIC_MODEL_TYPE == 'ensemble': topic_model = NmfTopicExtractor() topic_model.load_trained_data() topic_model_string = topic_model.print_topic_model(num_terms) elif Constants.TOPIC_MODEL_TYPE == 'lda': topic_model = topic_model_creator.load_topic_model(None, None) topic_model_string = [ topic_model.print_topic(topic_id, num_terms) for topic_id in range(num_topics) ] context_extractor = ContextExtractor(records) context_extractor.separate_reviews() context_extractor.get_context_rich_topics() topic_data = [] for topic in range(num_topics): result = {} result['topic_id'] = topic result.update(split_topic(topic_model_string[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) # generate_excel_file(topic_data) data_frame = DataFrame.from_dict(topic_data) scores = {} scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS probability_score = data_frame['probability_score'].mean() scores['probability_score'] = probability_score high_ratio_mean_score = data_frame[( data_frame.ratio > Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean() low_ratio_mean_score = data_frame[( data_frame.ratio < Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean() stability = None sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO if include_stability: stability = calculate_topic_stability(records, sample_ratio).mean() scores['stability'] = stability # separation_score =\ # (high_ratio_mean_score / low_ratio_mean_score)\ # if low_ratio_mean_score != 0\ # else 'N/A' gamma = 0.5 separation_score = gamma * high_ratio_mean_score + (1 - gamma) * ( 1 - low_ratio_mean_score) joint_separation_score =\ (high_ratio_mean_score + (1 - low_ratio_mean_score)) / 2 scores['separation_score'] = separation_score scores['joint_separation_score'] = joint_separation_score scores['combined_score'] =\ (probability_score * separation_score)\ if probability_score != 'N/A' and separation_score != 'N/A'\ else 'N/A' print('probability score: %f' % scores['probability_score']) print('separation score:', scores['separation_score']) print('combined score:', scores['combined_score']) end_time = time.time() cycle_time = end_time - start_time scores['cycle_time'] = cycle_time print("Cycle time = %f seconds" % cycle_time) return scores
def analyze_topics(include_stability=True): start_time = time.time() utilities.plant_seeds() records = \ ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) num_topics = Constants.TOPIC_MODEL_NUM_TOPICS num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS if Constants.TOPIC_MODEL_TYPE == 'ensemble': topic_model = NmfTopicExtractor() topic_model.load_trained_data() topic_model_string = topic_model.print_topic_model(num_terms) elif Constants.TOPIC_MODEL_TYPE == 'lda': topic_model = topic_model_creator.load_topic_model(None, None) topic_model_string = [ topic_model.print_topic(topic_id, num_terms) for topic_id in range(num_topics) ] context_extractor = ContextExtractor(records) context_extractor.separate_reviews() context_extractor.get_context_rich_topics() topic_data = [] for topic in range(num_topics): result = {} result['topic_id'] = topic result.update(split_topic(topic_model_string[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) # generate_excel_file(topic_data) data_frame = DataFrame.from_dict(topic_data) scores = {} scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS probability_score = data_frame['probability_score'].mean() scores['probability_score'] = probability_score high_ratio_mean_score = data_frame[ (data_frame.ratio > Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean() low_ratio_mean_score = data_frame[ (data_frame.ratio < Constants.CONTEXT_EXTRACTOR_BETA)]['probability_score'].mean() stability = None sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO if include_stability: stability = calculate_topic_stability(records, sample_ratio).mean() scores['stability'] = stability # separation_score =\ # (high_ratio_mean_score / low_ratio_mean_score)\ # if low_ratio_mean_score != 0\ # else 'N/A' gamma = 0.5 separation_score = gamma*high_ratio_mean_score + (1 - gamma)*(1-low_ratio_mean_score) joint_separation_score =\ (high_ratio_mean_score + (1 - low_ratio_mean_score)) / 2 scores['separation_score'] = separation_score scores['joint_separation_score'] = joint_separation_score scores['combined_score'] =\ (probability_score * separation_score)\ if probability_score != 'N/A' and separation_score != 'N/A'\ else 'N/A' print('probability score: %f' % scores['probability_score']) print('separation score:', scores['separation_score']) print('combined score:', scores['combined_score']) end_time = time.time() cycle_time = end_time - start_time scores['cycle_time'] = cycle_time print("Cycle time = %f seconds" % cycle_time) return scores