def setup(opt): if opt.dataset_name.lower() == 'cmu_mosi' or opt.dataset_name.lower( ) == 'cmu_mosei': from data.reader import DataReader reader = DataReader(opt) reader = reader.prepare_data() return reader
def execute(self, args): try: # This call validates inputs. If a required arg isn't there # or an additional, unexpected, arg is present it will except. execute_args = super()._parse_execute_arguments(args) if IFunction.GLOBAL_HELP in execute_args.keys(): # Regardless of anything else, if help is there, show it and quit self.get_help(1) else: start_month = date_range(1, -1) end_month = date_range(50,-1) # Filter the files and get the data from the last one since # we are just going over global stats. region = execute_args['-r'] files = DataReader.get_dated_files(self.datasets, start_month, end_month) region_data = GlobalDataParser.parse_region_data([files[-1]], region) print("Province information for region : ", region) if len(region_data): columns = [6,30, 7, 8,11,11] headers = ['Rank','Province', 'Cases', 'Deaths', 'Recovered', 'Mortality'] PrintTable.print_banner(columns,headers) region = region_data[0] rows = [] for province in region.provinces.keys(): mortality = "%.2f" % (region.provinces[province].get_mortality()) row = [ province, region.provinces[province].confirmed_cases, region.provinces[province].deaths, region.provinces[province].recovered, mortality + " %" ] rows.append(row) # Sort them by mortality rows = sorted(rows, reverse=True, key=lambda x: x[1]) # Now print them rank = 1 for row in rows: row.insert(0,rank) PrintTable.print_row(columns, row) rank += 1 else: print("NO data found....") except Exception as ex: print(str(ex))
def main(): # read the data reader = DataReader(main_path) questions = reader.questions workers = reader.workers answers = reader.answers print("Finished reading answers from " + reader.answer) print("Total questions: " + str(len(questions))) print("Total workers: " + str(len(workers))) print("Total answers: " + str(len(answers))) print( "class 0:", sum([1 if i == 0 else 0 for i in questions.values()]) / sum([1 for _ in questions.values()])) # split into train/validation/test (q_train, q_validation, q_test) = SplitData(split_seed, list(questions.items())) (full, train, validation, test) = ArrangeData(questions, q_train, q_validation, q_test, workers, answers) print("Train size", len(train.questions), len(train.workers), len(train.answers)) print("Validation size", len(validation.questions), len(validation.workers), len(validation.answers)) print("Test size", len(test.questions), len(test.workers), len(test.answers)) # cut ground for test set but keep a local copy test_set = deepcopy(test) for q in test.questions.keys(): test.questions[q] = None full.questions[q] = None #end for # initialize algorithms algos = [] algos.append(MajorityVoting(full, train, validation, test)) algos.append( DawidSkene(full, train, validation, test, ds_seed, "mv_w", 100)) for alg in algos: print("") print("Algorithm - " + alg.name) # run the algo test_ans = alg.run() # get back metrics (precision, recall, f1score, accuracy) = alg.validate(test_set, test_ans) print(("precision", precision)) print(("recall", recall)) print(("f1score", f1score)) print(("accuracy", accuracy))
def execute(self, args): try: # This call validates inputs. If a required arg isn't there # or an additional, unexpected, arg is present it will except. execute_args = super()._parse_execute_arguments(args) if IFunction.GLOBAL_HELP in execute_args.keys(): # Regardless of anything else, if help is there, show it and quit self.get_help(1) else: start_month = date_range(1, -1) end_month = date_range(50, -1) # Filter the files and get the data from the last one since # we are just going over global stats. files = DataReader.get_dated_files(self.datasets, start_month, end_month) region_data = GlobalDataParser.parse_region_data([files[-1]]) # Accumulate province data for each region regional_details = {} for reg in region_data: regional_details[reg.region] = reg.condense_provinces() # Get the mortality for each region mortality_stats = [] for reg in regional_details.keys(): mortality = regional_details[reg].get_mortality() mortality_stats.append(mortality_overall(reg, mortality)) sorted_mortality_stats = sorted(mortality_stats, reverse=True, key=lambda x: x.rate) ''' Table 1: Prints out each region with it's rolled up stats across all provinces. ''' columns = [6, 35, 11] headers = ['Rank', 'Region', 'Mortality'] PrintTable.print_banner(columns, headers) entry = 1 for stat in sorted_mortality_stats: mortality_rate = "%.2f" % (stat.rate) row = [entry, stat.region, '{}%'.format(mortality_rate)] PrintTable.print_row(columns, row) entry += 1 except Exception as ex: print(str(ex)) raise ex
def load_data(path, date_col, frequency): """ Loads the dataset, indexes it and imputes the missing values :param path: path to the data set on disk :param file_type: file type of the data :param date_col: date column name in the dataset :param frequency: frequency of the dates :return: pandas dataframe """ dataframe = DataReader.read_data(path, date_col=date_col) dataframe = Indexer.index_dates(dataframe, date_col, frequency=frequency) dataframe = Imputer.impute(dataframe) return dataframe
def convert_set(self, reader: DataReader): return [(self.convert_graph(d.graph), self.convert_plan(d.plan)) for d in reader.copy().data]
def execute(self, args): try: # This call validates inputs. If a required arg isn't there # or an additional, unexpected, arg is present it will except. execute_args = super()._parse_execute_arguments(args) if IFunction.GLOBAL_HELP in execute_args.keys(): # Regardless of anything else, if help is there, show it and quit self.get_help(1) else: start_month = date_range(1, -1) end_month = date_range(50, -1) # Filter the files and get the data from the last one since # we are just going over global stats. files = DataReader.get_dated_files(self.datasets, start_month, end_month) region_data = GlobalDataParser.parse_region_data([files[-1]]) # Accumulate province data for each region regional_details = {} for reg in region_data: regional_details[reg.region] = reg.condense_provinces() ''' Table 1: Prints out each region with it's rolled up stats across all provinces. ''' columns = [35, 11, 11, 11] headers = ['Region', 'Confirmed', 'Deaths', 'Mortality'] PrintTable.print_banner(columns, headers) overall_stats = OverallStats() data_rows = [] for region_key in regional_details.keys(): data_row = [region_key] data_row.append( regional_details[region_key].confirmed_cases) data_row.append(regional_details[region_key].deaths) #data_row.append(regional_details[region_key].recovered) mortality = '0.00' try: mortality = ( regional_details[region_key].deaths / regional_details[region_key].confirmed_cases) * 100 mortality = "%.2f" % (mortality) except: pass data_row.append(mortality) overall_stats.add_deaths( region_key, regional_details[region_key].deaths) overall_stats.add_mortality(region_key, float(mortality)) overall_stats.add_confirmed( region_key, regional_details[region_key].confirmed_cases) overall_stats.recovered_cases.append( regional_details[region_key].recovered) data_rows.append(data_row) sort_column = -1 if '-c' in execute_args: sort_column = 1 elif '-d' in execute_args: sort_column = 2 if sort_column != -1: sorted_rows = sorted(data_rows, reverse=True, key=lambda x: x[sort_column]) data_rows = sorted_rows for row in data_rows: PrintTable.print_row(columns, row) ''' Table 2: Global overall stats focusing on US vs World ''' print("") columns = [36, 26] headers = ['General Statistic', 'Value'] banner, cols = PrintTable.print_banner(columns, headers) total_cases = overall_stats.us_confirmed_cases + sum( overall_stats.non_us_confirmed_cases) total_deaths = overall_stats.us_deaths + sum( overall_stats.non_us_deaths) us_cases = "%d (%.3f)" % ( overall_stats.us_confirmed_cases, float(overall_stats.us_confirmed_cases / total_cases) * 100) non_us_case = "%d (%.3f)" % ( sum(overall_stats.non_us_confirmed_cases), float( sum(overall_stats.non_us_confirmed_cases) / total_cases) * 100) us_deaths = "%d (%.3f)" % ( overall_stats.us_deaths, float(overall_stats.us_deaths / total_deaths) * 100) non_us_deaths = "%d (%.3f)" % ( sum(overall_stats.non_us_deaths), float(sum(overall_stats.non_us_deaths) / total_deaths) * 100) PrintTable.print_row( columns, ["Total Global Confirmed Cases", total_cases]) PrintTable.print_row(columns, ["US Confirmed Cases", us_cases]) PrintTable.print_row(columns, ["Non-US Confirmed Cases", non_us_case]) PrintTable.print_row(columns, ["Total Global Deaths", total_deaths]) PrintTable.print_row(columns, ["US Deaths", us_deaths]) PrintTable.print_row(columns, ["Non-US Deaths", non_us_deaths]) #PrintTable.print_row(columns, ["Total Recovered", sum(overall_stats.recovered_cases)]) print(banner) PrintTable.print_row( columns, ["Highest Mortality", overall_stats.mortality_highest]) PrintTable.print_row(columns, [ "Highest Mortality Region", overall_stats.mortality_winner ]) print(banner) PrintTable.print_row(columns, [ "Highest Reported Deaths", overall_stats.total_cases_highest ]) PrintTable.print_row(columns, [ "Highest Reported Region", overall_stats.total_cases_winner ]) print(banner) print("") except Exception as ex: print(str(ex)) raise ex
def execute(self, args): try: # This call validates inputs. If a required arg isn't there # or an additional, unexpected, arg is present it will except. execute_args = super()._parse_execute_arguments(args) if IFunction.GLOBAL_HELP in execute_args.keys(): # Regardless of anything else, if help is there, show it and quit self.get_help(1) else: start_month = date_range(int(execute_args['-s']), -1) end_month = date_range(-1,-1) if '-e' in execute_args.keys(): end_month = date_range(int(execute_args['-e']),-1) files = DataReader.get_dated_files(self.datasets, start_month, end_month) # Get the region and then collect the data desired_region = execute_args['-r'] region_data = GlobalDataParser.parse_region_data(files, desired_region) # Output table header columns = [16, 19,19,19,11] headers = ['File', 'Confirmed', 'Deaths', 'Recovered', 'Mortality'] PrintTable.print_banner(columns,headers) last_confirmed = -1 last_death = -1 last_recovered = -1 # If -sum, then only first and last data_to_scan = region_data if '-sum' in execute_args.keys(): data_to_scan = [ region_data[0], region_data[-1] ] for rdata in data_to_scan: file_name = rdata.data_file.file_name mrate = '0.00' # Accumulate all of the province/state data condensed = rdata.condense_provinces() confirmed = condensed.confirmed_cases deaths = condensed.deaths recovered = condensed.recovered # Mortality rate mrate = '%.2f' %(condensed.get_mortality()) # If we have a last entry, get the diff to today if last_confirmed != -1: confirmed = "{} ({})".format(str(confirmed), str(confirmed - last_confirmed)) deaths = "{} ({})".format(str(deaths), str(deaths - last_death)) recovered = "{} ({})".format(str(recovered), str(recovered - last_recovered)) last_confirmed = condensed.confirmed_cases last_death = condensed.deaths last_recovered = condensed.recovered output_data = [ file_name, confirmed, deaths, recovered, mrate + ' %' ] PrintTable.print_row(columns, output_data) except Exception as ex: print(str(ex))
def main(): # read the data reader = DataReader(main_path) questions = reader.questions workers = reader.workers answers = reader.answers print("Finished reading answers from " + reader.answer) print("Total questions: " + str(len(questions))) print("Total workers: " + str(len(workers))) print("Total answers: " + str(len(answers))) print() # questions print( "class 0:", sum([1 if i == 0 else 0 for i in questions.values()]) / sum([1 for _ in questions.values()])) print( "class 1:", sum([1 if i == 1 else 0 for i in questions.values()]) / sum([1 for _ in questions.values()])) print() # workers w_accuracy = {} w_answers = {} w_count = {} for w, ans in workers.items(): answers = [0.0, 0.0] total = 0.0 correct = 0.0 for (q, a) in ans: # accuracy total += 1.0 correct += 1.0 if a == questions[q] else 0.0 answers[a] += 1.0 #end for w_accuracy[w] = correct / total w_answers[w] = answers[1] / (answers[0] + answers[1]) w_count[w] = total #end for # values print("# of answers per worker") print(("mean", stats.mean(w_count.values())), ("stddev", stats.stdev(w_count.values())), ("median", stats.median(w_count.values()))) print(("min", min(w_count.values())), ("max", max(w_count.values()))) print() w_count_s = sorted(w_count.values()) count_90 = (len(w_count_s) * 90) / 100 w_count_s = w_count_s[:int(count_90)] print("# of answers per worker (90th percentile)") print(("mean", stats.mean(w_count_s)), ("max", max(w_count_s))) print() print("# of answers per worker") print(("mean", stats.mean(w_accuracy.values())), ("stddev", stats.stdev(w_accuracy.values())), ("median", stats.median(w_accuracy.values()))) print(("min", min(w_accuracy.values())), ("max", max(w_accuracy.values()))) print() #exit() # plotting plt.hist(w_count.values()) plt.title("Distribution of worker answer count") plt.xlabel("# answers") plt.ylabel("# workers") plt.show() bins = ( np.arange(0, 13) / 11.0 ) - 0.05 #to have areas that'd be centered around 0.0, 0.1, ... 1.0 plt.hist(w_accuracy.values(), bins=bins) plt.title("Distribution of worker accuracy") plt.xlabel("% of correct answers") plt.ylabel("# workers") plt.show() plt.hist(w_answers.values(), bins=(bins + 0.02158749248346362 )) # to make a prior line be more in the center of a block plt.axvline(x=0.12158749248346362, ymin=0, ymax=1, color="red") plt.title("Distribution of worker answers") plt.xlabel("% of positive answers") plt.ylabel("# workers") plt.show() plt.scatter(w_answers.values(), w_accuracy.values()) plt.title("Distribution of worker answers to worker accuracy") plt.xlabel("% of positive answers") plt.ylabel("% of correct answers") plt.show() plt.scatter(w_count.values(), w_accuracy.values()) plt.title("Distribution of worker activity to worker accuracy") plt.xlabel("# answers") plt.ylabel("% of correct answers") plt.show()
def main(FLAGS): # Define data reader. dataset = DataReader(FLAGS.DATA_DIR, FLAGS.SAMPLING_MINUTE, FLAGS.SAMPLING_SIZE, FLAGS.BATCH_SIZE, FLAGS.USE_JSON) # Define model. model = RNN_cell(FLAGS.INPUT_SIZE, FLAGS.HIDDEN_LAYER_SIZE, FLAGS.TARGET_SIZE) # Define label and model output. label = tf.placeholder(tf.float32, shape=[None], name='labels') label_one_hot = tf.one_hot(tf.cast(label, tf.int32), depth=FLAGS.TARGET_SIZE) outputs = model.get_outputs() last_output = outputs[-1] output = tf.nn.softmax(last_output) output_class = tf.argmax(output, 1) # Define cross entropy loss. cross_entropy = -tf.reduce_mean(label_one_hot * tf.log(output)) #cross_entropy = FLAGS.tf_Weighted_RMSE(label, output) #cross_entropy = tf.reduce_mean(tf.square(output - label)) #cross_entropy = tf.reduce_mean(label*output) # Define adam training optimizer. learning_rate = tf.placeholder(tf.float32) train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy) saved_eval = 10.0 # Define log summary tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('loss', cross_entropy) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: sess.run(tf.initialize_all_variables()) merged = tf.summary.merge_all() if FLAGS.SAVE_SUMMARY: writer = tf.summary.FileWriter(FLAGS.LOG_PATH, sess.graph) saver = tf.train.Saver(max_to_keep=10) for epoch in range(0, FLAGS.EPOCH): start_time = time.time() for num in range(0, dataset.train_batch_num): step = epoch * dataset.train_batch_num + num lr = FLAGS.LEARNING_RATE * (0.5**(np.floor( epoch / FLAGS.LR_DECAY_EPOCH))) # Train model train_x, train_y = dataset.get_train_batch() loss, summary, _ = sess.run( [cross_entropy, merged, train_step], feed_dict={ model._inputs: train_x, label: train_y, learning_rate: lr }) # Evaluate on validation set. valid_x = dataset.X_valid valid_y = dataset.Y_valid valid_predict = sess.run(output_class, feed_dict={model._inputs: valid_x}) valid_eval = FLAGS.Weighted_RMSE(valid_y, valid_predict) if FLAGS.SAVE_SUMMARY: writer.add_summary(summary, step) # Save model setting if step % FLAGS.SAVE_STEP == 0: saver.save(sess, os.path.join(FLAGS.MODEL_PATH, 'model'), global_step=step) if valid_eval <= saved_eval: saved_epoch = epoch saved_num = num saved_eval = valid_eval # Display setting if step % FLAGS.DISPLAY_STEP == 0: rate = (step + 1) * FLAGS.BATCH_SIZE / (time.time() - start_time) remaining = (FLAGS.EPOCH * dataset.train_batch_num - step) * FLAGS.BATCH_SIZE / rate print( "###################################################") print( "progress epoch %d step %d / %d image/sec %0.1f remaining %0.1fm" % (epoch, num, dataset.train_batch_num, rate, remaining / 60)) print("- Loss =", loss) print("- Weighted RMSE on validation =", valid_eval) print( "- Accuracy on validation =", np.sum(valid_y == valid_predict) / np.shape(valid_y)[0]) print("- Best(but not saved) Weight RMSE on validation =", saved_eval) print("- Best model on validation at epoch =", saved_epoch, "step =", saved_num) print("- Min kp-index on validation set :", np.min(valid_predict)) print("- Max kp-index on validation set :", np.max(valid_predict)) print("Finish!")
def run(): result = pd.DataFrame(columns=result_col_names) # variance_thresholds = [1, 0.25, 0.50, 0.75] variance_thresholds = [1] # combinations = list(itertools.combinations([1, 2], 2)) # borda combinations # combinations = [] combinations = [ BordaCombinations.LS_SPEC, BordaCombinations.LS_IDETECT, BordaCombinations.LS_SPEC_IDETECT, BordaCombinations.SPEC_IDETECT, BordaCombinations.GLSPFS_LS, BordaCombinations.GLSPFS_SPEC, BordaCombinations.GLSPFS_IDETECT, BordaCombinations.GLSPFS_LS_SPEC, BordaCombinations.GLSPFS_LS_IDETECT, BordaCombinations.GLSPFS_SPEC_IDETECT, BordaCombinations.GLSPFS_LS_SPEC_IDETECT ] for variance_threshold in variance_thresholds: for dataset_name in dataSets: reader = DataReader(dataset_name) time.start_time() dataset, n_features, y_true = reader.get_preprocessed_data() time.end_time("read dataset") # default_values = state of art control = init.get_initial_variables(is_default_values=False) rankings = np.zeros([n_features, 1]) for method in methods: control.best_silhouette = 0 current_result, best_rank = feat_selection.run_and_evaluate_fs_methods( dataset, dataset_name, method, y_true, result_col_names, variance_threshold, control) result = pd.concat([result, current_result]) results_filename = "result_best_fs_" + dataset_name + ".csv" result.to_csv(FoldersLocation.results.value + results_filename, sep=" ") rank = pd.Series(best_rank) logger.log("best rank: " + str(rank.shape), False) rankings = np.append(rankings, rank[:, None], 1) # Adding Borda Count results if not control.state_of_art: for comb in combinations: borda_results = borda.get_borda_results(rankings, dataset, dataset_name, result_col_names, y_true, combination=comb) result = pd.concat([result, borda_results]) variance_results_filename = "result_after_" + str( variance_threshold ) + "_variance_best_fs_" + dataset_name + ".csv" results_filename = FoldersLocation.results.value + variance_results_filename result.to_csv(results_filename, sep=" ")