def main(argc, argv): """ entry point to the program """ if argc < 3 or argc > 4: sys.exit( f"Usage python3 {argv[0]} <training_file> <output_dir> <random_features?>" ) _, training_y, training_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") random_features = None if argc >= 4: random_features = int(argv[3]) num_rows = len(training_y) while True: tree = DecisionTree() rows_to_evaluate = random.choices(range(num_rows), k=num_rows) tree.train(rows_to_evaluate, training_x, training_y, random_features=random_features) filename = f"{argv[2]}/{uuid.uuid4()}.json" with open(filename, "w") as out_file: out_file.write(tree.to_json()) print(filename)
def main(argv): if len(argv) != 5: sys.exit( f"Usage python3 {argv[0]} <file> <percent_validation> <training_filename> <validation_filename>" ) rna_ids, y_data, x_data = parse_data.read_data(argv[1]) pct_validation = float(argv[2]) y_counts = {} for index, y in enumerate(y_data): key = repr(y) if key not in y_counts: y_counts[key] = [] y_counts[key].append(index) validation_indexes = [] training_indexes = [] for key in y_counts: random.shuffle(y_counts[key]) split_point = int(len(y_counts[key]) * pct_validation) for index in y_counts[key][:split_point]: validation_indexes.append(index) for index in y_counts[key][split_point:]: training_indexes.append(index) create_csv(argv[3], training_indexes, rna_ids, y_data, x_data) create_csv(argv[4], validation_indexes, rna_ids, y_data, x_data)
def main(): commands = [] data = pd.read_data() products = data["products_weights"] warehouses = data["warehouses"] max_dron_size = data["max_payload"] num_of_orders = len(data["orders"]) drons = [] for i in xrange(data["drones_num"]): drons.append({"id": i, "position": warehouses[warehouse_id]["position"], "cooldown": 0}) drons[-1]["warehouse"] = get_dron_start_warehouse(drons[-1], warehouses) turn = 0 while turn < data["turns"]: while has_free_drons(drons): dron = get_free_dron(drons) order = get_nearest_order(data["orders"], dron["warehouse"]) while order == None: dron["warehouse"]["disabled"] = True if get_not_disabled_warehouse(dron, warehouses) == None: break dron["warehouse"] = get_not_disabled_warehouse(dron, warehouses) order = get_nearest_order(data["orders"], dron["warehouse"]) loads, delivers = pack_dron(dron, max_dron_size, order, products, dron["warehouse"]) output_commands(loads) output_commands(delivers) commands = [] for dron in drons: if dron["cooldown"] > 0: dron["cooldown"] -= 1 turn += 1
def main(argv): if len(argv) != 3: sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>") rna_ids, _, test_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") json_files = glob.glob(f"{argv[2]}/*.json") forest = RandomForest() weights_filename = f"{argv[2]}/tree_weights.json" json_files.remove(weights_filename) weights = {} with open(weights_filename, "r") as weights_file: weights = json.loads(weights_file.read()) for filename in json_files: with open(filename, "r") as tree_file: tree = DecisionTree.from_json(tree_file.read()) forest.add_tree(tree) forest.weights.append(weights[filename]) for i, x in enumerate(test_x): prediction, confidence = forest.predict_with_confidence(x) if prediction == 0.0: confidence = 1 - confidence print(f"{rna_ids[i]},{confidence}")
def main(argv): """ entry point to the program """ if len(argv) != 3: sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>") _, test_y, test_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") json_files = glob.glob(f"{argv[2]}/*.json") forest = RandomForest() for filename in json_files: with open(filename, "r") as tree_file: tree = DecisionTree.from_json(tree_file.read()) forest.add_tree(tree) total_right = 0 for i, point in enumerate(test_x): expected = forest.predict(point) if test_y[i] == expected: total_right += 1 accuracy = total_right / len(test_y) print(f"Accuracy: {accuracy}")
def import_data(): clear_db() with MongoClient() as client: db = client[DB_NAME] training_collection = db['training_data'] test_collection = db['test_data'] for data_file, labels_file in TRAINING_DATA: read_data( data_file_path=path.join(DATA_DIR, data_file), db_collection=training_collection, labels_file_path=path.join(DATA_DIR, labels_file), ) for data_file in TEST_DATA: read_data( data_file_path=path.join(DATA_DIR, data_file), db_collection=test_collection )
def main(argv): if len(argv) != 3: sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>") _, test_y, test_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") tree_files = glob.glob(f"{argv[2]}/*.json") forest = [] for filename in tree_files: with open(filename, "r") as tree_file: tree = DecisionTree.from_json(tree_file.read()) forest.append(tree) diffs = [] forest_predictions = [] for tree in forest: tree_predictions = [tree.predict(x) for x in test_x] forest_predictions.append(tree_predictions) diff = dist(test_y, tree_predictions) diffs.append(diff) sorted_refs = list(range(len(forest))) sorted_refs.sort(key=lambda ref: diffs[ref]) #do not need diffs anymore del diffs prediction_sum = numpy.array(forest_predictions[sorted_refs[0]]) smallest_dist = dist(test_y, prediction_sum) print(smallest_dist) best_trees = [tree_files[sorted_refs[0]]] for ref in sorted_refs[1:]: #correctness = numpy.subtract(forest_predictions[ref], test_y) #total_wrong = numpy.count_nonzero(correctness) #accuracy = 1 - (total_wrong / len(test_y)) #print(f"Accuracy: {accuracy}") new_combination = numpy.add(prediction_sum, forest_predictions[ref]) normalized_combination = new_combination / (len(best_trees) + 1) new_dist = dist(test_y, normalized_combination) #might need to make this an <= due to math if new_dist < smallest_dist: prediction_sum = new_combination smallest_dist = new_dist print(smallest_dist) best_trees.append(tree_files[ref]) #should have the combination of trees that give us the closest value to the ground truth (i.e. the test data) #choose a set of trees such that the dist(ground, predict_prob) is minimized to 1 #only problem is with multiple labels, this could be biased against labels that have larger distances (i.e. 2 and 0 vs 1 and 0) #print(prediction_sum / len(best_trees)) for tree_file in tree_files: if tree_file not in best_trees: print(f"removing {tree_file}") os.remove(tree_file)
def test_read_data(self): (h, d) = pd.read_data(test_file) assert h is not None logger.debug("h: {}".format(h)) assert "pool_id" in h assert d is not None logger.debug("d: {}".format(d)) assert len(d) > 0
def interpret_and_run(args): if args.subcommand == 'data' or args.subcommand == 'npz': if args.amplitude_colour_limit == None: amplitude_colour_limit = None else: amplitude_colour_limit = eval(args.amplitude_colour_limit) if args.norm_squared_colour_limit == None: norm_squared_colour_limit = None else: norm_squared_colour_limit = eval(args.norm_squared_colour_limit) if args.subcommand == 'data': if args.data_file == None: raise exceptions.StupidityError('No data_file entered.') unoptimized_size = eval(args.size) center = eval(args.center) data = parse_data.read_data(args.data_file) args_dict = { 'frequency': args.frequency, 'distance': args.distance, 'resolution': args.resolution, 'size': unoptimized_size, 'center': center, 'source_locations': data[0], 'source_amplitudes': data[1] } main_directory_path = initialize_main_directory( args.data_file, args.directory_save_location, args.save_directory_name, args.new_save_directory) spherical_interpreter = SphericalInterpreter("raw_data", args_dict) spherical_interpreter.to_analysis_file( os.path.join(main_directory_path, 'analysis_file.txt')) spherical_interpreter.to_npz_file( os.path.join(main_directory_path, 'npz_file.npz')) colour_plot_directory_path = initialize_colour_plot_directory( args.colour_plot_directory_name, main_directory_path, args.new_colour_plot_directory) to_colour_plot(spherical_interpreter, colour_plot_directory_path, amplitude_colour_limit, norm_squared_colour_limit) elif args.subcommand == 'npz': if args.npz_file == None: raise exceptions.StupidityError('No npz_file entered') if os.path.exists(args.npz_file): main_directory_path = os.path.dirname(args.npz_file) args_dict = {'npz_dict': np.load(args.npz_file)} else: raise ValueError('The specified npz_file does not exist') spherical_interpreter = SphericalInterpreter('npz_file', args_dict) spherical_interpreter.to_analysis_file( os.path.join(main_directory_path, 'analysis_file.txt')) colour_plot_directory_path = initialize_colour_plot_directory( args.colour_plot_directory_name, main_directory_path, args.new_colour_plot_directory) to_colour_plot(spherical_interpreter, colour_plot_directory_path, amplitude_colour_limit, norm_squared_colour_limit)
def read_prism_cell_from_file(row_metadata_file, items): filepath = row_metadata_file (headers, data) = parse_data.read_data(filepath) data = [x for x in data if x[0][0] != "#"] header_map = parse_data.generate_header_map(headers, items, False) logger.debug("header_map: {}".format(header_map)) return parse_data.parse_data(header_map, data, PrismCell)
def main(): args = parse_args() if args.runOnScenario != 'all': # all scenarios will be transformed to upper case, because names are case insensitive args.runOnScenario = list(set(args.runOnScenario)) args.runOnScenario = [i.upper() for i in args.runOnScenario] if args.checkStatistic != -1: args.checkStatistic = list(set(args.checkStatistic)) nodes = read_data(args.fileName, args.ignore) check_options(args.checkStatistic, nodes, args.runOnScenario, args.reversePrecondPostcond, args.reverseTrigDesc, args.top, args.noCPU, args.healScenarios)
def _read_perturbagen_from_file(filepath, do_keep_all): (headers, data) = parse_data.read_data(filepath) #todo: think about other checks / better notification of wrong map type if "well_position" in headers: Exception( "Merino no longer supports CM map type, please convert map to CMap map type" ) header_map = parse_data.generate_header_map(headers, None, do_keep_all) logger.debug("header_map: {}".format(header_map)) return parse_data.parse_data(header_map, data, Perturbagen)
def main(): data = pd.read_data() warehouses_pos = split_position(data['warehouses']) orders_pos = split_position(data['orders']) orders_sizes = [] for order in data['orders']: items_total_weight = 0 for i in xrange(len(order['items'])): items_total_weight += -data['products_weights'][i]*order['items'][i] orders_sizes.append(items_total_weight) max_orders_size = max(orders_sizes) orders_sizes = map(lambda x: int(x/float(max_orders_size) * 100), orders_sizes) plt.scatter(orders_pos[0], orders_pos[1], s=orders_sizes, c='b') plt.scatter(warehouses_pos[0], warehouses_pos[1], c='r') plt.axis([0, data['field_size'][0], 0, data['field_size'][1]]) plt.show()
def train_NBC(filepath): new_df = read_data(filepath) new_train_test = new_df.values.tolist() x_train, x_test = train_test_split(new_train_test, test_size=0.1) cl = NaiveBayesClassifier(x_train) # print(cl.classify("Please create an assignment and forward it by EOD")) # print(cl.classify("Im not a dessert person but the warm butter cake should be illegal its so good.")) print("Acheived a test accuracy of : %s " % cl.accuracy(x_test)) # details of classifier train cl.show_informative_features() if not os.path.isdir("./models"): os.mkdir("./models") # saving the trained model file = open("./models/cl_NBC.obj", "wb") pickle.dump(cl, file) file.close()