def generate_dataset_and_parser(): folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) edit_dataset_name = folder_name + '/test_ed.pkl' strings = [generate_random_dna(50)] + [generate_random_dna(random.randint(10, 50)) for _ in range(19)] strings_dict = {'train': strings[:10], 'val': strings[10:15], 'test': strings[15:]} edit_dataset = EditDistanceGenomicDatasetGenerator(strings=strings_dict) edit_dataset.save_as_pickle(edit_dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = edit_dataset_name args.epochs = 2 args.print_every = 1 args.construct_msa_tree = 'True' return folder_name, edit_dataset_name, args
def generate_dataset_and_parser(): folder_name = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) edit_dataset_name = folder_name + '/test_ed.pkl' edit_dataset = EditDistanceDatasetGenerator(N_batches={ "train": 2, "val": 2, "test": 2 }, batch_size={ "train": 5, "val": 3, "test": 3 }, len_sequence={ "train": 10, "val": 10, "test": 10 }, max_changes={ "train": 2, "val": 2, "test": 2 }, string_generator=generator, seed=0) edit_dataset.save_as_pickle(edit_dataset_name) hc_dataset_name = folder_name + '/test_hc.pkl' hc_dataset = HierarchicalClusteringDatasetGenerator( N_reference=3, N_leaves=4, len_sequence=10, min_changes=2, max_changes=4, string_generator=generator, seed=0) hc_dataset.save_as_pickle(hc_dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = edit_dataset_name args.epochs = 2 args.print_every = 1 args.hierarchical_data_path = hc_dataset_name return folder_name, edit_dataset_name, hc_dataset_name, args
def generate_dataset_and_parser(): folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) dataset_name = folder_name + '/test_ed_model.pkl' dataset = EditDistanceDatasetGenerator( N_batches={"train": 2, "val": 2, "test": 2}, batch_size={"train": 5, "val": 3, "test": 3}, len_sequence={"train": 10, "val": 10, "test": 10}, max_changes={"train": 2, "val": 2, "test": 2}, string_generator=generator, seed=0) dataset.save_as_pickle(dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = dataset_name args.epochs = 2 args.print_every = 1 args.distance = "euclidean" return folder_name, dataset_name, args
from edit_distance.models.feedforward.model import MLPEncoder from edit_distance.train import execute_train, general_arg_parser from util.data_handling.data_loader import BOOL_CHOICE parser = general_arg_parser() parser.add_argument('--layers', type=int, default=1, help='Number of fully connected layers') parser.add_argument('--hidden_size', type=int, default=100, help='Size of hidden layers') parser.add_argument('--batch_norm', type=str, default='False', help='Batch normalization') args = parser.parse_args() assert args.batch_norm in BOOL_CHOICE, "Boolean values have to be either 'True' or 'False' " execute_train(model_class=MLPEncoder, model_args=dict(layers=args.layers, hidden_size=args.hidden_size, batch_norm=True if args.batch_norm == 'True' else False), args=args)