def test_equal_independent(self): for seed in range(TEST_CASES): generator = IndependentGenerator(alphabet_size=4, seed=seed) random.seed(seed) sequence1 = generator.generate(length=random.randint(10, 100)) distance1 = dp_edit_distance(sequence1, sequence1) string1 = "".join(chr(s + 97) for s in sequence1) distance2 = Levenshtein.distance(string1, string1) assert distance1 == 0, 'dp_edit_distance on equal sequences different from 0' assert distance2 == 0, 'Levenshtein.distance on equal sequences different from 0'
def test_100_independent(self): for seed in range(TEST_CASES): generator = IndependentGenerator(alphabet_size=4, seed=seed) sequence1 = generator.generate(length=100) sequence2 = generator.generate(length=100) distance1 = dp_edit_distance(sequence1, sequence2) string1 = "".join(chr(s + 97) for s in sequence1) string2 = "".join(chr(s + 97) for s in sequence2) distance2 = Levenshtein.distance(string1, string2) assert distance1 == distance2, \ 'Mismatch between dp_edit_distance and Levenshtein.distance'
def test_100independent_5independent(self): for seed in range(TEST_CASES): generator = IndependentGenerator(alphabet_size=4, seed=seed) sequence = generator.generate(length=100) pattern = generator.generate(length=5) random.seed(seed) matches = {} for algorithm in EXACT_MATCHING_ALGORITHMS.keys(): matches[algorithm] = sorted( EXACT_MATCHING_ALGORITHMS[algorithm](pattern, sequence)) for algorithm in EXACT_MATCHING_ALGORITHMS.keys(): if algorithm != 'brute_force': assert matches[algorithm] == matches['brute_force'], \ 'Mismatch between brute force and ' + algorithm + ": " + str(matches)
def __init__(self, methodName): super().__init__(methodName) self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) self.dataset = EditDistanceDatasetGenerator( N_batches={ "train": 4, "val": 2, "test": 3 }, batch_size={ "train": 5, "val": 3, "test": 4 }, len_sequence={ "train": 10, "val": 10, "test": 10 }, max_changes={ "train": 4, "val": 4, "test": 4 }, string_generator=self.generator, seed=0)
def __init__(self, methodName): super().__init__(methodName) self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) self.dataset = ClosestStringDatasetGenerator( N_reference=10, N_query=15, len_sequence=20, min_changes=3, max_changes=10, initials=3, string_generator=self.generator, seed=0)
def generate_dataset_and_parser(): folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) edit_dataset_name = folder_name + '/test_ed.pkl' strings = [generate_random_dna(50)] + [generate_random_dna(random.randint(10, 50)) for _ in range(19)] strings_dict = {'train': strings[:10], 'val': strings[10:15], 'test': strings[15:]} edit_dataset = EditDistanceGenomicDatasetGenerator(strings=strings_dict) edit_dataset.save_as_pickle(edit_dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = edit_dataset_name args.epochs = 2 args.print_every = 1 args.construct_msa_tree = 'True' return folder_name, edit_dataset_name, args
def generate_dataset_and_parser(): folder_name = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) edit_dataset_name = folder_name + '/test_ed.pkl' edit_dataset = EditDistanceDatasetGenerator(N_batches={ "train": 2, "val": 2, "test": 2 }, batch_size={ "train": 5, "val": 3, "test": 3 }, len_sequence={ "train": 10, "val": 10, "test": 10 }, max_changes={ "train": 2, "val": 2, "test": 2 }, string_generator=generator, seed=0) edit_dataset.save_as_pickle(edit_dataset_name) hc_dataset_name = folder_name + '/test_hc.pkl' hc_dataset = HierarchicalClusteringDatasetGenerator( N_reference=3, N_leaves=4, len_sequence=10, min_changes=2, max_changes=4, string_generator=generator, seed=0) hc_dataset.save_as_pickle(hc_dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = edit_dataset_name args.epochs = 2 args.print_every = 1 args.hierarchical_data_path = hc_dataset_name return folder_name, edit_dataset_name, hc_dataset_name, args
def generate_dataset_and_parser(): folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) dataset_name = folder_name + '/test_ed_model.pkl' dataset = EditDistanceDatasetGenerator( N_batches={"train": 2, "val": 2, "test": 2}, batch_size={"train": 5, "val": 3, "test": 3}, len_sequence={"train": 10, "val": 10, "test": 10}, max_changes={"train": 2, "val": 2, "test": 2}, string_generator=generator, seed=0) dataset.save_as_pickle(dataset_name) parser = general_arg_parser() args = parser.parse_args() args.data = dataset_name args.epochs = 2 args.print_every = 1 args.distance = "euclidean" return folder_name, dataset_name, args
def __init__(self, methodName): super().__init__(methodName) self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0) self.dataset = HierarchicalClusteringDatasetGenerator(N_reference=10, N_leaves=15, len_sequence=20, min_changes=3, max_changes=10, string_generator=self.generator, seed=0)
help='Number of references') parser.add_argument('--N_query', type=int, default=1000, help='Number of queries') parser.add_argument('--len_sequence', type=int, default=1024, help='Length of sequences') parser.add_argument('--min_changes', type=float, default=50, help='Minimum number of mutations') parser.add_argument('--max_changes', type=float, default=600, help='Maximum number of mutations') parser.add_argument('--initials', type=float, default=200, help='Initial independently generated sequences') parser.add_argument('--seed', type=int, default=0, help='Random seed') args = parser.parse_args() generator = IndependentGenerator(seed=args.seed) data = ClosestStringDatasetGenerator\ (N_reference=args.N_reference, N_query=args.N_query, len_sequence=args.len_sequence, min_changes=args.min_changes, max_changes=args.max_changes, seed=args.seed, string_generator=generator, initials=args.initials) data.save_as_pickle(args.out)