コード例 #1
0
    def test_equal_independent(self):
        for seed in range(TEST_CASES):
            generator = IndependentGenerator(alphabet_size=4, seed=seed)
            random.seed(seed)

            sequence1 = generator.generate(length=random.randint(10, 100))
            distance1 = dp_edit_distance(sequence1, sequence1)

            string1 = "".join(chr(s + 97) for s in sequence1)
            distance2 = Levenshtein.distance(string1, string1)

            assert distance1 == 0, 'dp_edit_distance on equal sequences different from 0'
            assert distance2 == 0, 'Levenshtein.distance on equal sequences different from 0'
コード例 #2
0
    def test_100_independent(self):
        for seed in range(TEST_CASES):
            generator = IndependentGenerator(alphabet_size=4, seed=seed)
            sequence1 = generator.generate(length=100)
            sequence2 = generator.generate(length=100)

            distance1 = dp_edit_distance(sequence1, sequence2)

            string1 = "".join(chr(s + 97) for s in sequence1)
            string2 = "".join(chr(s + 97) for s in sequence2)
            distance2 = Levenshtein.distance(string1, string2)

            assert distance1 == distance2, \
                'Mismatch between dp_edit_distance and Levenshtein.distance'
コード例 #3
0
    def test_100independent_5independent(self):
        for seed in range(TEST_CASES):
            generator = IndependentGenerator(alphabet_size=4, seed=seed)
            sequence = generator.generate(length=100)
            pattern = generator.generate(length=5)

            random.seed(seed)
            matches = {}
            for algorithm in EXACT_MATCHING_ALGORITHMS.keys():
                matches[algorithm] = sorted(
                    EXACT_MATCHING_ALGORITHMS[algorithm](pattern, sequence))

            for algorithm in EXACT_MATCHING_ALGORITHMS.keys():
                if algorithm != 'brute_force':
                    assert matches[algorithm] == matches['brute_force'], \
                        'Mismatch between brute force and ' + algorithm + ": " + str(matches)
コード例 #4
0
 def __init__(self, methodName):
     super().__init__(methodName)
     self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE,
                                           seed=0)
     self.dataset = EditDistanceDatasetGenerator(
         N_batches={
             "train": 4,
             "val": 2,
             "test": 3
         },
         batch_size={
             "train": 5,
             "val": 3,
             "test": 4
         },
         len_sequence={
             "train": 10,
             "val": 10,
             "test": 10
         },
         max_changes={
             "train": 4,
             "val": 4,
             "test": 4
         },
         string_generator=self.generator,
         seed=0)
コード例 #5
0
 def __init__(self, methodName):
     super().__init__(methodName)
     self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE,
                                           seed=0)
     self.dataset = ClosestStringDatasetGenerator(
         N_reference=10,
         N_query=15,
         len_sequence=20,
         min_changes=3,
         max_changes=10,
         initials=3,
         string_generator=self.generator,
         seed=0)
コード例 #6
0
def generate_dataset_and_parser():
    folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10))
    generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0)
    edit_dataset_name = folder_name + '/test_ed.pkl'
    strings = [generate_random_dna(50)] + [generate_random_dna(random.randint(10, 50)) for _ in range(19)]
    strings_dict = {'train': strings[:10], 'val': strings[10:15], 'test': strings[15:]}
    edit_dataset = EditDistanceGenomicDatasetGenerator(strings=strings_dict)
    edit_dataset.save_as_pickle(edit_dataset_name)

    parser = general_arg_parser()
    args = parser.parse_args()
    args.data = edit_dataset_name
    args.epochs = 2
    args.print_every = 1
    args.construct_msa_tree = 'True'
    return folder_name, edit_dataset_name, args
コード例 #7
0
def generate_dataset_and_parser():
    folder_name = ''.join(
        random.choice(string.ascii_lowercase) for _ in range(10))
    generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0)
    edit_dataset_name = folder_name + '/test_ed.pkl'
    edit_dataset = EditDistanceDatasetGenerator(N_batches={
        "train": 2,
        "val": 2,
        "test": 2
    },
                                                batch_size={
                                                    "train": 5,
                                                    "val": 3,
                                                    "test": 3
                                                },
                                                len_sequence={
                                                    "train": 10,
                                                    "val": 10,
                                                    "test": 10
                                                },
                                                max_changes={
                                                    "train": 2,
                                                    "val": 2,
                                                    "test": 2
                                                },
                                                string_generator=generator,
                                                seed=0)
    edit_dataset.save_as_pickle(edit_dataset_name)

    hc_dataset_name = folder_name + '/test_hc.pkl'
    hc_dataset = HierarchicalClusteringDatasetGenerator(
        N_reference=3,
        N_leaves=4,
        len_sequence=10,
        min_changes=2,
        max_changes=4,
        string_generator=generator,
        seed=0)
    hc_dataset.save_as_pickle(hc_dataset_name)

    parser = general_arg_parser()
    args = parser.parse_args()
    args.data = edit_dataset_name
    args.epochs = 2
    args.print_every = 1
    args.hierarchical_data_path = hc_dataset_name
    return folder_name, edit_dataset_name, hc_dataset_name, args
コード例 #8
0
def generate_dataset_and_parser():
    folder_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10))
    generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0)
    dataset_name = folder_name + '/test_ed_model.pkl'
    dataset = EditDistanceDatasetGenerator(
        N_batches={"train": 2, "val": 2, "test": 2},
        batch_size={"train": 5, "val": 3, "test": 3},
        len_sequence={"train": 10, "val": 10, "test": 10},
        max_changes={"train": 2, "val": 2, "test": 2},
        string_generator=generator, seed=0)
    dataset.save_as_pickle(dataset_name)

    parser = general_arg_parser()
    args = parser.parse_args()
    args.data = dataset_name
    args.epochs = 2
    args.print_every = 1
    args.distance = "euclidean"
    return folder_name, dataset_name, args
コード例 #9
0
 def __init__(self, methodName):
     super().__init__(methodName)
     self.generator = IndependentGenerator(alphabet_size=ALPHABET_SIZE, seed=0)
     self.dataset = HierarchicalClusteringDatasetGenerator(N_reference=10, N_leaves=15, len_sequence=20,
                                                           min_changes=3, max_changes=10,
                                                           string_generator=self.generator, seed=0)
コード例 #10
0
                        help='Number of references')
    parser.add_argument('--N_query',
                        type=int,
                        default=1000,
                        help='Number of queries')
    parser.add_argument('--len_sequence',
                        type=int,
                        default=1024,
                        help='Length of sequences')
    parser.add_argument('--min_changes',
                        type=float,
                        default=50,
                        help='Minimum number of mutations')
    parser.add_argument('--max_changes',
                        type=float,
                        default=600,
                        help='Maximum number of mutations')
    parser.add_argument('--initials',
                        type=float,
                        default=200,
                        help='Initial independently generated sequences')
    parser.add_argument('--seed', type=int, default=0, help='Random seed')
    args = parser.parse_args()

    generator = IndependentGenerator(seed=args.seed)
    data = ClosestStringDatasetGenerator\
        (N_reference=args.N_reference, N_query=args.N_query,
         len_sequence=args.len_sequence, min_changes=args.min_changes, max_changes=args.max_changes,
         seed=args.seed, string_generator=generator, initials=args.initials)
    data.save_as_pickle(args.out)