def grimm_filter_unique_gene(in_file, out_file):
    lines = open(in_file).read().split('\n')

    # make unique blocks list
    i = 0
    flt = Unique_Filter()
    while i < len(lines):
        line = lines[i]
        if GRIMMReader.is_genome_declaration_string(line):
            data_line = lines[i + 1]
            parsed = GRIMMReader.parse_data_string(data_line)[1]
            flt.update_allowed_blocks(parsed)
            i += 2
        else:
            i += 1

    # write allowed blocks
    i = 0
    with open(out_file, 'w') as f:
        while i < len(lines):
            line = lines[i]
            if GRIMMReader.is_genome_declaration_string(line):
                data_line = lines[i + 1]

                parsed = GRIMMReader.parse_data_string(data_line)[1]
                parsed = flt.filter_unique(parsed)

                print(line, file=f)
                print(' '.join(p[0] + p[1] for p in parsed), '@', file=f)
                i += 2
            else:
                i += 1

    return list(map(int, flt.allowed_blocks))
Esempio n. 2
0
    def test_parse_data_string_correct(self):
        # data string is parsed by getting information about genes order and individual orientations for each block (gene)
        # string based processing if performed
        # if no orientation is specified explicitly, positive orientation is assumed
        data_string = "a $"
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "$")
        self.assertEqual(result[1][0][0], "+")
        self.assertEqual(result[1][0][1], "a")
        self.assertEqual(len(result[0]), 1)
        self.assertEqual(len(result[1]), 1)

        data_string = "a @"
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        self.assertEqual(result[1][0][0], "+")
        self.assertEqual(result[1][0][1], "a")
        self.assertEqual(len(result[0]), 1)
        self.assertEqual(len(result[1]), 1)

        data_string = "     a -b c -d @ e f     "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)

        data_string = "     a -b +c -d $ e f     "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "$")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)

        data_string = "     a -b c -d @ e f $ g -h    "
        result = GRIMMReader.parse_data_string(data_string)
        self.assertEqual(result[0], "@")
        reference_genes = ["a", "b", "c", "d"]
        result_genes = [gene[1] for gene in result[1]]
        reference_signs = ["+", "-", "+", "-"]
        result_signs = [gene[0] for gene in result[1]]
        self.assertListEqual(result_genes, reference_genes)
        self.assertListEqual(result_signs, reference_signs)
Esempio n. 3
0
 def test_parse_data_string_error(self):
     # data string must contain a fragment termination symbol ($ or @)
     # and must contain space separated gene order information before fragment termination symbol
     data_string_1 = "   a b c d e    "
     data_string_2 = ""
     data_string_3 = " a -b -c d -e "
     data_string_4 = "$"
     data_string_5 = "@"
     data_string_6 = "@ a d s d"
     data_string_7 = "$a d s d"
     data_string_8 = "$-a d s d"
     data_string_9 = "@+a d s d"
     data_string_10 = "a b - -c d e $"
     for data_string in [
             data_string_1, data_string_2, data_string_3, data_string_4,
             data_string_5, data_string_6, data_string_7, data_string_8,
             data_string_9, data_string_10
     ]:
         with self.assertRaises(ValueError):
             GRIMMReader.parse_data_string(data_string)
Esempio n. 4
0
def get_block_neighbours(grimm_file):
    block_neighbours = defaultdict(lambda: defaultdict(list))
    with open(grimm_file) as f:
        ls = f.readlines()

    i = 0
    while i < len(ls):
        l = ls[i]
        if GRIMMReader.is_genome_declaration_string(l):
            genome = GRIMMReader.parse_genome_declaration_string(l)
            data_line = ls[i + 1]
            bs = GRIMMReader.parse_data_string(data_line)[1]

            n = len(bs)
            j = 0

            while j < n:
                tandem_copies = 1
                prev_or, prev_block = bs[j % n]
                _, curr_block = bs[(j + 1) % n]
                next_or, next_block = bs[(j + 2) % n]

                if curr_block == prev_block:
                    j += 1
                    continue

                while curr_block == next_block:
                    j += 1
                    tandem_copies += 1
                    next_or, next_block = bs[(j + 2) % n]

                neighbours = (prev_block + ('h' if prev_or == '+' else 't'),
                              next_block + ('t' if next_or == '+' else 'h'))

                orientations = tuple(bs[(k + 1) % n][0]
                                     for k in range(j - tandem_copies + 1, j +
                                                    1))

                if orientations[0] == '-':
                    neighbours = (neighbours[1], neighbours[0])
                    orientations = tuple('+' if or_ == '-' else '+'
                                         for or_ in orientations[::-1])

                block_neighbours[int(curr_block)][genome.name].append(
                    (*neighbours, tandem_copies, orientations))

                j += 1

            i += 2
        else:
            i += 1

    return block_neighbours
def get_genomes_contain_blocks_grimm(grimm_file):
    genomes, blocks = set(), set()

    with open(grimm_file) as f:
        ls = f.readlines()
    block_genome_count = defaultdict(Counter)

    for i in range(0, len(ls), 2):
        name = GRIMMReader.parse_genome_declaration_string(ls[i]).name
        data = GRIMMReader.parse_data_string(ls[i + 1])[1]
        genomes.add(name)
        for _, block in data:
            blocks.add(int(block))
            block_genome_count[int(block)][name] += 1

    return list(sorted(genomes)), list(sorted(blocks)), block_genome_count
Esempio n. 6
0
     with open(file_name, "rt") as source:
         current_genome = None
         for line in source:
             line = line.strip()
             if len(line) == 0 or GRIMMReader.is_comment_string(
                     data_string=line):
                 continue
             if GRIMMReader.is_genome_declaration_string(data_string=line):
                 current_genome = GRIMMReader.parse_genome_declaration_string(
                     data_string=line).name
                 if args.trim_names:
                     current_genome = current_genome.split(
                         args.trimmer_char, 1)[0]
             elif current_genome is not None:
                 current_chromosome = []
                 chr_type, blocks = GRIMMReader.parse_data_string(
                     data_string=line)
                 genomes[current_genome].append((chr_type, blocks))
 if args.good_genomes != "":
     good_genomes = args.good_genomes.split(",")
     if args.trim_names:
         good_genomes = [
             genome_name.split(args.trimmer_char, 1)[0]
             for genome_name in good_genomes
         ]
     for genome_name in list(genomes.keys()):
         if genome_name not in good_genomes:
             del genomes[genome_name]
 if args.bad_genomes != "":
     bad_genomes = args.bad_genomes.split(",")
     if args.trim_names:
         bad_genomes = [
Esempio n. 7
0
 genomes = defaultdict(list)
 for file_name in args.grimm:
     logger.info("Processing file \"{file_name}\"".format(file_name=file_name))
     with open(file_name, "rt") as source:
         current_genome = None
         for line in source:
             line = line.strip()
             if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line):
                 continue
             if GRIMMReader.is_genome_declaration_string(data_string=line):
                 current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name
                 if args.trim_names:
                     current_genome = current_genome.split(args.trimmer_char, 1)[0]
             elif current_genome is not None:
                 current_chromosome = []
                 chr_type, blocks = GRIMMReader.parse_data_string(data_string=line)
                 genomes[current_genome].append((chr_type, blocks))
 if args.good_genomes != "":
     good_genomes = args.good_genomes.split(",")
     if args.trim_names:
         good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes]
     for genome_name in list(genomes.keys()):
         if genome_name not in good_genomes:
             del genomes[genome_name]
 if args.bad_genomes != "":
     bad_genomes = args.bad_genomes.split(",")
     if args.trim_names:
         bad_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in bad_genomes]
     for genome_name in list(genomes.keys()):
         if genome_name in bad_genomes:
             del genomes[genome_name]