Esempio n. 1
0
 def test_parse_genome_declaration_string(self):
     # genome declaration string is parsed, by stripping the string from the right
     # and retrieving the string after the ">" character
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome"),
         BGGenome("genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string("  >genome  "),
         BGGenome("genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome__genome"),
         BGGenome("genome__genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome>genome"),
         BGGenome("genome>genome"))
     self.assertEqual(
         GRIMMReader.parse_genome_declaration_string(">genome.!/.#4"),
         BGGenome("genome.!/.#4"))
Esempio n. 2
0
def get_block_neighbours(grimm_file):
    block_neighbours = defaultdict(lambda: defaultdict(list))
    with open(grimm_file) as f:
        ls = f.readlines()

    i = 0
    while i < len(ls):
        l = ls[i]
        if GRIMMReader.is_genome_declaration_string(l):
            genome = GRIMMReader.parse_genome_declaration_string(l)
            data_line = ls[i + 1]
            bs = GRIMMReader.parse_data_string(data_line)[1]

            n = len(bs)
            j = 0

            while j < n:
                tandem_copies = 1
                prev_or, prev_block = bs[j % n]
                _, curr_block = bs[(j + 1) % n]
                next_or, next_block = bs[(j + 2) % n]

                if curr_block == prev_block:
                    j += 1
                    continue

                while curr_block == next_block:
                    j += 1
                    tandem_copies += 1
                    next_or, next_block = bs[(j + 2) % n]

                neighbours = (prev_block + ('h' if prev_or == '+' else 't'),
                              next_block + ('t' if next_or == '+' else 'h'))

                orientations = tuple(bs[(k + 1) % n][0]
                                     for k in range(j - tandem_copies + 1, j +
                                                    1))

                if orientations[0] == '-':
                    neighbours = (neighbours[1], neighbours[0])
                    orientations = tuple('+' if or_ == '-' else '+'
                                         for or_ in orientations[::-1])

                block_neighbours[int(curr_block)][genome.name].append(
                    (*neighbours, tandem_copies, orientations))

                j += 1

            i += 2
        else:
            i += 1

    return block_neighbours
def get_genomes_contain_blocks_grimm(grimm_file):
    genomes, blocks = set(), set()

    with open(grimm_file) as f:
        ls = f.readlines()
    block_genome_count = defaultdict(Counter)

    for i in range(0, len(ls), 2):
        name = GRIMMReader.parse_genome_declaration_string(ls[i]).name
        data = GRIMMReader.parse_data_string(ls[i + 1])[1]
        genomes.add(name)
        for _, block in data:
            blocks.add(int(block))
            block_genome_count[int(block)][name] += 1

    return list(sorted(genomes)), list(sorted(blocks)), block_genome_count
Esempio n. 4
0
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info(
            "Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(
                        data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(
                        data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(
                            args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(
                        data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
            good_genomes = [
                genome_name.split(args.trimmer_char, 1)[0]
                for genome_name in good_genomes
            ]
Esempio n. 5
0
    logger.info(full_description)
    logger.info(parser.format_values())
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info("Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
            good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes]
        for genome_name in list(genomes.keys()):
            if genome_name not in good_genomes:
                del genomes[genome_name]
    if args.bad_genomes != "":
        bad_genomes = args.bad_genomes.split(",")