def grimm_filter_unique_gene(in_file, out_file):
    lines = open(in_file).read().split('\n')

    # make unique blocks list
    i = 0
    flt = Unique_Filter()
    while i < len(lines):
        line = lines[i]
        if GRIMMReader.is_genome_declaration_string(line):
            data_line = lines[i + 1]
            parsed = GRIMMReader.parse_data_string(data_line)[1]
            flt.update_allowed_blocks(parsed)
            i += 2
        else:
            i += 1

    # write allowed blocks
    i = 0
    with open(out_file, 'w') as f:
        while i < len(lines):
            line = lines[i]
            if GRIMMReader.is_genome_declaration_string(line):
                data_line = lines[i + 1]

                parsed = GRIMMReader.parse_data_string(data_line)[1]
                parsed = flt.filter_unique(parsed)

                print(line, file=f)
                print(' '.join(p[0] + p[1] for p in parsed), '@', file=f)
                i += 2
            else:
                i += 1

    return list(map(int, flt.allowed_blocks))
Ejemplo n.º 2
0
def get_block_neighbours(grimm_file):
    block_neighbours = defaultdict(lambda: defaultdict(list))
    with open(grimm_file) as f:
        ls = f.readlines()

    i = 0
    while i < len(ls):
        l = ls[i]
        if GRIMMReader.is_genome_declaration_string(l):
            genome = GRIMMReader.parse_genome_declaration_string(l)
            data_line = ls[i + 1]
            bs = GRIMMReader.parse_data_string(data_line)[1]

            n = len(bs)
            j = 0

            while j < n:
                tandem_copies = 1
                prev_or, prev_block = bs[j % n]
                _, curr_block = bs[(j + 1) % n]
                next_or, next_block = bs[(j + 2) % n]

                if curr_block == prev_block:
                    j += 1
                    continue

                while curr_block == next_block:
                    j += 1
                    tandem_copies += 1
                    next_or, next_block = bs[(j + 2) % n]

                neighbours = (prev_block + ('h' if prev_or == '+' else 't'),
                              next_block + ('t' if next_or == '+' else 'h'))

                orientations = tuple(bs[(k + 1) % n][0]
                                     for k in range(j - tandem_copies + 1, j +
                                                    1))

                if orientations[0] == '-':
                    neighbours = (neighbours[1], neighbours[0])
                    orientations = tuple('+' if or_ == '-' else '+'
                                         for or_ in orientations[::-1])

                block_neighbours[int(curr_block)][genome.name].append(
                    (*neighbours, tandem_copies, orientations))

                j += 1

            i += 2
        else:
            i += 1

    return block_neighbours
Ejemplo n.º 3
0
 def test_is_genome_declaration_string(self):
     # string is named as genome declaration string if its first non empty element is ">"
     # genome name has to be specified after the ">" char, empty genome name is forbidden
     self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("    >genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("  \t  >genome"))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string(">genome   \t"))
     self.assertTrue(GRIMMReader.is_genome_declaration_string(">genome   "))
     self.assertTrue(
         GRIMMReader.is_genome_declaration_string("   >genome   "))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("\tt   >genome"))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("  t\t>genome"))
     self.assertFalse(
         GRIMMReader.is_genome_declaration_string("  t>genome"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("genome"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string(">"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("     >   "))
     self.assertFalse(GRIMMReader.is_genome_declaration_string("     >"))
     self.assertFalse(GRIMMReader.is_genome_declaration_string(">   "))
Ejemplo n.º 4
0
    logger.info(parser.format_values())
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info(
            "Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(
                        data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(
                        data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(
                            args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(
                        data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
            good_genomes = [
                genome_name.split(args.trimmer_char, 1)[0]
Ejemplo n.º 5
0
    logger.addHandler(ch)
    logger.info(full_description)
    logger.info(parser.format_values())
    ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry))
    logger.info("Starting the converting process")

    genomes = defaultdict(list)
    for file_name in args.grimm:
        logger.info("Processing file \"{file_name}\"".format(file_name=file_name))
        with open(file_name, "rt") as source:
            current_genome = None
            for line in source:
                line = line.strip()
                if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line):
                    continue
                if GRIMMReader.is_genome_declaration_string(data_string=line):
                    current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name
                    if args.trim_names:
                        current_genome = current_genome.split(args.trimmer_char, 1)[0]
                elif current_genome is not None:
                    current_chromosome = []
                    chr_type, blocks = GRIMMReader.parse_data_string(data_string=line)
                    genomes[current_genome].append((chr_type, blocks))
    if args.good_genomes != "":
        good_genomes = args.good_genomes.split(",")
        if args.trim_names:
            good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes]
        for genome_name in list(genomes.keys()):
            if genome_name not in good_genomes:
                del genomes[genome_name]
    if args.bad_genomes != "":