Ejemplo n.º 1
0
def make_gaps_tree(in_file):
    # A dictionary to store an interval tree for each chromosome header.
    all_trees = dict()
    x = SeqReader(in_file)
    if in_file.endswith(".gz"):
        for header, sequence in x.parse_gzip_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0))
                               for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    else:
        for header, sequence in x.parse_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0))
                               for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    return all_trees
Ejemplo n.º 2
0
def read_gz_contigs(in_file):
    d = dict()
    x = SeqReader(in_file)
    for header, seq in x.parse_gzip_fasta():
        d[header.replace('>', '').split(' ')[0]] = seq
    return d
Ejemplo n.º 3
0
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size, chr0=True):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    remaining_contig_headers = []
    all_seqs = OrderedDict()
    x = SeqReader('../' + in_contigs_file)
    if in_contigs_file.endswith(".gz"):
        for header, seq in x.parse_gzip_fasta():
            remaining_contig_headers.append(header.split(' ')[0])
            all_seqs[header.split(' ')[0]] = seq
    else:
        for header, seq in x.parse_fasta():
            remaining_contig_headers.append(header.split(' ')[0])
            all_seqs[header.split(' ')[0]] = seq

    # Get all reference chromosomes
    all_chroms = sorted(list(set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()])))

    # Iterate through each orderings file and store sequence in a dictionary
    all_pms = dict()
    pad = ''.join('N' for i in range(gap_size))
    for this_chrom in all_chroms:
        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
        orderings = get_orderings(orderings_file)
        if orderings:
            seq_list = []
            for line in orderings:
                # Mark that we have seen this contig
                remaining_contig_headers.pop(remaining_contig_headers.index('>' + line[0]))
                if line[1] == '+':
                    seq_list.append(all_seqs['>' + line[0]])
                else:
                    assert line[1] == '-'
                    seq_list.append(reverse_complement(all_seqs['>' + line[0]]))
            all_pms[this_chrom] = pad.join(seq_list)
            all_pms[this_chrom] += '\n'

    # Get unincorporated sequences and place them in Chr0
    if remaining_contig_headers:
        if chr0:
            chr0_headers = []
            chr0_seq_list = []
            for header in remaining_contig_headers:
                chr0_headers.append(header)
                chr0_seq_list.append(all_seqs[header])
            all_pms['Chr0'] = pad.join(chr0_seq_list)
            all_pms['Chr0'] += '\n'

            # Write out the list of chr0 headers
            f_chr0_g = open('groupings/Chr0_contigs.txt', 'w')
            f_chr0_o = open('orderings/Chr0_orderings.txt', 'w')
            for i in chr0_headers:
                f_chr0_g.write(i[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n')
            f_chr0_g.close()
            f_chr0_o.close()
        else:
            # Instead of making a chromosome 0, add the unplaced sequences as is.
            for header in remaining_contig_headers:
                all_pms[header[1:]] = all_seqs[header] + "\n"
                f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt', 'w')
                f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt', 'w')
                f_chr0_g.write(header[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n')
                f_chr0_g.close()
                f_chr0_o.close()

    # Write the final sequences out to a file
    with open('ragoo.fasta', 'w') as f:
        for out_header in all_pms:
            f.write(">" + out_header + "_RaGOO\n")
            f.write(all_pms[out_header])