Beispiel #1
0
def write_misasm_broken_ctgs(contigs_file,
                             breaks,
                             out_prefix,
                             in_gff=None,
                             in_gff_name=None):
    current_path = os.getcwd()
    os.chdir('ctg_alignments')

    if in_gff and in_gff_name:
        with open(in_gff_name, 'w') as f:
            for i in in_gff.keys():
                for j in in_gff[i]:
                    f.write(str(j) + '\n')

    x = SeqReader("../../" + contigs_file)
    f = open(out_prefix + ".misasm.break.fa", 'w')
    for header, seq in x.parse_fasta():
        header = header[1:]
        if header not in breaks:
            f.write(">" + header + "\n")
            f.write(seq + "\n")
        else:
            # Break the contig
            ctg_len = len(seq)
            break_list = [0] + sorted(breaks[header]) + [ctg_len]
            for i in range(len(break_list) - 1):
                f.write(">" + header + "_misasm_break:" + str(break_list[i]) +
                        "-" + str(break_list[i + 1]) + "\n")
                f.write(seq[break_list[i]:break_list[i + 1]] + "\n")
    os.chdir(current_path)
Beispiel #2
0
def make_gaps_tree(in_file):
    # A dictionary to store an interval tree for each chromosome header.
    all_trees = dict()
    x = SeqReader(in_file)
    if in_file.endswith(".gz"):
        for header, sequence in x.parse_gzip_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0))
                               for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    else:
        for header, sequence in x.parse_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0))
                               for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    return all_trees
Beispiel #3
0
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    remaining_contig_headers = []
    all_seqs = dict()
    x = SeqReader('../' + in_contigs_file)
    for header, seq in x.parse_fasta():
        remaining_contig_headers.append(header.split(' ')[0])
        all_seqs[header.split(' ')[0]] = seq

    # Get all reference chromosomes
    all_chroms = sorted(
        list(
            set([
                in_unique_contigs[i].ref_chrom
                for i in in_unique_contigs.keys()
            ])))

    # Iterate through each orderings file and store sequence in a dictionary
    all_pms = dict()
    for this_chrom in all_chroms:
        all_pms[this_chrom] = ''
        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
        orderings = get_orderings(orderings_file)
        for line in orderings:
            # Mark that we have seen this contig
            remaining_contig_headers.pop(
                remaining_contig_headers.index('>' + line[0]))
            if line[1] == '+':
                all_pms[this_chrom] += all_seqs['>' + line[0]]
                all_pms[this_chrom] += ''.join('N' for i in range(gap_size))
            else:
                assert line[1] == '-'
                all_pms[this_chrom] += reverse_complement(all_seqs['>' +
                                                                   line[0]])
                all_pms[this_chrom] += ''.join('N' for i in range(gap_size))
        all_pms[this_chrom] += '\n'

    # Get unincorporated sequences and place them in Chr0
    all_pms['Chr0'] = ''
    for header in remaining_contig_headers:
        all_pms['Chr0'] += all_seqs[header]
        all_pms['Chr0'] += ''.join('N' for i in range(gap_size))
    all_pms['Chr0'] += '\n'

    # Write the final sequences out to a file
    with open('ragoo.fasta', 'w') as f:
        f.write('>Chr0_RaGOO\n')
        f.write(all_pms['Chr0'])
        for header in all_chroms:
            f.write('>' + header + '_RaGOO\n')
            f.write(all_pms[header])
Beispiel #4
0
def create_pseudomolecules(in_contigs_file,
                           in_unique_contigs,
                           gap_size,
                           chr0=True):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    remaining_contig_headers = []
    all_seqs = OrderedDict()
    x = SeqReader(in_contigs_file)
    for header, seq in x.parse_fasta():
        remaining_contig_headers.append(header.split(' ')[0])
        all_seqs[header.split(' ')[0]] = seq

    # Get all reference chromosomes
    all_chroms = sorted(
        list(
            set([
                in_unique_contigs[i].ref_chrom
                for i in in_unique_contigs.keys()
            ])))

    # Iterate through each orderings file and store sequence in a dictionary
    all_pms = dict()
    pad = ''.join('N' for i in range(gap_size))
    for this_chrom in all_chroms:
        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
        orderings = get_orderings(orderings_file)
        if orderings:
            seq_list = []
            for line in orderings:
                # Mark that we have seen this contig
                remaining_contig_headers.pop(
                    remaining_contig_headers.index('>' + line[0]))
                if line[1] == '+':
                    seq_list.append(all_seqs['>' + line[0]])
                else:
                    assert line[1] == '-'
                    seq_list.append(reverse_complement(all_seqs['>' +
                                                                line[0]]))
            all_pms[this_chrom] = pad.join(seq_list)
            all_pms[this_chrom] += '\n'

    # Get unincorporated sequences and place them in Chr0
    if remaining_contig_headers:
        if chr0:
            chr0_headers = []
            chr0_seq_list = []
            for header in remaining_contig_headers:
                chr0_headers.append(header)
                chr0_seq_list.append(all_seqs[header])
            all_pms['Chr0'] = pad.join(chr0_seq_list)
            all_pms['Chr0'] += '\n'

            # Write out the list of chr0 headers
            f_chr0_g = open('groupings/Chr0_contigs.txt', 'w')
            f_chr0_o = open('orderings/Chr0_orderings.txt', 'w')
            for i in chr0_headers:
                f_chr0_g.write(i[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" +
                               '\n')
            f_chr0_g.close()
            f_chr0_o.close()
        else:
            # Instead of making a chromosome 0, add the unplaced sequences as is.
            for header in remaining_contig_headers:
                all_pms[header[1:]] = all_seqs[header] + "\n"
                f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt',
                                'w')
                f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt',
                                'w')
                f_chr0_g.write(header[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' +
                               "0" + '\n')
                f_chr0_g.close()
                f_chr0_o.close()

    # Write the final sequences out to a file
    with open('ragoo.fasta', 'w') as f:
        for out_header in all_pms:
            f.write(">" + out_header + "_RaGOO\n")
            f.write(all_pms[out_header])
Beispiel #5
0
def read_gz_contigs(in_file):
    d = dict()
    x = SeqReader(in_file)
    for header, seq in x.parse_gzip_fasta():
        d[header.replace('>', '').split(' ')[0]] = seq
    return d