コード例 #1
0
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    remaining_contig_headers = []
    all_seqs = dict()
    x = SeqReader('../' + in_contigs_file)
    for header, seq in x.parse_fasta():
        remaining_contig_headers.append(header.split(' ')[0])
        all_seqs[header.split(' ')[0]] = seq

    # Get all reference chromosomes
    all_chroms = sorted(
        list(
            set([
                in_unique_contigs[i].ref_chrom
                for i in in_unique_contigs.keys()
            ])))

    # Iterate through each orderings file and store sequence in a dictionary
    all_pms = dict()
    for this_chrom in all_chroms:
        all_pms[this_chrom] = ''
        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
        orderings = get_orderings(orderings_file)
        for line in orderings:
            # Mark that we have seen this contig
            remaining_contig_headers.pop(
                remaining_contig_headers.index('>' + line[0]))
            if line[1] == '+':
                all_pms[this_chrom] += all_seqs['>' + line[0]]
                all_pms[this_chrom] += ''.join('N' for i in range(gap_size))
            else:
                assert line[1] == '-'
                all_pms[this_chrom] += reverse_complement(all_seqs['>' +
                                                                   line[0]])
                all_pms[this_chrom] += ''.join('N' for i in range(gap_size))
        all_pms[this_chrom] += '\n'

    # Get unincorporated sequences and place them in Chr0
    all_pms['Chr0'] = ''
    for header in remaining_contig_headers:
        all_pms['Chr0'] += all_seqs[header]
        all_pms['Chr0'] += ''.join('N' for i in range(gap_size))
    all_pms['Chr0'] += '\n'

    # Write the final sequences out to a file
    with open('ragoo.fasta', 'w') as f:
        f.write('>Chr0_RaGOO\n')
        f.write(all_pms['Chr0'])
        for header in all_chroms:
            f.write('>' + header + '_RaGOO\n')
            f.write(all_pms[header])
コード例 #2
0
def create_pseudomolecules(in_contigs_file,
                           in_unique_contigs,
                           gap_size,
                           chr0=True):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    remaining_contig_headers = []
    all_seqs = OrderedDict()
    x = SeqReader(in_contigs_file)
    for header, seq in x.parse_fasta():
        remaining_contig_headers.append(header.split(' ')[0])
        all_seqs[header.split(' ')[0]] = seq

    # Get all reference chromosomes
    all_chroms = sorted(
        list(
            set([
                in_unique_contigs[i].ref_chrom
                for i in in_unique_contigs.keys()
            ])))

    # Iterate through each orderings file and store sequence in a dictionary
    all_pms = dict()
    pad = ''.join('N' for i in range(gap_size))
    for this_chrom in all_chroms:
        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
        orderings = get_orderings(orderings_file)
        if orderings:
            seq_list = []
            for line in orderings:
                # Mark that we have seen this contig
                remaining_contig_headers.pop(
                    remaining_contig_headers.index('>' + line[0]))
                if line[1] == '+':
                    seq_list.append(all_seqs['>' + line[0]])
                else:
                    assert line[1] == '-'
                    seq_list.append(reverse_complement(all_seqs['>' +
                                                                line[0]]))
            all_pms[this_chrom] = pad.join(seq_list)
            all_pms[this_chrom] += '\n'

    # Get unincorporated sequences and place them in Chr0
    if remaining_contig_headers:
        if chr0:
            chr0_headers = []
            chr0_seq_list = []
            for header in remaining_contig_headers:
                chr0_headers.append(header)
                chr0_seq_list.append(all_seqs[header])
            all_pms['Chr0'] = pad.join(chr0_seq_list)
            all_pms['Chr0'] += '\n'

            # Write out the list of chr0 headers
            f_chr0_g = open('groupings/Chr0_contigs.txt', 'w')
            f_chr0_o = open('orderings/Chr0_orderings.txt', 'w')
            for i in chr0_headers:
                f_chr0_g.write(i[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" +
                               '\n')
            f_chr0_g.close()
            f_chr0_o.close()
        else:
            # Instead of making a chromosome 0, add the unplaced sequences as is.
            for header in remaining_contig_headers:
                all_pms[header[1:]] = all_seqs[header] + "\n"
                f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt',
                                'w')
                f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt',
                                'w')
                f_chr0_g.write(header[1:] + "\t" + "0" + '\n')
                f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' +
                               "0" + '\n')
                f_chr0_g.close()
                f_chr0_o.close()

    # Write the final sequences out to a file
    with open('ragoo.fasta', 'w') as f:
        for out_header in all_pms:
            f.write(">" + out_header + "_RaGOO\n")
            f.write(all_pms[out_header])
コード例 #3
0
ファイル: ragoo.py プロジェクト: lucventurini/RaGOO
def create_pseudomolecules(in_contigs_file,
                           out_folder,
                           in_ref,
                           gap_size=100,
                           chr0=True):
    """
    Need to make a translation table for easy lift-over.
    :param in_contigs_file:
    :param in_unique_contigs:
    :param gap_size:
    :return:
    """
    # First, read all of the contigs into memory
    # remaining_contig_headers = []
    x = pysam.FastaFile(in_contigs_file)
    y = pysam.FastaFile(in_ref)
    remaining_contig_headers = set(x.references)

    # Get all reference chromosomes
    # all_chroms = sorted(list(set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()])))

    # Iterate through each orderings file and store sequence in a dictionary
    os.chdir(out_folder)
    all_chroms = sorted([
        os.path.basename(_).replace("_orderings.txt", "")
        for _ in os.listdir(os.path.join("orderings"))
        if os.path.basename(_).replace("_orderings.txt", "") in y.references
    ])

    pad = 'N' * gap_size

    with open('ragoo.fasta', 'w') as outfile:
        for this_chrom in all_chroms:
            orderings_file = os.path.join('orderings',
                                          this_chrom + '_orderings.txt')
            orderings = get_orderings(orderings_file)
            curr_seq = []
            curr_total = 0
            if orderings:
                print(">" + this_chrom + "_RaGOO", file=outfile)
                for line in orderings:
                    # Mark that we have seen this contig
                    remaining_contig_headers.remove(line[0])
                    _ = x.fetch(line[0])
                    curr_total += x.get_reference_length(line[0])
                    if line[1] == '+':
                        curr_seq.append(_)
                    else:
                        assert line[1] == '-'
                        curr_seq.append(reverse_complement(_))

                    if curr_total >= 10**7:  # Print out every 10Mbps
                        wrapped = re.findall(".{1,60}", pad.join(curr_seq))
                        print(*wrapped[:-1], sep="\n", file=outfile)
                        curr_seq = [wrapped[-1]]
                        curr_total = len(wrapped[-1])

                wrapped = re.findall(".{1,60}", pad.join(curr_seq))
                print(*wrapped, sep="\n", file=outfile)

        # Get unincorporated sequences and place them in Chr0
        if remaining_contig_headers:
            if chr0:
                curr_seq = []
                curr_total = 0
                chr0_headers = []
                for header in remaining_contig_headers:
                    _ = x.fetch(header)
                    curr_total += x.get_reference_length(header)
                    curr_seq.append(_)
                    chr0_headers.append(header)
                    if curr_total >= 10**7:  # Print out every 10Mbps
                        wrapped = re.findall(".{1,60}", pad.join(curr_seq))
                        print(*wrapped[:-1], sep="\n", file=outfile)
                        curr_seq = [wrapped[-1]]
                        curr_total = len(wrapped[-1])

                wrapped = re.findall(".{1,60}", pad.join(curr_seq))
                print(*wrapped, sep="\n", file=outfile)
                # Write out the list of chr0 headers
                f_chr0_g = open(os.path.join('groupings', 'Chr0_contigs.txt'),
                                'w')
                f_chr0_o = open(
                    os.path.join('orderings', 'Chr0_orderings.txt'), 'w')
                for i in chr0_headers:
                    f_chr0_g.write(i[1:] + "\t" + "0" + '\n')
                    f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' +
                                   "0" + '\n')
                f_chr0_g.close()
                f_chr0_o.close()
            else:
                # Instead of making a chromosome 0, add the unplaced sequences as is.
                for header in remaining_contig_headers:
                    print(">{}".format(header), file=outfile)
                    print(*re.findall(".{1,60}", pad.join(x.fetch(header))),
                          sep="\n",
                          file=outfile)
                    f_chr0_g = open(
                        os.path.join('groupings', header[1:] + '_contigs.txt'),
                        'w')
                    f_chr0_o = open(
                        os.path.join('orderings' + header[1:] +
                                     '_orderings.txt'), 'w')
                    f_chr0_g.write(header[1:] + "\t" + "0" + '\n')
                    f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" +
                                   '\t' + "0" + '\n')
                    f_chr0_g.close()
                    f_chr0_o.close()