def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size): """ Need to make a translation table for easy lift-over. :param in_contigs_file: :param in_unique_contigs: :param gap_size: :return: """ # First, read all of the contigs into memory remaining_contig_headers = [] all_seqs = dict() x = SeqReader('../' + in_contigs_file) for header, seq in x.parse_fasta(): remaining_contig_headers.append(header.split(' ')[0]) all_seqs[header.split(' ')[0]] = seq # Get all reference chromosomes all_chroms = sorted( list( set([ in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys() ]))) # Iterate through each orderings file and store sequence in a dictionary all_pms = dict() for this_chrom in all_chroms: all_pms[this_chrom] = '' orderings_file = 'orderings/' + this_chrom + '_orderings.txt' orderings = get_orderings(orderings_file) for line in orderings: # Mark that we have seen this contig remaining_contig_headers.pop( remaining_contig_headers.index('>' + line[0])) if line[1] == '+': all_pms[this_chrom] += all_seqs['>' + line[0]] all_pms[this_chrom] += ''.join('N' for i in range(gap_size)) else: assert line[1] == '-' all_pms[this_chrom] += reverse_complement(all_seqs['>' + line[0]]) all_pms[this_chrom] += ''.join('N' for i in range(gap_size)) all_pms[this_chrom] += '\n' # Get unincorporated sequences and place them in Chr0 all_pms['Chr0'] = '' for header in remaining_contig_headers: all_pms['Chr0'] += all_seqs[header] all_pms['Chr0'] += ''.join('N' for i in range(gap_size)) all_pms['Chr0'] += '\n' # Write the final sequences out to a file with open('ragoo.fasta', 'w') as f: f.write('>Chr0_RaGOO\n') f.write(all_pms['Chr0']) for header in all_chroms: f.write('>' + header + '_RaGOO\n') f.write(all_pms[header])
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size, chr0=True): """ Need to make a translation table for easy lift-over. :param in_contigs_file: :param in_unique_contigs: :param gap_size: :return: """ # First, read all of the contigs into memory remaining_contig_headers = [] all_seqs = OrderedDict() x = SeqReader(in_contigs_file) for header, seq in x.parse_fasta(): remaining_contig_headers.append(header.split(' ')[0]) all_seqs[header.split(' ')[0]] = seq # Get all reference chromosomes all_chroms = sorted( list( set([ in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys() ]))) # Iterate through each orderings file and store sequence in a dictionary all_pms = dict() pad = ''.join('N' for i in range(gap_size)) for this_chrom in all_chroms: orderings_file = 'orderings/' + this_chrom + '_orderings.txt' orderings = get_orderings(orderings_file) if orderings: seq_list = [] for line in orderings: # Mark that we have seen this contig remaining_contig_headers.pop( remaining_contig_headers.index('>' + line[0])) if line[1] == '+': seq_list.append(all_seqs['>' + line[0]]) else: assert line[1] == '-' seq_list.append(reverse_complement(all_seqs['>' + line[0]])) all_pms[this_chrom] = pad.join(seq_list) all_pms[this_chrom] += '\n' # Get unincorporated sequences and place them in Chr0 if remaining_contig_headers: if chr0: chr0_headers = [] chr0_seq_list = [] for header in remaining_contig_headers: chr0_headers.append(header) chr0_seq_list.append(all_seqs[header]) all_pms['Chr0'] = pad.join(chr0_seq_list) all_pms['Chr0'] += '\n' # Write out the list of chr0 headers f_chr0_g = open('groupings/Chr0_contigs.txt', 'w') f_chr0_o = open('orderings/Chr0_orderings.txt', 'w') for i in chr0_headers: f_chr0_g.write(i[1:] + "\t" + "0" + '\n') f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close() else: # Instead of making a chromosome 0, add the unplaced sequences as is. for header in remaining_contig_headers: all_pms[header[1:]] = all_seqs[header] + "\n" f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt', 'w') f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt', 'w') f_chr0_g.write(header[1:] + "\t" + "0" + '\n') f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close() # Write the final sequences out to a file with open('ragoo.fasta', 'w') as f: for out_header in all_pms: f.write(">" + out_header + "_RaGOO\n") f.write(all_pms[out_header])
def create_pseudomolecules(in_contigs_file, out_folder, in_ref, gap_size=100, chr0=True): """ Need to make a translation table for easy lift-over. :param in_contigs_file: :param in_unique_contigs: :param gap_size: :return: """ # First, read all of the contigs into memory # remaining_contig_headers = [] x = pysam.FastaFile(in_contigs_file) y = pysam.FastaFile(in_ref) remaining_contig_headers = set(x.references) # Get all reference chromosomes # all_chroms = sorted(list(set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()]))) # Iterate through each orderings file and store sequence in a dictionary os.chdir(out_folder) all_chroms = sorted([ os.path.basename(_).replace("_orderings.txt", "") for _ in os.listdir(os.path.join("orderings")) if os.path.basename(_).replace("_orderings.txt", "") in y.references ]) pad = 'N' * gap_size with open('ragoo.fasta', 'w') as outfile: for this_chrom in all_chroms: orderings_file = os.path.join('orderings', this_chrom + '_orderings.txt') orderings = get_orderings(orderings_file) curr_seq = [] curr_total = 0 if orderings: print(">" + this_chrom + "_RaGOO", file=outfile) for line in orderings: # Mark that we have seen this contig remaining_contig_headers.remove(line[0]) _ = x.fetch(line[0]) curr_total += x.get_reference_length(line[0]) if line[1] == '+': curr_seq.append(_) else: assert line[1] == '-' curr_seq.append(reverse_complement(_)) if curr_total >= 10**7: # Print out every 10Mbps wrapped = re.findall(".{1,60}", pad.join(curr_seq)) print(*wrapped[:-1], sep="\n", file=outfile) curr_seq = [wrapped[-1]] curr_total = len(wrapped[-1]) wrapped = re.findall(".{1,60}", pad.join(curr_seq)) print(*wrapped, sep="\n", file=outfile) # Get unincorporated sequences and place them in Chr0 if remaining_contig_headers: if chr0: curr_seq = [] curr_total = 0 chr0_headers = [] for header in remaining_contig_headers: _ = x.fetch(header) curr_total += x.get_reference_length(header) curr_seq.append(_) chr0_headers.append(header) if curr_total >= 10**7: # Print out every 10Mbps wrapped = re.findall(".{1,60}", pad.join(curr_seq)) print(*wrapped[:-1], sep="\n", file=outfile) curr_seq = [wrapped[-1]] curr_total = len(wrapped[-1]) wrapped = re.findall(".{1,60}", pad.join(curr_seq)) print(*wrapped, sep="\n", file=outfile) # Write out the list of chr0 headers f_chr0_g = open(os.path.join('groupings', 'Chr0_contigs.txt'), 'w') f_chr0_o = open( os.path.join('orderings', 'Chr0_orderings.txt'), 'w') for i in chr0_headers: f_chr0_g.write(i[1:] + "\t" + "0" + '\n') f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close() else: # Instead of making a chromosome 0, add the unplaced sequences as is. for header in remaining_contig_headers: print(">{}".format(header), file=outfile) print(*re.findall(".{1,60}", pad.join(x.fetch(header))), sep="\n", file=outfile) f_chr0_g = open( os.path.join('groupings', header[1:] + '_contigs.txt'), 'w') f_chr0_o = open( os.path.join('orderings' + header[1:] + '_orderings.txt'), 'w') f_chr0_g.write(header[1:] + "\t" + "0" + '\n') f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close()