def get_1001pseudogenome_by_range(accs, chrom, start, end, out_dir, ref_file=data_path, num_col=0, name_col=1, delim='', header=True): fasta_out = os.path.join(out_dir, "chr{}_p{}-{}.fasta".format(chrom, start, end)) get_1001pseudogenome_raw(accs, chrom_pref + chrom, start, end, fasta_out, ref_file=ref_file, num_col=num_col, name_col=name_col, delim=delim, header=header) ## ensure nice line width from fasta_manip import fasta_to_dict tmp = fasta_to_dict(fasta_out) from fasta_manip import dict_to_fasta dict_to_fasta(tmp, fasta_out) print("Sequences were successfully written to: {}".format(fasta_out))
def get_domain_seqs(domain_f, fasta, fout, header=[], domain='', adj_dir=False): ## get domain ranges with open(domain_f, 'r') as f: domain_dat = [x[:-1].split('\t') for x in f.readlines()] get = make_custom_get(domain_dat[0]) domain_dat = domain_dat[1:] ## parse fasta file from fasta_manip import fasta_to_dict seqs = fasta_to_dict(fasta) ## get sequences output = {} num = 1 for i, entry in enumerate(domain_dat): ## note: domain_dat is 1-indexed, start and end inclusive, but Python splicing is 0-indexed seq = seqs[get(entry, "contig")][get(entry, "hit.start") - 1:get(entry, "hit.end")] if adj_dir and get(entry, "strand") == '-': seq = seq.reverse_complement() output['|'.join([str(x) for x in \ # (get(entry, "contig", "accID", "group") + [domain, num] + \ (get(entry, "contig", "accID") + [domain, num] + \ ['-'.join([str(x) for x in get(entry, "hit.start", "hit.end")])] + \ (["revcomp"] if adj_dir and get(entry, "strand") == '-' else []))])] = seq num += 1 from fasta_manip import dict_to_fasta dict_to_fasta(output, fout)
def get_ref_raw(chrom, start, end, fasta_out, encoding="utf-8", ref_fasta_files=ref_fasta): from fasta_manip import fasta_to_dict ref_seq = list(fasta_to_dict( ref_fasta_files[chrom]).values())[0][start:end] ## 0-indexed from fasta_manip import dict_to_fasta dict_to_fasta( {"Col-0_ref|{}:{}..{}".format(chrom, start + 1, end): ref_seq}, fasta_out)
def get_ref_by_gene(gene, feature, out_dir, bed=bed_path, encoding="utf-8", ref_fasta_files=ref_fasta, complete=False, domain="", domain_f="", start_inc=True, end_inc=True, merge=False, translate=False, adj_dir=False, by_gene=False, **for_get_domain_in_genome_coords): if domain_f: feature = "CDS" data = grep_bedmerge(gene, bed, feature, encoding, out_dir, merge=merge)["data"] ## extract sequences from fasta file if not data: return chrom = data[0][0] start = min([int(x[1]) for x in data]) end = max([int(x[2]) for x in data]) strand = data[0][3] if feature == "gene": isoforms = {gene: data} else: isoforms = { re.search("=(.+\.\d+)(?=[,;]|$)", isoform).group(1): [x for x in data if x[-1] == isoform] for isoform in set([x[-1] for x in data]) } fasta_out_l = [] seq_ranges = {} ## iterate through isoforms for isoform, isoform_dat in isoforms.items(): ## if extracting only domain-specific range if domain_f: domain_data = get_domain_in_genome_coords( gene, domain, domain_f, out_dir, bed=bed, encoding=encoding, isoform=isoform, start_inc=start_inc, end_inc=end_inc, **{ k: v for k, v in for_get_domain_in_genome_coords.items() if k in ["qname_dname", "qstart_qend"] }) if (not domain_data): continue else: domain_data = [(start, end)] ## get fasta file of sequence data fasta_out = os.path.join(out_dir, isoform + "_ref_" + feature + ("_complete" if complete else '') + \ (('_' + ("domain" if not domain else domain)) if domain_f else '') + \ ("_protein" if (translate and (feature=="CDS")) else '') + ".fasta") get_ref_raw(chrom, start, end, fasta_out, encoding=encoding, ref_fasta_files=ref_fasta_files) seqs_to_write = {} for i, domain_range in enumerate(domain_data): from fasta_manip import fasta_to_dict ref_seq = list(fasta_to_dict(fasta_out).values())[0] d_start, d_end = domain_range ranges = [(d_start, d_end)] ## trim sequence if complete flag not raised or if domain required if (not complete) or domain_f: if complete and domain_f and d_start and d_end: ranges = [(max(start, d_start) - start, min(end, d_end) - start)] elif domain_f and d_start and d_end: ranges = [(max(int(x[1]), d_start) - start, min(int(x[2]), d_end) - start) \ for x in isoform_dat if has_overlap((int(x[1]), int(x[2])), (d_start, d_end))] else: ranges = [(int(x[1]) - start, int(x[2]) - start) for x in isoform_dat] from fasta_manip import extract_ranges ref_seq = extract_ranges(ref_seq, ranges) if (adj_dir or translate) and strand == '-': ref_seq = ref_seq.reverse_complement() ## translate sequence if translate flag raised AND feature is CDS if translate: if feature == "CDS" and not complete: ref_seq = ref_seq.translate(to_stop=True) else: print( "Translation is only possible when the selected feature is 'CDS' and the flag 'complete' is not raised." ) seq_name = "Col-0_ref|{}|{}|{}".format(gene, feature, isoform) + \ (('|' + ("domain" if not domain else domain) + f"|{i+1}") if domain_f else '') + \ ("|complete" if complete else '') + \ ("|revcomp" if adj_dir and strand == '-' else '') seqs_to_write[seq_name] = ref_seq ## for by_gene if by_gene: overlap_ranges = [] overlap_seq_names = [] for logged_ranges, logged_seq_names in seq_ranges.items(): if has_any_overlap(ranges, logged_ranges): overlap_ranges.append(logged_ranges) overlap_seq_names.extend(logged_seq_names) if overlap_ranges: for logged_ranges in overlap_ranges: del (seq_ranges[logged_ranges]) ranges = merge_ranges(*overlap_ranges) seq_ranges[tuple(sorted(ranges))] = seq_ranges.get( tuple(sorted(ranges)), []) + [seq_name] + overlap_seq_names else: seq_ranges[tuple(sorted(ranges))] = seq_ranges.get( tuple(sorted(ranges)), []) + [seq_name] from fasta_manip import dict_to_fasta dict_to_fasta(seqs_to_write, fasta_out) fasta_out_l.append(fasta_out) fasta_out_final = os.path.join(out_dir, gene + "_ref_" + feature + \ ("_complete" if complete else '') + \ (('_' + ("domain" if not domain else domain)) if domain_f else '') + \ ("_protein" if (translate and (feature=="CDS") and not complete) else '')+\ ".fasta") if fasta_out_l: if fasta_out_l[0] != fasta_out_final: from file_manip import cat_files cat_files(sorted(fasta_out_l), fasta_out_final) for fasta_out in fasta_out_l: os.remove(fasta_out) if by_gene: isoform_seqs = fasta_to_dict(fasta_out_final) final_seqs = {} i = 0 for ranges, seq_names in sorted(seq_ranges.items()): seq_name_l = seq_names[0].split('|') seq_name_l[3] = ','.join(f'{r[0]}-{r[1]}' for r in ranges) if domain_f: seq_name_l[5] = str(i + 1) seq_name = '|'.join(seq_name_l) final_seqs[seq_name] = isoform_seqs[seq_names[0]] i += 1 dict_to_fasta(final_seqs, fasta_out_final) print("Sequences were successfully written to: {}".format( fasta_out_final)) elif not fasta_out_l: f = open(fasta_out_final, "w+") f.write('') f.close() print("{} is an empty file".format(fasta_out_final))
def get_1001pseudogenome_by_gene(accs, gene, feature, out_dir, bed=bed_path, encoding="utf-8", ref_file=data_path, num_col=0, name_col=1, delim='', header=True, complete=False, domain="", domain_f="", merge=False, adj_dir=False, **for_get_domain_in_genome_coords): data = grep_bedmerge(gene, bed, feature, encoding, out_dir, merge=merge)["data"] ## extract sequences chrom = data[0][0] start = min([int(x[1]) for x in data]) end = max([int(x[2]) for x in data]) strand = data[0][3] print(data[0], chrom, start, end) if feature == "gene": isoforms = {gene: data} else: isoforms = { re.search("=(.+\.\d+)(?=[,;]|$)", isoform).group(1): [x for x in data if x[-1] == isoform] for isoform in set([x[-1] for x in data]) } fasta_out_l = [] ## iterate through isoforms for isoform, isoform_dat in isoforms.items(): ## if extracting only domain-specific range if domain_f: d_start, d_end = get_domain_in_genome_coords( gene, domain, domain_f, out_dir, bed=bed, encoding=encoding, isoform=isoform, start_inc=start_inc, end_inc=end_inc, **{ k: v for k, v in for_get_domain_in_genome_coords.items() if k in ["qname_dname", "qstart_qend"] }) if (not d_start) or (not d_end): continue ## get fasta file of sequence data fasta_out = os.path.join(out_dir, isoform + '_' + feature + ("_complete" if complete else '') + \ (('_' + ("domain" if not domain else domain))if domain_f else '') + ".fasta") get_1001pseudogenome_raw(accs, chrom, start + 1, end, fasta_out) accs = raw_accs_to_id(accs, ref_file=data_path, num_col=0, name_col=1, delim='', header=True) ## rename seqs + trim if required from fasta_manip import fasta_to_dict tmp_seqs = fasta_to_dict(fasta_out) seqs = {"{}|{}|{}|{}|{}".format(k.split('|')[3], accs[k.split('|')[3]], gene, feature, isoform) +\ (('|' + ("domain" if not domain else domain)) if domain_f else '' + ("|complete" if complete else '')): seq \ for k, seq in tmp_seqs.items()} if (not complete) or domain_f: if complete and domain_f and d_start and d_end: ranges = [(max(start, d_start) - start, min(end, d_end) - start)] elif domain_f and d_start and d_end: ranges = [(max(int(x[1]), d_start) - start, min(int(x[2]), d_end) - start) \ for x in isoform_dat if has_overlap((int(x[1]), int(x[2])), (d_start, d_end))] else: ranges = [(int(x[1]) - start, int(x[2]) - start) for x in isoform_dat] from fasta_manip import extract_ranges seqs = {k: extract_ranges(seq, ranges) for k, seq in seqs.items()} if adj_dir and strand == '-': seqs = { k + "|revcomp": seq.reverse_complement() for k, seq in seqs.items() } from fasta_manip import dict_to_fasta dict_to_fasta(seqs, fasta_out) fasta_out_l.append(fasta_out) fasta_out_final = os.path.join(out_dir, gene + '_' + feature + ("_complete" if complete else '') + \ (('_' + ("domain" if not domain else domain)) if domain_f else '') + ".fasta") if fasta_out_l and fasta_out_l[0] != fasta_out_final: from file_manip import cat_files cat_files(sorted(fasta_out_l), fasta_out_final) for fasta_out in fasta_out_l: os.remove(fasta_out) print("Sequences were successfully written to: {}".format(fasta_out_final))
def get_cds(fout, fasta, bed, domain_f='', domain='', complete=False, adjust_dir=False, translate=False, protein_id_field="protein_id", domain_pid_f=lambda x: x, **kwargs): ## extract isoform ranges data = grep_bedmerge(bed, fasta, "CDS") data.sort(key=lambda x: x[-1]) get = make_custom_get(["chrom", "start", "end", "strand", "phase", "name"]) ## extract domain ranges if domain and domain_f: with open(domain_f) as f: domain_data = [x[:-1].split('\t') for x in f.readlines()] domain_get = make_custom_get(domain_data[0]) domain_data = [ x for x in domain_data[1:] if domain_get(x, "domain") == domain ] domain_seqs = set( domain_pid_f(x) for x in domain_get(domain_data, "qseqid")) # domain_data = {seq_name: [x for x in domain_data if domain_get(x, "qseqid") == seq_name] # for seq_name in domain_seqs} data = [ x for x in data if split_extract( split_extract(get(x, "name"), protein_id_field + '=', 1), ';', 0) in domain_seqs ] isoforms_dat = {} isoform = split_extract( split_extract(get(data[0], "name"), protein_id_field + '=', 1), ';', 0) isoform_dat = [] for i, entry in enumerate(data): curr_isoform = split_extract( split_extract(get(entry, "name"), protein_id_field + '=', 1), ';', 0) if curr_isoform == isoform: isoform_dat.append( get(entry, "chrom", "start", "end", "strand", "phase")) else: ## if complete, get min, max of combined CDS in the isoform if complete: tmp_isoform_dat = isoform_dat[0] tmp_isoform_dat[1] = min(get(isoform_dat, "start")) tmp_isoform_dat[2] = max(get(isoform_dat, "end")) isoforms_dat[isoform] = [tmp_isoform_dat] else: isoforms_dat[isoform] = isoform_dat isoform_dat = [ get(entry, "chrom", "start", "end", "strand", "phase") ] isoform = curr_isoform ## if last entry, write to isoforms_dat if i == len(data) - 1: isoforms_dat[isoform] = isoform_dat ## domain magic if domain_f and domain: updated_isoforms_dat = {} for isoform in domain_seqs: if not isoform in isoforms_dat: continue isoform_dat = isoforms_dat[isoform] chrom, strand = get(isoform_dat[0], "chrom", "strand") domain_ranges = get_domain_in_genome_coords(isoform, domain, domain_f, os.path.dirname(fout), qname_dname=("qseqid", "domain"), qstart_qend=("qstart", "qend"), bed=bed, fasta=fasta, **kwargs) for i, domain_dat in enumerate(domain_ranges): d_start, d_end = domain_dat domain_cds = [[ chrom, max(min(x), d_start), min(max(x), d_end), strand ] for x in get(isoform_dat, "start", "end") if has_overlap(x, (d_start, d_end))] updated_isoforms_dat[isoform + f"|{domain}|{i+1}"] = domain_cds isoforms_dat = updated_isoforms_dat ## extract sequences ref_seqs = { split_extract(k, ' ', 0): v for k, v in fasta_to_dict(fasta).items() } isoforms_seq = {} for isoform, isoform_dat in isoforms_dat.items(): if not isoform_dat: continue chrom, strand = get(isoform_dat[0], "chrom", "strand") isoform_seq = extract_ranges(ref_seqs[str(chrom)], get(isoform_dat, "start", "end")) if (adjust_dir or (translate and not complete)) and strand == '-': isoform_seq = isoform_seq.reverse_complement() if translate and not complete: isoform_seq = isoform_seq.translate() isoforms_seq[isoform + ("|revcomp" if strand == '-' and (adjust_dir or (translate and not complete)) else '') + ("|complete" if complete else '')] = isoform_seq dict_to_fasta(isoforms_seq, fout) return