def test_renamed(self): """ Check if sequences in a FASTA file are properly renamed. """ renamer = bioformats.seqname.FastaSeqRenamer() renamer.read_renaming_dict(self.__renaming_dict) with open(self.__output, "w") as output_fasta: for line in renamer.renamed(self.__fasta): output_fasta.write(line) # perform the reverse renaming rev_renamer = bioformats.seqname.FastaSeqRenamer() rev_renamer.read_renaming_dict(self.__renaming_dict) with open(self.__rev_output, "w") as rev_output_fasta: for line in renamer.renamed(self.__output, reverse=True): rev_output_fasta.write(line) # compare the original and reverse-renamed FASTA files original_fasta = Fasta(self.__fasta) rev_renamed_fasta = Fasta(self.__rev_output) for x, y in zip(original_fasta.keys(), rev_renamed_fasta.keys()): self.assertEqual(x, y) # check if the missing sequence exception is raised del renamer.renaming_dict["seq2"] with self.assertRaises(MissingSeqNameError): for _ in renamer.renamed(self.__fasta): pass os.unlink(self.__output) os.unlink(self.__rev_output)
def fasta_to_df(fasta: pyfaidx.Fasta) -> pd.DataFrame: """Convert the fasta file from seqextractor to a Pandas DataFrame Parameters ---------- fasta : :class:`pyfaidx.Fasta` Parsed FASTA with sequences to examine for spacers that needs to be converted to a Pandas DataFrame Results ---------- :class:`pd.DataFrame` """ df = pd.DataFrame( [fasta[_].name.split("_") for _ in fasta.keys()], columns=["gene_name", "feature_id", "strand", "start", "stop", "seq_hash"], ) df = df.astype( { "feature_id": "category", "gene_name": "category", "strand": "category", "start": np.uint32, "stop": np.uint32, "seq_hash": np.int32, }, copy=False, ) df["sequence"] = pd.Series([fasta[_][:].seq for _ in fasta.keys()]) df["reverse_complement"] = pd.Series( [fasta[_][:].reverse.complement.seq for _ in fasta.keys()] ) return df
def test_renamed(self): formats = self.__formats for i, j in itertools.product(formats[:-1], formats): renamer = bioformats.seqname.NcbiFastaSeqRenamer() for k in self.__acc_num_files: renamer.read_ncbi_acc_num(k, i, j) # convert sequence IDs input_file = os.path.join(self.__test_dir, 'ncbi_' + i + '.fa') with open(self.__output, 'w') as output_fasta: for line in renamer.renamed(input_file): output_fasta.write(line) example_file = os.path.join(self.__test_dir, 'ncbi_' + j + '.fa') for k in (self.__output + '.fai', example_file + '.fai'): if os.path.isfile(k): os.unlink(k) output_fasta = Fasta(self.__output) example_fasta = Fasta(example_file) # compare the obtained file to the example self.assertEqual(output_fasta.keys(), example_fasta.keys()) # test for an incorrect format with self.assertRaises(SeqRenameError): renamer = bioformats.seqname.NcbiFastaSeqRenamer() renamer.read_ncbi_acc_num( 'unknown', 'chr_refseq', os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa')) with self.assertRaises(SeqRenameError): renamer.read_ncbi_acc_num( 'chr_refseq', 'unknown', os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa')) # test for an incorrect NCBI accession number dictionary with self.assertRaises(IncorrectDictError): renamer.read_ncbi_acc_num(self.__chr_incorrect, 'refseq_full', 'chr_refseq') # check if sequence versions are removed renamer = bioformats.seqname.NcbiFastaSeqRenamer() for k in self.__acc_num_files: renamer.read_ncbi_acc_num(k, 'chr', 'genbank', remove_seq_version=True) input_file = os.path.join(self.__test_dir, 'ncbi_chr.fa') example_file = os.path.join(self.__test_dir, 'ncbi_genbank_nover.fa') with open(self.__output, 'w') as output_fasta: for line in renamer.renamed(input_file): output_fasta.write(line) for k in (self.__output + '.fai', example_file + '.fai'): if os.path.isfile(k): os.unlink(k) output_fasta = Fasta(self.__output) example_fasta = Fasta(example_file) self.assertEqual(output_fasta.keys(), example_fasta.keys()) os.unlink(example_file + '.fai')
def test_renamed(self): """ Check if sequences in a FASTA file are properly renamed. """ renamer = bioformats.seqname.FastaSeqRenamer() renamer.read_renaming_dict(self.__renaming_dict) with open(self.__output, 'w') as output_fasta: for line in renamer.renamed(self.__fasta): output_fasta.write(line) # perform the reverse renaming rev_renamer = bioformats.seqname.FastaSeqRenamer() rev_renamer.read_renaming_dict(self.__renaming_dict) with open(self.__rev_output, 'w') as rev_output_fasta: for line in renamer.renamed(self.__output, reverse=True): rev_output_fasta.write(line) # compare the original and reverse-renamed FASTA files original_fasta = Fasta(self.__fasta) rev_renamed_fasta = Fasta(self.__rev_output) for x, y in zip( original_fasta.keys(), rev_renamed_fasta.keys()): self.assertEqual(x, y) # check if the missing sequence exception is raised del renamer.renaming_dict['seq2'] with self.assertRaises(MissingSeqNameError): for _ in renamer.renamed(self.__fasta): pass os.unlink(self.__output) os.unlink(self.__rev_output)
class GenVCF: def __init__(self, ref_fasta_path, vcf_path, kmer_size, nprocs): self.vcf_path = vcf_path self.fasta_path = ref_fasta_path self.ref = Fasta(ref_fasta_path) self.vcf = VCF(vcf_path) self.kmer_size = kmer_size self.nprocs = nprocs self.keys = [c for c in self.vcf.seqnames if c in self.ref.keys()] self.directory = None if len(self.keys) == 0: self.keys = self.ref.keys() print('No common keys found. Using reference.') def set_destination(self, path): self.directory = path def full_string(self): ref_seq = "" for chrom in self.keys: ref_seq += str(self.ref[chrom]) return ref_seq def get_kmer_frequency(self, klen): # returns a counter return kt.get_kmer_count(self.full_string(), klen) def vcf_scan(self): if self.nprocs == 0: self.nprocs = mp.cpu_count() regions = kt.get_split_vcf_regions(self.vcf_path, self.nprocs) args = [[region, self.vcf_path, self.fasta_path, self.kmer_size] for region in regions] pool = mp.Pool(self.nprocs) results = [funccall.get() for funccall in [pool.starmap_async(process_region, args)]] pool.close() all_vars, singletons, not_singletons = ['all_vars'], ['singleton_vars'], ['notsingleton_vars'] all_transitions, singleton_transitions, notsingleton_transitions, mismatches = ['all_transitions'], [ 'singleton_transitions'], ['notsingleton_transitions'], ['mismatches'] for result in results[0]: for key, value in result.items(): if key == 'all': all_transitions.append(value[0]) all_vars.append(value[1]) if key == 'singletons': singleton_transitions.append(value[0]) singletons.append(value[1]) if key == 'not_singletons': notsingleton_transitions.append(value[0]) not_singletons.append(value[1]) if key == 'mismatches': mismatches.append(value[0]) all_results = [all_vars, singletons, not_singletons, all_transitions, singleton_transitions, notsingleton_transitions, mismatches] destination = kt.prepare_directory(parent=self.directory) for res in all_results: merge_and_save(res, destination) return #results
def filter_fasta(infa, outfa, regex=".*", v=False, force=False): """Filter fasta file based on regex. Parameters ---------- infa : str Filename of input fasta file. outfa : str Filename of output fasta file. Cannot be the same as infa. regex : str, optional Regular expression used for selecting sequences. v : bool, optional If set to True, select all sequence *not* matching regex. force : bool, optional If set to True, overwrite outfa if it already exists. Returns ------- fasta : Fasta instance pyfaidx Fasta instance of newly created file """ if infa == outfa: raise ValueError("Input and output FASTA are the same file.") if os.path.exists(outfa): if force: os.unlink(outfa) if os.path.exists(outfa + ".fai"): os.unlink(outfa + ".fai") else: raise ValueError( "{} already exists, set force to True to overwrite".format( outfa)) filt_function = re.compile(regex).search fa = Fasta(infa, filt_function=filt_function) seqs = fa.keys() if v: original_fa = Fasta(infa) seqs = [s for s in original_fa.keys() if s not in seqs] fa = original_fa if len(seqs) == 0: raise ValueError("No sequences left after filtering!") with open(outfa, "w") as out: for chrom in seqs: out.write(">{}\n".format(fa[chrom].name)) out.write("{}\n".format(fa[chrom][:].seq)) return Fasta(outfa)
def filter_fasta(infa, outfa, regex=".*", v=False, force=False): """Filter fasta file based on regex. Parameters ---------- infa : str Filename of input fasta file. outfa : str Filename of output fasta file. Cannot be the same as infa. regex : str, optional Regular expression used for selecting sequences. v : bool, optional If set to True, select all sequence *not* matching regex. force : bool, optional If set to True, overwrite outfa if it already exists. Returns ------- fasta : Fasta instance pyfaidx Fasta instance of newly created file """ if infa == outfa: raise ValueError("Input and output FASTA are the same file.") if os.path.exists(outfa): if force: os.unlink(outfa) if os.path.exists(outfa + ".fai"): os.unlink(outfa + ".fai") else: raise ValueError( "{} already exists, set force to True to overwrite".format(outfa)) filt_function = re.compile(regex).search fa = Fasta(infa, filt_function=filt_function) seqs = fa.keys() if v: original_fa = Fasta(infa) seqs = [s for s in original_fa.keys() if s not in seqs] fa = original_fa if len(seqs) == 0: raise ValueError("No sequences left after filtering!") with open(outfa, "w") as out: for chrom in seqs: out.write(">{}\n".format(fa[chrom].name)) out.write("{}\n".format(fa[chrom][:].seq)) return Fasta(outfa)
def get_sequence_fasta(region, reference=None, padding=True): ref = Fasta(reference) if "chr" not in list(ref.keys())[0] and "chr" in region.chr: chrom = region.chr.split("chr")[1] elif "chr" not in region.chr and "chr" in list(ref.keys())[0]: chrom = "chr" + region.chr else: chrom = region.chr if not padding: return ref[chrom][region.start:region.stop].seq else: return ref[chrom][region.start_w_padding:region.stop_w_padding].seq
def write_sequence(args): _, ext = os.path.splitext(args.fasta) if ext: ext = ext[1:] # remove the dot from extension filt_function = re.compile(args.regex).search fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild) regions_to_fetch, split_function = split_regions(args) if not regions_to_fetch: regions_to_fetch = fasta.keys() if args.invert_match: sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch]) fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild) regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude) split_function = ucsc_split header = False for region in regions_to_fetch: name, start, end = split_function(region) if args.size_range: if start is not None and end is not None: sequence_len = end - start else: sequence_len = len(fasta[name]) if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len: continue if args.split_files: # open output file based on sequence name filename = '.'.join(str(e) for e in (name, start, end, ext) if e) filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters) outfile = open(filename, 'w') elif args.out: outfile = args.out else: outfile = sys.stdout try: if args.transform: if not header and args.transform == 'nucleotide': outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n") header = True outfile.write(transform_sequence(args, fasta, name, start, end)) else: for line in fetch_sequence(args, fasta, name, start, end): outfile.write(line) except FetchError as e: raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n") if args.split_files: outfile.close() fasta.__exit__()
def write_sequence(args): _, ext = os.path.splitext(args.fasta) if ext: ext = ext[1:] # remove the dot from extension filt_function = re.compile(args.regex).search fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild) regions_to_fetch, split_function = split_regions(args) if not regions_to_fetch: regions_to_fetch = fasta.keys() if args.invert_match: sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch]) fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild) regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude) split_function = ucsc_split header = False for region in regions_to_fetch: name, start, end = split_function(region) if args.size_range: if start is not None and end is not None: sequence_len = end - start else: sequence_len = len(fasta[name]) if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len: continue if args.split_files: # open output file based on sequence name filename = '.'.join(str(e) for e in (name, start, end, ext) if e) filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters) outfile = open(filename, 'w') elif args.out: outfile = args.out else: outfile = sys.stdout try: if args.transform: if not header and args.transform == 'nucleotide': outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n") header = True outfile.write(transform_sequence(args, fasta, name, start, end)) else: for line in fetch_sequence(args, fasta, name, start, end): outfile.write(line) except FetchError as e: raise FetchError(str(e) + " Try setting --lazy.\n") if args.split_files: outfile.close() fasta.__exit__()
def prepare_reference_dict(fasta_path, variants, delim='\t', primary_chroms=True, nprocs=6): """ :param nprocs: number of CPUs to use :param primary_chroms: boolean True means only include original autosomal chromosomes, False includes everything :param delim: indicates how your variant file is separated :param fasta_path: path to file containing reference sequence :param variants: path to bed file containing the variants in the reference sequence :return: prints files to a specified directory, 1 per chromosome """ start = time.time() fa = Fasta(fasta_path) final_dir = '/uufs/chpc.utah.edu/common/home/u0319040/longo_scratch/output/chroms/' # var_df = pd.read_csv(variants, sep=delim, low_memory=False) if primary_chroms: keys = get_primary_chroms_grch38(fasta_path) else: keys = fa.keys() args = [] directory = prepare_directory(new_folder='./ref_var_dict/') for key in keys: args.append((key, final_dir, fasta_path, directory)) pool = mp.Pool(nprocs) results = [ funccall.get() for funccall in [pool.starmap_async(process_chrom, args)] ] pool.close() print('Done processing variants in %f' % (time.time() - start), flush=True) return directory
def calc_bkgd_counts(fasta_filename, region_size_min, region_size_max, ignore_chroms, only_chroms, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename, as_raw = True) for chrom in fasta.keys(): # skip data based on specified chromosomes if chrom in ignore_chroms: continue if only_chroms and chrom not in only_chroms: continue seq_len = len(fasta[chrom]) for idx in range(seq_len + 1): for region_size in range(region_size_min, region_size_max + 1): nucs = fasta[chrom][idx:idx+region_size] nuc_counts[region_size][nucs] += 1 # remove entries that are not equal to region_size for region_size, nuc_dict in nuc_counts.items(): for nuc, count in nuc_dict.items(): if len(nuc) != region_size: nuc_dict.pop(nuc) return nuc_counts
def calc_bkgd_counts(fasta_filename, region_size_min, region_size_max, ignore_chroms, only_chroms, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename, as_raw=True) for chrom in fasta.keys(): # skip data based on specified chromosomes if chrom in ignore_chroms: continue if only_chroms and chrom not in only_chroms: continue seq_len = len(fasta[chrom]) for idx in range(seq_len + 1): for region_size in range(region_size_min, region_size_max + 1): nucs = fasta[chrom][idx:idx + region_size] nuc_counts[region_size][nucs] += 1 # remove entries that are not equal to region_size for region_size, nuc_dict in nuc_counts.items(): for nuc, count in nuc_dict.items(): if len(nuc) != region_size: nuc_dict.pop(nuc) return nuc_counts
def test_split_seq(self): """ Fetch sequence by blocks """ fa = Fasta('data/chr17.hg19.part.fa') gene = Fasta("data/gene.bed12.fasta") expect = gene[list(gene.keys())[0]][:].seq bed = "data/gene.bed12" with open(bed) as fi: record = fi.readline().strip().split("\t") chrom = record[0] start = int(record[1]) strand = record[5] # parse bed12 format starts = [int(x) for x in record[11].split(",")[:-1]] sizes = [int(x) for x in record[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start,size in zip(starts, sizes)] # bed half-open if strand == "-": starts = [start + 1 for start in starts] else: ends = [end - 1 for end in ends] intervals = zip(starts, ends) result = fa.get_spliced_seq(chrom, intervals, rc=True) print(result.seq) print("====") print(expect) assert result.seq == expect
def pairwise_align(folder, work_dir): try: os.mkdir(work_dir) except: pass fastas = [ folder + '/' + file for file in os.listdir(folder) if file.endswith('.fasta') ] for count, fasta in enumerate(fastas): subprocess.call( "rm temp.fa.fai && reformat.sh in=%s out=temp.fa addunderscore overwrite=true" % fasta, shell=True) f = Fasta('temp.fa') seqs = f.keys() subprocess.call("samtools faidx temp.fa %s > %s/main.fa" % (seqs[0], work_dir), shell=True) subprocess.call( 'samtools faidx temp.fa %s > %s/temp.fa && lastz --format=maf %s/main.fa %s/temp.fa > %s/%d.maf' % (' '.join( seqs[1:]), work_dir, work_dir, work_dir, work_dir, count), shell=True)
class TestFeatureKeyFunction: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, key_function=get_gene_name) self.genes = Fasta(self.fasta, key_function=get_gene_name) def test_keys(self): expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1'] result = sorted(self.genes.keys()) assert result == expect def test_key_function_by_dictionary_get_key(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.genes['MDM4'][100-1:150] assert str(result) == expect def test_key_function_by_fetch(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('MDM4', 100, 150) assert str(result) == expect @raises(ValueError) def test_duplicated_keys(self): genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
def get_refseq(ref): refseq = None fa = Fasta(ref) for genome_id in fa.keys(): genome = genome_id refseq = str(fa[genome_id]) return refseq
def remakeProt(fasta, outfile, idfile, id): fasta_index = Fasta(fasta) lookup = {} for protein in fasta_index.keys(): size = len(fasta_index[protein]) gene, isoform = stripName(protein) if gene in lookup: if size > lookup[gene][0]: lookup[gene] = (size, isoform) else: lookup[gene] = (size, isoform) with open(outfile,"w") as f, open(idfile, "a") as q: for i, gene in enumerate(lookup): isoform = lookup[gene][1] name = "".join([gene, "_P", isoform]) if name not in fasta_index: if gene == isoform: name = gene else: name = "".join([gene, "_T", isoform]) q.write("{}_{}: {}\n".format(id, i, gene)) f.write(">{}_{}\n".format(id,i)) for line in fasta_index[name]: f.write("{}\n".format(str(line)))
class FastaChunkReader: def __init__(self, filename, chunk_size=10000, kmer_size=31): self.fasta = Fasta(filename) self.current_ref = 0 self.current_start = 0 self.chunk_size = chunk_size self.kmer_size = kmer_size self.seqnames = list(self.fasta.keys()) #self.chunk = chunk #self.total_chunks = total_chunks def __iter__(self): return self def __next__(self): if len(self.seqnames) == self.current_ref: self.fasta.close() raise StopIteration seqname = self.seqnames[self.current_ref] start = self.current_start end = start + self.chunk_size self.current_start = end - self.kmer_size if end >= len(self.fasta[seqname]): self.current_start = 0 self.current_ref += 1 end = len(self.fasta[seqname]) return { "seqname": seqname, "start": start, "end": end, "seq": self.fasta[seqname][start:end].seq, }
def write_sequence(args): _, ext = os.path.splitext(args.fasta) if ext: ext = ext[1:] # remove the dot from extension fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter) regions_to_fetch, split_function = split_regions(args) if not regions_to_fetch: regions_to_fetch = tuple(fasta.keys()) for region in regions_to_fetch: name, start, end = split_function(region) if args.split_files: # open output file based on sequence name filename = '.'.join(str(e) for e in (name, start, end, ext) if e) filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters) outfile = open(filename, 'w') else: outfile = sys.stdout try: for line in fetch_sequence(args, fasta, name, start, end): outfile.write(line) except FetchError as e: raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n") if args.split_files: outfile.close() fasta.__exit__()
def main(options): transcripts=read_strand_file(options.strand) ref=Fasta(options.ref) for chrom in ref.keys(): print(chrom, file=sys.stderr) print(">"+chrom) plus=np.array([False]*len(ref[chrom])) minus=np.array([False]*len(ref[chrom])) ti=0 for transcript in transcripts["chr"+chrom]: if not ti % 1000: print("\r"+chrom+":trans"+str(ti), file=sys.stderr) if transcript[0]=="+": plus[transcript[1]:transcript[2]]=True elif transcript[0]=="-": minus[transcript[1]:transcript[2]]=True ti+=1 print(chrom+":writing", file=sys.stderr) chrom_tx_strand = "".join(MAP[1*plus+2*minus]) #output=textwrap.fill(chrom_tx_strand,40) print(chrom_tx_strand) print(chrom+":done", file=sys.stderr)
def flaimapper(self, settings, first_task): ''' Make bam files and fragment them. Use Flaimapper on fragmented bam. Combine Flaimapper output and count reads using Bedtools intersect. ''' overlap = settings["overlap_range"] size_range = settings["size_range"] #get chrom lengths genome = Fasta(settings["genome"], one_based_attributes=False) genome_lengths = {} for chrom in genome.keys(): genome_lengths[chrom] = len(genome[chrom]) #This allowes 2bp non-overlaping pp till length 1418 ## overlap = [0.888,0.916,0.939,0.957,0.97,0.979,0.985,0.9898,0.9931,\ ## 0.9953,0.99685,0.99789] #as list [x,y] ## size_range = [24,33,47,67,97,140,197,292,432,636,950]\ ## #as list [z], has to be shorter by 1 from overlap pool = mp.Pool(processes=settings["CPUs"]) results = [pool.apply_async(self.flaimapper_by_library, \ args = (settings,library,overlap,size_range,\ genome_lengths,first_task)) \ for library in sorted(settings["libraries"])] pool.close() pool.join() for r in results: r.get()
def test_split_seq(self): """ Fetch sequence by blocks """ fa = Fasta('data/chr17.hg19.part.fa') gene = Fasta("data/gene.bed12.fasta") expect = gene[list(gene.keys())[0]][:].seq bed = "data/gene.bed12" with open(bed) as fi: record = fi.readline().strip().split("\t") chrom = record[0] start = int(record[1]) strand = record[5] # parse bed12 format starts = [int(x) for x in record[11].split(",")[:-1]] sizes = [int(x) for x in record[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start, size in zip(starts, sizes)] # bed half-open if strand == "-": starts = [start + 1 for start in starts] else: ends = [end - 1 for end in ends] intervals = zip(starts, ends) result = fa.get_spliced_seq(chrom, intervals, rc=True) print(result.seq) print("====") print(expect) assert result.seq == expect
def prepare_reference_dict(fasta_path, variants, delim='\t', primary_chroms=True): """ :param primary_chroms: boolean True means only include original autosomal chromosomes, False includes everything :param delim: indicates how your variant file is separated :param fasta_path: path to file containing reference sequence :param variants: path to bed file containing the variants in the reference sequence :return: a dictionary mapping chromosome names to an array of tuples containing the reference allele in the first index and the variant allele in the second index if it exists """ start = time.time() fa = Fasta(fasta_path) var_df = pd.read_csv(variants, sep=delim) if primary_chroms: keys = get_primary_chroms(fasta_path) else: keys = fa.keys() args = [] directory = prepare_directory(new_folder='./ref_var_dict/') for key in keys: args.append((key, var_df[var_df.iloc[:, 0] == key], fasta_path, directory)) pool = mp.Pool(mp.cpu_count()) results = [funccall.get() for funccall in [pool.starmap_async(zip_chrom, args)]] pool.close() print('Done processing variants in %f' % (time.time() - start)) directory = prepare_directory(parent='./ref_var_dict/') for chrom_key in results[0]: fp = open(directory + chrom_key[0] + '.csv', 'w') fp.write('REF_fasta\tALT\tREF_vcf') for rec in chrom_key[1]: for i in rec: fp.write(str(i) + '\t') fp.write('\n') fp.close() print('Reference dictionary prepared in %f' % (time.time() - start)) return directory
def parse_fasta(f, output, window_size, chromosome, line_len): # ---- Load in input information ---- fasta_file = Fasta(str(f)) headers = fasta_file.keys() flag = True for header in list(headers): print( f"Parsing out {header} into {window_size} base pair windows") sample_seq_df = {str(header): list(fasta_file[str(header)][:].seq)} number_of_out_seqs = ( len(sample_seq_df[str(header)]) // window_size + 1) start_pos = 0 end_pos = window_size for _ in range(number_of_out_seqs): current_file_path = output / f"{chromosome}/{chromosome}_{start_pos}_{end_pos}.fasta" if flag: if current_file_path.is_file(): os.remove(current_file_path) else: pass with open(current_file_path, 'a') as current_file: seq = textwrap.wrap("".join(list(sample_seq_df[str(header)][start_pos:end_pos])), line_len) if len(seq) == 0: continue current_file.write(">{}\n".format(header)) current_file.write("{}\n".format("\n".join(seq))) start_pos += window_size end_pos += window_size flag = False return
def locate(args): kmers, fd, fo = args.kmer, args.db, args.out fg = args.fg db = Fasta(fd) # kseqs = kmers.split(',') kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs] ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ]) # seqs = [] if fg != '': fhg = open(fg, 'r') for line in fhg: line = line.rstrip("\n") if not line: continue gid = line.split()[0] if gid == 'gid': continue if gid not in db: continue seqs.append(gid) else: seqs = db.keys() fho = open(fo, 'w') fho.write('kmer\tsid\tstart\tend\tsrd\n') i = 1 for seqid in seqs: seq = db[seqid][0:].seq for m in re.finditer(ptn, seq): start, end = m.start()+1, m.end() srd = "+" if m.group(0) in kseqs else "-" fho.write(f"{m.group(0)}\t{seqid}\t{start}\t{end}\t{srd}\n") i += 1 fho.close()
def get_fasta_length(filename): """Get length of reference sequence""" refseq = Fasta(filename) key = list(refseq.keys())[0] l = len(refseq[key]) return l
def __init__(self, assembly, data_manager): sub_type = assembly.replace('.', '').replace('_', '') if sub_type.startswith('R6'): sub_type = 'R627' fasta_config = data_manager.get_config('FASTA') for sub_type_config in fasta_config.get_sub_type_objects(): if not sub_type == sub_type_config.get_sub_data_type(): self.logger.info(sub_type_config.get_sub_data_type()) continue filepath = sub_type_config.get_filepath() self.logger.info(filepath) break if filepath is None: self.logger.warning("Can't find Assembly filepath for %s", assembly) sys.exit(3) self.assembly = assembly self.filepath = filepath fasta_data = Fasta(filepath) while len(fasta_data.keys()) == 0: time.sleep(6) os.remove(filepath + ".fai") fasta_data = Fasta(filepath) self.fasta_data = fasta_data
def generate_sizes(name, genome_dir): """Generate a sizes file with length of sequences in FASTA file.""" fa = os.path.join(genome_dir, name, "{}.fa".format(name)) sizes = fa + ".sizes" g = Fasta(fa) with open(sizes, "w") as f: for seqname in g.keys(): f.write("{}\t{}\n".format(seqname, len(g[seqname])))
def ref_genome_as_string(ref_fasta, keys=None): ref_genome = Fasta(ref_fasta) if keys is None: keys = ref_genome.keys() ref_seq = "" for chrom in keys: ref_seq += str(ref_genome[chrom]) return ref_seq
class FastaWrapper(GenomeWrapper): def __init__(self, fasta_file, alpha='dna', one_hot=True, channel_last=True, in_mem=False, thread_safe=False, read_ahead=10000): super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe) self.fasta = Fasta(fasta_file, as_raw=True, sequence_always_upper=True, read_ahead=read_ahead) self._chroms = list(self.fasta.keys()) seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms] self._chroms_size = dict(zip(self._chroms, seq_lens)) self.read_ahead = read_ahead if in_mem: fasta_onehot_dict = self._encode_seqs(self.fasta) self.fasta.close() self.fasta = fasta_onehot_dict self.thread_safe = True else: if thread_safe: self.fasta.close() self.fasta = fasta_file def close(self): if not self.thread_safe: self.fasta.close() @staticmethod def _encode_seqs(fasta): # Converts a FASTA object into a dictionary of one-hot coded boolean matrices fasta_dict = {} pbar = tqdm(fasta) for record in pbar: pbar.set_description(desc='Loading sequence: ' + record.name) seq = record[:] seq = np.array(list(seq)) fasta_dict[record.name] = seq return fasta_dict def _get_seq(self, chrom, start, stop): if self.in_mem: seq = self.fasta[chrom][start:stop] else: if self.thread_safe: fasta = Fasta(self.fasta, as_raw=True, sequence_always_upper=True, read_ahead=self.read_ahead) seq = np.array(list(fasta[chrom][start:stop])) fasta.close() else: seq = np.array(list(self.fasta[chrom][start:stop])) return seq
def test_ncbiseqrename_fasta(self): """ Check if NCBI sequence names in a FASTA file are properly changed. """ sys.argv = [ '', 'ncbirenameseq', self.__fasta, 'genbank', self.__output, 'ucsc', '--chr', self.__chr, '--unloc', self.__unloc, '--unpl', self.__unpl, '--fasta' ] bioformats.cli.bioformats() # check if the obtained and original files are the same original_fasta = Fasta(self.__ucsc_fasta) renamed_fasta = Fasta(self.__output) for x, y in zip(original_fasta.keys(), renamed_fasta.keys()): self.assertEqual(x, y)
def get_lengths(input, cutoff): lookup = {} contigs = Fasta(input) for contig in contigs.keys(): if len(contigs[contig]) < cutoff: lookup[contig] = 1 return lookup
def get_fasta_sequence(filename, start, end, key=0): """Get chunk of indexed fasta sequence at start/end points""" from pyfaidx import Fasta refseq = Fasta(filename) if type(key) is int: chrom = list(refseq.keys())[key] seq = refseq[chrom][start:end].seq return seq
def tadpole_fastqs(f1, out, verbose=False, k=66, threads=1, tadpole_bin='tadpole.sh', bm1=1, bm2=1, mincontig="auto", mincountseed=100, return_contigs=False): ''' use tadpole from bbtools to assemble a cloud of sequences controlled by a UMI ''' #tadpole.sh in=tmp/f1_ACTTCGCCAGAGTTGG_GTGCGAGAGGGTA.fastq out=mini k=66 overwrite=True bm1=1 bm2=1 mincountseed=4 cmd = f"{tadpole_bin} in={f1} out={out} k={k} overwrite=True bm1={bm1} bm2={bm2} t={threads} -Xmx6g mincontig={mincontig} mincountseed={mincountseed} rcomp=f" if verbose: print(cmd) pp = subprocess.run(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) if "RuntimeException" in pp.stderr.decode(): #means the command failed print("RunTimeEx") return (False) contigs = pp.stderr.decode().split("Contigs generated:")[1].split( '\n')[0].split('\t')[1] contigs = int(contigs) if contigs == 0: return (False) else: #total = ''.join([i for i in pp.stderr.decode().split('\n') if "Pairs" in i]) #total = float(total.split('\t')[-1].replace("%", '')) #joined = ''.join([i for i in pp.stderr.decode().split('\n') if "Joined" in i]) #joined = float(joined.split('\t')[-1].replace("%", ''))/100 log = '\n'.join([i for i in pp.stderr.decode().split('\n')]) if "contig" in out: log_ofn = out.replace('contig', 'log') else: log_ofn = out + '.log' if verbose: print(pp.stderr.decode()) print("tadpole log %s" % (log_ofn)) with open(log_ofn, 'w') as logout: for i in log: logout.write(i) if return_contigs: from pyfaidx import Fasta contigs = Fasta(out, as_raw=True) contigs = [(k, contigs[k][:]) for k in contigs.keys()] return (contigs) return (True)
def format_alleles_from_sam(ref_path, fa_ofn): # we add the right header #TODO improve import uuid from pyfaidx import Fasta import subprocess import pysam unique_filename = "tmpfa_"+str(uuid.uuid4()) with open(unique_filename, 'w') as ofa, open(fa_ofn) as infile: FF = Fasta(ref_path) entries = [(f, len(FF[f][:])) for f in FF.keys()] for entry in entries: ofa.write(f'@SQ\tSN:{entry[0]}\tLN:{entry[1]}\n') print(entry) for line in infile.readlines(): #print(line) ofa.write(line) subprocess.run(f'mv {unique_filename} {fa_ofn}'.split()) with open(fa_ofn.replace('fa', 'concise'), 'w') as tab: tab.write("\t".join(['cbc', 'umi', 'molid', 'contig_str', 'contig_cov', 'reads', 'contig_len','cigar', 'allele', 'pruneda']) + '\n') for read in pysam.AlignmentFile(fa_ofn): start = read.reference_start trim_start = 1111 trim_end = 1515 refi = 0 quei = 0 #if start<trim_start: current = start allele = [] for i in read.cigartuples: if i[0] == 0: #match quei += i[1] if len(allele) == 0: allele.append(f'{current+i[1]-trim_start}M') elif len(allele) == 100*len(read.cigartuples)-1: allele.append(f'{trim_end -current+i[1]}M') else: allele.append(f'{current+i[1]}M') current = current + i[1] if i[0] == 1: #insertion allele.append(f'{current+i[1]}I') current = current + i[1] if i[0] == 2: #deletion allele.append(f'{current+i[1]}D') current = current + i[1] cbc, umi, molid, contig_str, contig_cov, reads, contig_len = read.query_name.split('_') pruneda = ":".join(allele[1:-1]) if len(pruneda) == 0 or len(allele) == 0: pruneda = 'NA' tab.write("\t".join([cbc, umi, molid, contig_str, contig_cov, reads, contig_len, read.cigarstring, ':'.join(allele), pruneda]) + '\n')
def main(options): """ Iterate and remove motif (by setting to N) """ ref=Fasta(options.ref) reg=re.compile(motif) for chrom in ref.keys(): print(">"+chrom) new_seq=reg.sub("N"*motif_length, ref[chrom][:].seq.upper()) print(new_seq)
def get_prot_lens(faa_file, phage): len_dict={} digits=get_digits(faa_file) #def make_seq_len_dict(faa): f=Fasta(faa_file) for i in f.keys(): name=get_locus_tag(i, digits=digits, phage=phage) length=len(str(f[i])) len_dict[name]=length return len_dict
def get_prot_lens(faa_file, phage): len_dict = {} digits = get_digits(faa_file) #def make_seq_len_dict(faa): f = Fasta(faa_file) for i in f.keys(): name = get_locus_tag(i, digits=digits, phage=phage) length = len(str(f[i])) len_dict[name] = length return len_dict
def clean_fasta(fn, prefix): fa = Fasta(fn) tmp_fa_fn = prefix+'tmp.fa' tmp_fa = open(tmp_fa_fn, 'w') for entry in fa.keys(): tmp_fa.write(">%s\n%s\n"%(entry, fa[entry][:].seq.replace('\n', ''))) fa.close() tmp_fa.close() cmd = "mv %s %s"%(tmp_fa_fn, fn) subprocess.run(cmd.split())
def test_ncbiseqrename_fasta(self): """ Check if NCBI sequence names in a FASTA file are properly changed. """ sys.argv = ['', self.__fasta, 'genbank', self.__output, 'ucsc', '--chr', self.__chr, '--unloc', self.__unloc, '--unpl', self.__unpl, '--fasta'] bioformats.cli.ncbirenameseq() # check if the obtained and original files are the same original_fasta = Fasta(self.__ucsc_fasta) renamed_fasta = Fasta(self.__output) for x, y in zip(original_fasta.keys(), renamed_fasta.keys()): self.assertEqual(x, y) os.unlink(self.__ucsc_fasta + '.fai') os.unlink(self.__output) os.unlink(self.__output + '.fai')
def binding_sites(kmer, genome_fp): genome = Fasta(genome_fp) locations = {} kmer = str(kmer) for record in genome.keys(): seq = str(genome[record]) locations[record] = substr_indices(kmer, seq) # append reversed primer locations as well locations[record] += substr_indices(revcomp(kmer), seq) if locations == {}: raise ValueError( "No locations for {} found in fg genome!".format(kmer)) return locations
def write_read_lengths_to_file(read_fasta_files, output_file): out=open(output_file,"w") out.write("fasta_file\tseq_id\tread_len\n") readfiles=list(read_fasta_files) for r in readfiles: f=Fasta(r) for i in f.keys(): length=len(str(f[i])) fasta=r.split("/")[-1] sequence=i out.write("%s\t%s\t%s\n" % (fasta, sequence, length)) out.close()
def chromosome_ends(genome_fp): ''' Returns the locations of the starts/ends of each chromosome (record) in a genome where all the chromosomes are concatenated (so i.e. the 2nd genome start site is len(1st genome), and all indices are 0-based). ''' genome = Fasta(genome_fp) len_so_far = 0 chr_ends = {} for record in genome.keys(): chromosome = genome[record] chr_len = len(chromosome) chr_ends[record] = [len_so_far, chr_len + len_so_far - 1] len_so_far += chr_len return chr_ends
def split_fasta(number_files, fasta_file): try: fasta=Fasta(fasta_file) except: print "could not open fasta" exit() number_seqs=len(fasta.keys()) splits=int(np.ceil(number_seqs/number_files)) #print(splits) ranges=range(0, number_seqs, splits) print(ranges) ranges[-1]=number_seqs print(ranges) for i in range(0, number_files): start=ranges[i] stop=ranges[i+1] label=re.sub(r"\.fa.*","."+str(i+1)+".fasta", fasta_file) out=open(label,"w") for f in fasta.keys()[start:stop]: out.write(">"+f+"\n"+str(fasta[f])+"\n") out.close()
def fillgaps(consensusdict, fasta): """ """ print("filling consensus...") fastascaf = Fasta(fasta, mutable=True) for chrom in fastascaf.keys(): for suc in consensusdict.keys(): t1 = int(suc.split(":")[0]) t2 = int(suc.split(":")[1]) assert (t2 - t1) == len(fastascaf[chrom][t1:t2].seq) # print(consensusdict[suc]) # print(fastascaf[chrom][t1:t2].seq) fastascaf[chrom][t1:t2] = consensusdict[suc] # print(fastascaf[chrom][t1:t2].seq) return(None)
def generate_gap_bed(fname, outname): """ Generate a BED file with gap locations. Parameters ---------- fname : str Filename of input FASTA file. outname : str Filename of output BED file. """ f = Fasta(fname) with open(outname, "w") as bed: for chrom in f.keys(): for m in re.finditer(r'N+', f[chrom][:].seq): bed.write("{}\t{}\t{}\n".format(chrom, m.start(0), m.end(0)))
def fasta_stats(fasta_fp): """ Retrieves the number of bases and number of records in a FASTA file. Also creates a FASTA index (.fai) for later searching. May be slow for very large files. """ # pyfaidx can't handle blank lines within records, so we have to check :( check_empty_lines(fasta_fp) try: fasta = Fasta(fasta_fp) length = fasta_len_quick(fasta_fp) nrecords = len(fasta.keys()) return length, nrecords except: click.secho( "\nError reading %s: invalid FASTA format?" % fasta_fp, fg="red") raise
def readFASTA(x, splitKey = None): """ Is sequence file? Load from file if so. File should be FASTA format Use pyfasta """ if type(x) is not str: raise TypeError("input must be type str. filename or sequence") if os.path.isfile(x): tmp_o = Fasta(x, key_function=lambda key: key.split()[0]) if (splitKey is None): o = tmp_o else: o = { i.split(splitKey)[0] : tmp_o[i] for i in tmp_o.keys() } else: o = x return o
def size(args): if args.header: print("seqid\tsize") fname, fext = op.splitext(args.fi) if args.fi in ['stdin', '-'] or fext in ['.gz','.bz2']: fh = must_open(args.fi) for rcd in SeqIO.parse(fh, "fasta"): sid, size = rcd.id, len(rcd) if args.bed: print("%s\t%d\t%d" % (sid, 0, size)) else: print("%s\t%d" % (sid, size)) elif fext in [".%s" % x for x in FastaExt]: from pyfaidx import Fasta fas = Fasta(args.fi) for sid in fas.keys(): size = len(fas[sid]) if args.bed: print("%s\t%d\t%d" % (sid, 0, size)) else: print("%s\t%d" % (sid, size)) else: logging.error("%s is not a supported format" % fext)
EXAMPLE: genomeiden_combined.py my_fasta.fa [OPTIONS]' print >>sys.stderr, msg sys.exit(1) options = { 'windowed': False } # genomeiden_combined.py my_fasta.fa [OPTIONS] fasta_filename = sys.argv[1] if len(sys.argv) >2: coord_format = sys.argv[2] if coord_format == '--windowed': options['windowed'] = True test_fasta = Fasta(fasta_filename) names = test_fasta.keys() print names def find_all_slippery_seq(seq,n=7): slippery_seqs = [] seqs = [] # searches for slip seq every nucleotide for i in xrange(len(seq)): # forward and revcomp seqs strands = ('+', '-') starts = (i, i + n) zipped_seqs = zip( (seq[i:i+n], -seq[i:i+n]), strands, starts ) map(seqs.append, zipped_seqs) i = 0 for zseq, zstrand, zidx in seqs:
def test_keys(self): fasta = Fasta('data/genes.fasta', split_char='|') expect = ['530364724', '530364725', '530364726', '530373235', '530373237', '530384534', '530384536', '530384538', '530384540', '543583738', '543583740', '543583785', '543583786', '543583788', '543583794', '543583795', '543583796', '557361097', '557361099', '563317589', 'AB821309.1', 'KF435149.1', 'KF435150.1', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'NR_104216.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1', 'dbj'] result = sorted(fasta.keys()) assert result == expect
#!usr/bin/python from pyfaidx import Fasta from Bio.Seq import translate tara=Fasta("./databases/OM-RGC_seq.release.fna") tara_aa=open("./databases/OM-RGC_seq.translated.fasta","w") for s in tara.keys(): tara_aa.write(">"+s+"\n"+translate(tara[s])+"\n") tara_aa.close()
def test_keys(self): genes = Fasta('data/genes.fasta', key_function=get_gene_name) expect = ['BARD1', 'FGFR2', 'MDM4', 'gi|530364724|ref|XR_241079.1|', 'gi|530364725|ref|XR_241080.1|', 'gi|530364726|ref|XR_241081.1|', 'gi|530373235|ref|XM_005265507.1|', 'gi|530373237|ref|XM_005265508.1|', 'gi|530384534|ref|XM_005249642.1|', 'gi|530384536|ref|XM_005249643.1|', 'gi|530384538|ref|XM_005249644.1|', 'gi|530384540|ref|XM_005249645.1|', 'gi|543583738|ref|NM_001282548.1|', 'gi|543583740|ref|NM_001282549.1|', 'gi|543583785|ref|NM_000465.3|', 'gi|543583786|ref|NM_001282543.1|', 'gi|543583788|ref|NM_001282545.1|', 'gi|543583794|ref|NR_104212.1|', 'gi|543583795|ref|NR_104215.1|', 'gi|557361097|gb|KF435149.1|'] result = sorted(genes.keys()) assert result == expect
CRICK_MAX = count_crickMAX(args) print "now starting Fasta import" seq_in = Fasta(args.seqin) print "done with Fasta import" clusters = open(args.clusters) outsam = args.samout # path = '/Volumes/data/epiGBS/Baseclear/Athal/' # path = '/Volumes/data/epiGBS/DNAVISION/Project_DNA11032___140919_SN170_0407_AC52R6ACXX/Sample_DNA11032-001-L1/output/seqykJJfz/scabiosa/' # path = '/tmp/' # path = '/Volumes/data/epiGBS/FINAL/Scabiosa/BASECLEAR/' # seq_in = Fasta(path+'Scabiosa_combined.fa') #fasta_in = SeqIO.parse(open('/tmp/test.fa', 'r'), 'fasta') seq_in_keymap = {} for key in seq_in.keys(): seq_in_keymap[key.split(';')[0]] = key faidx_rec = seq_in[key] # clusters = open(path +'derep.uc', 'r') # outsam = path+'derep_out.sam' #clusters = open('/Volumes/data/epiGBS/test_scabi/cluster_sorted_a.uc', 'r') #out_fa = open('/Volumes/data/epiGBS/test_scabi/output3.fa', 'w') #outsam = '/Volumes/data/epiGBS/test_scabi/output3.sam' #seq_in = SeqIO.parse(open('/Volumes/data/galaxy/database/files/009/dataset_9152.dat', 'r'), 'fastq') #clusters = open('/Volumes/data/epiGBS/test_scabi/cluster_923.uc', 'r') #cluster_records = pickle.load(open( "/tmp/save.p", "rb" )) # #print 'boe'
def rename(args): import re from pyfaidx import Fasta fi, fo, fmf, fmb = args.fi, args.fo, args.fmf, args.fmb merge_short, gap = args.merge_short, args.gap prefix_chr, prefix_ctg = args.prefix_chr, args.prefix_ctg db = Fasta(fi) ptn1 = "^(chr)?([0-9]{1,2})" ptn2 = "chromosome *([0-9]{1,2})" sdic, cdic = dict(), dict() ccnt = 1 for sid in db.keys(): size = len(db[sid]) res1 = re.search(ptn1, sid, re.IGNORECASE) if res1: sdic[sid] = [int(res1.group(2)), size] else: sid_long = db[sid].long_name res2 = re.search(ptn2, sid_long, re.IGNORECASE) if res2: sdic[sid] = [int(res2.group(1)), size] else: cdic[sid] = [ccnt, size] ccnt += 1 if len(sdic.keys()) == 0: print("Error: no chromosomes detected") sys.exit(1) slst = sorted(sdic.items(), key = lambda t: t[1][0]) clst = sorted(cdic.items(), key = lambda t: t[1][0]) nchrom = slst[-1][1][0] sdigits = ndigit(slst[-1][1][0]) cdigits = ndigit(clst[-1][1][0]) if len(clst) > 0 else 1 sfmt = "%s%%0%dd" % (prefix_chr, sdigits) cfmt = "%s%%0%dd" % (prefix_ctg, cdigits) logging.debug("%d chromosomes, %d scaffolds/contigs" % (len(sdic), len(cdic))) fname, fext = op.splitext(fi) if fext not in [".%s" % x for x in FastaExt]: logging.error("%s is not a supported format" % fext) sys.exit(1) fho = open(fo, "w") fhf = open(fmf, "w") fhb = open(fmb, "w") for sid, sval in slst: scnt, size = sval nsid = sfmt % scnt fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (sid, 0, size, nsid, 0, size, scnt)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (nsid, 0, size, sid, 0, size, scnt)) nrcd = SeqRecord(Seq(str(db[sid])), id = nsid, description = '') SeqIO.write(nrcd, fho, "fasta") i = nchrom + 1 if len(clst) > 0 and merge_short: zid = "%sx" % prefix_chr if sdigits == 2: zid = "%s99" % prefix_chr else: assert sdigits == 1, "wrong number of chroms: %d" % sdigits pos = 0 seq = '' for cid, sval in clst: ccnt, size = sval start, end = pos, pos + size if pos > 0: start += gap end += gap seq += "N" * gap seq += str(db[cid]) fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, zid, start, end, i)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (zid, start, end, cid, 0, size, i)) pos = end i += 1 nrcd = SeqRecord(Seq(seq), id = zid, description = '') SeqIO.write(nrcd, fho, "fasta") else: for cid, sval in clst: ccnt, size = sval ncid = cfmt % ccnt fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, ncid, 0, size, i)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (ncid, 0, size, cid, 0, size, i)) nrcd = SeqRecord(Seq(str(db[cid])), id = ncid, description = '') SeqIO.write(nrcd, fho, "fasta") i += 1 fhf.close() fhb.close() fho.close()
from pyfaidx import Fasta maysFasta = Fasta('name') maysFasta.keys()
print species x[species] = 0 y[species] = 0 z[species] = [] for file in os.listdir('.'): if file.endswith('.fai') and protId[species] in file: with open(file,'r') as f: lines = f.readlines() for line in lines: if line: x[species] += int(line.split('\t')[1])#abs(int(line.split('\t')[3]) - int(line.split('\t')[2]))#max(map(int,line.split('\t')[2:4]))-min(map(int,line.split('\t')[2:4])) for folder in [folder2 for folder2 in fastaFolders if species+'.fa' in os.listdir(folder2)]: try: fa = Fasta(folder+species+'.fa') #bedText = '\n'.join('\t'.join(['_'.join(line.split('_')[:-2])] + line.split('_')[-2:]) for line in fa.keys()) y[species] += sum([len(fa[key][:].seq) for key in fa.keys()])#findlen(BedTool(bedText, from_string=True)) except: print 'Error for ' + folder+species+'.fa' print y[species] """ with open('finalSyntenyMultipleSpecies.bed','r') as f: print 'Bed Open...' lineOut = [] for line in f.readlines(): lineOut.append('-'.join(line.split('\t')[0:4])+'|'+line[line.rfind('\t')+1:]) for line in lineOut: for seq in line.split('|'): y[specId[seq.split('-')[0]]] += abs(int(seq.split('-')[3]) - int(seq.split('-')[2])) """