def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent
def generate_values(path, progress=False): seqs = SeqIO.parse(path, 'fasta') if not progress: for seq in seqs: yield (seq.id, seq.description, str(seq.seq)) if progress: for seq in tqdm(GenWithLength(seqs, len(FASTA(path)))): yield (seq.id, seq.description, str(seq.seq))
def __init__( self, query_path, # The input sequences db_path=pfam.hmm_db, # The database to search seq_type='prot' or 'nucl', # The seq type of the query_path file e_value=0.001, # The search threshold params=None, # Add extra params for the command line out_path=None, # Where the results will be dropped executable=None, # If you want a specific binary give the path cpus=None): # The number of threads to use # Save attributes # self.query = FASTA(query_path) self.db = FilePath(db_path) self.params = params if params else {} self.e_value = e_value self.seq_type = seq_type self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Auto detect database short name # if db_path == 'pfam': self.db = pfam.hmm_db if db_path == 'tigrfam': self.db = tigrfam.hmm_db # Output # if out_path is None: self.out_path = FilePath(self.query.prefix_path + '.hmmout') elif out_path.endswith('/'): self.out_path = FilePath(out_path + self.query.prefix + '.hmmout') else: self.out_path = FilePath(out_path)
def to_fasta(self, path, verbose=False): # Select verbosity # import tqdm wrapper = tqdm.tqdm if verbose else lambda x: x # Do it # with open(path, 'w') as handle: for r in wrapper(self): SeqIO.write(r, handle, 'fasta') # Return # return FASTA(path)
def fresh_fasta(self): """A file containing all the fresh water genes""" fasta = FASTA(self.p.fresh_fasta) if not fasta.exists: print "Building fasta file with all fresh genes..." fresh = [g for g in genomes.values() if g.fresh] shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta)) assert len(fasta) == sum(map(len, fresh)) self.timer.print_elapsed() return fasta
def fasta(self): """The fasta file containing the filtered genes of this cluster The names now will correspond to long descriptive names""" fasta = FASTA(self.p.fasta) if not fasta: fasta.create() for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name) fasta.close() return fasta
def __init__(self, version, seq_type, base_dir=None): # Attributes # self.version = version self.seq_type = seq_type self.short_name = self.short_name + "_" + self.version # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # URL # self.url = "release_%s/Exports/" % self.version # The database # self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version self.nr99_dest = FASTA(self.base_dir + self.nr99_name) self.nr99 = FASTA(self.base_dir + self.nr99_name[:-3]) # The alignment # self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version self.aligned_dest = FASTA(self.base_dir + self.aligned_name) self.aligned = FASTA(self.base_dir + self.aligned_name[:-3])
def fasta(self): """Make a fasta file with all uniprot proteins that are related to this family.""" fasta = FASTA(self.p.proteins) if not fasta.exists: fasta.create() for seq in pfam.fasta: if self.fam_name in seq.description: fasta.add_seq(seq) fasta.close() assert fasta # Return # return fasta
def read_file(self, fp): ''' Read the first FASTA record from the content of fp, and set the chromosome name and sequence using set_chromosome method. ''' if self.verbose: print >> stderr, "reading a FASTA record to set a chromosome" fasta = FASTA(fp=fp, verbose=self.verbose) chr_name, chr_seq = fasta.get_record() if chr_name and chr_seq: chr_name = chr_name[1:] self.set_chromosome(chr_name, chr_seq) elif not chr_name and not chr_seq: raise NoChromosomeFoundError(fp.name, chr_name, chr_seq) else: raise ChromosomeFASTAFromatError(fp.name, chr_name, chr_seq)
def main(): args = parse_args() dihedrals = read_dihedrals() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) #for u in universalGrooves: # print (u, universalGrooves[u]) #for u in intersectGrooves: # print (intersectGrooves[u]) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) outputfilehandler = open(args.pdbids, 'w') for pdbid in pdbids: if pdbid in dihedrals: if args.pep: finalSeqCode = oneHotEncoding(peptides[pdbid]) finalLabelCode = dihedrals[pdbid] if args.label == 'x': print(', '.join(finalSeqCode)) outputfilehandler.write(pdbid + '\n') elif args.label == 'y': print(', '.join(finalLabelCode)) outputfilehandler.write(pdbid + '\n') else: finalSeqCode = oneHotEncoding(universalGrooves[pdbid] + peptides[pdbid]) finalLabelCode = dihedrals[pdbid] if args.label == 'x': print(', '.join(finalSeqCode)) outputfilehandler.write(pdbid + '\n') elif args.label == 'y': print(', '.join(finalLabelCode)) outputfilehandler.write(pdbid + '\n') outputfilehandler.close()
def read_fasta(args): fasta = FASTA(args.fasta) fasta.read() headers = fasta.get_headers() pep_chain = {} pep_seq = {} for header in headers: fields = header.split('|') pdbid = fields[0] chainid = fields[1] seq = fasta.get_sequence(header) if len(seq) == 9: pep_chain[pdbid] = chainid pep_seq[pdbid] = seq return (pep_chain, pep_seq)
def __init__(self, query_path, db_path, seq_type = 'prot' or 'nucl', # The seq type of the query_path file params = None, # Add extra params for the command line algorithm = "blastn" or "blastp", # Will be auto-determined with seq_type out_path = None, # Where the results will be dropped executable = None, # If you want a specific binary give the path cpus = None, # The number of threads to use num = None, # When parallelized, the number of this thread _out = None, # Store the stdout at this path _err = None): # Store the stderr at this path # Main input # self.query = FASTA(query_path) # The database to search against # self.db = FilePath(db_path) # Other attributes # self.seq_type = seq_type self.algorithm = algorithm self.num = num self.params = params if params else {} # The standard output and error # self._out = _out self._err = _err # Output defaults # if out_path is None: self.out_path = self.query.prefix_path + self.extension elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + self.extension else: self.out_path = out_path # Make it a file path # self.out_path = FilePath(self.out_path) # Executable # self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Save the output somewhere # if self._out is True: self._out = self.out_path + '.stdout' if self._err is True: self._err = self.out_path + '.stderr'
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) aaindex = Aaindex() #for result in aaindex.search('charge'): # print(result) record = aaindex.get('FASG890101') #print (record.title) index_data = record.index_data #print (index_data) charge = aaindex.get('KLEP840101') charge_data = charge.index_data #print (charge_data) for l in labels: (pdbid1, pdbid2) = l.split('_') #if pdbid1 in pdbids and pdbid2 in pdbids: if pdbid1 in pdbids or pdbid2 in pdbids: if args.pep: finalSeqCode, finalLabelCode = oneHotEncoding(peptides[pdbid1]+'|'+peptides[pdbid2], labels[l], index_data, charge_data) if args.label == 'x': print (', '.join(finalSeqCode)) elif args.label == 'y': print (', '.join(finalLabelCode)) else: finalSeqCode, finalLabelCode = oneHotEncoding(universalGrooves[pdbid1]+peptides[pdbid1]+'|'+universalGrooves[pdbid2]+peptides[pdbid2], labels[l], index_data, charge_data) if args.label == 'x': print (', '.join(finalSeqCode)) elif args.label == 'y': print (', '.join(finalLabelCode))
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() peptides = totalNineMers(fasta) pdbids = peptides.keys() testsetlen = int(args.percent * len(pdbids)) trainset = [] testset = [] for i in range(0, len(pdbids)): r = random() if len(testset) < testsetlen and r < 0.5: testset.append(pdbids[i]) else: trainset.append(pdbids[i]) write_to_file('train/90_10/train.txt', trainset) write_to_file('test/90_10/test.txt', testset)
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() peptides, alleles = totalNineMers(fasta) pdbids = peptides.keys() testsetlen = int(args.percent * len(pdbids)) trainset = [] testset = [] for p in pdbids: r = random() if len(testset) < testsetlen and r < 0.5 and alleles[p] == 'A0201': testset.append(p) else: trainset.append(p) write_to_file('train.txt', trainset) write_to_file('test.txt', testset)
def __init__(self, path, num_parts=None, part_size=None, base_dir=None): # Basic # self.path = path # Directory # if base_dir is None: self.base_dir = path + '.parts/' else: self.base_dir = base_dir # Num parts # if num_parts is not None: self.num_parts = num_parts # Evaluate size # if part_size is not None: self.bytes_target = humanfriendly.parse_size(part_size) self.num_parts = int( math.ceil(self.count_bytes / self.bytes_target)) # Make parts # self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i self.parts = [ FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1) ] # Give a number to each part # for i, part in enumerate(self.parts): part.num = i
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) #for u in universalGrooves: # print (u, universalGrooves[u]) #for u in intersectGrooves: # print (intersectGrooves[u]) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) for l in labels: (pdbid1, pdbid2) = l.split('_') #if pdbid1 in pdbids and pdbid2 in pdbids: if pdbid1 in pdbids or pdbid2 in pdbids: if args.pep: finalSeqCode, finalLabelCode = oneHotEncoding( peptides[pdbid1] + '|' + peptides[pdbid2], labels[l]) if args.label == 'x': print(', '.join(finalSeqCode)) elif args.label == 'y': print(', '.join(finalLabelCode)) else: finalSeqCode, finalLabelCode = oneHotEncoding( universalGrooves[pdbid1] + peptides[pdbid1] + '|' + universalGrooves[pdbid2] + peptides[pdbid2], labels[l]) if args.label == 'x': print(', '.join(finalSeqCode)) elif args.label == 'y': print(', '.join(finalLabelCode))
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print("Success", directory)
def seeds(self): seeds = FASTA(self.autopaths.seed) return seeds
def fasta(self): fasta = FASTA(self.autopaths.fasta) return fasta
def subsampled(self): subsampled = FASTA(self.p.subsampled) if not subsampled.exists: self.fasta.subsample(down_to=30, new_path=subsampled) self.add_taxonomy(subsampled) return subsampled
'TTDB/TriTrypDB-46_TcruziCLBrenerEsmeraldo-like_AnnotatedCDSs.fasta', 'organism': 'TcruziCLBrenerEsmeraldo-like' } non_emeraldo = { 'genome_filename': 'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_Genome.fasta', 'regions_filename': 'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_AnnotatedCDSs.fasta', 'organism': 'TcruziCLBrenerNon-Esmeraldo-like' } organism = emeraldo_like if __name__ == "__main__": # Load FASTA files genome = FASTA(organism['genome_filename']) genome.load() regions = FASTA(organism['regions_filename']) regions.load() # Load database file sqlite = sqlite3.connect(SQLite_DB) # Create MFASeq Folder Organism_MFASeq_folder = f"{MFASeq_folder}/MFA-Seq_{organism['organism']}" if not os.path.isdir(Organism_MFASeq_folder): os.mkdir(Organism_MFASeq_folder) # Create MFASeq Files for chromosome_id in genome.data.keys():
"--basepairs", action='store_true', help="Use base pairs instead of genome counts") args = parser.parse_args() protein_fasta = args.fasta or '/home/seijihariki/Documents/TCC/TTDB/TriTrypDB-46_TcruziCLBrenerEsmeraldo-like_AnnotatedTranscripts.fasta' simulation_folder = args.simulation search = args.search or 'DGF-1' base_pairs = args.basepairs simulation_cnt = args.count or 50 chromosomes_cnt = args.chromosomes or 41 print('Loading annotations:') transcripts = FASTA(protein_fasta) transcripts.load() collisions = {} print('Detecting collisions:') for chromosome in range(chromosomes_cnt): chromosome_name = f"TcChr{chromosome + 1}-S" collisions[chromosome_name] = [] for simulation in range(simulation_cnt): with open( f"{simulation_folder}simulation_{simulation}/{chromosome_name}.cseq" ) as times: start, end = -2, -2 current_location = 0
""" # Built-in modules # import inspect, os # Internal modules # from seqsearch.databases.ncbi_16s import ncbi_16s from seqsearch.search.blast import BLASTquery # First party modules # from fasta import FASTA # Get current directory # file_name = inspect.getframeinfo(inspect.currentframe()).filename this_dir = os.path.dirname(os.path.abspath(file_name)) + '/' ############################################################################### if __name__ == "__main__": # Main input # seqs = FASTA(this_dir + 'seqs.fasta') # The database to search against # db = ncbi_16s.blast_db # Create search # query = BLASTquery(seqs, db) # Run # query.run()
"""We explore the client given inputs, check for problems, then format them and store them in the repository as immutable artifacts (compressed text files)""" import inspect, os, glob, pandas from fasta import FASTA current_script = inspect.getframeinfo(inspect.currentframe()).filename current_dir = os.path.dirname(os.path.abspath(current_script)) + '/' genomes_dir = current_dir + '../ld12/data/genomes/' input_dir = "/proj/b2013274/mcl/" faa_paths = sorted(glob.glob(input_dir + '*.faa')) fna_paths = sorted(glob.glob(input_dir + '*.fna')) faas = [FASTA(faa) for faa in faa_paths if '647533246' not in faa] fnas = [FASTA(fna) for fna in fna_paths if '647533246' not in fna] faas_nums = [int(g.short_prefix) for g in faas] fnas_nums = [int(g.short_prefix) for g in fnas] metadata = pandas.io.parsers.read_csv(current_dir + '../ld12/data/metadata.tsv', sep='\t', index_col=0, encoding='utf-8') meta_nums = list(metadata.index) print set(faas_nums) ^ set(fnas_nums) print set(faas_nums) ^ set(meta_nums) def strip(seq):
from fasta import FASTA, AlignedFASTA community = FASTA('community.fasta') alignment = AlignedFASTA('alignment.fasta')
def all_proteins(self): """The main fasta file.""" return FASTA(self.p.unzipped_proteins)
def to_fasta(self, path): with open(path, 'w') as handle: for r in self: SeqIO.write(r, handle, 'fasta') return FASTA(path)