def _make_db(self): os.mkdir(self.tmp) copy(self.genome, os.path.join(self.tmp, 'genome.fa')) self.genome = os.path.join(self.tmp, 'genome.fa') cline = NcbimakeblastdbCommandline(input_file=self.genome, dbtype='nucl') cline()
def blast_func(samplecfg): blastlog = ['BLAST'] # create index if does not exist if not (os.path.exists(samplecfg.genome + '.nin') and os.path.exists(samplecfg.genome + '.nhr') and os.path.exists(samplecfg.genome + '.nsq')): makedb = NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='nucl', input_file=samplecfg.genome) stdout, stderr = makedb() blastlog.append('\n\nNcbimakeblastdb\n\n') blastlog.append(stdout) blastlog.append(stderr) # blast sampleblast = NcbiblastnCommandline( task='blastn', query=samplecfg.fasta, db=samplecfg.genome, outfmt=samplecfg.view, evalue=samplecfg.evalue, out=mydir + '/blast_' + samplecfg.sample + '.' + samplecfg.suffix + '.txt') stdout, stderr = sampleblast() blastlog.append('\n\nNcbiblastn\n\n') blastlog.append(stdout) blastlog.append(stderr) return blastlog
def blastdb (db_file): make_db_cmd=NcbimakeblastdbCommandline( cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.9.0+/bin/makeblastdb', dbtype='nucl', input_file=db_file ) make_db_cmd()
def create_db(db_name: str) -> bool: blastsets = { 'TAIR10_Whole_Genome': 'TAIR10_bac_con_20101028', 'TAIR10_CDS': 'TAIR10_cds_20101214_updated', 'TAIR10_Genes': 'TAIR10_cdna_20110103_representative_gene_model_updated' } # create Database directory if not os.path.isdir(db_file_path): os.makedirs(db_file_path) logger.info('Now, Database dir has been created!') # create blastsets directory if not os.path.isdir(ref_path): os.makedirs(ref_path) logger.info('Now, blastsets dir has been created!') if db_name not in blastsets.keys(): logger.error('Entered a database that is not registered.') raise ValueError source = blastsets[db_name] source_path = os.path.join(ref_path, source) if not os.path.exists(source_path): subprocess.run(['curl', '-O', 'ftp://ftp.arabidopsis.org/home/tair/Sequences/blast_datasets/TAIR10_blastsets/' + source], cwd=ref_path) logger.info('The source download is finished.') cline = NcbimakeblastdbCommandline(input_file=source_path, dbtype='nucl', parse_seqids=True, out=os.path.join(db_file_path, db_name)) stdout, stderr = cline() if stderr: logger.debug(stderr) return False logger.debug(stdout) return True
def _reverse_blast_iden(self, threads): os.symlink(os.path.abspath(self.ref), os.path.join(self.tmp_dir, 'ref.fasta')) makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot', input_file=os.path.join( self.tmp_dir, 'ref.fasta')) blastp_cline = NcbiblastpCommandline( query=os.path.join(self.tmp_dir, 'putative.fasta'), db=os.path.join(self.tmp_dir, 'ref.fasta'), evalue='1e-3', outfmt="6 qacc sacc qlen slen length pident evalue", max_hsps=1, num_threads=threads, out=os.path.join(self.tmp_dir, 'reverse_blast.tbl')) makeblastdb_cline() blastp_cline() blast_result = pd.read_table(os.path.join(self.tmp_dir, 'reverse_blast.tbl'), header=None, names=[ 'qacc', 'sacc', 'qlen', 'slen', 'length', 'pident', 'evalue' ]) seq_idx_lst = [ _[0] for _ in enumerate(blast_result['sacc'].to_list()) if _[1] in self.seed ] _tmp_lst = blast_result['qacc'].to_list() seq_lst = [_tmp_lst[_] for _ in seq_idx_lst] return set(seq_lst), blast_result[blast_result['qacc'].isin(seq_lst)]
def create(self) -> tuple: """Function for making local blast database. This function creates database from files found in source_dir. Returns: tuple(Database, str): Database object, output from makeblastdb. Creates: (*.nhr, *.nin, *.nsq): Created database's files in LMBD format. Raises: SubprocessError: When makeblastdb returns error or when input file does not exist. """ self._aggregate(self.source_dir, Path("blast_input.fasta")) try: cmd = NcbimakeblastdbCommandline(input_file="blast_input.fasta", dbtype="nucl", title=self.name, out=self.name) makeblastdb_output = subprocess.run(str(cmd), capture_output=True, shell=True) if makeblastdb_output.stderr: raise subprocess.SubprocessError( f"Makeblastdb returned error: {makeblastdb_output.stderr.decode()}" ) except Exception: raise finally: if Path("blast_input.fasta").exists(): Path("blast_input.fasta").unlink() return self, makeblastdb_output.stdout.decode()
def blast_iden(self, threads=2): shutil.copyfile(self.db, os.path.join(self.tmp_dir, 'database.fasta')) makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot', input_file=os.path.join( self.tmp_dir, 'database.fasta')) blastp_cline = NcbiblastpCommandline( query=self.seed, db=os.path.join(self.tmp_dir, 'database.fasta'), evalue='1e-5', outfmt="6 qacc sacc qlen slen length pident evalue", max_hsps=1, num_threads=threads, out=os.path.join(self.tmp_dir, 'blast.tbl')) makeblastdb_cline() blastp_cline() blast_result = pd.read_table(os.path.join(self.tmp_dir, 'blast.tbl'), header=None, names=[ 'qacc', 'sacc', 'qlen', 'slen', 'length', 'pident', 'evalue' ]) blast_result = blast_result[ (blast_result['pident'] > 50) & (blast_result['length'] / blast_result['slen'] > 0.5) & (blast_result['length'] / blast_result['qlen'] > 0.5)] blast_result.to_csv(os.path.join(self.tmp_dir, 'blast2.tbl'), sep='\t', index=False) seq_list = [ _ for _ in SeqIO.parse(self.db, 'fasta') if _.id in blast_result['sacc'].to_list() ] SeqIO.write(seq_list, os.path.join(self.tmp_dir, 'subgenes.fasta'), 'fasta')
def database_blast(self): database_cmd = NcbimakeblastdbCommandline( cmd=os.path.join(self.exec, 'makeblastdb'), dbtype='nucl', input_file=self.out_path) database_cmd() print('IdenDSS database created success!')
def create(self, input_file: Path): """Function for making local blast database. This function creates database from tRNAs retrieved from tRNAscan-SE. Args: input_file (Path): Path to file containing DB sequences. Returns: str: Database output. Creates: (*.nhr, *.nin, *.nsq): Created database's files in LMBD format. Raises: SubprocessError: When makeblastdb returns error or when input file does not exist. """ try: cmd = NcbimakeblastdbCommandline(input_file=str(input_file), dbtype="nucl", title=self.name, out=self.name) cmd() makeblastdb_output = subprocess.run(str(cmd), capture_output=True, shell=True) if makeblastdb_output.stderr: raise subprocess.SubprocessError( f"Makeblastdb returned error: {makeblastdb_output.stderr.decode()}" ) except Exception: raise finally: if input_file.exists(): input_file.unlink() return makeblastdb_output.stdout.decode()
def blast(): with open(concat_exons) as concatenated: concatenated_exons = SeqIO.to_dict( SeqIO.parse(concatenated, 'fasta', generic_dna)) with open(f"{concat_exons.split('.')[0]}_names_corrected.fas", 'w') as corrected: for key in concatenated_exons.keys(): corrected.write( f">{key.split('-')[1]}-{key.split('-')[3]}\n{str(concatenated_exons[key].seq)}\n" ) print('Building database for %s...' % concat_exons) NcbimakeblastdbCommandline( dbtype='nucl', input_file=f"{concat_exons.split('.')[0]}_names_corrected.fas", out=concat_exons, parse_seqids=True)() print('Done') print(f'Blasting {probes} against {concat_exons}') NcbiblastnCommandline( task=blast_task, query=probes, db=concat_exons, out=f'{probes}_against_{concat_exons}.txt', outfmt="6 qaccver saccver pident qcovhsp evalue bitscore", num_threads=4)() print('Done')
def blastdb (in_file,db_file): make_db_cmd=NcbimakeblastdbCommandline( cmd='makeblastdb', dbtype='nucl', input_file=in_file, out=db_file ) make_db_cmd()
def blastdb(species_id_path): make_db_cmd = NcbimakeblastdbCommandline( cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/makeblastdb', dbtype='nucl', input_file=species_id_path, out=str(species_db_dir / species_id_path.stem)) make_db_cmd()
def make_blastdb(seqs_file): """Make a BLAST database from a protein FASTA file. Args: seqs_file (str): protein sequence FASTA file path. """ makeblastdb = NcbimakeblastdbCommandline(dbtype="prot", input_file=seqs_file) out, err = makeblastdb()
def make_blast_database(in_filename: str, db_filename: str): """ Make local BLAST database from given file. """ NcbimakeblastdbCommandline(input_file=in_filename, parse_seqids=True, title='e_coli_genome', dbtype='nucl', out=db_filename)()
def create_blast_db(fa_file_path=None, dbtype="nucl"): """Creates a new blast db As input takes the path of the collection of *.fa files. """ print("Creating the blast DB...") create = NcbimakeblastdbCommandline(input_file=fa_file_path, dbtype=dbtype) create() print(f"Blast DB created at {fa_file_path}.")
def blast(query, database, dbtype, title, evalue, outfmt, out): #tworzę bazę danych dla blasta cline = NcbimakeblastdbCommandline(dbtype=dbtype, input_file=database, title=title) cline() #wykonuję blasta wobec danej bazy danych blastx_cline = NcbitblastnCommandline(query=query, db=database, evalue=evalue, outfmt=outfmt, out=out) blastx_cline()
def blast_pair(self): print('Screen SSR start') # make a temporary directory for BLAST os.mkdir(self._tmpdir) sequences = SeqIO.to_dict(SeqIO.parse(self._seq_path, 'fasta')) lengths = {_id: len(_seq.seq) for _id, _seq in sequences.items()} ssr_info = pd.read_table(self._ssr_info) tmp_lst = [] for _ in self._assembly.values(): tmp_lst += _ ssr_info = ssr_info[ssr_info['seqid'].isin(tmp_lst)] del tmp_lst if not self._circular: ssr_info = ssr_info[ssr_info.apply( lambda x: (x['start'] > 200) & (x['end'] < lengths[x['seqid']] - 200), axis=1)] # prepare sequences print('(1/3) prepare sequence for BLAST') seqlist = [] for _idx, _item in ssr_info.iterrows(): genome_seq = sequences[_item['seqid']] _sequence_id = str(genome_seq.id) + "_" + str( _item['start']) + "_" + str(_item['end']) _sequence_template = str( get_seq(genome_seq, _item['start'], _item['end'])) seqlist.append('>' + _sequence_id) seqlist.append(_sequence_template) with open(os.path.join(self._tmpdir, 'query.fasta'), 'w') as f: f.write('\n'.join(seqlist)) f.write('\n') # BLAST print('(2/3) BLAST start') database_cmd = NcbimakeblastdbCommandline(dbtype='nucl', input_file=os.path.join( self._tmpdir, 'query.fasta')) blastn_cmd = NcbiblastnCommandline( query=os.path.join(self._tmpdir, 'query.fasta'), db=os.path.join(self._tmpdir, 'query.fasta'), dust='no', outfmt='\"6 qacc sacc length pident evalue\"', num_threads=self._threads, evalue='1e-3', out=os.path.join(self._tmpdir, 'blast_result.txt'), max_hsps=1) database_cmd() blastn_cmd() print('(2/3) BLAST Done')
def run(self): num_cases = len(list(SeqIO.parse(self.fasta, 'fasta'))) # Make Database clinedb = NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file=self.fasta, input_type='fasta', out=self.fasta) clinedb() # Calculation clinec = NcbiblastpCommandline( cmd='blastp', query=self.fasta, db=self.fasta, evalue=10, outfmt='6 qseqid sseqid pident evalue bitscore score', max_target_seqs=num_cases, max_hsps=1, num_threads=self.cpucount, out='all_vs_all.tsv') clinec() # Data Processing data = pd.read_csv( 'all_vs_all.tsv', delimiter='\t', names=['seq1', 'seq2', 'pident', 'evalue', 'bitscore', 'score']) no_dups = self.clean_duplicates(data) pivoted = no_dups.pivot(index='seq1', columns='seq2', values=self.type[self.matrix_type]) for column in pivoted.columns: pos = pivoted.index.get_loc(column) + 1 for index in pivoted.index[pos:]: if column != index: pivoted.loc[index, column] = pivoted.loc[column, index] os.remove('all_vs_all.tsv') os.remove('{}.phr'.format(self.fasta)) os.remove('{}.pin'.format(self.fasta)) os.remove('{}.psq'.format(self.fasta)) pivoted_round = pivoted.round(2) self.onfinished.emit(pivoted_round)
def write_output(protein_records, cache_dir, args): """Write out the extract protein sequences to the specified output(s) :param protein_records: list of SeqRecords :param cache_dir: Path, cache directory :param args: cmd-line args parser Return nothing. """ logger = logging.getLogger(__name__) if args.fasta_file: SeqIO.write(protein_records, args.fasta_file, "fasta") if args.fasta_dir: for record in protein_records: accession = record.id target_path = args.fasta_dir / f'{accession}.fasta' SeqIO.write([record], target_path, "fasta") if args.blastdb: fasta_name = args.blastdb / 'blastdb.fasta' SeqIO.write(protein_records, target_path, "fasta") cmd_makedb = NcbimakeblastdbCommandline( cmd='makeblastdb', dbtype='prot', input_file=fasta_name, ) stdout, stderr = cmd_makedb() # check the command was successfully exectured if len(stderr) != 0: logger.warning() print( f"Could not build non-CAZyme db.\nstdout={stdout}\nstderr={stderr}" ) cache_path = cache_dir / 'extracted_sequences.txt' with open(cache_path, 'a') as fh: for record in protein_records: fh.write(f"{record.id}\n") return
def run_blast(query_file,species_id_path,species_out_path): make_db_cmd=NcbimakeblastdbCommandline( cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/makeblastdb', dbtype='nucl', input_file=species_id_path, out=str(species_out_path/"blastdb"/species_id_path.stem) ) blast_cmd=NcbiblastnCommandline( cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn', query=query_file, db=species_out_path/"blastdb"/species_id_path.stem, outfmt=11, out=species_out_path/"asn"/(species_id_path.stem+".asn") # perc_identity=95 ) blast_xml_cmd=NcbiblastformatterCommandline( archive=species_out_path/"asn"/(species_id_path.stem+".asn"), outfmt=5, out=species_out_path/"xml"/(species_id_path.stem+".xml"), cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) blast_txt_cmd=NcbiblastformatterCommandline( archive=species_out_path/"asn"/(species_id_path.stem+".asn"), outfmt=7, out=species_out_path/"txt"/(species_id_path.stem+".txt"), cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) db_file=species_out_path/"blastdb"/(species_id_path.stem+".ndb") if (species_out_path/"xml"/(species_id_path.stem+".xml")).exists() is False: if db_file.exists() is False: make_db_cmd() try: blast_cmd() except ApplicationError: print(blast_xml_cmd) try: blast_txt_cmd() except ApplicationError: print(blast_xml_cmd) try: blast_xml_cmd() except ApplicationError: print(blast_xml_cmd)
def _forward_blast_iden(self, threads): all_proteins = SeqIO.to_dict(SeqIO.parse(self.ref, 'fasta')) if self.seed: seed_proteins = [all_proteins.get(_) for _ in self.seed] else: # do not need accessions, the ref fasta file is the ref sequences seed_proteins = all_proteins SeqIO.write(seed_proteins, os.path.join(self.tmp_dir, 'seed.fasta'), 'fasta') del all_proteins, seed_proteins os.symlink(os.path.abspath(self.query), os.path.join(self.tmp_dir, 'query.fasta')) makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot', input_file=os.path.join( self.tmp_dir, 'query.fasta')) blastp_cline = NcbiblastpCommandline( query=os.path.join(self.tmp_dir, 'seed.fasta'), db=os.path.join(self.tmp_dir, 'query.fasta'), evalue='1e-3', outfmt="6 qacc sacc qlen slen length pident evalue", max_hsps=1, num_threads=threads, out=os.path.join(self.tmp_dir, 'blast.tbl')) makeblastdb_cline() blastp_cline() blast_result = pd.read_table(os.path.join(self.tmp_dir, 'blast.tbl'), header=None, names=[ 'qacc', 'sacc', 'qlen', 'slen', 'length', 'pident', 'evalue' ]) seq_list = [ _ for _ in SeqIO.parse(self.query, 'fasta') if _.id in blast_result['sacc'].to_list() ] SeqIO.write(seq_list, os.path.join(self.tmp_dir, 'putative.fasta'), 'fasta')
def blast_self(self): os.mkdir(self._tmpdir) # prepare fasta seq_list = [] for _idx, _row in self._primer_info.iterrows(): try: seq_list.append( SeqRecord(Seq(_row['Forward']), id=_row['ID'] + '_F', name='', description='')) seq_list.append( SeqRecord(Seq(_row['Reverse']), id=_row['ID'] + '_R', name='', description='')) except TypeError: continue SeqIO.write(seq_list, os.path.join(self._tmpdir, 'query.fasta'), 'fasta') # BLAST database_cmd = NcbimakeblastdbCommandline(dbtype='nucl', input_file=self._seq_path) blastn_cmd = NcbiblastnCommandline( query=os.path.join(self._tmpdir, 'query.fasta'), db=self._seq_path, task='blastn-short', outfmt=5, num_threads=self._threads, evalue='10', out=os.path.join(self._tmpdir, 'blast_result.xml'), max_hsps=1, max_target_seqs=2) try: database_cmd() blastn_cmd() except IOError: print('please check your file and/or its permission')
def create_hit_tables(fasta_file, probe_exons, n_cpu, length_cover, log_file): """Running blast on every fasta file with contigs. Probe file is blasted against contigs. Blast results are saved in text file.""" logger = create_logger(log_file) path = os.path.dirname(fasta_file) fasta_file: str = os.path.basename(fasta_file) sample: str = Path(fasta_file).stem NcbimakeblastdbCommandline( dbtype="nucl", input_file=os.path.join(path, fasta_file), out=os.path.join(path, sample), parse_seqids=True, )() logger.info(f"\t\tCreating hit table for {sample}. Running BLAST...") NcbiblastnCommandline( task="blastn", query=probe_exons, db=os.path.join(path, sample), out=os.path.join(path, f"reference_in_{sample}_contigs.txt"), qcov_hsp_perc=length_cover, outfmt="6 qaccver saccver pident qcovhsp evalue bitscore sstart send", )() logger.info(f"\t\tHit table for {sample} is ready")
fh_perc_id_out = open('perc_id.txt', 'a') header = "\t".join(["OG_group", "ORF_id", "Gene_id", "Perc_id", "Closest_SDP"]) fh_perc_id_out.write(header + '\n') count_aln_results = 0 count_species_recruits = dict() for orf_file in orf_files: print('Processing orf-file: ', orf_file) filename_split = orf_file.split('_') OG = filename_split[0] core_aln_file = OG + '_aln_nuc.fasta' core_aln_file_fullname = core_aln_dir + '/' + core_aln_file core_ffn_file = OG + '.ffn' core_ffn_file_full = core_aln_dir + '/' + core_ffn_file #copy core ffn-file as temporary file to SDP-dir, makeblastdb copyfile(core_ffn_file_full, "temp.ffn") makeblastdb_cmd = NcbimakeblastdbCommandline(dbtype="nucl", input_file="temp.ffn") makeblastdb_cmd() #Loop over orfs in current orf-file. Write each orf to temporary fasta-file, blast against the core-seqs and get species-affiliation for first blast-hit. for seq_record in SeqIO.parse(orf_file, "fasta"): SeqIO.write(seq_record, "temp_orf.ffn", "fasta") blast_result = get_best_blast_hit('temp_orf.ffn', 'temp.ffn') if (blast_result == None): continue #Add orf to core alignment with muscle and get max perc-id aln_result = add_orf_perc_id(core_aln_file_fullname, 'temp_orf.ffn') if (aln_result[2] != 0): orf_max_perc_id = round(aln_result[2], 2) orf_id = aln_result[0] best_hit_to_species_gene_id = aln_result[1] best_species = "other" if blast_result in species_dict: best_species = species_dict[blast_result]
shutil.copyfileobj(f_in, f_out) if not exists(result): if not exists(name_subject): print(name_subject, 'file does not exists in path downloading...') file_hpv = wget.download(url_hpv) if not exists(name_query): print(name_query, 'file does not exists in path downloading...') file_cov = wget.download(url_cov) unzip(name_subject) unzip(name_query) subject_cline = NcbimakeblastdbCommandline(cmd=makeblast, dbtype="prot", input_file=subject, out=subject_out) query_cline = NcbimakeblastdbCommandline(cmd=makeblast, dbtype="prot", input_file=query, out=query_out) print(subject_cline) s_stdout, s_stderr = subject_cline() q_stdout, q_stderr = query_cline() # cov2 = SeqIO.parse(gzip.open(name_query, mode), format=format) # hpv = SeqIO.parse(gzip.open(name_subject, mode), format=format) result_cline = NcbiblastpCommandline(cmd=blast, query=query, db=subject_out, out=result,
def validateFullLengthSequencesUsingBlast(referenceSequences=None, fullLengthSequences=None, outputDirectory=None, threadCount=1, batchSize=50, verbose=False, delimiter='\t', keepBlastFiles=False): # TODO: Blast is hopefully faster than pairwise alignments. # But it's only doing local alignments. print('Validating ' + str(len(fullLengthSequences)) + ' sequences against ' + str(len(referenceSequences)) + ' Reference Sequences using Blast Alignments (threads=' + str(threadCount) + ')') # Start a thread pool queryBatches = [] batchResults = [] pool = multiprocessing.Pool(threadCount) before = currentMillis() # Create Blast Reference blastDirectory = join(outputDirectory, 'blast_results') if (not isdir(blastDirectory)): makedirs(blastDirectory) referenceFileName = join(blastDirectory, 'BlastReference.fasta') printSequences(alleleSequences=referenceSequences, outputFilename=referenceFileName, verbose=verbose) cline = NcbimakeblastdbCommandline(dbtype="nucl", input_file=referenceFileName) stdout, stderr = cline() if (verbose): print('MakeDB Commandline:\n' + str(cline)) print('Output:' + str(stdout)) print('Errors?:' + str(stderr)) # Split Query Sequences into Batches if (verbose): print('Splitting ' + str(len(fullLengthSequences)) + ' sequences into batches of size ' + str(batchSize)) newBatch = [] for sequenceIndex, sequence in enumerate(fullLengthSequences): newBatch.append(sequence) if (len(newBatch) >= batchSize or sequenceIndex == len(fullLengthSequences) - 1): # Done with this batch. Write it to file batchFileName = join( blastDirectory, 'Batch' + str(len(queryBatches)) + 'Sequences.fasta') printSequences(alleleSequences=newBatch, outputFilename=batchFileName, verbose=verbose) queryBatches.append(newBatch) newBatch = [] if (verbose): print('Found ' + str(len(queryBatches)) + ' batches of size <= ' + str(batchSize)) # For each Batch for batchIndex, batch in enumerate(queryBatches): batchFileName = join(blastDirectory, 'Batch' + str(batchIndex) + 'Sequences.fasta') # Start thread to run blast against references if (threadCount > 1): batchResults.append( pool.starmap_async( findBestReferenceSequence, [[referenceFileName, batchFileName, verbose]])) else: batchResults.append( findBestReferenceSequence(referenceFileName=referenceFileName, batchFileName=batchFileName, verbose=verbose)) pool.close() pool.join() # Delete blast output files if (not keepBlastFiles): cleanupBlastOutputFiles(blastDirectory=blastDirectory, referenceFileName=referenceFileName, queryBatches=queryBatches) # Create output file sequenceValidationResultsFile = open( join(outputDirectory, 'ReferenceFinderValidationResults.csv'), 'w') sequenceValidationResultsFile.write('Query_Name' + delimiter + 'Best_Reference' + delimiter + 'Alignment_Score\n') # Each batch result should be a dictionary. Take those results and write them for batchResult in batchResults: if (threadCount > 1): # If it's multi threaded we need to "get" the value currentBatchResults = batchResult.get()[0] else: currentBatchResults = batchResult for queryAlleleName in currentBatchResults.keys(): bestReferenceName, alignmentScore = currentBatchResults[ queryAlleleName] sequenceValidationResultsFile.write( str(queryAlleleName) + delimiter + str(bestReferenceName) + delimiter + str(alignmentScore) + '\n') sequenceValidationResultsFile.close() if (verbose): after = currentMillis() print('Finding References ' + str(len(fullLengthSequences)) + ' sequences took ' + str((after - before)) + ' seconds.')
def main(argv): argsgiven = 0 query = '' subject = '' build_DB = True usage = 'seq_uniq_seek.py -q <queryfile>.fasta -s <subjectfile>.fasta -B [build database true/false]' verbal = True opts, args = getopt.getopt(argv, "xmhq:s:o:", ["subject=", "query="]) for opt, arg in opts: if opt == '-h': print(usage) sys.exit() elif opt == '-x': build_DB = False elif opt in ("-q", "--query"): query = arg argsgiven += 1 elif opt in ("-m", "--mute"): verbal = False argsgiven += 1 elif opt in ("-s", "--subject"): subject = arg argsgiven += 1 elif opt in ("-o", "--output"): output = arg argsgiven += 1 if (argsgiven < 3): print(usage) sys.exit(2) if (verbal): print( "\n ---- ==== SEEK UNIQ SEQ ==== ---- \nFinding sequences occuring in " + query + " that are not occuring in " + subject + " and saving in " + output + ".fasta\n") if (build_DB): if (verbal): print("Building blast database for subject file (" + subject + ")") makedb = NcbimakeblastdbCommandline(cmd='makeblastdb', input_file=subject, dbtype='nucl', parse_seqids=True) makedb() if (verbal): print("Done.\n") else: if (verbal): print("Not building database. Hoping for the best") if (verbal): print("Blasting query (" + query + ") against subject database (" + subject + ")") if (verbal): print("Splitting query into multiple files to save memory.") shutil.rmtree("chunks", ignore_errors=True) os.mkdir("chunks") record_iter = SeqIO.parse(open(query), "fasta") for i, batch in enumerate(batch_iterator(record_iter, 10000)): filename = "chunks/chunk_%i.fasta" % (i + 1) with open(filename, "w") as handle: count = SeqIO.write(batch, handle, "fasta") if (verbal): print("Building query index dictionary") q_dict = SeqIO.index(query, "fasta") hits = [] chunks = glob.glob('chunks/chunk*') for i, file in enumerate(chunks): now = datetime.now() dt_string = now.strftime("%d-%m_%H:%M:%S") print("[xenoseq_blast " + dt_string + "] So anyway... I'm busy blasting... " + str(round(i / len(chunks) * 100, 2)) + "%") blastn_cline = NcbiblastnCommandline(cmd='blastn', query=file, db=subject, num_threads=8, evalue=1e-5, perc_identity=90, outfmt=5, out="reads_all_vs_all.xml") blastn_cline() # Bit below is from: https://biopython.org/wiki/Retrieve_nonmatching_blast_queries for record in NCBIXML.parse(open("reads_all_vs_all.xml")): for alignment in record.alignments: if (alignment.length > 100): hits.append(record.query.split()[0]) os.remove("reads_all_vs_all.xml") shutil.rmtree("chunks") if (verbal): print("Subtracting hits from query dict keys") misses = set(q_dict.keys()) - set(hits) orphans = [q_dict[name] for name in misses] if (verbal): print("%i out of %i records in query are unique" % (len(misses), len(q_dict))) if (verbal): print("Writing to file %s" % (output)) SeqIO.write(orphans, output, 'fasta') if (verbal): print("Done. Hoping for the best.\n")
def determine_DGR_activity_from_metagenome(rawdatafile, reference_genomefile, VRs, TRs, output_folder, rawdatafile2): #Create temp_directory in output_folder to store information temp_folder = '%s/temp' % (output_folder) Path(temp_folder).mkdir(parents=True, exist_ok=True) #Keep track of files that need to be deleted at the end of the analysis temp_files = [] temp_blast_db = [] #Format the reference genome file for easier searching later formatted_ref_genomefile = '%s/formatted_ref_genome.fasta' % (temp_folder) temp_files.append(formatted_ref_genomefile) utils.format_ref_genome_file(reference_genomefile, formatted_ref_genomefile) #Get the name of the reference genome file and remove the file extension reference_genome_name = '.'.join( reference_genomefile.split('/')[-1].split('.')[:-1]) reference_genome_name = reference_genome_name.replace('_contigs', '') #Get the name of the rawdata file and remove the file extension rawdata_name = rawdatafile.split('/')[-1].split('.')[0] #Create error list for later debugging: errors = [] error_file = '%s/DGR_analysis_%s_errors.txt' % (output_folder, rawdata_name) #Determine if rawdatafile is in .gz format if rawdatafile.split('.')[-1] == 'gz': print('Uncompressing raw data') os.system('cp %s %s/%s.fastq.gz' % (rawdatafile, temp_folder, rawdata_name)) os.system('unpigz -p 4 %s/%s.fastq.gz' % (temp_folder, rawdata_name)) temp_files.append('%s/%s.fastq' % (temp_folder, rawdata_name)) if rawdatafile2 is not None: os.system('cp %s %s/%s_2.fastq.gz' % (rawdatafile, temp_folder, rawdata_name)) os.system('unpigz -p 4 %s/%s_2.fastq.gz' % (temp_folder, rawdata_name)) os.system('cat %s/%s_2.fastq >> %s/%s.fastq' % (temp_folder, rawdata_name, temp_folder, rawdata_name)) os.system('rm %s/%s_2.fastq' % (temp_folder, rawdata_name)) rawdatafile = '%s/%s.fastq' % (temp_folder, rawdata_name) #If the rawdata file is in fastq, it needs to be converted to fasta for blast, delete that file at the end #Otherwise just use the fastafile supplied and it does not need ot be deleted if utils.is_fastq_file(rawdatafile): print('Converting rawdata from FASTQ to FASTA') rawdata_fasta = '%s/%s.fa' % (temp_folder, rawdata_name) temp_files.append(rawdata_fasta) with open(rawdata_fasta, 'w') as f: data = SeqIO.parse(rawdatafile, 'fastq') i = 1 for seq in data: f.write('>Sequence%i\n%s\n' % (i, str(seq.seq))) i += 1 else: rawdata_fasta = rawdatafile vr_100_filename = '%s/VR-100bp.fasta' % (temp_folder) temp_files.append(vr_100_filename) #Create empty file with open(vr_100_filename, 'w') as vr_100: pass for VR_file, TR_file in zip(VRs, TRs): if Path(VR_file).stat().st_size > 0: VR_start, VR_end, VR_contig_num, VR_seq = utils.extract_sequence( VR_file, formatted_ref_genomefile) TR_start, TR_end, TR_contig_num, TR_seq = utils.extract_sequence( TR_file, formatted_ref_genomefile) if len(VR_seq) != len(TR_seq): raise ValueError('VR and TR lengths are not equal') unique_name = '%s-Contig%s_%s_%s' % ( reference_genome_name, VR_contig_num, VR_start, VR_end) print('Creating VR area files for %s' % (unique_name)) #Define VR area +/- 100 bp in order to map to VR region with open(vr_100_filename, 'a') as vr_100: with open(formatted_ref_genomefile, 'r') as ref_genome: rg_parser = SeqIO.parse(ref_genome, 'fasta') for contig in rg_parser: if contig.name == 'Contig%i' % (VR_contig_num): vr_area_start = VR_start - 100 if vr_area_start < 0: vr_area_start = 0 vr_area_end = VR_end + 100 if vr_area_end > len(contig.seq): vr_area_end = len(contig.seq) vr_100.write( '>VR_area_100-%s\n%s' % (unique_name, str(contig.seq[vr_area_start:vr_area_end]))) break else: errors.append('%s in %s did not contain a VR' % (VR_file, rawdata_name)) #Store all sequences from raw data that may potentially match to the VR area +/- 100 bp sequences_matched_to_vr_100_area = '%s/%s-seqs_match_VR100.fasta' % ( temp_folder, rawdata_name) vr_100_blast_database = '%s/vr100_blastdb' % (temp_folder) blastoutput_from_vr100_blast = '%s/blastoutput_vr100.txt' % (temp_folder) #Delete these files at the end during cleanup temp_files.append(sequences_matched_to_vr_100_area) temp_blast_db.append(vr_100_blast_database) temp_files.append(blastoutput_from_vr100_blast) #Create blast database with the VR+/-100 area cline = NcbimakeblastdbCommandline(dbtype='nucl', input_file=vr_100_filename, out=vr_100_blast_database) cline() print( "Finding potential raw data sequences that match to the surround VR area" ) #Setup blast output options #output_options = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'sseq', 'qseq'] output_options = ['qseqid', 'length', 'qlen'] blast_out_str = ' '.join(output_options) #Blast all rawdata to the VR+/-100 blast database cline = NcbiblastnCommandline(out=blastoutput_from_vr100_blast, db=vr_100_blast_database, query=rawdata_fasta, outfmt='6 %s' % (blast_out_str), word_size=8, reward=1, penalty=-1, evalue=1e-4, gapopen=6, gapextend=6, perc_identity=80, task='blastn', dust='no') cline() #Find sequences that had at least 50% of their sequence align (gets rid of partially aligned sequences) potential_seqs = [] with open(blastoutput_from_vr100_blast, 'r') as f: reader = csv.reader(f, delimiter='\t') for query_id, align_len, query_len in reader: #Check to see if at least 50% of the rawdata sequence aligned if int(align_len) >= (0.8 * int(query_len)): potential_seqs.append(query_id) if len(potential_seqs) == 0: utils.cleanup(temp_files, temp_blast_db) print('There were no sequences that matched to VR') #Now read in the rawdata file and create new data file is only the sequences that aligned to VR area +/- 100 bp #Output: sequences_matched_to_vr_100_area now contains all rawdata reads that potentially match to VR with open(sequences_matched_to_vr_100_area, 'w') as vr_100_writer: data = SeqIO.parse(rawdata_fasta, 'fasta') for sequence in data: if sequence.name == potential_seqs[0]: vr_100_writer.write('>%s\n%s\n' % (sequence.name, str(sequence.seq))) if len(potential_seqs) > 1: potential_seqs.pop(0) else: break print("Determining which candidate sequences match best to VR") #Go through the rawdata reads that potentially map to VR and determine if there is a better match somewhere else by using the entire ref genome #First create a new blast database with the entire reference genome entire_ref_genome_blast_database = '%s/ref_gen_blastdb' % (temp_folder) temp_blast_db.append(entire_ref_genome_blast_database) cline = NcbimakeblastdbCommandline(dbtype='nucl', input_file=formatted_ref_genomefile, out=entire_ref_genome_blast_database) time.sleep(3) cline() #Next create files for output, need to parse XML files to determine is best match is somewhere else best_alignments_blastoutput = '%s/best_alignments_blastoutput.xml' % ( temp_folder) temp_files.append(best_alignments_blastoutput) #Blast the candidate reads to the entire genome to find alignment, using a more stringent alignment. #If read doesn't match anywhere else or matches best to VR region, then will use the read for downstream analysis #Input file: sequences_matched_to_vr_100_area #To do: find the optimum search settings for more stringency cline = NcbiblastnCommandline(out=best_alignments_blastoutput, db=entire_ref_genome_blast_database, query=sequences_matched_to_vr_100_area, outfmt=5, word_size=20, reward=1, penalty=-2, evalue=1e-4, gapopen=6, gapextend=2, perc_identity=80) cline() #Now parse the XML file and find the best match (or no match) #Output includes all transcripts that match to VR (but some may also match to TR and thus need to be filtered) vr_tr_transcripts = '%s/vr_tr_transcripts.fasta' % (temp_folder) reads_that_matched_better_somewhere_else = '%s/reads_that_matched_better_somewhere_else.fasta' % ( temp_folder) temp_files.append(vr_tr_transcripts) #temp_files.append(reads_that_matched_better_somewhere_else) for VR_file, TR_file in zip(VRs, TRs): VR_start, VR_end, VR_contig_num, VR_seq = utils.extract_sequence( VR_file, formatted_ref_genomefile) TR_start, TR_end, TR_contig_num, TR_seq = utils.extract_sequence( TR_file, formatted_ref_genomefile) unique_name = '%s-Contig%s_%s_%s' % (reference_genome_name, VR_contig_num, VR_start, VR_end) print('Processing blast output for %s' % (unique_name)) vr_tr_names = [] with open(best_alignments_blastoutput, 'r') as blastoutput: parser = NCBIXML.parse(blastoutput) for result in parser: #For each sequence, find the best hit if len(result.alignments) > 0: lowest = 1 best = [0, 0] for anum, alignment in enumerate(result.alignments): for hnum, hsp in enumerate(alignment.hsps): if hsp.expect < lowest: lowest = hsp.expect best = [anum, hnum] alignment = result.alignments[best[0]] hsp = alignment.hsps[best[1]] if alignment.hit_def == 'Contig%i' % (VR_contig_num): within = False if hsp.sbjct_start >= VR_start and hsp.sbjct_start <= VR_end: within = True if hsp.sbjct_end >= VR_start and hsp.sbjct_end <= VR_end: within = True if hsp.sbjct_start <= VR_start and hsp.sbjct_end >= VR_end: within = True if within: vr_tr_names.append(result.query) if len(vr_tr_names) == 0: utils.cleanup(temp_files, temp_blast_db) print('There were no sequences that matched to VR') else: with open(vr_tr_transcripts, 'w') as vr_tr_writer: sequences = SeqIO.parse(sequences_matched_to_vr_100_area, 'fasta') for sequence in sequences: if sequence.name == vr_tr_names[0]: vr_tr_writer.write('>%s\n%s\n' % (sequence.name, str(sequence.seq))) if len(vr_tr_names) > 1: vr_tr_names.pop(0) else: break #Remove any potential TR sequences #Extract the TR area +/- 100 bp, then see if any of the tr_vr_sequences align perfectly to TR #To do: allow for 1-2 mismatches sequences = SeqIO.parse(formatted_ref_genomefile, 'fasta') for sequence in sequences: if sequence.name == 'Contig%i' % (TR_contig_num): start = TR_start - 100 if start < 0: start = 0 end = TR_end + 100 if end > len(sequence.seq): end = len(sequence.seq) tr_area = sequence.seq[start:end] tr_area_rev = tr_area.reverse_complement() vr_sequences = '%s/VR_sequences-%s.fasta' % (output_folder, unique_name) num_trs = 0 with open(vr_sequences, 'w') as vr_writer: sequences = SeqIO.parse(vr_tr_transcripts, 'fasta') for sequence in sequences: if sequence.seq in tr_area or sequence.seq in tr_area_rev: num_trs += 1 else: vr_writer.write('>%s\n%s\n' % (sequence.name, str(sequence.seq))) #Align VR sequences and orient the VR/TR pair aligned_VR_sequences = '%s/aligned_VR_sequences-%s.fa' % ( output_folder, unique_name) vr_blast_database = '%s/vrblastdb' % (temp_folder) vr_blast_output = '%s/vr_aligning_blastout.xml' % (temp_folder) temp_blast_db.append(vr_blast_database) temp_files.append(vr_blast_output) cline = NcbimakeblastdbCommandline(dbtype='nucl', input_file=VR_file, out=vr_blast_database) cline() cline = NcbiblastnCommandline(out=vr_blast_output, db=vr_blast_database, query=vr_sequences, outfmt=5, word_size=8, reward=1, penalty=-1, evalue=1e-4, gapopen=2, gapextend=1, perc_identity=50) cline() amiss, tmiss = 0, 0 for i in range(len(VR_seq)): if VR_seq[i] != TR_seq[i]: if TR_seq[i] == 'A': amiss += 1 elif TR_seq[i] == 'T': tmiss += 1 reverse = False if tmiss > amiss: reverse = True with open(vr_blast_output, 'r') as f: results = NCBIXML.parse(f) with open(aligned_VR_sequences, 'w') as aligned_VR_writer: vr_oriented = VR_seq if reverse: vr_oriented = str(Seq(VR_seq).reverse_complement()) aligned_VR_writer.write('>%s\n%s\n' % ('VR', vr_oriented)) for result in results: if len(result.alignments) > 0: lowest = 1 best = [0, 0] for anum, alignment in enumerate( result.alignments): for hnum, hsp in enumerate(alignment.hsps): if hsp.expect < lowest: lowest = hsp.expect best = [anum, hnum] alignment = result.alignments[best[0]] hsp = alignment.hsps[best[1]] seq_out = '' if hsp.sbjct_start < hsp.sbjct_end: start = hsp.sbjct_start - 1 else: start = hsp.sbjct_end - 1 i = 0 for i in range(start): seq_out += '-' if hsp.sbjct_start < hsp.sbjct_end: seq_out += hsp.query else: seq_out += str( Seq(hsp.query).reverse_complement()) end = len(VR_seq) - start - len(hsp.query) i = 0 for i in range(end): seq_out += '-' if reverse: seq_out = str( Seq(seq_out).reverse_complement()) aligned_VR_writer.write('>%s\n%s\n' % (result.query, seq_out)) utils.cleanup(temp_files, temp_blast_db) if len(errors) > 0: utils.print_errors(error_file, errors)
def generate_profiles(in_dataframe, out_path): """Rather complicated and quite honetly ugly looking function used for generating the profiles from a given set of sequences. Intended to be used internally. """ out_path = Path(out_path) dataset = in_dataframe s = Sultan() print('Unpacking and generating Uniprot DB.') s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run() cmd = NcbimakeblastdbCommandline( input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot') cmd() if not (out_path / 'profile').exists(): s.mkdir(out_path / 'profile').run() with TemporaryDirectory() as psi_temp: for _, sample in tqdm(dataset.iterrows(), total=len(dataset), desc='Generating profiles'): with NamedTemporaryFile(mode='w') as blast_in: if isinstance(sample.name, tuple): sample_id, chain = sample.name[0], sample.name[1] out_name = f'{sample_id}_{chain}' dump_path = out_path / 'full_test_summary.joblib' else: sample_id = sample.name out_name = sample_id dump_path = out_path / 'jpred_summary.joblib' sequence, structure = sample[['Sequence', 'Structure']] structure = ' ' + structure print(f'>{out_name}', file=blast_in) print(sequence, file=blast_in) blast_in.seek(0) cmd = NcbipsiblastCommandline( query=blast_in.name, db='../data/swiss-prot/uniprot_sprot.fasta', evalue=0.01, num_iterations=3, out_ascii_pssm=f'{psi_temp}/{out_name}.pssm', num_descriptions=10000, num_alignments=10000, # out=f'{psi_temp}{out_name}.alns.blast', num_threads=8) cmd() if not os.path.exists( os.path.join(psi_temp, out_name + '.pssm')): tqdm.write( f'Unable to generate profile for {out_name}. No hits in the database.' ) dataset.drop(index=sample.name, inplace=True) continue with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file: pssm_file.readline() pssm_file.readline() profile = [] offset = False position = 0 for line in pssm_file: line = line.rstrip() if not line: break line = line.split() line.append(structure[position]) position += 1 if not offset: for i in range(2): line.insert(0, '') offset = True profile.append(line) profile = pd.DataFrame(profile) profile.drop( (profile.columns[col] for col in range(2, 22)), axis=1, inplace=True) profile.drop((profile.columns[-3:-1]), axis=1, inplace=True) profile.drop((profile.columns[0]), axis=1, inplace=True) profile.columns = profile.iloc[0] profile = profile[1:] profile.rename(columns={profile.columns[0]: "Sequence"}, inplace=True) profile.rename(columns={profile.columns[-1]: "Structure"}, inplace=True) profile = profile[ ['Structure'] + [col for col in profile.columns if col != 'Structure']] profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype( float).divide(100) profile.to_csv(out_path / 'profile' / (out_name + '.profile'), sep='\t', index=False) print( f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile' ) dump(dataset, dump_path)
def iden_main(args): if args.circular: probe_generate = probe_generate_c else: probe_generate = probe_generate_l seq_dict = {_.id: _ for _ in SeqIO.parse(args.database, 'fasta')} _table = pd.read_table(args.meta, names=['group', 'sample', 'assembly'], dtype=str) # by group for generate probe _group_table = _table.groupby(['group'])['assembly'].apply(list).reset_index(name='assembly') # by sample for pre-BLAST _group_sample_table = _table.groupby(['group', 'sample'])['assembly'].apply(list).reset_index(name='assembly') for _idx, _row in _group_table.iterrows(): os.mkdir(args.tmp) # pre-probe _super_probe = defaultdict(str) for _asm in _row['assembly']: for _k, _v in probe_generate(seq_dict[_asm], args.length).items(): _super_probe[_k] = _v _probe_lines = ['>' + seq_id + '\n' + seq for seq, seq_id in _super_probe.items()] with open(os.path.join(args.tmp, 'pre_probe.fasta'), 'w') as f: f.write('\n'.join(_probe_lines)) del _super_probe, _probe_lines # pre-blast ## make blast database _db_lines = ['>' + seq_id + '\n' + str(seq_dict[seq_id].seq) for seq_id in _row['assembly']] with open(os.path.join(args.tmp, 'pre_blast_db.fasta'), 'w') as f: f.write('\n'.join(_db_lines)) del _db_lines _database_cmd = NcbimakeblastdbCommandline( cmd=os.path.join(args.blast, 'makeblastdb'), dbtype='nucl', input_file=os.path.join(args.tmp, 'pre_blast_db.fasta')) _database_cmd() ## blast _blastn_cmd = NcbiblastnCommandline( cmd=os.path.join(args.blast, 'blastn'), query=os.path.join(args.tmp, 'pre_probe.fasta'), db=os.path.join(args.tmp, 'pre_blast_db.fasta'), task='blastn-short', dust='no', word_size=min(round(args.length/2)-1, 11), outfmt='\"6 qacc sacc length pident evalue\"', num_threads=args.threads, evalue=10, out=os.path.join(args.tmp, 'pre_blast.txt'), max_hsps=1, max_target_seqs=len(_row['assembly'])) _blastn_cmd() ## parse result _blast_df = pd.read_table(os.path.join(args.tmp, 'pre_blast.txt'), names=['query', 'subject', 'length', 'identity', 'evalue']) _blast_df = _blast_df[(_blast_df['length'].values == args.length) & (_blast_df['identity'].values == 100)] _check_list = _group_sample_table.loc[_group_sample_table['group'].values == _row['group'], 'assembly'].to_list() _probe_list = [] for _sample in _check_list: _probe_list.append(set(_blast_df.loc[_blast_df['subject'].map(lambda x: x in _sample)]['query'].to_list())) _probe_result = list(reduce(lambda x,y: x & y, _probe_list)) del _probe_list _tmp_dict = {_.id: _ for _ in SeqIO.parse(os.path.join(args.tmp, 'pre_probe.fasta'), 'fasta')} _probe_dict = {} for _id in _probe_result: _probe_dict.setdefault(_id, _tmp_dict[_id]) SeqIO.write(list(_probe_dict.values()), os.path.join(args.tmp, 'probe.fasta'),'fasta') del _tmp_dict, _probe_dict ## BLAST ap = BLASTParse(os.path.join(args.tmp, 'probe.fasta'), _row['assembly'], args.database, args.tmp, args.length, args.threads, args.blast) ap.blast_cmd() _result_tb = ap.blast_parse() _result_tb['group'] = _row['group'] _result_tb['assembly'] = _row['assembly'][0] _result_tb[['group', 'assembly', 'seq', 'position', 'GC']]. \ to_csv(os.path.join(args.output, _row['group'] + '.txt'), sep='\t', index=False) del_dir(args.tmp)