コード例 #1
0
 def _make_db(self):
     os.mkdir(self.tmp)
     copy(self.genome, os.path.join(self.tmp, 'genome.fa'))
     self.genome = os.path.join(self.tmp, 'genome.fa')
     cline = NcbimakeblastdbCommandline(input_file=self.genome,
                                        dbtype='nucl')
     cline()
コード例 #2
0
ファイル: blast.py プロジェクト: NICHD-BSPC/httools
def blast_func(samplecfg):
    blastlog = ['BLAST']
    # create index if does not exist
    if not (os.path.exists(samplecfg.genome + '.nin')
            and os.path.exists(samplecfg.genome + '.nhr')
            and os.path.exists(samplecfg.genome + '.nsq')):
        makedb = NcbimakeblastdbCommandline(cmd='makeblastdb',
                                            dbtype='nucl',
                                            input_file=samplecfg.genome)
        stdout, stderr = makedb()
        blastlog.append('\n\nNcbimakeblastdb\n\n')
        blastlog.append(stdout)
        blastlog.append(stderr)
    # blast
    sampleblast = NcbiblastnCommandline(
        task='blastn',
        query=samplecfg.fasta,
        db=samplecfg.genome,
        outfmt=samplecfg.view,
        evalue=samplecfg.evalue,
        out=mydir + '/blast_' + samplecfg.sample + '.' + samplecfg.suffix +
        '.txt')
    stdout, stderr = sampleblast()
    blastlog.append('\n\nNcbiblastn\n\n')
    blastlog.append(stdout)
    blastlog.append(stderr)
    return blastlog
コード例 #3
0
def blastdb (db_file):
    make_db_cmd=NcbimakeblastdbCommandline(
        cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.9.0+/bin/makeblastdb',
        dbtype='nucl',
        input_file=db_file
    )
    make_db_cmd()
コード例 #4
0
ファイル: blast.py プロジェクト: KaKazuki/Leaves
def create_db(db_name: str) -> bool:
    blastsets = {
        'TAIR10_Whole_Genome': 'TAIR10_bac_con_20101028',
        'TAIR10_CDS': 'TAIR10_cds_20101214_updated',
        'TAIR10_Genes': 'TAIR10_cdna_20110103_representative_gene_model_updated'
    }
    # create Database directory
    if not os.path.isdir(db_file_path):
        os.makedirs(db_file_path)
        logger.info('Now, Database dir has been created!')
    # create blastsets directory
    if not os.path.isdir(ref_path):
        os.makedirs(ref_path)
        logger.info('Now, blastsets dir has been created!')
    if db_name not in blastsets.keys():
        logger.error('Entered a database that is not registered.')
        raise ValueError
    source = blastsets[db_name]
    source_path = os.path.join(ref_path, source)
    if not os.path.exists(source_path):
        subprocess.run(['curl', '-O',
                        'ftp://ftp.arabidopsis.org/home/tair/Sequences/blast_datasets/TAIR10_blastsets/' + source],
                       cwd=ref_path)
        logger.info('The source download is finished.')
    cline = NcbimakeblastdbCommandline(input_file=source_path, dbtype='nucl',
                                       parse_seqids=True, out=os.path.join(db_file_path, db_name))
    stdout, stderr = cline()
    if stderr:
        logger.debug(stderr)
        return False
    logger.debug(stdout)
    return True
コード例 #5
0
 def _reverse_blast_iden(self, threads):
     os.symlink(os.path.abspath(self.ref),
                os.path.join(self.tmp_dir, 'ref.fasta'))
     makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot',
                                                    input_file=os.path.join(
                                                        self.tmp_dir,
                                                        'ref.fasta'))
     blastp_cline = NcbiblastpCommandline(
         query=os.path.join(self.tmp_dir, 'putative.fasta'),
         db=os.path.join(self.tmp_dir, 'ref.fasta'),
         evalue='1e-3',
         outfmt="6 qacc sacc qlen slen length pident evalue",
         max_hsps=1,
         num_threads=threads,
         out=os.path.join(self.tmp_dir, 'reverse_blast.tbl'))
     makeblastdb_cline()
     blastp_cline()
     blast_result = pd.read_table(os.path.join(self.tmp_dir,
                                               'reverse_blast.tbl'),
                                  header=None,
                                  names=[
                                      'qacc', 'sacc', 'qlen', 'slen',
                                      'length', 'pident', 'evalue'
                                  ])
     seq_idx_lst = [
         _[0] for _ in enumerate(blast_result['sacc'].to_list())
         if _[1] in self.seed
     ]
     _tmp_lst = blast_result['qacc'].to_list()
     seq_lst = [_tmp_lst[_] for _ in seq_idx_lst]
     return set(seq_lst), blast_result[blast_result['qacc'].isin(seq_lst)]
コード例 #6
0
    def create(self) -> tuple:
        """Function for making local blast database.

        This function creates database from files found in source_dir.

        Returns:
            tuple(Database, str): Database object, output from makeblastdb.
        Creates:
            (*.nhr, *.nin, *.nsq): Created database's files in LMBD format.
        Raises:
            SubprocessError: When makeblastdb returns error or when input file does not exist.
        """

        self._aggregate(self.source_dir, Path("blast_input.fasta"))
        try:
            cmd = NcbimakeblastdbCommandline(input_file="blast_input.fasta",
                                             dbtype="nucl",
                                             title=self.name,
                                             out=self.name)
            makeblastdb_output = subprocess.run(str(cmd),
                                                capture_output=True,
                                                shell=True)
            if makeblastdb_output.stderr:
                raise subprocess.SubprocessError(
                    f"Makeblastdb returned error: {makeblastdb_output.stderr.decode()}"
                )
        except Exception:
            raise
        finally:
            if Path("blast_input.fasta").exists():
                Path("blast_input.fasta").unlink()
            return self, makeblastdb_output.stdout.decode()
コード例 #7
0
 def blast_iden(self, threads=2):
     shutil.copyfile(self.db, os.path.join(self.tmp_dir, 'database.fasta'))
     makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot',
                                                    input_file=os.path.join(
                                                        self.tmp_dir,
                                                        'database.fasta'))
     blastp_cline = NcbiblastpCommandline(
         query=self.seed,
         db=os.path.join(self.tmp_dir, 'database.fasta'),
         evalue='1e-5',
         outfmt="6 qacc sacc qlen slen length pident evalue",
         max_hsps=1,
         num_threads=threads,
         out=os.path.join(self.tmp_dir, 'blast.tbl'))
     makeblastdb_cline()
     blastp_cline()
     blast_result = pd.read_table(os.path.join(self.tmp_dir, 'blast.tbl'),
                                  header=None,
                                  names=[
                                      'qacc', 'sacc', 'qlen', 'slen',
                                      'length', 'pident', 'evalue'
                                  ])
     blast_result = blast_result[
         (blast_result['pident'] > 50)
         & (blast_result['length'] / blast_result['slen'] > 0.5) &
         (blast_result['length'] / blast_result['qlen'] > 0.5)]
     blast_result.to_csv(os.path.join(self.tmp_dir, 'blast2.tbl'),
                         sep='\t',
                         index=False)
     seq_list = [
         _ for _ in SeqIO.parse(self.db, 'fasta')
         if _.id in blast_result['sacc'].to_list()
     ]
     SeqIO.write(seq_list, os.path.join(self.tmp_dir, 'subgenes.fasta'),
                 'fasta')
コード例 #8
0
 def database_blast(self):
     database_cmd = NcbimakeblastdbCommandline(
         cmd=os.path.join(self.exec, 'makeblastdb'),
         dbtype='nucl',
         input_file=self.out_path)
     database_cmd()
     print('IdenDSS database created success!')
コード例 #9
0
ファイル: trna.py プロジェクト: 777moneymaker/jasper
    def create(self, input_file: Path):
        """Function for making local blast database.

        This function creates database from tRNAs retrieved from tRNAscan-SE.

        Args:
            input_file (Path): Path to file containing DB sequences.
        Returns:
            str: Database output.
        Creates:
            (*.nhr, *.nin, *.nsq): Created database's files in LMBD format.
        Raises:
            SubprocessError: When makeblastdb returns error or when input file does not exist.
        """

        try:
            cmd = NcbimakeblastdbCommandline(input_file=str(input_file),
                                             dbtype="nucl",
                                             title=self.name,
                                             out=self.name)
            cmd()
            makeblastdb_output = subprocess.run(str(cmd),
                                                capture_output=True,
                                                shell=True)
            if makeblastdb_output.stderr:
                raise subprocess.SubprocessError(
                    f"Makeblastdb returned error: {makeblastdb_output.stderr.decode()}"
                )
        except Exception:
            raise
        finally:
            if input_file.exists():
                input_file.unlink()
        return makeblastdb_output.stdout.decode()
コード例 #10
0
def blast():
    with open(concat_exons) as concatenated:
        concatenated_exons = SeqIO.to_dict(
            SeqIO.parse(concatenated, 'fasta', generic_dna))
    with open(f"{concat_exons.split('.')[0]}_names_corrected.fas",
              'w') as corrected:
        for key in concatenated_exons.keys():
            corrected.write(
                f">{key.split('-')[1]}-{key.split('-')[3]}\n{str(concatenated_exons[key].seq)}\n"
            )
    print('Building database for %s...' % concat_exons)
    NcbimakeblastdbCommandline(
        dbtype='nucl',
        input_file=f"{concat_exons.split('.')[0]}_names_corrected.fas",
        out=concat_exons,
        parse_seqids=True)()
    print('Done')
    print(f'Blasting {probes} against {concat_exons}')
    NcbiblastnCommandline(
        task=blast_task,
        query=probes,
        db=concat_exons,
        out=f'{probes}_against_{concat_exons}.txt',
        outfmt="6 qaccver saccver pident qcovhsp evalue bitscore",
        num_threads=4)()
    print('Done')
コード例 #11
0
def blastdb (in_file,db_file):
    make_db_cmd=NcbimakeblastdbCommandline(
        cmd='makeblastdb',
        dbtype='nucl',
        input_file=in_file,
        out=db_file
    )
    make_db_cmd()
コード例 #12
0
def blastdb(species_id_path):
    make_db_cmd = NcbimakeblastdbCommandline(
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/makeblastdb',
        dbtype='nucl',
        input_file=species_id_path,
        out=str(species_db_dir / species_id_path.stem))
    make_db_cmd()
コード例 #13
0
def make_blastdb(seqs_file):
    """Make a BLAST database from a protein FASTA file.

    Args:
        seqs_file (str): protein sequence FASTA file path.
    """
    makeblastdb = NcbimakeblastdbCommandline(dbtype="prot",
                                             input_file=seqs_file)
    out, err = makeblastdb()
コード例 #14
0
def make_blast_database(in_filename: str, db_filename: str):
    """
    Make local BLAST database from given file.
    """
    NcbimakeblastdbCommandline(input_file=in_filename,
                               parse_seqids=True,
                               title='e_coli_genome',
                               dbtype='nucl',
                               out=db_filename)()
コード例 #15
0
ファイル: blast.py プロジェクト: UtrechtUniversity/microbiome
def create_blast_db(fa_file_path=None, dbtype="nucl"):
    """Creates a new blast db

    As input takes the path of the collection of *.fa files.
    """
    print("Creating the blast DB...")

    create = NcbimakeblastdbCommandline(input_file=fa_file_path, dbtype=dbtype)

    create()

    print(f"Blast DB created at {fa_file_path}.")
コード例 #16
0
def blast(query, database, dbtype, title, evalue, outfmt, out):
    #tworzę bazę danych dla blasta
    cline = NcbimakeblastdbCommandline(dbtype=dbtype,
                                       input_file=database,
                                       title=title)
    cline()
    #wykonuję blasta wobec danej bazy danych
    blastx_cline = NcbitblastnCommandline(query=query,
                                          db=database,
                                          evalue=evalue,
                                          outfmt=outfmt,
                                          out=out)
    blastx_cline()
コード例 #17
0
ファイル: plugin_utils.py プロジェクト: Hua-CM/IdenSSR
 def blast_pair(self):
     print('Screen SSR start')
     # make a temporary directory for BLAST
     os.mkdir(self._tmpdir)
     sequences = SeqIO.to_dict(SeqIO.parse(self._seq_path, 'fasta'))
     lengths = {_id: len(_seq.seq) for _id, _seq in sequences.items()}
     ssr_info = pd.read_table(self._ssr_info)
     tmp_lst = []
     for _ in self._assembly.values():
         tmp_lst += _
     ssr_info = ssr_info[ssr_info['seqid'].isin(tmp_lst)]
     del tmp_lst
     if not self._circular:
         ssr_info = ssr_info[ssr_info.apply(
             lambda x:
             (x['start'] > 200) & (x['end'] < lengths[x['seqid']] - 200),
             axis=1)]
     # prepare sequences
     print('(1/3) prepare sequence for BLAST')
     seqlist = []
     for _idx, _item in ssr_info.iterrows():
         genome_seq = sequences[_item['seqid']]
         _sequence_id = str(genome_seq.id) + "_" + str(
             _item['start']) + "_" + str(_item['end'])
         _sequence_template = str(
             get_seq(genome_seq, _item['start'], _item['end']))
         seqlist.append('>' + _sequence_id)
         seqlist.append(_sequence_template)
     with open(os.path.join(self._tmpdir, 'query.fasta'), 'w') as f:
         f.write('\n'.join(seqlist))
         f.write('\n')
     # BLAST
     print('(2/3) BLAST start')
     database_cmd = NcbimakeblastdbCommandline(dbtype='nucl',
                                               input_file=os.path.join(
                                                   self._tmpdir,
                                                   'query.fasta'))
     blastn_cmd = NcbiblastnCommandline(
         query=os.path.join(self._tmpdir, 'query.fasta'),
         db=os.path.join(self._tmpdir, 'query.fasta'),
         dust='no',
         outfmt='\"6 qacc sacc length pident evalue\"',
         num_threads=self._threads,
         evalue='1e-3',
         out=os.path.join(self._tmpdir, 'blast_result.txt'),
         max_hsps=1)
     database_cmd()
     blastn_cmd()
     print('(2/3) BLAST Done')
コード例 #18
0
    def run(self):
        num_cases = len(list(SeqIO.parse(self.fasta, 'fasta')))

        # Make Database
        clinedb = NcbimakeblastdbCommandline(cmd='makeblastdb',
                                             dbtype='prot',
                                             input_file=self.fasta,
                                             input_type='fasta',
                                             out=self.fasta)
        clinedb()

        # Calculation
        clinec = NcbiblastpCommandline(
            cmd='blastp',
            query=self.fasta,
            db=self.fasta,
            evalue=10,
            outfmt='6 qseqid sseqid pident evalue bitscore score',
            max_target_seqs=num_cases,
            max_hsps=1,
            num_threads=self.cpucount,
            out='all_vs_all.tsv')
        clinec()

        # Data Processing
        data = pd.read_csv(
            'all_vs_all.tsv',
            delimiter='\t',
            names=['seq1', 'seq2', 'pident', 'evalue', 'bitscore', 'score'])
        no_dups = self.clean_duplicates(data)
        pivoted = no_dups.pivot(index='seq1',
                                columns='seq2',
                                values=self.type[self.matrix_type])

        for column in pivoted.columns:
            pos = pivoted.index.get_loc(column) + 1
            for index in pivoted.index[pos:]:
                if column != index:
                    pivoted.loc[index, column] = pivoted.loc[column, index]

        os.remove('all_vs_all.tsv')
        os.remove('{}.phr'.format(self.fasta))
        os.remove('{}.pin'.format(self.fasta))
        os.remove('{}.psq'.format(self.fasta))

        pivoted_round = pivoted.round(2)

        self.onfinished.emit(pivoted_round)
コード例 #19
0
def write_output(protein_records, cache_dir, args):
    """Write out the extract protein sequences to the specified output(s)
    
    :param protein_records: list of SeqRecords
    :param cache_dir: Path, cache directory
    :param args: cmd-line args parser
    
    Return nothing.
    """
    logger = logging.getLogger(__name__)

    if args.fasta_file:
        SeqIO.write(protein_records, args.fasta_file, "fasta")

    if args.fasta_dir:
        for record in protein_records:
            accession = record.id
            target_path = args.fasta_dir / f'{accession}.fasta'

            SeqIO.write([record], target_path, "fasta")

    if args.blastdb:
        fasta_name = args.blastdb / 'blastdb.fasta'
        SeqIO.write(protein_records, target_path, "fasta")

        cmd_makedb = NcbimakeblastdbCommandline(
            cmd='makeblastdb',
            dbtype='prot',
            input_file=fasta_name,
        )
        stdout, stderr = cmd_makedb()

        # check the command was successfully exectured
        if len(stderr) != 0:
            logger.warning()
            print(
                f"Could not build non-CAZyme db.\nstdout={stdout}\nstderr={stderr}"
            )

    cache_path = cache_dir / 'extracted_sequences.txt'
    with open(cache_path, 'a') as fh:
        for record in protein_records:
            fh.write(f"{record.id}\n")

    return
def run_blast(query_file,species_id_path,species_out_path):
    make_db_cmd=NcbimakeblastdbCommandline(
        cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/makeblastdb',
        dbtype='nucl',
        input_file=species_id_path,
        out=str(species_out_path/"blastdb"/species_id_path.stem)
    )
    blast_cmd=NcbiblastnCommandline(
            cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
            query=query_file,
            db=species_out_path/"blastdb"/species_id_path.stem,
            outfmt=11,
            out=species_out_path/"asn"/(species_id_path.stem+".asn")
            # perc_identity=95
        )
    blast_xml_cmd=NcbiblastformatterCommandline(
    archive=species_out_path/"asn"/(species_id_path.stem+".asn"),
    outfmt=5,
    out=species_out_path/"xml"/(species_id_path.stem+".xml"),
    cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd=NcbiblastformatterCommandline(
        archive=species_out_path/"asn"/(species_id_path.stem+".asn"),
        outfmt=7,
        out=species_out_path/"txt"/(species_id_path.stem+".txt"),
        cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    db_file=species_out_path/"blastdb"/(species_id_path.stem+".ndb")
    if (species_out_path/"xml"/(species_id_path.stem+".xml")).exists() is False:
        if db_file.exists() is False:
            make_db_cmd()
        try:
            blast_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
        try:
            blast_txt_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
        try:
            blast_xml_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
コード例 #21
0
    def _forward_blast_iden(self, threads):
        all_proteins = SeqIO.to_dict(SeqIO.parse(self.ref, 'fasta'))
        if self.seed:
            seed_proteins = [all_proteins.get(_) for _ in self.seed]
        else:  # do not need accessions, the ref fasta file is the ref sequences
            seed_proteins = all_proteins
        SeqIO.write(seed_proteins, os.path.join(self.tmp_dir, 'seed.fasta'),
                    'fasta')
        del all_proteins, seed_proteins
        os.symlink(os.path.abspath(self.query),
                   os.path.join(self.tmp_dir, 'query.fasta'))
        makeblastdb_cline = NcbimakeblastdbCommandline(dbtype='prot',
                                                       input_file=os.path.join(
                                                           self.tmp_dir,
                                                           'query.fasta'))

        blastp_cline = NcbiblastpCommandline(
            query=os.path.join(self.tmp_dir, 'seed.fasta'),
            db=os.path.join(self.tmp_dir, 'query.fasta'),
            evalue='1e-3',
            outfmt="6 qacc sacc qlen slen length pident evalue",
            max_hsps=1,
            num_threads=threads,
            out=os.path.join(self.tmp_dir, 'blast.tbl'))
        makeblastdb_cline()
        blastp_cline()
        blast_result = pd.read_table(os.path.join(self.tmp_dir, 'blast.tbl'),
                                     header=None,
                                     names=[
                                         'qacc', 'sacc', 'qlen', 'slen',
                                         'length', 'pident', 'evalue'
                                     ])
        seq_list = [
            _ for _ in SeqIO.parse(self.query, 'fasta')
            if _.id in blast_result['sacc'].to_list()
        ]
        SeqIO.write(seq_list, os.path.join(self.tmp_dir, 'putative.fasta'),
                    'fasta')
コード例 #22
0
ファイル: plugin_utils.py プロジェクト: Hua-CM/IdenSSR
 def blast_self(self):
     os.mkdir(self._tmpdir)
     # prepare fasta
     seq_list = []
     for _idx, _row in self._primer_info.iterrows():
         try:
             seq_list.append(
                 SeqRecord(Seq(_row['Forward']),
                           id=_row['ID'] + '_F',
                           name='',
                           description=''))
             seq_list.append(
                 SeqRecord(Seq(_row['Reverse']),
                           id=_row['ID'] + '_R',
                           name='',
                           description=''))
         except TypeError:
             continue
     SeqIO.write(seq_list, os.path.join(self._tmpdir, 'query.fasta'),
                 'fasta')
     # BLAST
     database_cmd = NcbimakeblastdbCommandline(dbtype='nucl',
                                               input_file=self._seq_path)
     blastn_cmd = NcbiblastnCommandline(
         query=os.path.join(self._tmpdir, 'query.fasta'),
         db=self._seq_path,
         task='blastn-short',
         outfmt=5,
         num_threads=self._threads,
         evalue='10',
         out=os.path.join(self._tmpdir, 'blast_result.xml'),
         max_hsps=1,
         max_target_seqs=2)
     try:
         database_cmd()
         blastn_cmd()
     except IOError:
         print('please check your file and/or its permission')
コード例 #23
0
def create_hit_tables(fasta_file, probe_exons, n_cpu, length_cover, log_file):
    """Running blast on every fasta file with contigs. Probe file is blasted against contigs. Blast results are saved in
    text file."""
    logger = create_logger(log_file)
    path = os.path.dirname(fasta_file)
    fasta_file: str = os.path.basename(fasta_file)
    sample: str = Path(fasta_file).stem
    NcbimakeblastdbCommandline(
        dbtype="nucl",
        input_file=os.path.join(path, fasta_file),
        out=os.path.join(path, sample),
        parse_seqids=True,
    )()
    logger.info(f"\t\tCreating hit table for {sample}. Running BLAST...")
    NcbiblastnCommandline(
        task="blastn",
        query=probe_exons,
        db=os.path.join(path, sample),
        out=os.path.join(path, f"reference_in_{sample}_contigs.txt"),
        qcov_hsp_perc=length_cover,
        outfmt="6 qaccver saccver pident qcovhsp evalue bitscore sstart send",
    )()
    logger.info(f"\t\tHit table for {sample} is ready")
コード例 #24
0
fh_perc_id_out = open('perc_id.txt', 'a')
header = "\t".join(["OG_group", "ORF_id", "Gene_id", "Perc_id", "Closest_SDP"])
fh_perc_id_out.write(header + '\n')
count_aln_results = 0
count_species_recruits = dict()
for orf_file in orf_files:
    print('Processing orf-file: ', orf_file)
    filename_split = orf_file.split('_')
    OG = filename_split[0]
    core_aln_file = OG + '_aln_nuc.fasta'
    core_aln_file_fullname = core_aln_dir + '/' + core_aln_file
    core_ffn_file = OG + '.ffn'
    core_ffn_file_full = core_aln_dir + '/' + core_ffn_file
    #copy core ffn-file as temporary file to SDP-dir, makeblastdb
    copyfile(core_ffn_file_full, "temp.ffn")
    makeblastdb_cmd = NcbimakeblastdbCommandline(dbtype="nucl",
                                                 input_file="temp.ffn")
    makeblastdb_cmd()
    #Loop over orfs in current orf-file. Write each orf to temporary fasta-file, blast against the core-seqs and get species-affiliation for first blast-hit.
    for seq_record in SeqIO.parse(orf_file, "fasta"):
        SeqIO.write(seq_record, "temp_orf.ffn", "fasta")
        blast_result = get_best_blast_hit('temp_orf.ffn', 'temp.ffn')
        if (blast_result == None): continue
        #Add  orf to core alignment with muscle and get max perc-id
        aln_result = add_orf_perc_id(core_aln_file_fullname, 'temp_orf.ffn')
        if (aln_result[2] != 0):
            orf_max_perc_id = round(aln_result[2], 2)
            orf_id = aln_result[0]
            best_hit_to_species_gene_id = aln_result[1]
            best_species = "other"
            if blast_result in species_dict:
                best_species = species_dict[blast_result]
コード例 #25
0
            shutil.copyfileobj(f_in, f_out)


if not exists(result):
    if not exists(name_subject):
        print(name_subject, 'file does not exists in path downloading...')
        file_hpv = wget.download(url_hpv)

    if not exists(name_query):
        print(name_query, 'file does not exists in path downloading...')
        file_cov = wget.download(url_cov)

    unzip(name_subject)
    unzip(name_query)
    subject_cline = NcbimakeblastdbCommandline(cmd=makeblast,
                                               dbtype="prot",
                                               input_file=subject,
                                               out=subject_out)
    query_cline = NcbimakeblastdbCommandline(cmd=makeblast,
                                             dbtype="prot",
                                             input_file=query,
                                             out=query_out)
    print(subject_cline)
    s_stdout, s_stderr = subject_cline()
    q_stdout, q_stderr = query_cline()

    # cov2 = SeqIO.parse(gzip.open(name_query, mode), format=format)
    # hpv = SeqIO.parse(gzip.open(name_subject, mode), format=format)
    result_cline = NcbiblastpCommandline(cmd=blast,
                                         query=query,
                                         db=subject_out,
                                         out=result,
コード例 #26
0
def validateFullLengthSequencesUsingBlast(referenceSequences=None,
                                          fullLengthSequences=None,
                                          outputDirectory=None,
                                          threadCount=1,
                                          batchSize=50,
                                          verbose=False,
                                          delimiter='\t',
                                          keepBlastFiles=False):
    # TODO: Blast is hopefully faster than pairwise alignments.
    #  But it's only doing local alignments.
    print('Validating ' + str(len(fullLengthSequences)) +
          ' sequences against ' + str(len(referenceSequences)) +
          ' Reference Sequences using Blast Alignments (threads=' +
          str(threadCount) + ')')
    # Start a thread pool
    queryBatches = []
    batchResults = []

    pool = multiprocessing.Pool(threadCount)
    before = currentMillis()

    # Create Blast Reference
    blastDirectory = join(outputDirectory, 'blast_results')
    if (not isdir(blastDirectory)):
        makedirs(blastDirectory)
    referenceFileName = join(blastDirectory, 'BlastReference.fasta')
    printSequences(alleleSequences=referenceSequences,
                   outputFilename=referenceFileName,
                   verbose=verbose)
    cline = NcbimakeblastdbCommandline(dbtype="nucl",
                                       input_file=referenceFileName)
    stdout, stderr = cline()
    if (verbose):
        print('MakeDB Commandline:\n' + str(cline))
        print('Output:' + str(stdout))
        print('Errors?:' + str(stderr))

    # Split Query Sequences into Batches
    if (verbose):
        print('Splitting ' + str(len(fullLengthSequences)) +
              ' sequences into batches of size ' + str(batchSize))
    newBatch = []
    for sequenceIndex, sequence in enumerate(fullLengthSequences):
        newBatch.append(sequence)
        if (len(newBatch) >= batchSize
                or sequenceIndex == len(fullLengthSequences) - 1):
            # Done with this batch. Write it to file
            batchFileName = join(
                blastDirectory,
                'Batch' + str(len(queryBatches)) + 'Sequences.fasta')
            printSequences(alleleSequences=newBatch,
                           outputFilename=batchFileName,
                           verbose=verbose)

            queryBatches.append(newBatch)
            newBatch = []
    if (verbose):
        print('Found ' + str(len(queryBatches)) + ' batches of size <= ' +
              str(batchSize))

    # For each Batch
    for batchIndex, batch in enumerate(queryBatches):
        batchFileName = join(blastDirectory,
                             'Batch' + str(batchIndex) + 'Sequences.fasta')
        # Start thread to run blast against references
        if (threadCount > 1):
            batchResults.append(
                pool.starmap_async(
                    findBestReferenceSequence,
                    [[referenceFileName, batchFileName, verbose]]))
        else:
            batchResults.append(
                findBestReferenceSequence(referenceFileName=referenceFileName,
                                          batchFileName=batchFileName,
                                          verbose=verbose))

    pool.close()
    pool.join()

    # Delete blast output files
    if (not keepBlastFiles):
        cleanupBlastOutputFiles(blastDirectory=blastDirectory,
                                referenceFileName=referenceFileName,
                                queryBatches=queryBatches)

    # Create output file
    sequenceValidationResultsFile = open(
        join(outputDirectory, 'ReferenceFinderValidationResults.csv'), 'w')
    sequenceValidationResultsFile.write('Query_Name' + delimiter +
                                        'Best_Reference' + delimiter +
                                        'Alignment_Score\n')

    # Each batch result should be a dictionary. Take those results and write them
    for batchResult in batchResults:
        if (threadCount > 1):
            # If it's multi threaded we need to "get" the value
            currentBatchResults = batchResult.get()[0]
        else:
            currentBatchResults = batchResult

        for queryAlleleName in currentBatchResults.keys():
            bestReferenceName, alignmentScore = currentBatchResults[
                queryAlleleName]
            sequenceValidationResultsFile.write(
                str(queryAlleleName) + delimiter + str(bestReferenceName) +
                delimiter + str(alignmentScore) + '\n')

    sequenceValidationResultsFile.close()

    if (verbose):
        after = currentMillis()
        print('Finding References ' + str(len(fullLengthSequences)) +
              ' sequences took ' + str((after - before)) + ' seconds.')
コード例 #27
0
def main(argv):

    argsgiven = 0
    query = ''
    subject = ''
    build_DB = True
    usage = 'seq_uniq_seek.py -q <queryfile>.fasta -s <subjectfile>.fasta -B [build database true/false]'
    verbal = True
    opts, args = getopt.getopt(argv, "xmhq:s:o:", ["subject=", "query="])
    for opt, arg in opts:
        if opt == '-h':
            print(usage)
            sys.exit()
        elif opt == '-x':
            build_DB = False
        elif opt in ("-q", "--query"):
            query = arg
            argsgiven += 1
        elif opt in ("-m", "--mute"):
            verbal = False
            argsgiven += 1
        elif opt in ("-s", "--subject"):
            subject = arg
            argsgiven += 1
        elif opt in ("-o", "--output"):
            output = arg
            argsgiven += 1
    if (argsgiven < 3):
        print(usage)
        sys.exit(2)

    if (verbal):
        print(
            "\n ---- ==== SEEK UNIQ SEQ ==== ---- \nFinding sequences occuring in "
            + query + " that are not occuring in " + subject +
            " and saving in " + output + ".fasta\n")

    if (build_DB):
        if (verbal):
            print("Building blast database for subject file (" + subject + ")")
        makedb = NcbimakeblastdbCommandline(cmd='makeblastdb',
                                            input_file=subject,
                                            dbtype='nucl',
                                            parse_seqids=True)
        makedb()
        if (verbal): print("Done.\n")
    else:
        if (verbal): print("Not building database. Hoping for the best")

    if (verbal):
        print("Blasting query (" + query + ") against subject database (" +
              subject + ")")

    if (verbal): print("Splitting query into multiple files to save memory.")

    shutil.rmtree("chunks", ignore_errors=True)
    os.mkdir("chunks")

    record_iter = SeqIO.parse(open(query), "fasta")
    for i, batch in enumerate(batch_iterator(record_iter, 10000)):
        filename = "chunks/chunk_%i.fasta" % (i + 1)
        with open(filename, "w") as handle:
            count = SeqIO.write(batch, handle, "fasta")

    if (verbal): print("Building query index dictionary")
    q_dict = SeqIO.index(query, "fasta")
    hits = []

    chunks = glob.glob('chunks/chunk*')
    for i, file in enumerate(chunks):
        now = datetime.now()
        dt_string = now.strftime("%d-%m_%H:%M:%S")
        print("[xenoseq_blast   " + dt_string +
              "] So anyway... I'm busy blasting... " +
              str(round(i / len(chunks) * 100, 2)) + "%")

        blastn_cline = NcbiblastnCommandline(cmd='blastn',
                                             query=file,
                                             db=subject,
                                             num_threads=8,
                                             evalue=1e-5,
                                             perc_identity=90,
                                             outfmt=5,
                                             out="reads_all_vs_all.xml")

        blastn_cline()

        # Bit below is from: https://biopython.org/wiki/Retrieve_nonmatching_blast_queries

        for record in NCBIXML.parse(open("reads_all_vs_all.xml")):
            for alignment in record.alignments:
                if (alignment.length > 100):
                    hits.append(record.query.split()[0])
        os.remove("reads_all_vs_all.xml")

    shutil.rmtree("chunks")

    if (verbal): print("Subtracting hits from query dict keys")
    misses = set(q_dict.keys()) - set(hits)
    orphans = [q_dict[name] for name in misses]
    if (verbal):
        print("%i out of %i records in query are unique" %
              (len(misses), len(q_dict)))
    if (verbal): print("Writing to file %s" % (output))
    SeqIO.write(orphans, output, 'fasta')
    if (verbal): print("Done. Hoping for the best.\n")
コード例 #28
0
def determine_DGR_activity_from_metagenome(rawdatafile, reference_genomefile,
                                           VRs, TRs, output_folder,
                                           rawdatafile2):
    #Create temp_directory in output_folder to store information
    temp_folder = '%s/temp' % (output_folder)
    Path(temp_folder).mkdir(parents=True, exist_ok=True)

    #Keep track of files that need to be deleted at the end of the analysis
    temp_files = []
    temp_blast_db = []

    #Format the reference genome file for easier searching later
    formatted_ref_genomefile = '%s/formatted_ref_genome.fasta' % (temp_folder)
    temp_files.append(formatted_ref_genomefile)
    utils.format_ref_genome_file(reference_genomefile,
                                 formatted_ref_genomefile)

    #Get the name of the reference genome file and remove the file extension
    reference_genome_name = '.'.join(
        reference_genomefile.split('/')[-1].split('.')[:-1])
    reference_genome_name = reference_genome_name.replace('_contigs', '')

    #Get the name of the rawdata file and remove the file extension
    rawdata_name = rawdatafile.split('/')[-1].split('.')[0]

    #Create error list for later debugging:
    errors = []
    error_file = '%s/DGR_analysis_%s_errors.txt' % (output_folder,
                                                    rawdata_name)

    #Determine if rawdatafile is in .gz format
    if rawdatafile.split('.')[-1] == 'gz':
        print('Uncompressing raw data')
        os.system('cp %s %s/%s.fastq.gz' %
                  (rawdatafile, temp_folder, rawdata_name))
        os.system('unpigz -p 4 %s/%s.fastq.gz' % (temp_folder, rawdata_name))
        temp_files.append('%s/%s.fastq' % (temp_folder, rawdata_name))
        if rawdatafile2 is not None:
            os.system('cp %s %s/%s_2.fastq.gz' %
                      (rawdatafile, temp_folder, rawdata_name))
            os.system('unpigz -p 4 %s/%s_2.fastq.gz' %
                      (temp_folder, rawdata_name))
            os.system('cat %s/%s_2.fastq >> %s/%s.fastq' %
                      (temp_folder, rawdata_name, temp_folder, rawdata_name))
            os.system('rm %s/%s_2.fastq' % (temp_folder, rawdata_name))
        rawdatafile = '%s/%s.fastq' % (temp_folder, rawdata_name)

    #If the rawdata file is in fastq, it needs to be converted to fasta for blast, delete that file at the end
    #Otherwise just use the fastafile supplied and it does not need ot be deleted
    if utils.is_fastq_file(rawdatafile):
        print('Converting rawdata from FASTQ to FASTA')
        rawdata_fasta = '%s/%s.fa' % (temp_folder, rawdata_name)
        temp_files.append(rawdata_fasta)
        with open(rawdata_fasta, 'w') as f:
            data = SeqIO.parse(rawdatafile, 'fastq')
            i = 1
            for seq in data:
                f.write('>Sequence%i\n%s\n' % (i, str(seq.seq)))
                i += 1
    else:
        rawdata_fasta = rawdatafile

    vr_100_filename = '%s/VR-100bp.fasta' % (temp_folder)
    temp_files.append(vr_100_filename)
    #Create empty file
    with open(vr_100_filename, 'w') as vr_100:
        pass

    for VR_file, TR_file in zip(VRs, TRs):
        if Path(VR_file).stat().st_size > 0:
            VR_start, VR_end, VR_contig_num, VR_seq = utils.extract_sequence(
                VR_file, formatted_ref_genomefile)
            TR_start, TR_end, TR_contig_num, TR_seq = utils.extract_sequence(
                TR_file, formatted_ref_genomefile)

            if len(VR_seq) != len(TR_seq):
                raise ValueError('VR and TR lengths are not equal')

            unique_name = '%s-Contig%s_%s_%s' % (
                reference_genome_name, VR_contig_num, VR_start, VR_end)

            print('Creating VR area files for %s' % (unique_name))

            #Define VR area +/- 100 bp in order to map to VR region
            with open(vr_100_filename, 'a') as vr_100:
                with open(formatted_ref_genomefile, 'r') as ref_genome:
                    rg_parser = SeqIO.parse(ref_genome, 'fasta')
                    for contig in rg_parser:
                        if contig.name == 'Contig%i' % (VR_contig_num):
                            vr_area_start = VR_start - 100
                            if vr_area_start < 0:
                                vr_area_start = 0
                            vr_area_end = VR_end + 100
                            if vr_area_end > len(contig.seq):
                                vr_area_end = len(contig.seq)
                            vr_100.write(
                                '>VR_area_100-%s\n%s' %
                                (unique_name,
                                 str(contig.seq[vr_area_start:vr_area_end])))
                            break
        else:
            errors.append('%s in %s did not contain a VR' %
                          (VR_file, rawdata_name))

    #Store all sequences from raw data that may potentially match to the VR area +/- 100 bp
    sequences_matched_to_vr_100_area = '%s/%s-seqs_match_VR100.fasta' % (
        temp_folder, rawdata_name)
    vr_100_blast_database = '%s/vr100_blastdb' % (temp_folder)
    blastoutput_from_vr100_blast = '%s/blastoutput_vr100.txt' % (temp_folder)

    #Delete these files at the end during cleanup
    temp_files.append(sequences_matched_to_vr_100_area)
    temp_blast_db.append(vr_100_blast_database)
    temp_files.append(blastoutput_from_vr100_blast)

    #Create blast database with the VR+/-100 area
    cline = NcbimakeblastdbCommandline(dbtype='nucl',
                                       input_file=vr_100_filename,
                                       out=vr_100_blast_database)
    cline()

    print(
        "Finding potential raw data sequences that match to the surround VR area"
    )

    #Setup blast output options
    #output_options = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'sseq', 'qseq']
    output_options = ['qseqid', 'length', 'qlen']
    blast_out_str = ' '.join(output_options)

    #Blast all rawdata to the VR+/-100 blast database
    cline = NcbiblastnCommandline(out=blastoutput_from_vr100_blast,
                                  db=vr_100_blast_database,
                                  query=rawdata_fasta,
                                  outfmt='6 %s' % (blast_out_str),
                                  word_size=8,
                                  reward=1,
                                  penalty=-1,
                                  evalue=1e-4,
                                  gapopen=6,
                                  gapextend=6,
                                  perc_identity=80,
                                  task='blastn',
                                  dust='no')
    cline()

    #Find sequences that had at least 50% of their sequence align (gets rid of partially aligned sequences)
    potential_seqs = []
    with open(blastoutput_from_vr100_blast, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for query_id, align_len, query_len in reader:
            #Check to see if at least 50% of the rawdata sequence aligned
            if int(align_len) >= (0.8 * int(query_len)):
                potential_seqs.append(query_id)

    if len(potential_seqs) == 0:
        utils.cleanup(temp_files, temp_blast_db)
        print('There were no sequences that matched to VR')

    #Now read in the rawdata file and create new data file is only the sequences that aligned to VR area +/- 100 bp
    #Output: sequences_matched_to_vr_100_area now contains all rawdata reads that potentially match to VR
    with open(sequences_matched_to_vr_100_area, 'w') as vr_100_writer:
        data = SeqIO.parse(rawdata_fasta, 'fasta')
        for sequence in data:
            if sequence.name == potential_seqs[0]:
                vr_100_writer.write('>%s\n%s\n' %
                                    (sequence.name, str(sequence.seq)))
                if len(potential_seqs) > 1:
                    potential_seqs.pop(0)
                else:
                    break

    print("Determining which candidate sequences match best to VR")

    #Go through the rawdata reads that potentially map to VR and determine if there is a better match somewhere else by using the entire ref genome
    #First create a new blast database with the entire reference genome
    entire_ref_genome_blast_database = '%s/ref_gen_blastdb' % (temp_folder)
    temp_blast_db.append(entire_ref_genome_blast_database)

    cline = NcbimakeblastdbCommandline(dbtype='nucl',
                                       input_file=formatted_ref_genomefile,
                                       out=entire_ref_genome_blast_database)
    time.sleep(3)
    cline()

    #Next create files for output, need to parse XML files to determine is best match is somewhere else
    best_alignments_blastoutput = '%s/best_alignments_blastoutput.xml' % (
        temp_folder)
    temp_files.append(best_alignments_blastoutput)

    #Blast the candidate reads to the entire genome to find alignment, using a more stringent alignment.
    #If read doesn't match anywhere else or matches best to VR region, then will use the read for downstream analysis
    #Input file: sequences_matched_to_vr_100_area
    #To do: find the optimum search settings for more stringency
    cline = NcbiblastnCommandline(out=best_alignments_blastoutput,
                                  db=entire_ref_genome_blast_database,
                                  query=sequences_matched_to_vr_100_area,
                                  outfmt=5,
                                  word_size=20,
                                  reward=1,
                                  penalty=-2,
                                  evalue=1e-4,
                                  gapopen=6,
                                  gapextend=2,
                                  perc_identity=80)
    cline()

    #Now parse the XML file and find the best match (or no match)
    #Output includes all transcripts that match to VR (but some may also match to TR and thus need to be filtered)
    vr_tr_transcripts = '%s/vr_tr_transcripts.fasta' % (temp_folder)
    reads_that_matched_better_somewhere_else = '%s/reads_that_matched_better_somewhere_else.fasta' % (
        temp_folder)
    temp_files.append(vr_tr_transcripts)
    #temp_files.append(reads_that_matched_better_somewhere_else)

    for VR_file, TR_file in zip(VRs, TRs):
        VR_start, VR_end, VR_contig_num, VR_seq = utils.extract_sequence(
            VR_file, formatted_ref_genomefile)
        TR_start, TR_end, TR_contig_num, TR_seq = utils.extract_sequence(
            TR_file, formatted_ref_genomefile)

        unique_name = '%s-Contig%s_%s_%s' % (reference_genome_name,
                                             VR_contig_num, VR_start, VR_end)

        print('Processing blast output for %s' % (unique_name))

        vr_tr_names = []
        with open(best_alignments_blastoutput, 'r') as blastoutput:
            parser = NCBIXML.parse(blastoutput)
            for result in parser:
                #For each sequence, find the best hit
                if len(result.alignments) > 0:
                    lowest = 1
                    best = [0, 0]
                    for anum, alignment in enumerate(result.alignments):
                        for hnum, hsp in enumerate(alignment.hsps):
                            if hsp.expect < lowest:
                                lowest = hsp.expect
                                best = [anum, hnum]
                    alignment = result.alignments[best[0]]
                    hsp = alignment.hsps[best[1]]
                    if alignment.hit_def == 'Contig%i' % (VR_contig_num):
                        within = False
                        if hsp.sbjct_start >= VR_start and hsp.sbjct_start <= VR_end:
                            within = True
                        if hsp.sbjct_end >= VR_start and hsp.sbjct_end <= VR_end:
                            within = True
                        if hsp.sbjct_start <= VR_start and hsp.sbjct_end >= VR_end:
                            within = True
                        if within:
                            vr_tr_names.append(result.query)

        if len(vr_tr_names) == 0:
            utils.cleanup(temp_files, temp_blast_db)
            print('There were no sequences that matched to VR')
        else:
            with open(vr_tr_transcripts, 'w') as vr_tr_writer:
                sequences = SeqIO.parse(sequences_matched_to_vr_100_area,
                                        'fasta')
                for sequence in sequences:
                    if sequence.name == vr_tr_names[0]:
                        vr_tr_writer.write('>%s\n%s\n' %
                                           (sequence.name, str(sequence.seq)))
                        if len(vr_tr_names) > 1:
                            vr_tr_names.pop(0)
                        else:
                            break

            #Remove any potential TR sequences
            #Extract the TR area +/- 100 bp, then see if any of the tr_vr_sequences align perfectly to TR
            #To do: allow for 1-2 mismatches
            sequences = SeqIO.parse(formatted_ref_genomefile, 'fasta')
            for sequence in sequences:
                if sequence.name == 'Contig%i' % (TR_contig_num):
                    start = TR_start - 100
                    if start < 0:
                        start = 0
                    end = TR_end + 100
                    if end > len(sequence.seq):
                        end = len(sequence.seq)
                    tr_area = sequence.seq[start:end]
                    tr_area_rev = tr_area.reverse_complement()

            vr_sequences = '%s/VR_sequences-%s.fasta' % (output_folder,
                                                         unique_name)
            num_trs = 0
            with open(vr_sequences, 'w') as vr_writer:
                sequences = SeqIO.parse(vr_tr_transcripts, 'fasta')
                for sequence in sequences:
                    if sequence.seq in tr_area or sequence.seq in tr_area_rev:
                        num_trs += 1
                    else:
                        vr_writer.write('>%s\n%s\n' %
                                        (sequence.name, str(sequence.seq)))

            #Align VR sequences and orient the VR/TR pair
            aligned_VR_sequences = '%s/aligned_VR_sequences-%s.fa' % (
                output_folder, unique_name)
            vr_blast_database = '%s/vrblastdb' % (temp_folder)
            vr_blast_output = '%s/vr_aligning_blastout.xml' % (temp_folder)

            temp_blast_db.append(vr_blast_database)
            temp_files.append(vr_blast_output)

            cline = NcbimakeblastdbCommandline(dbtype='nucl',
                                               input_file=VR_file,
                                               out=vr_blast_database)
            cline()

            cline = NcbiblastnCommandline(out=vr_blast_output,
                                          db=vr_blast_database,
                                          query=vr_sequences,
                                          outfmt=5,
                                          word_size=8,
                                          reward=1,
                                          penalty=-1,
                                          evalue=1e-4,
                                          gapopen=2,
                                          gapextend=1,
                                          perc_identity=50)
            cline()

            amiss, tmiss = 0, 0
            for i in range(len(VR_seq)):
                if VR_seq[i] != TR_seq[i]:
                    if TR_seq[i] == 'A':
                        amiss += 1
                    elif TR_seq[i] == 'T':
                        tmiss += 1
            reverse = False
            if tmiss > amiss:
                reverse = True

            with open(vr_blast_output, 'r') as f:
                results = NCBIXML.parse(f)
                with open(aligned_VR_sequences, 'w') as aligned_VR_writer:
                    vr_oriented = VR_seq
                    if reverse:
                        vr_oriented = str(Seq(VR_seq).reverse_complement())
                    aligned_VR_writer.write('>%s\n%s\n' % ('VR', vr_oriented))
                    for result in results:
                        if len(result.alignments) > 0:
                            lowest = 1
                            best = [0, 0]
                            for anum, alignment in enumerate(
                                    result.alignments):
                                for hnum, hsp in enumerate(alignment.hsps):
                                    if hsp.expect < lowest:
                                        lowest = hsp.expect
                                        best = [anum, hnum]
                            alignment = result.alignments[best[0]]
                            hsp = alignment.hsps[best[1]]
                            seq_out = ''
                            if hsp.sbjct_start < hsp.sbjct_end:
                                start = hsp.sbjct_start - 1
                            else:
                                start = hsp.sbjct_end - 1
                            i = 0
                            for i in range(start):
                                seq_out += '-'
                            if hsp.sbjct_start < hsp.sbjct_end:
                                seq_out += hsp.query
                            else:
                                seq_out += str(
                                    Seq(hsp.query).reverse_complement())
                            end = len(VR_seq) - start - len(hsp.query)
                            i = 0
                            for i in range(end):
                                seq_out += '-'
                            if reverse:
                                seq_out = str(
                                    Seq(seq_out).reverse_complement())
                            aligned_VR_writer.write('>%s\n%s\n' %
                                                    (result.query, seq_out))

    utils.cleanup(temp_files, temp_blast_db)
    if len(errors) > 0:
        utils.print_errors(error_file, errors)
コード例 #29
0
def generate_profiles(in_dataframe, out_path):
    """Rather complicated and quite honetly ugly looking function used
    for generating the profiles from a given set of sequences. Intended to be used internally.
    """
    out_path = Path(out_path)
    dataset = in_dataframe
    s = Sultan()

    print('Unpacking and generating Uniprot DB.')
    s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run()
    cmd = NcbimakeblastdbCommandline(
        input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot')
    cmd()
    if not (out_path / 'profile').exists():
        s.mkdir(out_path / 'profile').run()

    with TemporaryDirectory() as psi_temp:
        for _, sample in tqdm(dataset.iterrows(),
                              total=len(dataset),
                              desc='Generating profiles'):
            with NamedTemporaryFile(mode='w') as blast_in:
                if isinstance(sample.name, tuple):
                    sample_id, chain = sample.name[0], sample.name[1]
                    out_name = f'{sample_id}_{chain}'
                    dump_path = out_path / 'full_test_summary.joblib'
                else:
                    sample_id = sample.name
                    out_name = sample_id
                    dump_path = out_path / 'jpred_summary.joblib'

                sequence, structure = sample[['Sequence', 'Structure']]
                structure = ' ' + structure
                print(f'>{out_name}', file=blast_in)
                print(sequence, file=blast_in)
                blast_in.seek(0)
                cmd = NcbipsiblastCommandline(
                    query=blast_in.name,
                    db='../data/swiss-prot/uniprot_sprot.fasta',
                    evalue=0.01,
                    num_iterations=3,
                    out_ascii_pssm=f'{psi_temp}/{out_name}.pssm',
                    num_descriptions=10000,
                    num_alignments=10000,
                    #  out=f'{psi_temp}{out_name}.alns.blast',
                    num_threads=8)
                cmd()

                if not os.path.exists(
                        os.path.join(psi_temp, out_name + '.pssm')):
                    tqdm.write(
                        f'Unable to generate profile for {out_name}. No hits in the database.'
                    )
                    dataset.drop(index=sample.name, inplace=True)
                    continue
                with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file:
                    pssm_file.readline()
                    pssm_file.readline()
                    profile = []
                    offset = False
                    position = 0
                    for line in pssm_file:
                        line = line.rstrip()
                        if not line:
                            break
                        line = line.split()
                        line.append(structure[position])
                        position += 1
                        if not offset:
                            for i in range(2):
                                line.insert(0, '')
                                offset = True
                        profile.append(line)
                    profile = pd.DataFrame(profile)
                    profile.drop(
                        (profile.columns[col] for col in range(2, 22)),
                        axis=1,
                        inplace=True)
                    profile.drop((profile.columns[-3:-1]),
                                 axis=1,
                                 inplace=True)
                    profile.drop((profile.columns[0]), axis=1, inplace=True)
                    profile.columns = profile.iloc[0]
                    profile = profile[1:]
                    profile.rename(columns={profile.columns[0]: "Sequence"},
                                   inplace=True)
                    profile.rename(columns={profile.columns[-1]: "Structure"},
                                   inplace=True)
                    profile = profile[
                        ['Structure'] +
                        [col for col in profile.columns if col != 'Structure']]
                    profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype(
                        float).divide(100)
                    profile.to_csv(out_path / 'profile' /
                                   (out_name + '.profile'),
                                   sep='\t',
                                   index=False)
    print(
        f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile'
    )
    dump(dataset, dump_path)
コード例 #30
0
def iden_main(args):
    if args.circular:
        probe_generate = probe_generate_c
    else:
        probe_generate = probe_generate_l
    seq_dict = {_.id: _ for _ in SeqIO.parse(args.database, 'fasta')}
    _table = pd.read_table(args.meta, names=['group', 'sample', 'assembly'], dtype=str)
    # by group for generate probe
    _group_table = _table.groupby(['group'])['assembly'].apply(list).reset_index(name='assembly')
    # by sample for pre-BLAST
    _group_sample_table = _table.groupby(['group', 'sample'])['assembly'].apply(list).reset_index(name='assembly')
    for _idx, _row in _group_table.iterrows():
        os.mkdir(args.tmp)
        # pre-probe
        _super_probe = defaultdict(str)
        for _asm in _row['assembly']:
            for _k, _v in probe_generate(seq_dict[_asm], args.length).items():
                _super_probe[_k] = _v
        _probe_lines = ['>' + seq_id + '\n' + seq for seq, seq_id in _super_probe.items()]
        with open(os.path.join(args.tmp, 'pre_probe.fasta'), 'w') as f:
            f.write('\n'.join(_probe_lines))
        del _super_probe, _probe_lines
        # pre-blast
        ## make blast database
        _db_lines = ['>' + seq_id + '\n' + str(seq_dict[seq_id].seq) for seq_id in _row['assembly']]
        with open(os.path.join(args.tmp, 'pre_blast_db.fasta'), 'w') as f:
            f.write('\n'.join(_db_lines))
        del _db_lines
        _database_cmd = NcbimakeblastdbCommandline(
            cmd=os.path.join(args.blast, 'makeblastdb'),
            dbtype='nucl',
            input_file=os.path.join(args.tmp, 'pre_blast_db.fasta'))
        _database_cmd()
        ## blast
        _blastn_cmd = NcbiblastnCommandline(
            cmd=os.path.join(args.blast, 'blastn'),
            query=os.path.join(args.tmp, 'pre_probe.fasta'),
            db=os.path.join(args.tmp, 'pre_blast_db.fasta'),
            task='blastn-short',
            dust='no',
            word_size=min(round(args.length/2)-1, 11),
            outfmt='\"6 qacc sacc length pident evalue\"',
            num_threads=args.threads,
            evalue=10,
            out=os.path.join(args.tmp, 'pre_blast.txt'),
            max_hsps=1,
            max_target_seqs=len(_row['assembly']))
        _blastn_cmd()
        ## parse result
        _blast_df = pd.read_table(os.path.join(args.tmp, 'pre_blast.txt'),
                                  names=['query', 'subject', 'length', 'identity', 'evalue'])
        _blast_df = _blast_df[(_blast_df['length'].values == args.length) & (_blast_df['identity'].values == 100)]
        _check_list = _group_sample_table.loc[_group_sample_table['group'].values == _row['group'], 'assembly'].to_list()
        _probe_list = []
        for _sample in _check_list:
            _probe_list.append(set(_blast_df.loc[_blast_df['subject'].map(lambda x: x in _sample)]['query'].to_list()))
        _probe_result = list(reduce(lambda x,y: x & y, _probe_list))
        del _probe_list
        _tmp_dict = {_.id: _ for _ in SeqIO.parse(os.path.join(args.tmp, 'pre_probe.fasta'), 'fasta')}
        _probe_dict = {}
        for _id in _probe_result:
            _probe_dict.setdefault(_id, _tmp_dict[_id])
        SeqIO.write(list(_probe_dict.values()), os.path.join(args.tmp, 'probe.fasta'),'fasta')
        del _tmp_dict, _probe_dict
        ## BLAST
        ap = BLASTParse(os.path.join(args.tmp, 'probe.fasta'),
                        _row['assembly'],
                        args.database,
                        args.tmp,
                        args.length,
                        args.threads,
                        args.blast)
        ap.blast_cmd()
        _result_tb = ap.blast_parse()
        _result_tb['group'] = _row['group']
        _result_tb['assembly'] = _row['assembly'][0]
        _result_tb[['group', 'assembly', 'seq', 'position', 'GC']]. \
            to_csv(os.path.join(args.output, _row['group'] + '.txt'),
                   sep='\t',
                   index=False)
        del_dir(args.tmp)