コード例 #1
0
def divide_values(file, ref_scores):
    """divide each BSR value in a row by that row's maximum value"""
    errors = []
    infile = open(file, "rU")
    firstLine = infile.readline()
    FL_F=firstLine.split()
    outfile = open("BSR_matrix_values.txt", "w")
    outfile.write('\t'.join([str(item) for item in FL_F])+"\n")
    outdata=[]
    for line in infile:
        fields=line.split()
        all_fields=list(fields)
        try:
            fields=map(float, all_fields[1:])
        except:
            raise TypeError("abnormal number of fields observed")
        values= [ ]
        for x in fields:
            try:
                values.append(float(x)/float(ref_scores.get(all_fields[0])))
            except:
                """if a mismatch error in names encountered, change values to 0"""
                errors.append(all_fields[0])
                values.append(float("0"))
        sort_values=['%.2f' % elem for elem in values]
        outfile.write('\t'.join([str(item) for item in sort_values])+"\n")
        outdata.append(values)
    if len(errors)>0:
        nr=[x for i, x in enumerate(errors) if x not in errors[i+1:]]
        logging.logPrint("The following genes had no hits in datasets or are too short, values changed to 0, check names and output: %s" % "\n".join(nr))
    outfile.close()
    return outdata
コード例 #2
0
def translate_genes(genes,outfile,min_len):
    """translate nucleotide into peptide with BioPython"""
    infile = open(genes, "rU")
    output = []
    output_handle = open(outfile, "w")
    too_short = []
    for record in SeqIO.parse(infile, "fasta"):
        try:
            min_pep_len=int(min_len)
            """Should I trim these sequences back to be multiples of 3?"""
            if (len(record.seq)/3.0).is_integer():
                pep_seq=record.seq.translate(to_stop=True, table=11)
            elif ((len(record.seq)-1)/3.0).is_integer():
                pep_seq=record.seq[:-1].translate(to_stop=True, table=11)
            elif ((len(record.seq)-2)/3.0).is_integer():
                pep_seq=record.seq[:-2].translate(to_stop=True, table=11)
            elif ((len(record.seq)-3)/3.0).is_integer():
                pep_seq=record.seq[:-3].translate(to_stop=True, table=11)
            else:
                print("Sequence of odd length found and couldn't be trimmed")
            if len(pep_seq)>=min_pep_len:
                output_handle.write(">"+record.id+"\n")
                output_handle.write("".join(pep_seq)+"\n")
                output.append(pep_seq)
            else:
                too_short.append(record.id)
        except:
            raise TypeError("odd characters observed in sequence %s" % record.id)
    infile.close()
    output_handle.close()
    for record in output:
        return str(record)
    if len(too_short)>0:
        logging.logPrint("The following sequences were too short and will not be processed: %s" % "\n".join(too_short))
コード例 #3
0
ファイル: reliableDownloader.py プロジェクト: carze/vappio
def monitorDownload(pr, downloaderChan, baseDir, url, minRate):
    sizeSamples = []
    while True:
        baseSize = getSizeOfFiles(getDownloadFilenames(baseDir, url))
        time.sleep(SAMPLE_RATE)
        ##
        # If the program exited and exited correctly, then we're good
        # otherwise take another sample size and see if we should terminate
        if pr.exitCode is not None:
            downloaderChan.receive()
            return True
        else:
            currentSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) - baseSize
            logging.debugPrint(lambda: "Download rate: %8d - %s" % (currentSize / SAMPLE_RATE, getUrlFilename(url)))
            size = currentSize / SAMPLE_RATE
            if size < 0:
                size = 0
            sizeSamples.append(size)
            if len(sizeSamples) > MAX_SAMPLE_SIZE:
                sizeSamples.pop(0)

            if len(sizeSamples) >= MAX_SAMPLE_SIZE and sum(sizeSamples) / len(sizeSamples) < minRate:
                logging.logPrint(
                    "Average Rate: %8d - %s - KILLING" % (sum(sizeSamples) / len(sizeSamples), getUrlFilename(url))
                )
                os.kill(pr.pipe.pid, signal.SIGTERM)
                ##
                # Give it a second to finish up whatever it's doing
                time.sleep(2)
                try:
                    downloaderChan.receive()
                except:
                    pass
                return False
コード例 #4
0
def translate_genes(genes):
    """translate nucleotide into peptide with BioPython"""
    infile = open(genes, "rU")
    output = []
    output_handle = open("genes.pep", "w")
    too_short = []
    for record in SeqIO.parse(infile, "fasta"):
        try:
            if len(record.seq.translate(to_stop=True, table=11)) >= 30:
                print >> output_handle, ">" + record.id
                print >> output_handle, record.seq.translate(to_stop=True,
                                                             table=11)
                output.append(record.seq.translate(to_stop=True, table=11))
            else:
                too_short.append(record.id)
        except:
            raise TypeError("odd characters observed in sequence")
    for record in output:
        return str(record)
    infile.close()
    output_handle.close()
    if len(too_short) > 0:
        logging.logPrint(
            "The following sequences were too short and will not be processed: %s"
            % "\n".join(too_short))
    return output
コード例 #5
0
ファイル: reliableDownloader.py プロジェクト: carze/vappio
def monitorDownload(pr, downloaderChan, baseDir, url, minRate):
    sizeSamples = []
    while True:
        baseSize = getSizeOfFiles(getDownloadFilenames(baseDir, url))
        time.sleep(SAMPLE_RATE)
        ##
        # If the program exited and exited correctly, then we're good
        # otherwise take another sample size and see if we should terminate
        if pr.exitCode is not None:
            downloaderChan.receive()
            return True
        else:
            currentSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) - baseSize
            logging.debugPrint(lambda : 'Download rate: %8d - %s' % (currentSize/SAMPLE_RATE, getUrlFilename(url)))
            size = currentSize/SAMPLE_RATE
            if size < 0:
                size = 0
            sizeSamples.append(size)
            if len(sizeSamples) > MAX_SAMPLE_SIZE:
                sizeSamples.pop(0)

            if len(sizeSamples) >= MAX_SAMPLE_SIZE and sum(sizeSamples)/len(sizeSamples) < minRate:
                logging.logPrint('Average Rate: %8d - %s - KILLING' % (sum(sizeSamples)/len(sizeSamples), getUrlFilename(url)))
                os.kill(pr.pipe.pid, signal.SIGTERM)
                ##
                # Give it a second to finish up whatever it's doing
                time.sleep(2)
                try:
                    downloaderChan.receive()
                except:
                    pass
                return False
コード例 #6
0
def translate_genes(genes,outfile,min_len):
    """translate nucleotide into peptide with BioPython"""
    infile = open(genes, "rU")
    output = []
    output_handle = open(outfile, "w")
    too_short = []
    for record in SeqIO.parse(infile, "fasta"):
        try:
            min_pep_len=int(min_len)
            """Should I trim these sequences back to be multiples of 3?"""
            if (len(record.seq)/3.0).is_integer():
                pep_seq=record.seq.translate(to_stop=True, table=11)
            elif ((len(record.seq)-1)/3.0).is_integer():
                pep_seq=record.seq[:-1].translate(to_stop=True, table=11)
            elif ((len(record.seq)-2)/3.0).is_integer():
                pep_seq=record.seq[:-2].translate(to_stop=True, table=11)
            elif ((len(record.seq)-3)/3.0).is_integer():
                pep_seq=record.seq[:-3].translate(to_stop=True, table=11)
            else:
                print("Sequence of odd length found and couldn't be trimmed")
            if len(pep_seq)>=min_pep_len:
                output_handle.write(">"+record.id+"\n")
                output_handle.write("".join(pep_seq)+"\n")
                output.append(pep_seq)
            else:
                too_short.append(record.id)
        except:
            raise TypeError("odd characters observed in sequence %s" % record.id)
    infile.close()
    output_handle.close()
    for record in output:
        return str(record)
    if len(too_short)>0:
        logging.logPrint("The following sequences were too short and will not be processed: %s" % "\n".join(too_short))
コード例 #7
0
def divide_values(file, ref_scores):
    """divide each BSR value in a row by that row's maximum value"""
    errors = []
    infile = open(file, "rU")
    firstLine = infile.readline()
    FL_F=firstLine.split()
    outfile = open("BSR_matrix_values.txt", "w")
    outfile.write('\t'.join([str(item) for item in FL_F])+"\n")
    outdata=[]
    for line in infile:
        fields=line.split()
        all_fields=list(fields)
        try:
            fields=map(float, all_fields[1:])
        except:
            raise TypeError("abnormal number of fields observed")
        values= [ ]
        for x in fields:
            try:
                values.append(float(x)/float(ref_scores.get(all_fields[0])))
            except:
                """if a mismatch error in names encountered, change values to 0"""
                errors.append(all_fields[0])
                values.append(float("0"))
        sort_values=['%.2f' % elem for elem in values]
        outfile.write('\t'.join([str(item) for item in sort_values])+"\n")
        outdata.append(values)
    if len(errors)>0:
        nr=[x for i, x in enumerate(errors) if x not in errors[i+1:]]
        logging.logPrint("The following genes had no hits in datasets or are too short, values changed to 0, check names and output: %s" % "\n".join(nr))
    outfile.close()
    return outdata
コード例 #8
0
ファイル: reliableDownloader.py プロジェクト: carze/vappio
def runDownloader(chan):
    pr, rchan = chan.receive()
    try:
        commands.runProgramRunnerEx(pr)
        logging.debugPrint(lambda : 'Successfully completed download')
        rchan.send(None)
    except Exception, err:
        logging.logPrint('Download failed for unknown reason: ' + str(err))
        rchan.sendError(err)
コード例 #9
0
ファイル: reliableDownloader.py プロジェクト: carze/vappio
def runDownloader(chan):
    pr, rchan = chan.receive()
    try:
        commands.runProgramRunnerEx(pr)
        logging.debugPrint(lambda: "Successfully completed download")
        rchan.send(None)
    except Exception, err:
        logging.logPrint("Download failed for unknown reason: " + str(err))
        rchan.sendError(err)
コード例 #10
0
 def _perform_workflow(data):
     tn, f = data
     name,values=make_table_dev(f, "F", clusters)
     names.append(name)
     table_list.append(values)
     if debug == "T":
         logging.logPrint("sample %s processed" % f)
     else:
         pass
コード例 #11
0
ファイル: util.py プロジェクト: tarah28/LS-BSR
 def _perform_workflow(data):
     tn, f = data
     name,values=make_table_dev(f, "F", clusters)
     names.append(name)
     table_list.append(values)
     if debug == "T":
         logging.logPrint("sample %s processed" % f)
     else:
         pass
コード例 #12
0
def _perform_workflow_nl(data):
    tn, f = data[0]
    clusters = data[1]
    names = data[2]
    table_list = data[3]
    debug = data[4]

    name, values = make_table_dev(f, "F", clusters)
    names.append(name)
    table_list.append(values)
    if debug == "T":
        logging.logPrint("sample %s processed" % f)
コード例 #13
0
ファイル: util.py プロジェクト: BioinformaticsArchive/wgfast
def run_raxml(fasta_in, tree, out_class_file, insertion_method, parameters, model, suffix):
    """untested function, system calls"""
    if "NULL" == parameters:
        if "ASC_GTRGAMMA" == model:
            args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method,
	         '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-t',
	         '%s' % tree, '--asc-corr=lewis', '--no-bfgs', '>', '/dev/null 2>&1']
        else:
            args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method,
	         '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-t',
	         '%s' % tree, '--no-bfgs', '>', '/dev/null 2>&1']
    else:
        if "ASC_GTRGAMMA" == model:
            args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method,
	         '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-R', parameters, '-t',
	         '%s' % tree, '--asc-corr=lewis', '--no-bfgs', '>', '/dev/null 2>&1']
        else:
            args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method,
	         '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-R', parameters, '-t',
	         '%s' % tree, '--no-bfgs', '>', '/dev/null 2>&1']
    try:
        vcf_fh = open('%s.raxml.out' % suffix, 'w')
    except:
        log_isg.logPrint('could not open raxml file')
    try:
        log_fh = open('%s.raxml.log' % suffix, 'w')
    except:
        log_isg.logPrint('could not open log file')

    log_isg.logPrint("inserting sequence into tree")
    try:
        raxml_run = Popen(args, stderr=log_fh, stdout=vcf_fh)
        raxml_run.wait()
        log_isg.logPrint("sequence(s) inserted into tree")
    except:
        log_isg.logPrint("sequence(s) were not inserted into tree!!!!!")
    os.system("sed 's/\[[^]]*\]//g' RAxML_labelledTree.%s > %s.tree_including_unknowns_noedges.tree" % (suffix, suffix))
    subprocess.check_call("mv RAxML_labelledTree.%s %s_tree_including_unknowns_edges.tree" % (suffix, suffix) , shell=True)
    try:
        subprocess.check_call("cat RAxML_classificationLikelihoodWeights.%s >> %s" % (suffix, out_class_file), shell=True)
    except:
        pass
    os.system("rm RAxML_*.%s" % suffix)
    return suffix
コード例 #14
0
ファイル: util.py プロジェクト: BioinformaticsArchive/wgfast
def raxml_calculate_base_tree(in_fasta, model, name):
    """not tested, all system calls"""
    args = ['raxmlHPC-SSE3', '-f', 'd', '-p', '12345',
	     '-s', '%s' % in_fasta, '-m', '%s' % model, '-n', '%s' % name, "--no-bfgs",
	     '>', '/dev/null 2>&1']
    try:
        vcf_fh = open('raxml.out', 'w')
    except:
        log_isg.logPrint('could not open raxml file')
    try:
        log_fh = open('raxml.log', 'w')
    except:
        log_isg.logPrint('could not open log file')
    try:
        raxml_run = Popen(args, stderr=log_fh, stdout=vcf_fh)
        raxml_run.wait()
    except:
        print "could not infer base pruned tree"
        sys.exit()
コード例 #15
0
ファイル: batch_job_runner.py プロジェクト: carze/vappio
def run(options):
    logging.logPrint('Starting')

    batchConfig = config.configFromStream(open(options.configFile), lazy=True)
    machineConf = config.configFromStream(open('/tmp/machine.conf'))

    state = State(
        options.workflowConfig, options.batchStatesFile,
        _validateWrapper(
            batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'),
            pipeline_misc.determineWrapper(
                machineConf,
                batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'))),
        _interpretBatchFile(options.batchFile),
        _extractInnerPipelineConfig(batchConfig),
        batchConfig('pipeline.PIPELINE_WRAPPER_NAME'),
        int(batchConfig('batch.options.CONCURRENT_PRERUN')),
        int(batchConfig('batch.options.CONCURRENT_PIPELINES')),
        int(batchConfig('batch.options.CONCURRENT_POSTRUN')))

    logging.logPrint('Queuing any incomplete work')
    queueCount = _queueIncompleteWork(state)
    logging.logPrint('Queued: %d' % queueCount)

    if state.pipelinesQueue.hasWork():
        yield defer_work_queue.waitForCompletion(state.pipelinesQueue)

    for batchState in state.batchStates.values():
        if 'state' not in batchState or batchState['state'] == 'failed':
            raise JobFailed()
コード例 #16
0
ファイル: batch_job_runner.py プロジェクト: carze/vappio
def run(options):
    logging.logPrint('Starting')
    
    batchConfig = config.configFromStream(open(options.configFile), lazy=True)
    machineConf = config.configFromStream(open('/tmp/machine.conf'))
    
    state = State(options.workflowConfig,
                  options.batchStatesFile,
                  _validateWrapper(batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'),
                                   pipeline_misc.determineWrapper(machineConf,
                                                                  batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'))),
                  _interpretBatchFile(options.batchFile),
                  _extractInnerPipelineConfig(batchConfig),
                  batchConfig('pipeline.PIPELINE_WRAPPER_NAME'),
                  int(batchConfig('batch.options.CONCURRENT_PRERUN')),
                  int(batchConfig('batch.options.CONCURRENT_PIPELINES')),
                  int(batchConfig('batch.options.CONCURRENT_POSTRUN')))

    logging.logPrint('Queuing any incomplete work')
    queueCount = _queueIncompleteWork(state)
    logging.logPrint('Queued: %d' % queueCount)
    
    if state.pipelinesQueue.hasWork():
        yield defer_work_queue.waitForCompletion(state.pipelinesQueue)
    
    for batchState in state.batchStates.values():
        if 'state' not in batchState or batchState['state'] == 'failed':
            raise JobFailed()
コード例 #17
0
ファイル: commands.py プロジェクト: carze/vappio
    def __call__(self):
        """
        This returns:
        (onComplete, [(stream1, func1), .. (streamn, funcn)])

        Where onComplete is any cleanup that needs to happen once all the streams are consumed
        and stream1 is a stream and func1 is the function to call upon data coming in for that stream
        """
        if self.log:
            logPrint(self.cmd)


        env = self.env
        if self.addEnv and not self.env:
            # Copy the current environment because we'll be modifying it
            env = functional.updateDict(dict(os.environ), self.addEnv)
        elif self.addEnv:
            env = functional.updateDict(dict(self.env), self.addEnv)
            
        pipe = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                shell=True, env=env)
        self.pipe = pipe
                                
        return (self.onComplete, [(pipe.stdout, self.stdoutf), (pipe.stderr, self.stderrf)])
コード例 #18
0
ファイル: util.py プロジェクト: BioinformaticsArchive/wgfast
def run_gatk(reference, processors, name, gatk, tmp_dir):
    """gatk controller, mbq used to be set to 17, but was recently changed - untested, system call"""
    args = ['java', '-Djava.io.tmpdir=%s' % tmp_dir, '-jar', '%s' % gatk, '-T', 'UnifiedGenotyper',
            '-R', '%s' % reference, '-nt', '%s' % processors, '-S', 'silent',
            '-ploidy', '1', '-out_mode', 'EMIT_ALL_CONFIDENT_SITES',
            '-stand_call_conf', '30', '-stand_emit_conf', '30', '-I', '%s_renamed_header.bam' % name,
            '-rf', 'BadCigar']
    try:
        vcf_fh = open('%s.vcf.out' % name, 'w')
    except:
        log_isg.logPrint('could not open vcf file')
    try:
        log_fh = open('%s.vcf.log' % name, 'w')
    except:
        log_isg.logPrint('could not open log file')
    try:
        gatk_run = Popen(args, stderr=log_fh, stdout=vcf_fh)
        gatk_run.wait()
    except:
        log_isg.logPrint("GATK encountered problems and did not run")
コード例 #19
0
ファイル: lgt_wrapper.py プロジェクト: carze/vappio
def _log(batchState, msg):
    logging.logPrint('BATCH_NUM %d - %s' % (batchState['batch_num'], msg))
コード例 #20
0
ファイル: clovr_wrapper.py プロジェクト: carze/vappio
def _log(batchState, msg):
    logging.logPrint('BATCH_NUM %d - %s' % (batchState['batch_num'], msg))
コード例 #21
0
ファイル: batch_job_runner.py プロジェクト: carze/vappio
 def updateBatchState(self):
     logging.logPrint('Dumping states file with %d entries' % len(self.batchStates))
     return batch_state.dump(self.batchStatesFile, self.batchStates)
コード例 #22
0
ファイル: ls_bsr.py プロジェクト: mgalardini/LS-BSR
def main(directory, id, filter, processors, genes, usearch, blast, penalty, reward, length,
         max_plog, min_hlog, f_plog, keep, debug_table):
    start_dir = os.getcwd()
    ap=os.path.abspath("%s" % start_dir)
    dir_path=os.path.abspath("%s" % directory)
    logging.logPrint("Testing paths of dependencies")
    if blast=="blastn" or blast=="tblastn":
        ab = subprocess.call(['which', 'blastall'])
        if ab == 0:
            print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402"
        else:
            print "blastall isn't in your path, but needs to be!"
            sys.exit()
    try:
        os.makedirs('%s/joined' % dir_path)
    except:
        print "old run directory exists in your genomes directory (%s/joined).  Delete and run again" % dir_path
        sys.exit()
    for infile in glob.glob(os.path.join(dir_path, '*.fasta')):
        name=get_seq_name(infile)
        os.system("cp %s %s/joined/%s.new" % (infile,dir_path,name))
    if "null" in genes:
        rc = subprocess.call(['which', 'prodigal'])
        if rc == 0:
            pass
        else:
            print "prodigal is not in your path, but needs to be!"
        print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119"
        try:
            if os.path.exists(usearch):
                print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461"
        except:
            raise TypeError("-u usearch flag must be set for use with prodigal")
            sys.exc_clear()
        if blast=="blat":
            ac = subprocess.call(['which', 'blat'])
            if ac == 0:
                print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool.  Genome Research 12:656-664"
            else:
                print "You have requested blat, but it is not in your PATH"
                sys.exit()
        logging.logPrint("predicting genes with Prodigal")
        predict_genes(dir_path, processors)
        logging.logPrint("Prodigal done")
        os.system("cat *genes.seqs > all_gene_seqs.out")
        filter_scaffolds("all_gene_seqs.out")
        os.system("mv tmp.out all_gene_seqs.out")
        rename_fasta_header("all_gene_seqs.out", "all_sorted.txt")
        os.system("mkdir split_files")
        os.system("cp all_sorted.txt split_files/")
        os.system("rm all_sorted.txt")
        os.chdir("split_files/")
        os.system("split -l 200000 all_sorted.txt")
        logging.logPrint("clustering with USEARCH at an ID of %s" % id)
        sort_usearch(usearch)
        run_usearch(usearch, id)
        os.system("cat *.usearch.out > all_sorted.txt")
        os.system("mv all_sorted.txt %s/joined" % dir_path)
        os.chdir("%s/joined" % dir_path)
        uclust_cluster(usearch, id)
        logging.logPrint("USEARCH clustering finished")
        clusters = get_cluster_ids("consensus.fasta")
        subprocess.check_call("formatdb -i consensus.fasta -p F", shell=True)
        if "tblastn" == blast:
            translate_consensus("consensus.fasta")
            filter_seqs("tmp.pep")
            blast_against_self("consensus.fasta", "consensus.pep", "tmp_blast.out", filter, blast, penalty, reward, processors)
        elif "blastn" == blast:
            blast_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, blast, penalty, reward, processors)
        elif "blat" == blast:
            blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors)
        else:
            pass
        subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
        ref_scores=parse_self_blast(open("self_blast.out", "U"))
        subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
        os.system("rm *new_genes.*")
        if blast == "tblastn" or blast == "blastn":
            logging.logPrint("starting BLAST")
        else:
            logging.logPrint("starting BLAT")
        if "tblastn" == blast:
            blast_against_each_genome(dir_path, processors, filter, "consensus.pep", blast, penalty, reward)
        elif "blastn" == blast:
            blast_against_each_genome(dir_path, processors, filter, "consensus.fasta", blast, penalty, reward)
        elif "blat" == blast:
            blat_against_each_genome(dir_path, "consensus.fasta",processors)
        else:
            pass
        find_dups(ref_scores, length, max_plog, min_hlog)
    else:
        logging.logPrint("Using pre-compiled set of predicted genes")
        files = glob.glob(os.path.join(dir_path, "*.fasta"))
        if len(files)==0:
            print "no usable reference genomes found!"
            sys.exit()
        else:
            pass
        gene_path=os.path.abspath("%s" % genes)
        clusters = get_cluster_ids(gene_path)
        os.system("cp %s %s/joined/" % (gene_path,dir_path))
        os.chdir("%s/joined" % dir_path)
        if gene_path.endswith(".pep"):
            logging.logPrint("using tblastn on peptides")
            try:
                subprocess.check_call("formatdb -i %s" % gene_path, shell=True)
            except:
                logging.logPrint("problem encountered with BLAST database")
                sys.exit()
            blast_against_self(gene_path, gene_path, "tmp_blast.out", filter, "blastp", penalty, reward, processors)
            subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
            ref_scores=parse_self_blast(open("self_blast.out", "U"))
            subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
            logging.logPrint("starting BLAST")
            blast_against_each_genome(dir_path, processors, filter, gene_path, "tblastn", penalty, reward)
        elif gene_path.endswith(".fasta"):    
            if "tblastn" == blast:
                logging.logPrint("using tblastn")
                translate_genes(gene_path,)
                try:
                    subprocess.check_call("formatdb -i %s -p F" % gene_path, shell=True)
                except:
                    logging.logPrint("problem encountered with BLAST database")
                    sys.exit()
                blast_against_self(gene_path, "genes.pep", "tmp_blast.out", filter, blast, penalty, reward, processors)
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores=parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAST")
                blast_against_each_genome(dir_path, processors, filter, "genes.pep", blast, penalty, reward)
                os.system("cp genes.pep %s" % start_dir)
            elif "blastn" == blast:
                logging.logPrint("using blastn")
                try:
                    subprocess.check_call("formatdb -i %s -p F" % gene_path, shell=True)
                except:
                    logging.logPrint("BLAST not found")
                    sys.exit()
                blast_against_self(gene_path, gene_path, "tmp_blast.out", filter, blast, penalty, reward, processors)
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores=parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAST")
                blast_against_each_genome(dir_path, processors, filter, gene_path, blast, penalty, reward)
            elif "blat" == blast:
                logging.logPrint("using blat")
                blat_against_self(gene_path, gene_path, "tmp_blast.out", processors)
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores=parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAT")
                blat_against_each_genome(dir_path,gene_path,processors)
            else:
                pass
        else:
            print "input file format not supported"
            sys.exit()
    if blast=="blat":
        logging.logPrint("BLAT done")
    else:
        logging.logPrint("BLAST done")
    parse_blast_report()
    get_unique_lines()
    if debug_table == "old":
        make_table(processors, "F", clusters)
    elif debug_table == "new":
        curr_dir=os.getcwd()
        table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
        files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                                for idx, f in enumerate(table_files)]
        names=[]
        def _perform_workflow(data):
            tn, f = data
            name=make_table_dev(f, "F", clusters)
            names.append(name)
        results = set(p_func.pmap(_perform_workflow,
                                  files_and_temp_names,
                                  num_workers=processors))
        nr_sorted=sorted(clusters)
        open("ref.list", "a").write("\n")
        for x in nr_sorted:
            open("ref.list", "a").write("%s\n" % x)
        names_out = open("names.txt", "w")
        names_redux = [val for subl in names for val in subl]
        for x in names_redux: print >> names_out, "".join(x)
        names_out.close()
    else:
        print "incorrect debug option selected, exiting"
        sys.exit()
    subprocess.check_call("paste ref.list *.matrix > bsr_matrix", shell=True)
    divide_values("bsr_matrix", ref_scores)
    subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True)
    if "T" in f_plog:
        filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt")
        os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir)
    else:
        pass
    try:
        subprocess.check_call("cp names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s" % start_dir, shell=True, stderr=open(os.devnull, 'w'))
    except:
        sys.exc_clear()
    logging.logPrint("all Done")
    os.chdir("%s" % dir_path)
    if "T" == keep:
        pass
    else:
        os.system("rm -rf joined")
コード例 #23
0
ファイル: util.py プロジェクト: BioinformaticsArchive/wgfast
    def _perform_workflow(data):
        """idx is the sample name, f is the file dictionary"""
        idx, f = data
        if os.path.isfile("%s.tmp.xyx.matrix" % idx):
            pass
        else:
            if len(f)>1:
                if "T" in trim:
                    """paired end sequences - Hardcoded the number of processors per job to 2"""
                    args=['java','-jar','%s' % trim_path,'PE', '-threads', '2',
                          '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz',
	                  '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path,
	                  'MINLEN:%s' % int(get_sequence_length(f[0])/2)]
                    try:
                        vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open trimmomatic file')
                    try:
                        log_fh = open('%s.trimmomatic.log' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open log file')
                    if os.path.isfile("%s.F.paired.fastq.gz" % idx):
                        pass
                    else:
                        try:
                            trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh)
                            trim_cmd.wait()
                        except:
                            log_isg.logPrint('problem enountered trying to run trimmomatic')
                else:
                    os.link(f[0], "%s.F.paired.fastq.gz" % idx)
                    os.link(f[1], "%s.R.paired.fastq.gz" % idx)
                if os.path.isfile("%s_renamed_header.bam" % idx):
                    pass
                else:
                     run_bwa(reference, '%s.F.paired.fastq.gz' % idx, '%s.R.paired.fastq.gz' % idx, processors, idx)
            else:
                if "T" in trim:
                    """single end support"""
                    args=['java','-jar','%s' % trim_path,'SE', '-threads', '2',
                          '%s' % f[0], '%s.single.fastq.gz' % idx, 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path,
	                  'MINLEN:%s' % int(get_sequence_length(f[0])/2)]
                    try:
                        vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open trimmomatic file')
                    try:
                        log_fh = open('%s.trimmomatic.log' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open log file')
                    if os.path.isfile("%s.single.fastq.gz" % idx):
                        pass
                    else:
                        try:
                            trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh)
                            trim_cmd.wait()
                        except:
                            log_isg.logPrint("problem encountered with trimmomatic")
                else:
                    os.link(f[0], "%s.single.fastq.gz" % idx)
                if os.path.isfile("%s_renamed_header.bam" % idx):
                    pass
                else:
                    run_bwa(reference, '%s.single.fastq.gz' % idx, "NULL", processors, idx)
            if os.path.isfile("%s_renamed_header.bam" % idx):
                pass
            else:
                process_sam("%s.sam" % idx, idx)
                """inserts read group information, required by new versions of GATK"""
                os.system("java -jar %s INPUT=%s.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (picard,idx,idx,idx,idx,idx))
                os.system("samtools index %s_renamed_header.bam > /dev/null 2>&1" % idx)
            run_gatk(reference, processors, idx, gatk, tmp_dir)
            if "T" == doc:
                lock.acquire()
                os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx))
                os.system("java -Djava.io.tmpdir=%s -jar %s -R %s/scratch/reference.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (tmp_dir,gatk,ap,idx,idx))
                lock.release()
                process_coverage(idx)
            else:
                pass
            process_vcf("%s.vcf.out" % idx, ref_coords, coverage, proportion, idx)
            make_temp_matrix("%s.filtered.vcf" % idx, matrix, idx)
コード例 #24
0
def main(directory,id,filter,processors,genes,cluster_method,blast,length,
         max_plog,min_hlog,f_plog,keep,filter_peps,filter_scaffolds,prefix,temp_dir,min_pep_length,debug):
    start_dir = os.getcwd()
    ap=os.path.abspath("%s" % start_dir)
    dir_path=os.path.abspath("%s" % directory)
    logging.logPrint("Testing paths of dependencies")
    if blast=="blastn" or blast=="tblastn":
        ab = subprocess.call(['which', 'blastn'])
        if ab == 0:
            print("citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402")
        else:
            print("blastn isn't in your path, but needs to be!")
            sys.exit()
    if "NULL" in temp_dir:
        fastadir = tempfile.mkdtemp()
    else:
        fastadir = os.path.abspath("%s" % temp_dir)
        if os.path.exists('%s' % temp_dir):
            print("old run directory exists in your genomes directory (%s).  Delete and run again" % temp_dir)
            sys.exit()
        else:
            os.makedirs('%s' % temp_dir)
    for infile in glob.glob(os.path.join(dir_path, '*.fasta')):
        name=get_seq_name(infile)
        os.link("%s" % infile, "%s/%s.new" % (fastadir,name))
    if "null" in genes:
        rc = subprocess.call(['which', 'prodigal'])
        if rc == 0:
            pass
        else:
            print("prodigal is not in your path, but needs to be!")
            sys.exit()
        print("citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119")
        if "usearch" in cluster_method:
            print("citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461")
        elif "cd-hit" in cluster_method:
            print("citation: Li, W., Godzik, A. 2006. Cd-hit: a fast program for clustering and comparing large sets of protein or nuceltodie sequences. Bioinformatics 22(13):1658-1659")
        elif "vsearch" in cluster_method:
            print("citation: Rognes, T., Flouri, T., Nichols, B., Qunice, C., Mahe, Frederic. 2016. VSEARCH: a versatile open source tool for metagenomics. PeerJ Preprints. DOI: https://doi.org/10.7287/peerj.preprints.2409v1")
        if blast=="blat":
            ac = subprocess.call(['which', 'blat'])
            if ac == 0:
                print("citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool.  Genome Research 12:656-664")
            else:
                print("You have requested blat, but it is not in your PATH")
                sys.exit()
        logging.logPrint("predicting genes with Prodigal")
        #predict_genes(fastadir, processors)
        predict_genes_dev(fastadir, processors)
        logging.logPrint("Prodigal done")
        """This function produces locus tags"""
        genbank_hits = process_genbank_files(dir_path)
        if genbank_hits == None or len(genbank_hits) == 0:
            os.system("cat *genes.seqs > all_gene_seqs.out")
            if filter_scaffolds == "T":
                filter_scaffolds("all_gene_seqs.out")
                os.system("mv tmp.out all_gene_seqs.out")
            else:
                pass
        else:
            logging.logPrint("Converting genbank files")
            """First combine all of the prodigal files into one file"""
            os.system("cat *genes.seqs > all_gene_seqs.out")
            if filter_scaffolds == "T":
                filter_scaffolds("all_gene_seqs.out")
                os.system("mv tmp.out all_gene_seqs.out")
            else:
                pass
            """This combines the locus tags with the Prodigal prediction"""
            os.system("cat *locus_tags.fasta all_gene_seqs.out > tmp.out")
            os.system("mv tmp.out all_gene_seqs.out")
            """I also need to convert the GenBank file to a FASTA file"""
            for hit in genbank_hits:
                reduced_hit = hit.replace(".gbk","")
                SeqIO.convert("%s/%s" % (dir_path, hit), "genbank", "%s.fasta.new" % reduced_hit, "fasta")
        if "NULL" in cluster_method:
            print("Clustering chosen, but no method selected...exiting")
            sys.exit()
        elif "usearch" in cluster_method:
            ac = subprocess.call(['which', 'usearch'])
            if ac == 0:
                os.system("mkdir split_files")
                os.system("cp all_gene_seqs.out split_files/all_sorted.txt")
                os.chdir("split_files/")
                logging.logPrint("Splitting FASTA file for use with USEARCH")
                split_files("all_sorted.txt")
                logging.logPrint("clustering with USEARCH at an ID of %s" % id)
                #run_usearch(id)
                run_usearch_dev(id,4)
                os.system("cat *.usearch.out > all_sorted.txt")
                os.system("mv all_sorted.txt %s" % fastadir)
                os.chdir("%s" % fastadir)
                uclust_cluster(id)
                logging.logPrint("USEARCH clustering finished")
            else:
                print("usearch must be in your path as usearch...exiting")
                sys.exit()
        elif "vsearch" in cluster_method:
            ac = subprocess.call(['which', 'vsearch'])
            if ac == 0:
                logging.logPrint("clustering with VSEARCH at an ID of %s, using %s processors" % (id,processors))
                run_vsearch(id, processors)
                os.system("mv vsearch.out consensus.fasta")
                logging.logPrint("VSEARCH clustering finished")
            else:
                print("vsearch must be in your path as vsearch...exiting")
                sys.exit()
        elif "cd-hit" in cluster_method:
            ac = subprocess.call(['which', 'cd-hit-est'])
            if ac == 0:
                logging.logPrint("clustering with cd-hit at an ID of %s, using %s processors" % (id,processors))
                subprocess.check_call("cd-hit-est -i all_gene_seqs.out -o consensus.fasta -M 0 -T %s -c %s > /dev/null 2>&1" % (processors, id), shell=True)
            else:
                print("cd-hit must be in your path as cd-hit-est...exiting")
                sys.exit()
        """need to check for dups here"""
        dup_ids = test_duplicate_header_ids("consensus.fasta")
        if dup_ids == "True":
            pass
        elif dup_ids == "False":
            print("duplicate headers identified, renaming..")
            rename_fasta_header("consensus.fasta", "tmp.txt")
            os.system("mv tmp.txt consensus.fasta")
        else:
            pass
        if "tblastn" == blast:
            subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True)
            translate_genes("consensus.fasta","tmp.pep",min_pep_length)
            if filter_peps == "T":
                filter_seqs("tmp.pep")
                os.system("rm tmp.pep")
            else:
                os.system("mv tmp.pep consensus.pep")
            clusters = get_cluster_ids("consensus.pep")
            blast_against_self_tblastn("tblastn", "consensus.fasta", "consensus.pep", "tmp_blast.out", processors, filter)
        elif "blastn" == blast:
            subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True)
            blast_against_self_blastn("blastn", "consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, processors)
            clusters = get_cluster_ids("consensus.fasta")
        elif "blat" == blast:
            blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors)
            clusters = get_cluster_ids("consensus.fasta")
        else:
            pass
        subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
        ref_scores=parse_self_blast(open("self_blast.out", "U"))
        os.system("cp tmp_blast.out ref.scores")
        subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
        os.system("rm *new_genes.*")
        if blast == "tblastn" or blast == "blastn":
            logging.logPrint("starting BLAST")
        else:
            logging.logPrint("starting BLAT")
        if "tblastn" == blast:
            blast_against_each_genome_tblastn(processors, "consensus.pep", filter)
        elif "blastn" == blast:
            blast_against_each_genome_blastn(processors, filter, "consensus.fasta")
        elif "blat" == blast:
            blat_against_each_genome("consensus.fasta",processors)
        else:
            pass
    else:
        logging.logPrint("Using pre-compiled set of predicted genes")
        files = glob.glob(os.path.join(dir_path, "*.fasta"))
        if len(files)==0:
            print("no usable reference genomes found!")
            sys.exit()
        else:
            pass
        gene_path=os.path.abspath("%s" % genes)
        dup_ids = test_duplicate_header_ids(gene_path)
        if dup_ids == "True":
            pass
        elif dup_ids == "False":
            print("duplicate headers identified, exiting..")
            sys.exit()
        clusters = get_cluster_ids(gene_path)
        os.system("cp %s %s" % (gene_path,fastadir))
        os.chdir("%s" % fastadir)
        if gene_path.endswith(".pep"):
            logging.logPrint("using tblastn on peptides")
            try:
                subprocess.check_call("makeblastdb -in %s -dbtype prot > /dev/null 2>&1" % gene_path, shell=True)
            except:
                logging.logPrint("problem encountered with BLAST database")
                sys.exit()
            blast_against_self_tblastn("blastp", gene_path, gene_path, "tmp_blast.out", processors, filter)
            subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
            ref_scores=parse_self_blast(open("self_blast.out", "U"))
            os.system("cp self_blast.out ref.scores")
            subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
            logging.logPrint("starting BLAST")
            blast_against_each_genome_tblastn(processors, gene_path, filter)
        elif gene_path.endswith(".fasta"):
            if "tblastn" == blast:
                logging.logPrint("using tblastn")
                translate_genes(gene_path,"genes.pep",min_pep_length)
                try:
                    subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True)
                except:
                    logging.logPrint("problem encountered with BLAST database")
                    sys.exit()
                blast_against_self_tblastn("tblastn", gene_path, "genes.pep", "tmp_blast.out", processors, filter)
                logging.logPrint("starting BLAST")
                blast_against_each_genome_tblastn(processors, "genes.pep", filter)
                os.system("cp genes.pep %s" % start_dir)
            elif "blastn" == blast:
                logging.logPrint("using blastn")
                try:
                    subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True)
                except:
                    logging.logPrint("Database not formatted correctly...exiting")
                    sys.exit()
                try:
                    blast_against_self_blastn("blastn", gene_path, gene_path, "tmp_blast.out", filter, processors)
                except:
                    print("problem with blastn, exiting")
                    sys.exit()
                logging.logPrint("starting BLAST")
                try:
                    blast_against_each_genome_blastn(processors, filter, gene_path)
                except:
                    print("problem with blastn, exiting")
                    sys.exit()
            elif "blat" == blast:
                logging.logPrint("using blat")
                blat_against_self(gene_path, gene_path, "tmp_blast.out", processors)
                logging.logPrint("starting BLAT")
                blat_against_each_genome(gene_path,processors)
            else:
                pass
        else:
            print("input file format not supported")
            sys.exit()
            """testing to see if I can remove some redundancy"""
        subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
        os.system("cp self_blast.out ref.scores")
        ref_scores=parse_self_blast(open("self_blast.out", "U"))
        subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
        """testing block complete"""
    find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors)
    if blast=="blat":
        logging.logPrint("BLAT done")
    else:
        logging.logPrint("BLAST done")
    parse_blast_report("false")
    get_unique_lines()
    curr_dir=os.getcwd()
    table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(table_files)]
    names=[]
    table_list = []
    nr_sorted=sorted(clusters)
    centroid_list = []
    centroid_list.append(" ")
    for x in nr_sorted:
        centroid_list.append(x)
    table_list.append(centroid_list)
    logging.logPrint("starting matrix building")
    new_names,new_table = new_loop(files_and_temp_names, processors, clusters, debug)
    new_table_list = table_list+new_table
    logging.logPrint("matrix built")
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    names_out = open("names.txt", "w")
    names_redux = [val for subl in new_names for val in subl]
    for x in names_redux: names_out.write("".join(x)+"\n")
    names_out.close()
    create_bsr_matrix_dev(new_table_list)
    divide_values("bsr_matrix", ref_scores)
    subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True)
    try:
        subprocess.check_call("cp dup_matrix.txt names.txt consensus.pep duplicate_ids.txt consensus.fasta paralog_ids.txt %s" % ap, shell=True, stderr=open(os.devnull, 'w'))
    except:
        sys.exc_clear()
    """new code to rename files according to a prefix"""
    import datetime
    timestamp = datetime.datetime.now()
    rename = str(timestamp.year), str(timestamp.month), str(timestamp.day), str(timestamp.hour), str(timestamp.minute), str(timestamp.second)
    if "T" in f_plog:
        filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt")
        if "NULL" in prefix:
            os.system("cp bsr_matrix_values_filtered.txt %s/%s_paralogs_filtered_bsr_matrix_values.txt" % (start_dir,"".join(rename)))
        else:
            os.system("cp bsr_matrix_values_filtered.txt %s/%s_paralogs_filtered_bsr_matrix_values.txt" % (start_dir,prefix))
    os.chdir("%s" % ap)
    if "NULL" in prefix:
        os.system("mv dup_matrix.txt %s_dup_matrix.txt" % "".join(rename))
        os.system("mv names.txt %s_names.txt" % "".join(rename))
        os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % "".join(rename))
        os.system("mv paralog_ids.txt %s_paralog_ids.txt" % "".join(rename))
        os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % "".join(rename))
        if os.path.isfile("consensus.fasta"):
            os.system("mv consensus.fasta %s_consensus.fasta" % "".join(rename))
        if os.path.isfile("consensus.pep"):
            os.system("mv consensus.pep %s_consensus.pep" % "".join(rename))
    else:
        os.system("mv dup_matrix.txt %s_dup_matrix.txt" % prefix)
        os.system("mv names.txt %s_names.txt" % prefix)
        os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % prefix)
        os.system("mv paralog_ids.txt %s_paralog_ids.txt" % prefix)
        os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % prefix)
        if os.path.isfile("consensus.fasta"):
            os.system("mv consensus.fasta %s_consensus.fasta" % prefix)
        if os.path.isfile("consensus.pep"):
            os.system("mv consensus.pep %s_consensus.pep" % prefix)
    if "NULL" in prefix:
        outfile = open("%s_run_parameters.txt" % "".join(rename), "w")
    else:
        outfile = open("%s_run_parameters.txt" % prefix, "w")
    outfile.write("-d %s \\\n" % directory)
    outfile.write("-i %s \\\n" % id)
    outfile.write("-f %s \\\n" % filter)
    outfile.write("-p %s \\\n" % processors)
    outfile.write("-g %s \\\n" % genes)
    outfile.write("-c %s \\\n" % cluster_method)
    outfile.write("-b %s \\\n" % blast)
    outfile.write("-l %s \\\n" % length)
    outfile.write("-m %s \\\n" % max_plog)
    outfile.write("-n %s \\\n" % min_hlog)
    outfile.write("-t %s \\\n" % f_plog)
    outfile.write("-k %s \\\n" % keep)
    outfile.write("-s %s \\\n" % filter_peps)
    outfile.write("-e %s \\\n" % filter_scaffolds)
    outfile.write("-x %s \\\n" % prefix)
    outfile.write("-z %s\n" % debug)
    outfile.write("temp data stored here if kept: %s" % fastadir)
    outfile.close()
    logging.logPrint("all Done")
    if "T" == keep:
        pass
    else:
        os.system("rm -rf %s" % fastadir)
    os.chdir("%s" % ap)
コード例 #25
0
ファイル: batch_job_runner.py プロジェクト: carze/vappio
 def updateBatchState(self):
     logging.logPrint('Dumping states file with %d entries' %
                      len(self.batchStates))
     return batch_state.dump(self.batchStatesFile, self.batchStates)
コード例 #26
0
 try:
     os.makedirs('%s/joined' % dir_path)
 except OSError, e:
  	if e.errno != errno.EEXIST:
         raise
 for infile in glob.glob(os.path.join(dir_path, '*.fasta')):
     name=get_seq_name(infile)
     os.system("cp %s %s/joined/%s.new" % (infile,dir_path,name))
 if "null" in genes:
     try:
         if os.path.exists(usearch):
             pass
     except:
         raise TypeError("-u usearch flag must be set for use with prodigal")
         sys.exc_clear()
     logging.logPrint("predicting genes with Prodigal")
     predict_genes(dir_path, processors)
     logging.logPrint("Prodigal done")
     os.system("cat *genes.seqs > all_gene_seqs.out")
     uclust_sort(usearch)
     rename_fasta_header("tmp_sorted.txt", "all_sorted.txt")
     uclust_cluster(usearch, id)
     translate_consensus("consensus.fasta")
     filter_seqs("tmp.pep")
     subprocess.check_call("formatdb -i consensus.fasta -p F", shell=True)
     blast_against_self("consensus.fasta", "consensus.pep", "tmp_blast.out", filter, blast, penalty, reward)
     subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
     ref_scores=parse_self_blast(open("self_blast.out", "U"))
     subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
     os.system("rm *new_genes.*")
     logging.logPrint("starting BLAST")
コード例 #27
0
def main(matrix,tree,reference,directory,parameters,processors,coverage,proportion,keep,subsample,subnums,doc,tmp_dir,insertion_method,fudge,only_subs,model,trim):
    ref_path=os.path.abspath("%s" % reference)
    dir_path=os.path.abspath("%s" % directory)
    #check for binary dependencies
    log_isg.logPrint('testing the paths of all dependencies')
    ap=os.path.abspath("%s" % os.getcwd())
    aa = subprocess.call(['which', 'raxmlHPC-SSE3'])
    if aa == 0:
        pass
    else:
        print "RAxML must be in your path as raxmlHPC-SSE3"
        sys.exit()
    print "*citation: 'Stamatakis, A. RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies. Bioinformatics (2014).'"
    print "*citation: 'Berger SA, Krompass D, Stamatakis A. Performance, accuracy, and Web server for evolutionary placement of short sequence reads under maximum likelihood. Syst Biol. 2011;60(3):291-302'"
    ab = subprocess.call(['which', 'samtools'])
    if ab == 0:
        pass
    else:
        print "samtools must be in your path"
        sys.exit()
    print "*citation: 'Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, Genome Project Data Processing S. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009;25(16):2078-9'"
    ac = subprocess.call(['which', 'bwa'])
    if ac == 0:
        pass
    else:
        print "bwa must be in your path"
        sys.exit()
    print "*citation: 'Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXivorg. 2013(arXiv:1303.3997 [q-bio.GN])'"
    print "Patristic distances calculated with DendroPy"
    print "*citation: 'Sukumaran J, Holder MT. DendroPy: a Python library for phylogenetic computing. Bioinformatics. 2010;26(12):1569-71. Epub 2010/04/28. doi: 10.1093/bioinformatics/btq228. PubMed PMID: 20421198'"
    print "Also uses GATK for variant calling"
    print "*citation: 'McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome research. 2010;20(9):1297-303'"
    print "Uses trimmomatic for read trimming"
    print "*citation: Bolger A.M., Lohse M., Usadel B. Trimmomatic: A flexible trimmer for Illumina Sequence Data.  Bioinformatics. 2014.  Doi:10.1093/bioinformatics/btu170"
    print "Uses BioPython for FASTA parsing"
    print "*citation :C**k PJ, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, de Hoon MJ. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009;25(11):1422-3"
    print ""
    #done checking for dependencies"""
    log_isg.logPrint('WG-FAST pipeline starting')
    log_isg.logPrint("WG-FAST was invoked with the following parameters:")
    print "-m %s \\" % matrix
    print "-t %s \\" % tree
    print "-r %s \\" % reference
    print "-d %s \\" % directory
    print "-x %s \\" % parameters
    print "-p %s \\" % processors
    print "-c %s \\" % coverage
    print "-o %s \\" % proportion
    print "-k %s \\" % keep
    print "-s %s \\" % subsample
    print "-n %s \\" % subnums
    print "-g %s \\" % doc
    print "-e %s \\" % tmp_dir
    print "-z %s \\" % insertion_method
    print "-f %s \\" % fudge
    print "-y %s \\" % only_subs
    print "-j %s \\" % model
    print "-i %s" % trim
    try:
        os.makedirs('%s/scratch' % ap)
    except OSError, e:
        if e.errno != errno.EEXIST:raise
コード例 #28
0
ファイル: releaseCutS2.py プロジェクト: carze/vappio


    try:
        convertChannel.receive()
        vmWareDir = 'clovr-vmware.%s' % options('general.version')
        runSystemEx('mkdir -p ' + vmWareDir, log=True)
        runSystemEx('mv VMware_conversion/shared/converted_img.vmdk %s' % os.path.join(vmWareDir, 'clovr.9-04.x86-64.%s.vmdk' % options('general.version')))
        runSystemEx('mkdir -p %s %s' % (os.path.join(vmWareDir, 'keys'),
                                        os.path.join(vmWareDir, 'user_data')), log=True)
        runSystemEx('cp -rv /usr/local/projects/clovr/shared ' + vmWareDir, log=True)
        fout = open(os.path.join(vmWareDir, 'start_clovr.vmx'), 'w')
        clovrConf = config.configFromMap(dict(version=options('general.version')))
        for line in open('/usr/local/projects/clovr/start_clovr.vmx'):
            fout.write(config.replaceStr(line, clovrConf))
    except Exception, err:
        errorPrint('Converting image failed.  Error message:')
        errorPrint(str(err))

    try:
      amiId = bundleChannel.receive()
      logPrint('AMI: ' + amiId)
    except Exception, err:
      amiId = None
      errorPrint('Bundling AMI failed for some reason.  Error message:')
      errorPrint(str(err))


if __name__ == '__main__':
    main(*buildConfigN(OPTIONS))
コード例 #29
0
ファイル: ls_bsr.py プロジェクト: JoshGutman/LS-BSR
def main(directory, id, filter, processors, genes, cluster_method, blast,
         length, max_plog, min_hlog, f_plog, keep, filter_peps,
         filter_scaffolds, prefix, temp_dir, debug):
    start_dir = os.getcwd()
    ap = os.path.abspath("%s" % start_dir)
    dir_path = os.path.abspath("%s" % directory)
    logging.logPrint("Testing paths of dependencies")
    if blast == "blastn" or blast == "tblastn":
        ab = subprocess.call(['which', 'blastn'])
        if ab == 0:
            print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402"
        else:
            print "blastn isn't in your path, but needs to be!"
            sys.exit()
    if "NULL" in temp_dir:
        fastadir = tempfile.mkdtemp()
    else:
        fastadir = os.path.abspath("%s" % temp_dir)
        if os.path.exists('%s' % temp_dir):
            print "old run directory exists in your genomes directory (%s).  Delete and run again" % temp_dir
            sys.exit()
        else:
            os.makedirs('%s' % temp_dir)
    for infile in glob.glob(os.path.join(dir_path, '*.fasta')):
        name = get_seq_name(infile)
        os.link("%s" % infile, "%s/%s.new" % (fastadir, name))
    if "null" in genes:
        rc = subprocess.call(['which', 'prodigal'])
        if rc == 0:
            pass
        else:
            print "prodigal is not in your path, but needs to be!"
            sys.exit()
        print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119"
        if "usearch" in cluster_method:
            print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461"
        elif "cd-hit" in cluster_method:
            print "citation: Li, W., Godzik, A. 2006. Cd-hit: a fast program for clustering and comparing large sets of protein or nuceltodie sequences. Bioinformatics 22(13):1658-1659"
        elif "vsearch" in cluster_method:
            print "citation: Rognes, T., Flouri, T., Nichols, B., Qunice, C., Mahe, Frederic. 2016. VSEARCH: a versatile open source tool for metagenomics. PeerJ Preprints. DOI: https://doi.org/10.7287/peerj.preprints.2409v1"
        if blast == "blat":
            ac = subprocess.call(['which', 'blat'])
            if ac == 0:
                print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool.  Genome Research 12:656-664"
            else:
                print "You have requested blat, but it is not in your PATH"
                sys.exit()
        logging.logPrint("predicting genes with Prodigal")
        predict_genes(fastadir, processors)
        logging.logPrint("Prodigal done")
        """This function produces locus tags"""
        genbank_hits = process_genbank_files(dir_path)
        if genbank_hits == None or len(genbank_hits) == 0:
            os.system("cat *genes.seqs > all_gene_seqs.out")
            if filter_scaffolds == "T":
                filter_scaffolds("all_gene_seqs.out")
                os.system("mv tmp.out all_gene_seqs.out")
            else:
                pass
        else:
            logging.logPrint("Converting genbank files")
            """First combine all of the prodigal files into one file"""
            os.system("cat *genes.seqs > all_gene_seqs.out")
            if filter_scaffolds == "T":
                filter_scaffolds("all_gene_seqs.out")
                os.system("mv tmp.out all_gene_seqs.out")
            else:
                pass
            """This combines the locus tags with the Prodigal prediction"""
            os.system("cat *locus_tags.fasta all_gene_seqs.out > tmp.out")
            os.system("mv tmp.out all_gene_seqs.out")
            """I also need to convert the GenBank file to a FASTA file"""
            for hit in genbank_hits:
                reduced_hit = hit.replace(".gbk", "")
                SeqIO.convert("%s/%s" % (dir_path, hit), "genbank",
                              "%s.fasta.new" % reduced_hit, "fasta")
        if "NULL" in cluster_method:
            print "Clustering chosen, but no method selected...exiting"
            sys.exit()
        elif "usearch" in cluster_method:
            ac = subprocess.call(['which', 'usearch'])
            if ac == 0:
                os.system("mkdir split_files")
                os.system("cp all_gene_seqs.out split_files/all_sorted.txt")
                os.chdir("split_files/")
                logging.logPrint("Splitting FASTA file for use with USEARCH")
                split_files("all_sorted.txt")
                logging.logPrint("clustering with USEARCH at an ID of %s" % id)
                run_usearch(id)
                os.system("cat *.usearch.out > all_sorted.txt")
                os.system("mv all_sorted.txt %s" % fastadir)
                os.chdir("%s" % fastadir)
                uclust_cluster(id)
                logging.logPrint("USEARCH clustering finished")
            else:
                print "usearch must be in your path as usearch...exiting"
                sys.exit()
        elif "vsearch" in cluster_method:
            ac = subprocess.call(['which', 'vsearch'])
            if ac == 0:
                logging.logPrint(
                    "clustering with VSEARCH at an ID of %s, using %s processors"
                    % (id, processors))
                run_vsearch(id, processors)
                os.system("mv vsearch.out consensus.fasta")
                logging.logPrint("VSEARCH clustering finished")
            else:
                print "vsearch must be in your path as vsearch...exiting"
                sys.exit()
        elif "cd-hit" in cluster_method:
            ac = subprocess.call(['which', 'cd-hit-est'])
            if ac == 0:
                logging.logPrint(
                    "clustering with cd-hit at an ID of %s, using %s processors"
                    % (id, processors))
                subprocess.check_call(
                    "cd-hit-est -i all_gene_seqs.out -o consensus.fasta -M 0 -T %s -c %s > /dev/null 2>&1"
                    % (processors, id),
                    shell=True)
            else:
                print "cd-hit must be in your path as cd-hit-est...exiting"
                sys.exit()
        """need to check for dups here"""
        dup_ids = test_duplicate_header_ids("consensus.fasta")
        if dup_ids == "True":
            pass
        elif dup_ids == "False":
            print "duplicate headers identified, renaming.."
            rename_fasta_header("consensus.fasta", "tmp.txt")
            os.system("mv tmp.txt consensus.fasta")
        else:
            pass
        if "tblastn" == blast:
            subprocess.check_call(
                "makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1",
                shell=True)
            translate_consensus("consensus.fasta")
            if filter_peps == "T":
                filter_seqs("tmp.pep")
                os.system("rm tmp.pep")
            else:
                os.system("mv tmp.pep consensus.pep")
            clusters = get_cluster_ids("consensus.pep")
            blast_against_self_tblastn("tblastn", "consensus.fasta",
                                       "consensus.pep", "tmp_blast.out",
                                       processors, filter)
        elif "blastn" == blast:
            subprocess.check_call(
                "makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1",
                shell=True)
            blast_against_self_blastn("blastn", "consensus.fasta",
                                      "consensus.fasta", "tmp_blast.out",
                                      filter, processors)
            clusters = get_cluster_ids("consensus.fasta")
        elif "blat" == blast:
            blat_against_self("consensus.fasta", "consensus.fasta",
                              "tmp_blast.out", processors)
            clusters = get_cluster_ids("consensus.fasta")
        else:
            pass
        subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out",
                              shell=True)
        ref_scores = parse_self_blast(open("self_blast.out", "U"))
        subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
        os.system("rm *new_genes.*")
        if blast == "tblastn" or blast == "blastn":
            logging.logPrint("starting BLAST")
        else:
            logging.logPrint("starting BLAT")
        if "tblastn" == blast:
            blast_against_each_genome_tblastn(dir_path, processors,
                                              "consensus.pep", filter)
        elif "blastn" == blast:
            blast_against_each_genome_blastn(dir_path, processors, filter,
                                             "consensus.fasta")
        elif "blat" == blast:
            blat_against_each_genome(dir_path, "consensus.fasta", processors)
        else:
            pass
    else:
        logging.logPrint("Using pre-compiled set of predicted genes")
        files = glob.glob(os.path.join(dir_path, "*.fasta"))
        if len(files) == 0:
            print "no usable reference genomes found!"
            sys.exit()
        else:
            pass
        gene_path = os.path.abspath("%s" % genes)
        dup_ids = test_duplicate_header_ids(gene_path)
        if dup_ids == "True":
            pass
        elif dup_ids == "False":
            print "duplicate headers identified, exiting.."
            sys.exit()
        clusters = get_cluster_ids(gene_path)
        os.system("cp %s %s" % (gene_path, fastadir))
        os.chdir("%s" % fastadir)
        if gene_path.endswith(".pep"):
            logging.logPrint("using tblastn on peptides")
            try:
                subprocess.check_call(
                    "makeblastdb -in %s -dbtype prot > /dev/null 2>&1" %
                    gene_path,
                    shell=True)
            except:
                logging.logPrint("problem encountered with BLAST database")
                sys.exit()
            blast_against_self_tblastn("blastp", gene_path, gene_path,
                                       "tmp_blast.out", processors, filter)
            subprocess.check_call(
                "sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
            ref_scores = parse_self_blast(open("self_blast.out", "U"))
            subprocess.check_call("rm tmp_blast.out self_blast.out",
                                  shell=True)
            logging.logPrint("starting BLAST")
            blast_against_each_genome_tblastn(dir_path, processors, gene_path,
                                              filter)
        elif gene_path.endswith(".fasta"):
            if "tblastn" == blast:
                logging.logPrint("using tblastn")
                translate_genes(gene_path)
                try:
                    subprocess.check_call(
                        "makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" %
                        gene_path,
                        shell=True)
                except:
                    logging.logPrint("problem encountered with BLAST database")
                    sys.exit()
                blast_against_self_tblastn("tblastn", gene_path, "genes.pep",
                                           "tmp_blast.out", processors, filter)
                subprocess.check_call(
                    "sort -u -k 1,1 tmp_blast.out > self_blast.out",
                    shell=True)
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out",
                                      shell=True)
                logging.logPrint("starting BLAST")
                blast_against_each_genome_tblastn(dir_path, processors,
                                                  "genes.pep", filter)
                os.system("cp genes.pep %s" % start_dir)
            elif "blastn" == blast:
                logging.logPrint("using blastn")
                try:
                    subprocess.check_call(
                        "makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" %
                        gene_path,
                        shell=True)
                except:
                    logging.logPrint(
                        "Database not formatted correctly...exiting")
                    sys.exit()
                try:
                    blast_against_self_blastn("blastn", gene_path, gene_path,
                                              "tmp_blast.out", filter,
                                              processors)
                except:
                    print "problem with blastn, exiting"
                    sys.exit()
                subprocess.check_call(
                    "sort -u -k 1,1 tmp_blast.out > self_blast.out",
                    shell=True)
                os.system("cp self_blast.out tmp.out")
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out",
                                      shell=True)
                logging.logPrint("starting BLAST")
                try:
                    blast_against_each_genome_blastn(dir_path, processors,
                                                     filter, gene_path)
                except:
                    print "problem with blastn, exiting"
                    sys.exit()
            elif "blat" == blast:
                logging.logPrint("using blat")
                blat_against_self(gene_path, gene_path, "tmp_blast.out",
                                  processors)
                subprocess.check_call(
                    "sort -u -k 1,1 tmp_blast.out > self_blast.out",
                    shell=True)
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out",
                                      shell=True)
                logging.logPrint("starting BLAT")
                blat_against_each_genome(dir_path, gene_path, processors)
            else:
                pass
        else:
            print "input file format not supported"
            sys.exit()
    find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors)
    if blast == "blat":
        logging.logPrint("BLAT done")
    else:
        logging.logPrint("BLAST done")
    parse_blast_report("false")
    get_unique_lines()
    curr_dir = os.getcwd()
    table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(table_files)]
    names = []
    table_list = []
    nr_sorted = sorted(clusters)
    centroid_list = []
    centroid_list.append(" ")
    for x in nr_sorted:
        centroid_list.append(x)
    table_list.append(centroid_list)
    logging.logPrint("starting matrix building")
    new_names, new_table = new_loop(files_and_temp_names, processors, clusters,
                                    debug)
    new_table_list = table_list + new_table
    logging.logPrint("matrix built")
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    names_out = open("names.txt", "w")
    names_redux = [val for subl in new_names for val in subl]
    for x in names_redux:
        print >> names_out, "".join(x)
    names_out.close()
    create_bsr_matrix_dev(new_table_list)
    divide_values("bsr_matrix", ref_scores)
    subprocess.check_call(
        "paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" %
        start_dir,
        shell=True)
    if "T" in f_plog:
        filter_paralogs("%s/bsr_matrix_values.txt" % start_dir,
                        "paralog_ids.txt")
        os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir)
    else:
        pass
    try:
        subprocess.check_call(
            "cp dup_matrix.txt names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s"
            % ap,
            shell=True,
            stderr=open(os.devnull, 'w'))
    except:
        sys.exc_clear()
    """new code to rename files according to a prefix"""
    import datetime
    timestamp = datetime.datetime.now()
    rename = str(timestamp.year), str(timestamp.month), str(
        timestamp.day), str(timestamp.hour), str(timestamp.minute), str(
            timestamp.second)
    os.chdir("%s" % ap)
    if "NULL" in prefix:
        os.system("mv dup_matrix.txt %s_dup_matrix.txt" % "".join(rename))
        os.system("mv names.txt %s_names.txt" % "".join(rename))
        os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" %
                  "".join(rename))
        os.system("mv paralog_ids.txt %s_paralog_ids.txt" % "".join(rename))
        os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" %
                  "".join(rename))
        if os.path.isfile("consensus.fasta"):
            os.system("mv consensus.fasta %s_consensus.fasta" %
                      "".join(rename))
        if os.path.isfile("consensus.pep"):
            os.system("mv consensus.pep %s_consensus.pep" % "".join(rename))
    else:
        os.system("mv dup_matrix.txt %s_dup_matrix.txt" % prefix)
        os.system("mv names.txt %s_names.txt" % prefix)
        os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % prefix)
        os.system("mv paralog_ids.txt %s_paralog_ids.txt" % prefix)
        os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % prefix)
        if os.path.isfile("consensus.fasta"):
            os.system("mv consensus.fasta %s_consensus.fasta" % prefix)
        if os.path.isfile("consensus.pep"):
            os.system("mv consensus.pep %s_consensus.pep" % prefix)
    if "NULL" in prefix:
        outfile = open("%s_run_parameters.txt" % "".join(rename), "w")
    else:
        outfile = open("%s_run_parameters.txt" % prefix, "w")
    print >> outfile, "-d %s \\" % directory
    print >> outfile, "-i %s \\" % id
    print >> outfile, "-f %s \\" % filter
    print >> outfile, "-p %s \\" % processors
    print >> outfile, "-g %s \\" % genes
    print >> outfile, "-c %s \\" % cluster_method
    print >> outfile, "-b %s \\" % blast
    print >> outfile, "-l %s \\" % length
    print >> outfile, "-m %s \\" % max_plog
    print >> outfile, "-n %s \\" % min_hlog
    print >> outfile, "-t %s \\" % f_plog
    print >> outfile, "-k %s \\" % keep
    print >> outfile, "-s %s \\" % filter_peps
    print >> outfile, "-e %s \\" % filter_scaffolds
    print >> outfile, "-x %s \\" % prefix
    print >> outfile, "-z %s" % debug
    print >> outfile, "temp data stored here if kept: %s" % fastadir
    outfile.close()
    logging.logPrint("all Done")
    if "T" == keep:
        pass
    else:
        os.system("rm -rf %s" % fastadir)
    os.chdir("%s" % ap)
コード例 #30
0
ファイル: releaseCutS2.py プロジェクト: carze/vappio
        runSystemEx('mkdir -p ' + vmWareDir, log=True)
        runSystemEx(
            'mv VMware_conversion/shared/converted_img.vmdk %s' % os.path.join(
                vmWareDir,
                'clovr.9-04.x86-64.%s.vmdk' % options('general.version')))
        runSystemEx('mkdir -p %s %s' % (os.path.join(
            vmWareDir, 'keys'), os.path.join(vmWareDir, 'user_data')),
                    log=True)
        runSystemEx('cp -rv /usr/local/projects/clovr/shared ' + vmWareDir,
                    log=True)
        fout = open(os.path.join(vmWareDir, 'start_clovr.vmx'), 'w')
        clovrConf = config.configFromMap(
            dict(version=options('general.version')))
        for line in open('/usr/local/projects/clovr/start_clovr.vmx'):
            fout.write(config.replaceStr(line, clovrConf))
    except Exception, err:
        errorPrint('Converting image failed.  Error message:')
        errorPrint(str(err))

    try:
        amiId = bundleChannel.receive()
        logPrint('AMI: ' + amiId)
    except Exception, err:
        amiId = None
        errorPrint('Bundling AMI failed for some reason.  Error message:')
        errorPrint(str(err))


if __name__ == '__main__':
    main(*buildConfigN(OPTIONS))
コード例 #31
0
def main(matrix, tree, reference, directory, parameters, processors, coverage,
         proportion, keep, subsample, subnums, doc, tmp_dir, insertion_method,
         fudge, only_subs, model, trim, gatk_method):
    ref_path = os.path.abspath("%s" % reference)
    dir_path = os.path.abspath("%s" % directory)
    #check for binary dependencies
    log_isg.logPrint('testing the paths of all dependencies')
    ap = os.path.abspath("%s" % os.getcwd())
    aa = subprocess.call(['which', 'raxmlHPC-SSE3'])
    if aa == 0:
        pass
    else:
        print "RAxML must be in your path as raxmlHPC-SSE3"
        sys.exit()
    print "*citation: 'Stamatakis, A. RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies. Bioinformatics (2014).'"
    print "*citation: 'Berger SA, Krompass D, Stamatakis A. Performance, accuracy, and Web server for evolutionary placement of short sequence reads under maximum likelihood. Syst Biol. 2011;60(3):291-302'"
    ab = subprocess.call(['which', 'samtools'])
    if ab == 0:
        pass
    else:
        print "samtools must be in your path"
        sys.exit()
    print "*citation: 'Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, Genome Project Data Processing S. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009;25(16):2078-9'"
    ac = subprocess.call(['which', 'bwa'])
    if ac == 0:
        pass
    else:
        print "bwa must be in your path"
        sys.exit()
    print "*citation: 'Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXivorg. 2013(arXiv:1303.3997 [q-bio.GN])'"
    print "Patristic distances calculated with DendroPy"
    print "*citation: 'Sukumaran J, Holder MT. DendroPy: a Python library for phylogenetic computing. Bioinformatics. 2010;26(12):1569-71. Epub 2010/04/28. doi: 10.1093/bioinformatics/btq228. PubMed PMID: 20421198'"
    print "Also uses GATK for variant calling"
    print "*citation: 'McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome research. 2010;20(9):1297-303'"
    print "Uses trimmomatic for read trimming"
    print "*citation: Bolger A.M., Lohse M., Usadel B. Trimmomatic: A flexible trimmer for Illumina Sequence Data.  Bioinformatics. 2014.  Doi:10.1093/bioinformatics/btu170"
    print "Uses BioPython for FASTA parsing"
    print "*citation :C**k PJ, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, de Hoon MJ. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009;25(11):1422-3"
    print ""
    #done checking for dependencies"""
    log_isg.logPrint('WG-FAST pipeline starting')
    log_isg.logPrint("WG-FAST was invoked with the following parameters:")
    print "-m %s \\" % matrix
    print "-t %s \\" % tree
    print "-r %s \\" % reference
    print "-d %s \\" % directory
    print "-x %s \\" % parameters
    print "-p %s \\" % processors
    print "-c %s \\" % coverage
    print "-o %s \\" % proportion
    print "-k %s \\" % keep
    print "-s %s \\" % subsample
    print "-n %s \\" % subnums
    print "-g %s \\" % doc
    print "-e %s \\" % tmp_dir
    print "-z %s \\" % insertion_method
    print "-f %s \\" % fudge
    print "-y %s \\" % only_subs
    print "-j %s \\" % model
    print "-i %s \\" % trim
    print "-q %s" % gatk_method
    try:
        os.makedirs('%s/scratch' % ap)
    except OSError, e:
        if e.errno != errno.EEXIST: raise
コード例 #32
0
ファイル: ls_bsr.py プロジェクト: pryratess/LS-BSR
def main(
    directory,
    id,
    filter,
    processors,
    genes,
    usearch,
    vsearch,
    blast,
    penalty,
    reward,
    length,
    max_plog,
    min_hlog,
    f_plog,
    keep,
    filter_peps,
    debug,
):
    start_dir = os.getcwd()
    ap = os.path.abspath("%s" % start_dir)
    dir_path = os.path.abspath("%s" % directory)
    logging.logPrint("Testing paths of dependencies")
    if blast == "blastn" or blast == "tblastn":
        ab = subprocess.call(["which", "blastn"])
        if ab == 0:
            print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402"
        else:
            print "blastn isn't in your path, but needs to be!"
            sys.exit()
    try:
        os.makedirs("%s/joined" % dir_path)
    except:
        print "old run directory exists in your genomes directory (%s/joined).  Delete and run again" % dir_path
        sys.exit()
    for infile in glob.glob(os.path.join(dir_path, "*.fasta")):
        name = get_seq_name(infile)
        os.link("%s" % infile, "%s/joined/%s.new" % (dir_path, name))
    if "null" in genes:
        rc = subprocess.call(["which", "prodigal"])
        if rc == 0:
            pass
        else:
            print "prodigal is not in your path, but needs to be!"
            sys.exit()
        print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119"
        if os.path.exists(usearch):
            print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461"
        else:
            pass
        if blast == "blat":
            ac = subprocess.call(["which", "blat"])
            if ac == 0:
                print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool.  Genome Research 12:656-664"
            else:
                print "You have requested blat, but it is not in your PATH"
                sys.exit()
        logging.logPrint("predicting genes with Prodigal")
        predict_genes(dir_path, processors)
        logging.logPrint("Prodigal done")
        os.system("cat *genes.seqs > all_gene_seqs.out")
        filter_scaffolds("all_gene_seqs.out")
        os.system("mv tmp.out all_gene_seqs.out")
        rename_fasta_header("all_gene_seqs.out", "all_sorted.txt")
        if os.path.exists(usearch) and os.path.exists(vsearch):
            print "usearch and vsearch both selected, only usearch will be used"
        if os.path.exists(usearch):
            os.system("mkdir split_files")
            os.system("cp all_sorted.txt split_files/")
            os.system("rm all_sorted.txt")
            os.chdir("split_files/")
            os.system("split -l 200000 all_sorted.txt")
            logging.logPrint("clustering with USEARCH at an ID of %s" % id)
            run_usearch(usearch, id)
            os.system("cat *.usearch.out > all_sorted.txt")
            os.system("mv all_sorted.txt %s/joined" % dir_path)
            os.chdir("%s/joined" % dir_path)
            uclust_cluster(usearch, id)
            logging.logPrint("USEARCH clustering finished")
        elif os.path.exists(vsearch):
            logging.logPrint("clustering with VSEARCH at an ID of %s" % id)
            run_vsearch(vsearch, id, processors)
            os.system("mv vsearch.out consensus.fasta")
            logging.logPrint("VSEARCH clustering finished")
        else:
            print "neither usearch or vsearch selected for use with Prodigal!, exiting."
            sys.exit()
        if "tblastn" == blast:
            subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True)
            translate_consensus("consensus.fasta")
            if filter_peps == "T":
                filter_seqs("tmp.pep")
                os.system("rm tmp.pep")
            else:
                os.system("mv tmp.pep consensus.pep")
            clusters = get_cluster_ids("consensus.pep")
            blast_against_self_tblastn("tblastn", "consensus.fasta", "consensus.pep", "tmp_blast.out", processors)
        elif "blastn" == blast:
            subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True)
            blast_against_self_blastn(
                "blastn", "consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, penalty, reward, processors
            )
            clusters = get_cluster_ids("consensus.fasta")
        elif "blat" == blast:
            blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors)
            clusters = get_cluster_ids("consensus.fasta")
        else:
            pass
        subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
        ref_scores = parse_self_blast(open("self_blast.out", "U"))
        subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
        os.system("rm *new_genes.*")
        if blast == "tblastn" or blast == "blastn":
            logging.logPrint("starting BLAST")
        else:
            logging.logPrint("starting BLAT")
        if "tblastn" == blast:
            blast_against_each_genome_tblastn(dir_path, processors, "consensus.pep")
        elif "blastn" == blast:
            blast_against_each_genome_blastn(dir_path, processors, filter, "consensus.fasta", penalty, reward)
        elif "blat" == blast:
            blat_against_each_genome(dir_path, "consensus.fasta", processors)
        else:
            pass
        find_dups(ref_scores, length, max_plog, min_hlog)
    else:
        logging.logPrint("Using pre-compiled set of predicted genes")
        files = glob.glob(os.path.join(dir_path, "*.fasta"))
        if len(files) == 0:
            print "no usable reference genomes found!"
            sys.exit()
        else:
            pass
        gene_path = os.path.abspath("%s" % genes)
        clusters = get_cluster_ids(gene_path)
        os.system("cp %s %s/joined/" % (gene_path, dir_path))
        os.chdir("%s/joined" % dir_path)
        if gene_path.endswith(".pep"):
            logging.logPrint("using tblastn on peptides")
            try:
                subprocess.check_call("makeblastdb -in %s -dbtype prot > /dev/null 2>&1" % gene_path, shell=True)
            except:
                logging.logPrint("problem encountered with BLAST database")
                sys.exit()
            blast_against_self_tblastn("blastp", gene_path, gene_path, "tmp_blast.out", processors)
            subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
            ref_scores = parse_self_blast(open("self_blast.out", "U"))
            subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
            logging.logPrint("starting BLAST")
            blast_against_each_genome_tblastn(dir_path, processors, gene_path)
        elif gene_path.endswith(".fasta"):
            if "tblastn" == blast:
                logging.logPrint("using tblastn")
                translate_genes(gene_path)
                try:
                    subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True)
                except:
                    logging.logPrint("problem encountered with BLAST database")
                    sys.exit()
                blast_against_self_tblastn("tblastn", gene_path, "genes.pep", "tmp_blast.out", processors)
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAST")
                blast_against_each_genome_tblastn(dir_path, processors, "genes.pep")
                os.system("cp genes.pep %s" % start_dir)
            elif "blastn" == blast:
                logging.logPrint("using blastn")
                try:
                    subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True)
                except:
                    logging.logPrint("Database not formatted correctly...exiting")
                    sys.exit()
                try:
                    blast_against_self_blastn(
                        "blastn", gene_path, gene_path, "tmp_blast.out", filter, penalty, reward, processors
                    )
                except:
                    print "problem with blastn, exiting"
                    sys.exit()
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAST")
                blast_against_each_genome_blastn(dir_path, processors, filter, gene_path, penalty, reward)
            elif "blat" == blast:
                logging.logPrint("using blat")
                blat_against_self(gene_path, gene_path, "tmp_blast.out", processors)
                subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True)
                ref_scores = parse_self_blast(open("self_blast.out", "U"))
                subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True)
                logging.logPrint("starting BLAT")
                blat_against_each_genome(dir_path, gene_path, processors)
            else:
                pass
        else:
            print "input file format not supported"
            sys.exit()
    if blast == "blat":
        logging.logPrint("BLAT done")
    else:
        logging.logPrint("BLAST done")
    parse_blast_report("false")
    get_unique_lines()
    curr_dir = os.getcwd()
    table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(table_files)]
    names = []
    table_list = []
    nr_sorted = sorted(clusters)
    centroid_list = []
    centroid_list.append(" ")
    for x in nr_sorted:
        centroid_list.append(x)
    table_list.append(centroid_list)
    logging.logPrint("starting matrix building")
    new_names, new_table = new_loop(files_and_temp_names, processors, clusters, debug)
    new_table_list = table_list + new_table
    logging.logPrint("matrix built")
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    names_out = open("names.txt", "w")
    names_redux = [val for subl in new_names for val in subl]
    for x in names_redux:
        print >> names_out, "".join(x)
    names_out.close()
    create_bsr_matrix_dev(new_table_list)
    divide_values("bsr_matrix", ref_scores)
    subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True)
    if "T" in f_plog:
        filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt")
        os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir)
    else:
        pass
    try:
        subprocess.check_call(
            "cp names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s" % start_dir,
            shell=True,
            stderr=open(os.devnull, "w"),
        )
    except:
        sys.exc_clear()
    logging.logPrint("all Done")
    os.chdir("%s" % dir_path)
    if "T" == keep:
        pass
    else:
        os.system("rm -rf joined")
    os.chdir("%s" % ap)
コード例 #33
0
     write_reduced_matrix(matrix)
 ref_name=get_seq_name(reference)
 if only_subs == "T":
     pass
 else:
     fileSets=read_file_sets(dir_path)
     ref_coords = get_all_snps(matrix)
     run_loop(fileSets, dir_path,"%s/scratch/reference.fasta" % ap , processors, GATK_PATH, ref_coords, coverage, proportion, matrix, ap,doc,tmp_dir,ADD_GROUPS,TRIM_PATH,WGFAST_PATH,trim)
 """will subsample based on the number of SNPs reported by the following function"""
 used_snps=find_used_snps()
 #Outnames is required for the sub-sampling routine, even with -y T
 outnames=grab_names()
 for name in outnames:
     for k,v in used_snps.iteritems():
         if name==k:
             log_isg.logPrint("number of callable positions in genome %s = %s" % (k,v))
 if only_subs == "T":
     try:
         os.system("rm RAxML*")
     except:
         pass
     pass
 else:
     create_merged_vcf()
     subprocess.check_call("paste temp.matrix merged.vcf > combined.matrix", shell=True)
     matrix_to_fasta("combined.matrix", "all.fasta")
     os.system("mv combined.matrix %s/nasp_matrix.with_unknowns.txt" % ap)
     """this fixes the SNP output to conform with RAxML"""
     os.system("sed 's/://g' all.fasta | sed 's/,//g' > out.fasta")
     if insertion_method == "ML":
         suffix = run_raxml("out.fasta", tree,"out.classification_results.txt", "V", parameters, model, "out")
コード例 #34
0
     pass
 else:
     fileSets = read_file_sets(dir_path)
     ref_coords = get_all_snps(matrix)
     run_loop(fileSets, dir_path, "%s/scratch/reference.fasta" % ap,
              processors, GATK_PATH, ref_coords, coverage, proportion,
              matrix, ap, doc, tmp_dir, ADD_GROUPS, TRIM_PATH, WGFAST_PATH,
              trim, gatk_method)
 """will subsample based on the number of SNPs reported by the following function"""
 used_snps = find_used_snps()
 #Outnames is required for the sub-sampling routine, even with -y T
 outnames = grab_names()
 for name in outnames:
     for k, v in used_snps.iteritems():
         if name == k:
             log_isg.logPrint(
                 "number of callable positions in genome %s = %s" % (k, v))
 if only_subs == "T":
     try:
         os.system("rm RAxML*")
     except:
         pass
     pass
 else:
     create_merged_vcf()
     subprocess.check_call("paste temp.matrix merged.vcf > combined.matrix",
                           shell=True)
     matrix_to_fasta("combined.matrix", "all.fasta")
     os.system("mv combined.matrix %s/nasp_matrix.with_unknowns.txt" % ap)
     """this fixes the SNP output to conform with RAxML"""
     os.system("sed 's/://g' all.fasta | sed 's/,//g' > out.fasta")
     if insertion_method == "ML":