Exemple #1
0
def classify(infile, outfile, num_procs):
    ''' Run the classification
    '''
    c = plasclass.plasclass(num_procs)
    seq_names = []
    seqs = []
    i = 0
    fp = open(infile)
    with open(outfile, 'w') as o:
        for name, seq, _ in readfq(fp):
            seq_names.append(name)
            seqs.append(seq)
            i += 1
            if i % 50000 == 0:
                probs = c.classify(seqs)
                for j, p in enumerate(probs):
                    o.write(seq_names[j] + '\t' + str(p) + '\n')
                seq_names = []
                seqs = []

        # last bunch of sequences:
        probs = c.classify(seqs)
        for j, p in enumerate(probs):
            o.write(seq_names[j] + '\t' + str(p) + '\n')

    fp.close()
def parse_lines(fastg, ofile):
    lines = []
    fp = open(fastg, 'r')
    count = 0
    for name, seq, qual in readfq(fp):
        count += 1
        if count % 2 == 0: continue
        name = re.sub('[:,]', " ", name[:-1]).split(" ")[0]
        line = ">" + name + "\n" + seq + "\n"
        ofile.write(line)
def parse_lines(fastg, ofile):
    fp = open(fastg, 'r')
    seen = set()  ##
    for name, seq, qual in readfq(fp):
        name = re.sub('[:,]', " ", name[:-1]).split(" ")[0]
        if name[-1] == "'": name = name[:-1]
        if name in seen: continue
        else: seen.add(name)
        line = ">" + name + "\n" + seq + "\n"
        ofile.write(line)
Exemple #4
0
def main():
    # Get command line args
    args = parse_user_input()

    fastg = args.graph
    outdir = args.output_dir
    max_k = args.max_k

    num_procs = args.num_processes

    # for optional workflow steps
    bamfile = args.bam
    reads1 = args.reads1
    reads2 = args.reads2

    plasclass_file = args.plasclass
    plasflow_file = args.plasflow  # these are mutually exclusive

    # flags
    use_scores = True if args.use_scores in ['True', 'true'] else False
    use_genes = True if args.use_gene_hits in ['True', 'true'] else False

    # default threshold variables
    PARAMS.load_params_json()
    if args.max_CV:
        PARAMS.MAX_CV = args.max_CV
    if args.min_length:
        PARAMS.MIN_LENGTH = args.min_length
    if args.classification_thresh:
        PARAMS.CLASSIFICATION_THRESH = args.classification_thresh
    if args.gene_match_thresh:
        PARAMS.GENE_MATCH_THRESH = args.gene_match_thresh
    if args.selfloop_score_thresh:
        PARAMS.SELF_LOOP_SCORE_THRESH = args.selfloop_score_thresh
    if args.selfloop_mate_thresh:
        PARAMS.SELF_LOOP_MATE_THRESH = args.selfloop_mate_thresh
    if args.chromosome_score_thresh:
        PARAMS.CHROMOSOME_SCORE_THRESH = args.chromosome_score_thresh
    if args.chromosome_len_thresh:
        PARAMS.CHROMOSOME_LEN_THRESH = args.CHROMOSOME_LEN_THRESH
    if args.plasmid_score_thresh:
        PARAMS.PLASMID_SCORE_THRESH = args.plasmid_score_thresh
    if args.plasmid_len_thresh:
        PARAMS.PLASMID_LEN_THRESH
    if args.good_cyc_dominated_thresh:
        PARAMS.GOOD_CYC_DOMINATED_THRESH = args.good_cyc_dominated_thresh

    parent_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    data_path = os.path.join(parent_path, 'data')
    bin_path = os.path.join(parent_path, 'bin')

    int_dir = os.path.join(outdir, 'intermediate_files')
    if not os.path.exists(int_dir):
        os.makedirs(int_dir)
    logs_dir = os.path.join(outdir, 'logs')
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    # Get config variables
    try:
        config_path = os.path.join(bin_path, 'config.json')
        with open(config_path) as config:
            config = json.load(config)
            bwa_path = config['BWA_PATH']
            ncbi_path = config["NCBI_PATH"]
            samtools_path = config["SAMTOOLS_PATH"]
    except:
        print("Error loading config variables. Please check config.json file")
        raise

    # Set up logging and write config and options to the log file
    logfile = os.path.join(logs_dir, "scapp.log")
    logging.basicConfig(filemode='w',
                        filename=logfile,
                        level=logging.INFO,
                        format='%(asctime)s: %(message)s',
                        datefmt='%d/%m/%Y %H:%M')
    logger = logging.getLogger("scapp_logger")

    logger.info("Beginning SCAPP workflow")
    logger.info(
        "Got parameters:\n\tInput graph: {}\n\tOutput directory: {} \n\tMaximum k value: {}\
    \n\t# processes: {}\n\tMaximum CV: {}\n\tMinimum Length: {}\n\tBamfile: {}\n\tReads file 1: {}\
    \n\tReads file 2: {}\n\tUse scores: {}\n\tUse genes: {}\n\tPath to BWA executables: {}\
    \n\tPath to NCBI executables: {}\n\tPath to samtools executables: {}\
    \n\tPlasClass scores file: {}".format(fastg, outdir, max_k, num_procs,
                                          PARAMS.MAX_CV, PARAMS.MIN_LENGTH,
                                          bamfile, reads1, reads2, use_scores,
                                          use_genes, bwa_path, ncbi_path,
                                          samtools_path, plasclass_file))

    # TODO: Support GFA format: script to convert gfa to fastg

    # Step 1: Map reads and create BAM file
    if bamfile is None:
        # first create fasta of contigs in only one direction
        time_start = time.time()
        logger.info('Creating fasta from fastg')
        nodes_fasta = os.path.join(int_dir, 'assembly_graph.nodes.fasta')
        make_fasta_from_fastg.parse_lines(fastg, nodes_fasta)

        print("Creating BAM file of read mappings with BWA")
        logger.info("Creating BAM file")

        bwa_file = os.path.join(bwa_path, 'bwa')
        samtools_file = os.path.join(samtools_path, 'samtools')

        # BWA index
        bwa_outfile_name = os.path.join(logs_dir, 'bwa_std.log')
        bwa_outfile = open(bwa_outfile_name,
                           'w')  # Don't clutter with print statements
        cmd = bwa_file + " index " + nodes_fasta
        logger.info("Executing command: '{}'".format(cmd))
        subprocess.check_call(cmd,
                              stderr=subprocess.STDOUT,
                              stdout=bwa_outfile,
                              shell=True)
        bwa_outfile.flush()

        # BWA mem and samtools view
        reads_bam = os.path.join(int_dir, "reads_pe.bam")
        cmd =  bwa_file + " mem -t " + str(num_procs) + " " + nodes_fasta + " " \
                + reads1 + " " + reads2 + " | " + \
                samtools_file + "  view -buS -@ " + str(num_procs-1) + " - > " + reads_bam
        logger.info("Executing command: '{}'".format(cmd))
        subprocess.check_call(cmd,
                              stderr=subprocess.STDOUT,
                              stdout=bwa_outfile,
                              shell=True)
        bwa_outfile.flush()

        # Filter reads with samtools
        primary_bam = os.path.join(int_dir, "reads_pe_primary.bam")
        cmd = samtools_file + " view -bF 0x0900 -@ " + str(
            num_procs - 1) + " " + reads_bam + ' > ' + primary_bam
        #        cmd = samtools_file + " view -bF 0x0800 -@ " + str(num_procs-1) + " " + reads_bam + ' > ' + primary_bam
        logger.info("Executing command: '{}'".format(cmd))
        subprocess.check_call(cmd,
                              stderr=subprocess.STDOUT,
                              stdout=bwa_outfile,
                              shell=True)
        bwa_outfile.flush()

        # Sort filtered reads with samtools
        sorted_reads = os.path.join(int_dir, "reads_pe_primary.sort")
        cmd = samtools_file + " sort " + primary_bam + ' ' + sorted_reads
        logger.info("Executing command: '{}'".format(cmd))
        subprocess.check_call(cmd,
                              stderr=subprocess.STDOUT,
                              stdout=bwa_outfile,
                              shell=True)
        bwa_outfile.flush()

        # Index filtered reads with samtools
        sorted_bam = os.path.join(int_dir, "reads_pe_primary.sort.bam")
        cmd = samtools_file + " index " + sorted_bam
        logger.info("Executing command: '{}'".format(cmd))
        subprocess.check_call(cmd,
                              stderr=subprocess.STDOUT,
                              stdout=bwa_outfile,
                              shell=True)
        bwa_outfile.flush()

        bamfile = sorted_bam

        # Remove the intermediate files
        os.remove(primary_bam)
        os.remove(reads_bam)
        for f in glob.glob(nodes_fasta + "*"):
            os.remove(f)
        bwa_outfile.close()

        time_end = time.time()
        logger.info(
            "{} seconds to create indexed sorted bam file".format(time_end -
                                                                  time_start))
    else:
        logger.info("Using file {} as the bamfile".format(bamfile))

    # Step 2: Classify contigs using PlasClass (default) and parse scores
    if use_scores and plasclass_file is None and plasflow_file is None:
        print("Getting scores of graph nodes")
        logger.info("Using PlasClass to obtain sequence scores")
        time_start = time.time()
        plasclass_file = os.path.join(int_dir, 'plasclass.out')
        sys_stdout = sys.stdout  # don't want to clutter with more print statements
        sys_stderr = sys.stderr
        plasclass_outfile = os.path.join(logs_dir, 'plasclass_std.log')
        stdfile = open(plasclass_outfile, 'w')
        sys.stdout = stdfile
        sys.stderr = sys.stdout
        classify_fastg.classify(fastg, plasclass_file, num_procs)
        sys.stdout = sys_stdout
        sys.stderr = sys_stderr
        stdfile.close()
        time_end = time.time()
        logger.info(
            "{} seconds to classify the assembly graph".format(time_end -
                                                               time_start))

    scores_file = None
    if use_scores:
        # Parse and transform the scores
        logger.info("Transforming scores")
        time_start = time.time()
        scores_file = os.path.join(int_dir, 'assembly_graph.nodes.scores')
        if plasflow_file:
            parse_plasmid_scores.transformPlasFlow(plasflow_file, scores_file)
        else:
            parse_plasmid_scores.transformPlasClass(plasclass_file,
                                                    scores_file)
        time_end = time.time()
        logger.info("{} seconds to transform scores".format(time_end -
                                                            time_start))

    # Step 3: BLAST for plasmid-specific genes and parse BLAST output
    gene_hits_path = None
    if use_genes:
        print('Finding plasmid-specific genes with BLAST')
        time_start = time.time()
        logger.info('Finding plasmid-specific genes with BLAST')
        # don't want to clutter with more print statements
        # a bit more complicated for BLAST which prints from c
        old_stdout = os.dup(sys.stdout.fileno())
        old_stderr = os.dup(sys.stderr.fileno())
        blast_outfile = os.path.join(logs_dir, 'blast_std.log')
        stdfile = open(blast_outfile, 'w')
        os.dup2(stdfile.fileno(), sys.stdout.fileno())
        os.dup2(stdfile.fileno(), sys.stderr.fileno())
        genefiles_path = os.path.join(data_path, 'nt')
        protfiles_path = os.path.join(data_path, 'aa')
        try:
            gene_hits_path = find_plasmid_gene_matches.find_plasmid_gene_matches( \
                                fastg, int_dir, genefiles_path, protfiles_path, None, \
                                ncbi_path, num_procs, PARAMS.GENE_MATCH_THRESH)
        except:
            os.dup2(old_stdout, sys.stdout.fileno())
            os.dup2(old_stderr, sys.stderr.fileno())
            stdfile.close()
            print(
                "Error finding plasmid genes - check BLAST output file (blast_std.log)"
            )
            raise

        os.dup2(old_stdout, sys.stdout.fileno())
        os.dup2(old_stderr, sys.stderr.fileno())
        stdfile.close()
        time_end = time.time()
        logger.info(
            "{} seconds to find plasmid-specific gene hits".format(time_end -
                                                                   time_start))

    # Step 4: Run SCAPP annotated-assembly-graph-based plasmid assembly
    print("Starting SCAPP plasmid finding")
    logger.info("Starting plasmid finding")
    time_start = time.time()
    recycle.run_scapp(fastg, int_dir, bamfile, num_procs, max_k, \
                    gene_hits_path, use_genes, scores_file, use_scores, \
                    PARAMS.MAX_CV, PARAMS.MIN_LENGTH)
    time_end = time.time()
    logger.info("{} seconds to run SCAPP plasmid finding".format(time_end -
                                                                 time_start))

    basename, _ = os.path.splitext(os.path.basename(fastg))
    fasta_ofile = os.path.join(int_dir, basename + '.cycs.fasta')
    self_loops_ofile = os.path.join(int_dir, basename + '.self_loops.fasta')

    # Step 5: Post-process filtering: BLAST output plasmids for plasmid-specific genes
    if use_genes:
        print("Filtering plasmids by plasmid-specific genes")
        logger.info("Filtering plasmids by plasmid-specific genes")
        time_start = time.time()
        # don't want to clutter with more print statements
        # a bit more complicated for BLAST which prints from c
        old_stdout = os.dup(sys.stdout.fileno())
        old_stderr = os.dup(sys.stderr.fileno())
        blast_outfile = os.path.join(logs_dir, 'blast_std.log')
        stdfile = open(blast_outfile, 'a')
        os.dup2(stdfile.fileno(), sys.stdout.fileno())
        os.dup2(stdfile.fileno(), sys.stderr.fileno())

        hit_plasmids_dir = os.path.join(int_dir, "hit_cycs")
        if not os.path.exists(hit_plasmids_dir):
            os.mkdir(hit_plasmids_dir)
        try:
            hit_plasmids_fname = find_plasmid_gene_matches.find_plasmid_gene_matches( \
                                    fasta_ofile, hit_plasmids_dir, genefiles_path, \
                                    protfiles_path, None, ncbi_path, num_procs, PARAMS.GENE_MATCH_THRESH)
        except:
            os.dup2(old_stdout, sys.stdout.fileno())
            os.dup2(old_stderr, sys.stderr.fileno())
            stdfile.close()
            print(
                "Error filtering by plasmid genes. Check BLAST output file (blast_std.log)"
            )
            raise

        os.dup2(old_stdout, sys.stdout.fileno())
        os.dup2(old_stderr, sys.stderr.fileno())
        stdfile.close()

        gene_filtered_ofile = os.path.join(
            int_dir, basename + ".gene_filtered_cycs.fasta")
        create_hits_fasta.create_hits(fasta_ofile, hit_plasmids_fname,
                                      gene_filtered_ofile)
        time_end = time.time()
        logger.info(
            "{} seconds to filter plasmids using plasmid-specific gene hits".
            format(time_end - time_start))

    # Step 6: Post-process filtering: Classify gene filtered plasmids
    if use_scores:
        print("Getting scores of gene filtered plasmids")
        logger.info("Using PlasClass to obtain scores of cycles")
        time_start = time.time()
        plasclass_filtered_file = os.path.join(int_dir,
                                               'plasclass_filtered.out')
        sys_stdout = sys.stdout  # don't want to clutter with more print statements
        sys_stderr = sys.stderr
        plasclass_outfile = os.path.join(logs_dir, 'plasclass_std.log')
        stdfile = open(plasclass_outfile, 'a')
        sys.stdout = stdfile
        sys.stderr = sys.stdout
        classify_fastg.classify(fasta_ofile, plasclass_filtered_file,
                                num_procs)
        logger.info("Transforming scores")
        plasmid_scores_file = os.path.join(int_dir, 'filtered_plasmids.scores')
        parse_plasmid_scores.transformPlasClass(plasclass_filtered_file,
                                                plasmid_scores_file)

        classified_plasmids_fname = os.path.join(hit_plasmids_dir,
                                                 "classified_cycs.out")
        with open(plasmid_scores_file) as f, open(classified_plasmids_fname,
                                                  'w') as o:
            for line in f:
                splt = line.strip().split()
                if float(splt[1]) > PARAMS.CLASSIFICATION_THRESH:
                    o.write(splt[0] + '\n')
        classification_filtered_ofile = os.path.join(
            int_dir, basename + ".classified_cycs.fasta")
        create_hits_fasta.create_hits(fasta_ofile, classified_plasmids_fname,
                                      classification_filtered_ofile)
        sys.stdout = sys_stdout
        sys.stderr = sys_stderr
        stdfile.close()
        time_end = time.time()
        logger.info("{} seconds to filter cycles by PlasClass score".format(
            time_end - time_start))
        os.remove(plasmid_scores_file)
        os.remove(plasclass_filtered_file)

    # Step 7: Create set of confident plasmid predictions:
    # Confident plasmid predictions - 2 out of 3 of: gene hits, plasmid classification, self loops
    if use_scores and use_genes:
        time_start = time.time()
        gene_hit_set = set()
        fp = open(gene_filtered_ofile, 'r')
        for name, _, _ in utils.readfq(fp):
            gene_hit_set.add(name)
        fp.close()

        classified_set = set()
        fp = open(classification_filtered_ofile, 'r')
        for name, _, _ in utils.readfq(fp):
            classified_set.add(name)
        fp.close()

        self_loop_set = set()
        fp = open(self_loops_ofile, 'r')
        for name, _, _ in utils.readfq(fp):
            self_loop_set.add(name)
        fp.close()

        classified_loops = classified_set & self_loop_set
        gene_hit_loops = gene_hit_set & self_loop_set
        classified_gene_hit = gene_hit_set & classified_set

        confident_plasmid_set = classified_loops | classified_gene_hit | gene_hit_loops

        confident_plasmids_fname = os.path.join(hit_plasmids_dir,
                                                "confident_cycs.out")
        with open(confident_plasmids_fname, 'w') as o:
            for cyc in confident_plasmid_set:
                o.write(cyc + '\n')

        confident_plasmid_ofile = os.path.join(
            outdir, basename + ".confident_cycs.fasta")
        create_hits_fasta.create_hits(fasta_ofile, confident_plasmids_fname,
                                      confident_plasmid_ofile)

        time_end = time.time()
        logger.info(
            "{} seconds to filter confident plasmids".format(time_end -
                                                             time_start))

    # Step 8: Clean up any remaining intermediate files
    if use_scores:
        os.remove(scores_file)
        shutil.rmtree(hit_plasmids_dir)