Ejemplo n.º 1
0
def runPilon(bam_file, reference_fasta, sample_name, parent_dir, pilon_options,
             cores):
    try:
        assert (os.path.isfile(bam_file))
    except:
        sys.stderr.write(
            "ERROR: BAM input was not provided. Please provide. Exiting now ...\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "Pilon"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Pilon.log'
    logObject = uF.createLoggerObject(log_file)

    #### Start Pilon workflow

    # create Alignment object
    AlignmentObj = Alignment(bam_file, sample_name, logObject)

    # run Pilon
    AlignmentObj.run_pilon(workspace, reference_fasta, options=pilon_options)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "PILON.txt", 'w')
    conf_file.write("Pilon: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 2
0
def runNanoSample(nanopore_fastq, sample_dir, sample_name, fastqfilter_options,
                  no_gzip):
    try:
        assert (nanopore_fastq and os.path.isfile(nanopore_fastq))
    except:
        raise RuntimeError(
            "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n"
        )

    fastqfilter_options = fastqfilter_options.strip('"')

    # set up directory structure
    workspace_name = "NanoSample"
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'NanoSample.log'
    logObject = uF.createLoggerObject(log_file)

    # Initialize Nanopore Object
    NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject)

    # Subsample Nanopore reads using fastqfilter by Bruce Walker
    NanoporeObj.run_fastqfilter(workspace,
                                options=fastqfilter_options,
                                compress=(not no_gzip))

    conf_file = open(sample_dir + "NANOSAMPLE.txt", 'w')
    conf_file.write("NanoSample: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 3
0
def runMLST(assembly, sample_name, parent_dir, identifier):
    try:
        assert (os.path.isfile(assembly))
    except:
        sys.stderr.write("ERROR: Assembly does not seem to exist.\n")
        raise RuntimeError

    # set up directory structure
    workspace_name = "Assembly_MLST"
    if identifier:
        workspace_name += '_' + identifier
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Assembly_MLST.log'
    logObject = uF.createLoggerObject(log_file)

    # Initialize Assembly Object
    AssemblyObj = Assembly(assembly, sample_name, logObject)

    # Run MLST
    AssemblyObj.run_mlst(workspace)

    # create successful completion file if steps completed!
    conf_file_name = parent_dir + "ASSEMBLY_MLST"
    if identifier: conf_file_name += '_' + identifier
    conf_file_name += ".txt"
    conf_file = open(conf_file_name, 'w')
    conf_file.write("Assembly MLST: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 4
0
def runNanoQC(nanopore_fastq, nanopore_seqsum, nanopore_barcode, sample_dir,
              cores):
    try:
        assert (nanopore_fastq and (os.path.isfile(nanopore_fastq)))
    except:
        sys.stderr.write(
            "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "NanoQC"
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'NanoQC.log'
    logObject = uF.createLoggerObject(log_file)

    npF.run_nanoplot_qc(nanopore_fastq, workspace, logObject, cores=cores)
    if os.path.isfile(nanopore_seqsum):
        sample_seqsum = npF.filter_sequence_summary(nanopore_seqsum,
                                                    nanopore_barcode,
                                                    workspace, logObject)
        npF.run_minion_qc(sample_seqsum, workspace, logObject)

    # create successful completion file if steps completed!
    conf_file = open(sample_dir + "NANOQC.txt", 'w')
    conf_file.write("NanoQC: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 5
0
def runAssembly(fastq_frw, fastq_rev, sample_name, parent_dir, read_length,
                unicycler, cores):
    try:
        assert (os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "Assembly"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Assembly.log'
    logObject = uF.createLoggerObject(log_file)

    # create FastqPaired Object
    FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

    if unicycler: FastqPairedObj.run_unicycler(workspace, cores=cores)
    else:
        FastqPairedObj.run_spades(workspace,
                                  read_length=read_length,
                                  cores=cores)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "ASSEMBLY.txt", 'w')
    conf_file.write("Assembly: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 6
0
def runBayesHammer(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir):
    try: assert( (fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) )
    except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError

    # set up directory structure
    workspace_name = "BayesHammer"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'BayesHammer.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end cutadapt operation
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)
        FastqObj.error_correction(workspace)

    ### Perform paired-end cutadapt operation
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

        # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format
        FastqPairedObj.error_correction(workspace)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "BAYESHAMMER.txt", 'w')
    conf_file.write("BayesHammer: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 7
0
def runStrainGST(fastq_sin, fastq_frw, fastq_rev, db, sample_name, parent_dir,
                 options_kmerize, options_straingst):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin)
                 or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                     and os.path.isfile(fastq_rev))))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    try:
        assert (os.path.isfile(db))
    except:
        sys.stderr.write(
            "ERROR: StrainGST pangenome database file is not available.")
        raise RuntimeError()

    # set up directory structure
    workspace_name = "StrainGST"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'StrainGST.log'
    logObject = uF.createLoggerObject(log_file)

    kmer_file = None
    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # run strainge kmerize
        kmer_file = FastqObj.kmerize(workspace, options=options_kmerize)

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create Fastq object
        FastqObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

        # run strainge kmerize
        kmer_file = FastqObj.kmerize(workspace, options=options_kmerize)

    # create Kmer object
    KmerObj = Kmer(kmer_file, sample_name, logObject)

    # run straingst
    KmerObj.run_straingst(workspace, db, options=options_straingst)

    # produce kmer histogram - in progress - issues running.
    # KmerObj.create_histogram(workspace)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "STRAINGST.txt", 'w')
    conf_file.write("StrainGST: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 8
0
def runFastQC(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, cores):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin))
                or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                    and os.path.isfile(fastq_rev)))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "FastQC"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'FastQC.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # validate FASTQ file is indeed a FASTQ file
        valid = FastqObj.validate()
        if not valid:
            sys.stderr.write(
                "ERROR: FASTQ file %s seems to be in invalid format. Exiting now...\n"
                % FastqObj.fastq)
            sys.exit(1)

        # run FastQC and parse results.
        fastqcResDir = FastqObj.run_qc(workspace, cores=cores)

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # validate FASTQ file is indeed a FASTQ file
        valid = FastqPairedObj.validate()
        if not valid:
            sys.stderr.write(
                "ERROR: At least one of the FASTQ files seems to be in an invalid format. Exiting now ...\n"
            )
            sys.exit(1)

        # run FastQC and parse results.
        fastqcResDirs = FastqPairedObj.run_qc(workspace, cores=cores)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "FASTQC.txt", 'w')
    conf_file.write("FastQC: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 9
0
def runRefAlignment(fastq_sin, fastq_frw, fastq_rev, reference_fasta, sample_name, parent_dir, bwa_options, cores):
    try: assert( (os.path.isfile(fastq_sin)) or (os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)) )
    except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError

    try: assert(os.path.isfile(reference_fasta))
    except: sys.stderr.write("ERROR: Reference FASTA file does not have the correct format.\n"); raise RuntimeError

    # set up directory structure
    workspace_name = "ReferenceAlignment"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'ReferenceAlignment.log'
    logObject = uF.createLoggerObject(log_file)

    sam_file = None
    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # Align reads to reference genome
        sam_file = FastqObj.align_to_reference(workspace, reference_fasta, options=bwa_options, cores=cores)


    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

        # Align reads to reference genome
        sam_file = FastqPairedObj.align_to_reference(workspace, reference_fasta, options=bwa_options, cores=cores)

    # create Alignment object
    AlignmentObj = Alignment(sam_file, sample_name, logObject)

    # compress SAM to BAM
    AlignmentObj.compress_sam(workspace, clean=True)

    # sort BAM file
    AlignmentObj.sort_bam(workspace, clean=True)

    # index BAM file
    AlignmentObj.index_bam(workspace)

    # mark duplicates
    AlignmentObj.mark_dups(workspace, clean=True)

    # index BAM file
    AlignmentObj.index_bam(workspace)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "REFALIGNMENT.txt", 'w')
    conf_file.write("Reference Alignment: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 10
0
def runAMRP(fastq_frw, fastq_rev, sample_name, parent_dir, shortbred_markers,
            ariba_database, ariba_names):
    try:
        assert (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                and os.path.isfile(fastq_rev))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    try:
        assert (shortbred_markers or ariba_database)
    except:
        sys.stderr.write(
            "ERROR: Some issue occurred with provided databases/options. Please check the input and retry.\n"
        )
        raise RuntimeError

    try:
        if ariba_database or ariba_names:
            assert (ariba_database and ariba_names)
    except:
        sys.stderr.write(
            "ERROR: ARIBA database provided without names or visa versa, either way please check the input and retry!.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "AMRP"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'AMPR.log'
    logObject = uF.createLoggerObject(log_file)

    # create FastqPaired Object
    FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

    # Run AMR Prediction analysis
    if ariba_database:
        for i, adb in enumerate(ariba_database):
            adb_name = ariba_names[i]
            FastqPairedObj.ariba(workspace, name=adb_name, ariba_db=adb)
    if shortbred_markers:
        FastqPairedObj.shortbred_amrp(workspace, shortbred_markers)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "AMRP.txt", 'w')
    conf_file.write("AMRP: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 11
0
def runNanoCanu(nanopore_fastq, sample_dir, sample_name, canu_options, memory,
                cores):
    try:
        assert (nanopore_fastq and os.path.isfile(nanopore_fastq))
    except:
        raise RuntimeError(
            "ERROR: FASTQ input(s) were not provided properly. Please fix. Raising exception\n"
        )

    unicycler_options = canu_options.strip('"')

    # set up directory structure
    workspace_name = "Canu_Assembly"
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Canu_Assembly.log'
    logObject = uF.createLoggerObject(log_file)

    if nanopore_fastq.endswith('.gz'):
        FastqObj = Fastq(nanopore_fastq, sample_name, logObject)
        FastqObj.create_new_instance(workspace,
                                     compress=False,
                                     change_reference=True)

        # Initialize Nanopore Object
        NanoporeObj = Nanopore(FastqObj.fastq, sample_name, logObject)

        # Run Canu for assembly
        NanoporeObj.run_canu(workspace,
                             options=canu_options,
                             memory=memory,
                             cores=cores)

        # Clean up temporary FASTQ instance
        os.system('rm -f %s' % FastqObj.fastq)

    else:
        # Initialize Nanopore Object
        NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject)

        # Run Canu for assembly
        NanoporeObj.run_canu(workspace,
                             options=canu_options,
                             memory=memory,
                             cores=cores)

    conf_file = open(sample_dir + "CANU_ASSEMBLY.txt", 'w')
    conf_file.write("Canu Assembly: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 12
0
def runSortMeRNA(fastq_sin, fastq_frw, fastq_rev, database_dir, sample_name,
                 parent_dir, cores, no_gzip):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin)
                 or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                     and os.path.isfile(fastq_rev))))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "SortMeRNA"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'SortMeRNA.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # split up ribosomal and non-ribosomal RNA data
        FastqObj.filter_ribo_rna(workspace,
                                 database_dir,
                                 cores=cores,
                                 compress=(not no_gzip))

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # split up ribosomal and non-ribosomal RNA data
        FastqPairedObj.filter_ribo_rna(workspace,
                                       database_dir,
                                       cores=cores,
                                       compress=(not no_gzip))

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "SORTMERNA.txt", 'w')
    conf_file.write("SortMeRNA: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 13
0
def runKneadData(fastq_sin, fastq_frw, fastq_rev, kneaddata_options,
                 sample_name, parent_dir, cores, no_gzip):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin)
                 or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                     and os.path.isfile(fastq_rev))))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "KneadData"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'KneadData.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # bin reads taxonomically using Centrifuge
        FastqObj.run_kneaddata(workspace,
                               options=kneaddata_options,
                               cores=cores,
                               compress=not (no_gzip))

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # bin reads taxonomically using Centrifuge
        FastqPairedObj.run_kneaddata(workspace,
                                     options=kneaddata_options,
                                     cores=cores,
                                     compress=not (no_gzip))

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "KNEADDATA.txt", 'w')
    conf_file.write("KneadData: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 14
0
def processGpDirectory(bam_location, sample_name, parent_dir, no_gzip):
    try: assert(os.path.isdir(bam_location) or os.path.isfile(bam_location))
    except: sys.stderr.write("ERROR: BAM/GP directory does not exist! Exiting now ...\n"); raise RuntimeError

    # set up directory structure
    workspace_name = "ProcessGPDirectory"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'ProcessGPDirectory.log'
    logObject = uF.createLoggerObject(log_file)

    bam_location = os.path.abspath(bam_location)

    logObject.info('*' * 70)
    logObject.info("Beginning to convert BAM location %s to FASTQ(s)" % (os.path.abspath(bam_location) + '/'))

    input_bam = bam_location
    if os.path.isdir(bam_location):
        gp_directory = bam_location + '/'

        # copy over all txt files
        metric_files = [gp_directory + f for f in os.listdir(gp_directory) if not f.endswith('.pdf') and not f.endswith('.bam') and not f.endswith('.bai') and not os.path.isdir(gp_directory + f)]
        for mf in metric_files:
            mf_basename = mf.split('/')[-1]
            try:
                 shutil.copy(mf, workspace + mf_basename)
            except:
                 pass

        bam_files = [gp_directory + f for f in os.listdir(gp_directory) if f.endswith('.bam')]
        try: assert(len(bam_files) == 1)
        except:
            logObject.error()
            raise RuntimeError

        input_bam = bam_files[0]

    # create Alignment Object
    AlignmentObj = Alignment(input_bam, sample_name, logObject)

    # extract reads from BAM
    AlignmentObj.extract_fastqs(workspace, compress=(not no_gzip))

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "GPPROCESS.txt", 'w')
    conf_file.write("ProcessGPDirectory: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 15
0
def runNanoUnicycler(nanopore_fastq, illumina_forward, illumina_reverse,
                     sample_dir, sample_name, unicycler_options, identifier,
                     cores):
    try:
        assert (nanopore_fastq and os.path.isfile(nanopore_fastq))
    except:
        raise RuntimeError(
            "ERROR: FASTQ input(s) were not provided properly. Please fix. Raising exception\n"
        )

    try:
        assert (illumina_forward and illumina_reverse
                and os.path.isfile(illumina_forward)
                and os.path.isfile(illumina_reverse))
    except:
        raise RuntimeError(
            "ERROR: Optional Illumina FASTQ input(s) / assembly were not provided properly. Please fix. Raising exception\n"
        )

    unicycler_options = unicycler_options.strip('"')

    # set up directory structure
    workspace_name = "Unicycler_Assembly"
    if identifier:
        workspace_name += '_' + identifier
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Unicycler_Assembly.log'
    logObject = uF.createLoggerObject(log_file)

    # initialize Nanopore object
    NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject)

    # Run Unicycler for assembly
    NanoporeObj.run_unicycler(illumina_forward,
                              illumina_reverse,
                              workspace,
                              options=unicycler_options,
                              cores=cores)

    conf_file_name = sample_dir + "UNICYCLER_ASSEMBLY"
    if identifier: conf_file_name += '_' + identifier
    conf_file_name += ".txt"
    conf_file = open(conf_file_name, 'w')
    conf_file.write("Unicycler Assembly: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 16
0
def runAssemblyAdapterRemoval(assembly, sample_name, parent_dir, run_guinan,
                              gaemr_options, guinan_options, size_filter,
                              identifier):
    try:
        assert (os.path.isfile(assembly))
    except:
        sys.stderr.write("ERROR: Assembly does not seem to exist.\n")
        raise RuntimeError

    # set up directory structure
    workspace_name = "Assembly_Adapter_Removal"
    if identifier:
        workspace_name += '_' + identifier
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Assembly_Adapter_Removal.log'
    logObject = uF.createLoggerObject(log_file)

    # Initialize Assembly Object
    AssemblyObj = Assembly(assembly, sample_name, logObject)

    # Run GAEMR formatting program to generate assembly graph
    workspace_a = uF.setupDirectory(workspace, "Assembly_Formatted/")
    AssemblyObj.run_gaemr_formatter(workspace_a, reference_change=True)

    if run_guinan:
        # Run GAEMR based adapters in assembly
        guinan_commands_file = AssemblyObj.detect_adapters(
            workspace, options=gaemr_options)

        # Run guinan suite to remove detected adapters from assembly
        AssemblyObj.remove_adapters(guinan_commands_file,
                                    workspace,
                                    options=guinan_options)

    # Run assembly filter by contig size
    AssemblyObj.filter_contigs_by_size(workspace, size_filter=size_filter)

    # create successful completion file if steps completed!
    conf_file_name = parent_dir + "ASSEMBLY_ADAPTER_REMOVAL"
    if identifier: conf_file_name += '_' + identifier
    conf_file_name += ".txt"
    conf_file = open(conf_file_name, 'w')
    conf_file.write("Assembly Adapter Removal: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 17
0
def runCentrifuge(fastq_sin, fastq_frw, fastq_rev, centrifuge_index,
                  sample_name, parent_dir, cores):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin)
                 or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                     and os.path.isfile(fastq_rev))))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "Centrifuge"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Centrifuge.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # bin reads taxonomically using Centrifuge
        FastqObj.bin_taxonomically(workspace, centrifuge_index, cores=cores)

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # bin reads taxonomically using Centrifuge
        FastqPairedObj.bin_taxonomically(workspace,
                                         centrifuge_index,
                                         cores=cores)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "CENTRIFUGE.txt", 'w')
    conf_file.write("Centrifuge: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 18
0
def runSymlinkInput(fastq_sin, fastq_frw, fastq_rev, parent_dir, sample_name):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin))
                or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                    and os.path.isfile(fastq_rev)))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "Symlink_Input"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Symlink.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # create symlink
        FastqObj.create_symlink(workspace)

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # create symlink
        FastqPairedObj.create_symlink(workspace)
    conf_file = open(parent_dir + "SYMLINK_INPUT.txt", 'w')
    conf_file.write("SymlinkInput: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 19
0
def runNanoMerge(nanopore_fastq, sample_dir, sample_name, barcode, no_gzip):
    try:
        assert (nanopore_fastq and (os.path.isfile(nanopore_fastq)
                                    or os.path.isdir(nanopore_fastq)))
    except:
        sys.stderr.write(
            "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "NanoMerge"
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'NanoMerge.log'
    logObject = uF.createLoggerObject(log_file)

    if os.path.isdir(nanopore_fastq):
        npF.concat_fastqs(nanopore_fastq,
                          workspace,
                          sample_name,
                          logObject,
                          barcode=barcode,
                          compress=(not no_gzip))
    elif os.path.isfile(nanopore_fastq):
        if nanopore_fastq.endswith('.gz'):
            os.system('cp %s %s' %
                      (nanopore_fastq, workspace + sample_name + '.fastq.gz'))
            if no_gzip:
                os.system('gunzip %s' % workspace + sample_name + '.fastq.gz')
        else:
            os.system('cp %s %s' %
                      (nanopore_fastq, workspace + sample_name + '.fastq'))
            if not no_gzip:
                os.system('gzip %s' % workspace + sample_name + '.fastq')

    conf_file = open(sample_dir + "NANOMERGE.txt", 'w')
    conf_file.write("NanoMerge: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 20
0
def Reporter(input, outdir, analysis_type, title):
    try: assert(os.path.isfile(input) or os.path.isdir(input))
    except: sys.stderr.write("Input was neither a file nor directory. Exiting now ...")

    # create results directory
    outdir = os.path.abspath(outdir) + '/'
    if not os.path.isdir(outdir): os.system('mkdir ' + outdir)
    else: sys.stderr.write('Warning: Results directory already exists! Press control-C multiple times repeatedly and panic!\n... or just let the data be overwritten.\n')

    # create logging object
    log_file = outdir + 'Reporter.log'
    logObject = uF.createLoggerObject(log_file)

    # get the sheppard QC directory locations or parent locations provided
    repo_dir = identify_searching_spaces(input, logObject)

    # read meta data and centrifuge results
    meta_columns, metadata, centrifugedata = extract_metadata_and_centrifuge(repo_dir, outdir, logObject)

    if analysis_type == 'basic':
        # run MultiQC and extract some data from the summary text files
        mqc_data, mqc_columns = run_MultiQC(repo_dir, title, outdir, logObject)

        # create high level statistics file for visualization in R Markdown or seeQc Shiny Application
        general_stats_data, general_stats_categories = create_high_level_stats(metadata, mqc_data, mqc_columns, meta_columns, centrifugedata, outdir, logObject)

        # determine outliers using MAD approach
        outlier_samps = determine_outliers(general_stats_data, logObject)

        # create Pandas data frame and write to excel spreadsheet.
        generate_excel_spreadsheet(outlier_samps, general_stats_categories, general_stats_data, outdir, logObject)

    elif analysis_type == 'nano-assembly':
        # parse seQuoia repos for GAEMR metrics
        sample_assembly_metrics, sample_GAEMR_htmls, data_headers = parse_nano_assembly_structure(repo_dir, logObject)

        # create high level statistics file for visualization in Shiny Application
        create_high_level_stats_ont(metadata, meta_columns, sample_assembly_metrics, sample_GAEMR_htmls, data_headers, outdir, logObject)
Ejemplo n.º 21
0
def runNanoTrim(nanopore_fastq, sample_dir, sample_name, no_gzip):
    try: assert( nanopore_fastq and (os.path.isfile(nanopore_fastq) or os.path.isdir(nanopore_fastq)) )
    except:
        sys.stderr.write("ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n")
        raise RuntimeError

    # set up directory structure
    workspace_name = "NanoTrim"
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'NanoTrim.log'
    logObject = uF.createLoggerObject(log_file)

    # Initialize Nanopore Object
    NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject)

    # Trim any adapters with PoreChop
    NanoporeObj.run_nanotrim(workspace, compress=(not no_gzip))

    conf_file = open(sample_dir + "NANOTRIM.txt", 'w')
    conf_file.write("NanoTrim: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 22
0
def runTrimmomatic(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, trimmomatic_options, cores, no_gzip):
    try: assert( (fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) )
    except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError

    trimmomatic_options = trimmomatic_options.strip('"')

    # set up directory structure
    workspace_name = "QualityTrim"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'QualityTrim.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end trimmomatic operation
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # trim adapters using cutadapt and return resulting FASTQ file in gzip compressed format
        FastqObj.quality_trim(workspace, options=trimmomatic_options, cores=cores, compress=(not no_gzip))

    ### Perform paired-end trimmomatic operation
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

        # trim adpaters using cutadapt and return resulting FASTQ files in gzip compressed format
        FastqPairedObj.quality_trim(workspace, options=trimmomatic_options, cores=cores, compress=(not no_gzip))

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "QUALITYTRIM.txt", 'w')
    conf_file.write("QualityTrim: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 23
0
def runMLST(fastq_frw, fastq_rev, sample_name, parent_dir, ariba_db_dir):
    try:
        assert (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                and os.path.isfile(fastq_rev))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    try:
        assert (os.path.isdir(ariba_db_dir))
    except:
        sys.stderr.write(
            "ERROR: Some issue occurred with provided databases/options. Please check the input and retry.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "MLST"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'MLST.log'
    logObject = uF.createLoggerObject(log_file)

    # create FastqPaired Object
    FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject)

    # Run ARIBA analysis to detect STs in raw reads
    FastqPairedObj.ariba(workspace, name='ariba_mlst', ariba_db=ariba_db_dir)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "MLST.txt", 'w')
    conf_file.write("MLST: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 24
0
def runSubsample(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir,
                 reads, bases, no_gzip):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin))
                or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                    and os.path.isfile(fastq_rev)))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "Subsample"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'Subsample.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end QC analysis
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # run FastQC and parse results.
        if reads:
            FastqObj.subsample(workspace, reads=reads, compress=(not no_gzip))
        elif bases:
            FastqObj.downsample(workspace, bases=bases, compress=(not no_gzip))
        else:
            logObject.error(
                "No subsampling quantity specified defaulting to 100K reads being subsampled!"
            )
            FastqObj.subsample(workspace, reads=reads, compress=(not no_gzip))

    ### Perform paired-end QC analysis
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # run FastQC and parse results.
        if reads:
            FastqPairedObj.subsample(workspace,
                                     reads=reads,
                                     compress=(not no_gzip))
        elif bases:
            FastqPairedObj.downsample(workspace,
                                      bases=bases,
                                      compress=(not no_gzip))
        else:
            logObject.error(
                "No subsampling quantity specified defaulting to 100K ")
            FastqPairedObj.subsample(workspace,
                                     reads=reads,
                                     compress=(not no_gzip))

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "SUBSAMPLE.txt", 'w')
    conf_file.write("Subsample: Module Completed Succesfully!")
    conf_file.close()
def reorganize(sample_dir, reorganize, sample_name):
    try:
        assert (os.path.isdir(sample_dir))
    except:
        sys.stderr.write(
            "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n")
        raise RuntimeError

    sample_dir = os.path.abspath(sample_dir) + '/'

    if reorganize:
        # set up directory structure
        workspace_name = "Assembly_Results/"
        workspace = uF.setupDirectory(sample_dir,
                                      workspace_name,
                                      panic_if_exists=False)

        # create logging object
        log_file = workspace + 'Reorganization.log'
        logObject = uF.createLoggerObject(log_file)

        logObject.info("Creating easy upload formats for sample %s",
                       sample_name)
        logObject.info("-" * 80)

        runs = ['full-np', 'sub-np', 'canu']
        names = [
            'Unicycler_All-ONT', 'Unicycler_Subsampled-ONT', 'Canu_Pure-ONT'
        ]
        for i, run in enumerate(runs):
            run_name = names[i]

            # De Novo Assembly + QC Storage

            logObject.info(
                'Moving GAEMR folder for run %s to results directory.' % run)
            logObject.info('-' * 80)

            try:
                new_location = os.path.abspath(workspace + run_name)
                original_dir = os.path.abspath(sample_dir + 'GAEMR_' +
                                               run) + '/'
                assert (os.path.isdir(original_dir))
                os.system('mv %s %s' % (original_dir, new_location))
            except:
                logObject.warning(
                    'Unable to move GAEMR directory for run %s to results directory.'
                    % run_name)

            # MLST Results Storage

            logObject.info(
                'Moving MLST folder for run %s to results directory.' % run)
            logObject.info('-' * 80)

            try:
                new_location = os.path.abspath(workspace + run_name)
                original_dir = os.path.abspath(sample_dir + 'Assembly_MLST_' +
                                               run) + '/'
                assert (os.path.isdir(original_dir))
                os.system('mv %s %s' % (original_dir, new_location))
            except:
                logObject.warning(
                    'Unable to move Assembly_MLST directory for run %s to results directory.'
                    % run_name)

        logObject.info('*' * 80)

        intermediate_workspace_name = 'Intermediate_Subdirectories/'
        intermediate_workspace = uF.setupDirectory(
            sample_dir, intermediate_workspace_name)

        logObject.info(
            "Moving intermediate subdirectories of workflow to directory %s" %
            intermediate_workspace)

        try:
            for sub in os.listdir(sample_dir):
                sub_dir = os.path.abspath(sample_dir + sub) + '/'
                if os.path.isdir(sub_dir) and sub != 'Assembly_Results':
                    os.system('mv %s %s' % (sub_dir, intermediate_workspace))
        except:
            logObject.error(
                "Something went wrong when moving intermediate directories.")
            raise RuntimeError()

        logObject.info('*' * 80)

        checkpoint_workspace_name = 'Checkpoint_Files/'
        checkpoint_workspace = uF.setupDirectory(sample_dir,
                                                 checkpoint_workspace_name)

        logObject.info("Moving checkpoint files of workflow to directory %s" %
                       checkpoint_workspace)

        try:
            for f in os.listdir(sample_dir):
                checkpoint_file = os.path.abspath(sample_dir + f)
                if os.path.isfile(checkpoint_file) and f.endswith('.txt'):
                    os.system('mv %s %s' %
                              (checkpoint_file, checkpoint_workspace))
        except:
            logObject.error(
                "Something went wrong when moving checkpoint files.")
            raise RuntimeError()

        uF.closeLoggerObject(logObject)

        # create successful reorganization file if steps completed!
        conf_file = open(sample_dir + "REORGANIZATION.txt", 'w')
        conf_file.write("Reorganization was Completed Successfully!")
        conf_file.close()

    # create successful completion file if steps completed!
    conf_file = open(sample_dir + "COMPLETION.txt", 'w')
    conf_file.write("Completion: Module Completed Successfully!")
    conf_file.close()
Ejemplo n.º 26
0
def reorganize(sample_dir):
    try:
        assert (os.path.isdir(sample_dir))
    except:
        sys.stderr.write(
            "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n")
        raise RuntimeError

    sample_dir = os.path.abspath(sample_dir) + '/'

    # set up directory structure
    workspace_name = "LSARP_Results/"
    workspace = sample_dir + workspace_name
    if not os.path.isdir(workspace):
        workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'LSARP_Table_Creation.log'
    logObject = uF.createLoggerObject(log_file)

    sample = sample_dir.split('/')[-2]
    logObject.info("Creating easy upload formats for sample %s", sample)
    logObject.info("-" * 80)

    # FASTQC Tables

    logObject.info('Creating FastQC Data Tables.')
    logObject.info('-' * 80)

    FastQC_results = 'FastQC/'
    FastQC_results_workspace = workspace + FastQC_results
    fastqc_modules = [
        'Per base sequence quality', 'Per tile sequence quality',
        'Per sequence quality scores', 'Per base sequence content',
        'Per sequence GC content', 'Per base N content',
        'Sequence Length Distribution', 'Sequence Duplication Levels',
        'Overrepresented sequences', 'Adapter Content'
    ]

    try:
        fastqc_zipped_data_dirs = [
            sample_dir + 'FastQC/' + zd
            for zd in os.listdir(sample_dir + 'FastQC/') if zd.endswith('.zip')
        ]
        assert (len(fastqc_zipped_data_dirs) > 0)
        for zd in fastqc_zipped_data_dirs:
            assert (os.path.isfile(zd))
        if not os.path.isdir(FastQC_results_workspace):
            FastQC_results_workspace = uF.setupDirectory(
                workspace, FastQC_results)
    except:
        logObject.error(
            'No FastQC results available or path is unable to be determined!')
    else:
        for zd in fastqc_zipped_data_dirs:
            with zipfile.ZipFile(zd) as z:
                for filename in z.namelist():
                    if filename.split('/')[-1] == 'fastqc_data.txt':
                        with z.open(filename) as fh:
                            FastQC_tmp_out = open(
                                FastQC_results_workspace + 'tmp.txt', 'wb')
                            for line in fh:
                                FastQC_tmp_out.write(line)
                            FastQC_tmp_out.close()
                            fadapa = Fadapa(FastQC_results_workspace +
                                            'tmp.txt')
                            for module in fastqc_modules:
                                try:
                                    table_file = '_'.join(module.split())
                                    cleaned_module_data = fadapa.clean_data(
                                        module)
                                    if cleaned_module_data:
                                        table_handle = open(
                                            FastQC_results_workspace +
                                            table_file + '.table.txt', 'w')
                                        for i, split_line in enumerate(
                                                cleaned_module_data):
                                            if i == 0:
                                                split_line = [
                                                    'sample', 'read'
                                                ] + split_line
                                            else:
                                                split_line = [
                                                    sample_dir.split('/')[-2],
                                                    zd.split('/')[-1].split(
                                                        sample_dir.split('/')
                                                        [-2] + '_')[1].split(
                                                            '_fastqc.zip')
                                                    [0].split('.')[0]
                                                ] + split_line
                                            table_handle.write(
                                                '\t'.join(split_line) + '\n')
                                        table_handle.close()
                                except:
                                    pass
                            os.system('rm -f %s' % FastQC_results_workspace +
                                      'tmp.txt')
    logObject.info('*' * 80)

    # Centrifuge Tables

    logObject.info('Creating Centrifuge Data Tables.')
    logObject.info('-' * 80)

    Centrifuge_results = 'Centrifuge/'
    Centrifuge_results_workspace = workspace + Centrifuge_results

    centrifuge_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_report.tsv'
    kraken_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_kraken_report.txt'

    try:
        assert (os.path.isfile(centrifuge_report_file)
                and os.path.isfile(kraken_report_file))
        if not os.path.isdir(Centrifuge_results_workspace):
            Centrifuge_results_workspace = uF.setupDirectory(
                workspace, Centrifuge_results)

        centrifuge_report_table_file = Centrifuge_results_workspace + 'centrifuge_report.table.txt'
        centrifuge_report_table_handle = open(centrifuge_report_table_file,
                                              'w')

        centrifuge_report_data = defaultdict(lambda: ['NA'] * 6)
        for i, line in enumerate(open(centrifuge_report_file)):
            if i > 0:
                line = line.rstrip('\n')
                name, taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = line.split(
                    '\t')
                centrifuge_report_data[name] = [
                    taxID, taxRank, genomeSize, numReads, numUniqueReads,
                    abundance
                ]

        header = [
            'sample', 'taxonomy_name', 'taxonomy_level', 'taxonomy_rank',
            'taxonomy_id', 'genome_size', 'centrifuge_abundance',
            'percentage_of_fragments_recursively_covered',
            'number_of_fragments_recursively_included',
            'number_of_fragments_direct'
        ]
        centrifuge_report_table_handle.write('\t'.join(header) + '\n')
        for i, line in enumerate(open(kraken_report_file)):
            line = line.rstrip('\n')
            prop, frag_recurse, frag_direct, tax_level, tax_id = line.split(
            )[:5]
            tax = ' '.join(line.split()[5:]).strip()
            taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = centrifuge_report_data[
                tax]
            centrifuge_report_table_handle.write('\t'.join([
                sample_dir.split('/')[-2], tax, tax_level, taxRank, taxID,
                genomeSize, abundance, prop, frag_recurse, frag_direct
            ]) + '\n')

        centrifuge_report_table_handle.close()
    except:
        logObject.error('No Centrifuge results available!')

    logObject.info('*' * 80)

    # AMRP Tables

    logObject.info('Moving Results from ARIBA and ShortBRED AMR Searches.')
    logObject.info('-' * 80)

    AMRP_results = 'AMRP_Searches/'
    AMRP_results_workspace = workspace + AMRP_results

    try:
        AMRP_dir = sample_dir + 'AMRP/'
        assert (os.path.isdir(AMRP_dir))
        if not os.path.isdir(workspace + AMRP_dir):
            AMRP_results_workspace = uF.setupDirectory(workspace, AMRP_results)
        for sd in os.listdir(AMRP_dir):
            ariba_dir = AMRP_dir + sd + '/'
            ariba_report = ariba_dir + 'report.tsv'
            if os.path.isfile(ariba_report):
                ariba_result = AMRP_results_workspace + sample_dir.split(
                    '/')[-2] + '_' + sd + '_ariba_results.txt'
                os.system('cp %s %s' % (ariba_report, ariba_result))
    except:
        logObject.error('Unable to create AMR prediction data tables.'
                        )  # Raising exception now ...')

    logObject.info('*' * 80)

    # MLST Tables

    logObject.info('Creating MLST Data Tables.')
    logObject.info('-' * 80)

    MLST_results = 'MLST/'
    MLST_results_workspace = workspace + MLST_results

    try:
        MLST_dir = sample_dir + 'MLST/'
        MLST_result_file = MLST_dir + 'ariba_mlst/mlst_report.tsv'

        if not os.path.isdir(MLST_results_workspace):
            MLST_results_workspace = uF.setupDirectory(workspace, MLST_results)
        os.system('cp %s %s' % (MLST_result_file, MLST_results_workspace))

    except:
        logObject.error('Unable to create MLST call data tables.'
                        )  # Raising exception now ...')
        #raise RuntimeError

    logObject.info('*' * 80)

    # De Novo Assembly Storage

    logObject.info('Moving de novo assembly to results directory.')
    logObject.info('-' * 80)

    Assembly_results = 'Assembly/'
    Assembly_results_workspace = workspace + Assembly_results

    try:
        Assembly_dir = sample_dir + 'Assembly/'
        Assembly_original_location = Assembly_dir + 'assembly.fasta'
        if not os.path.isfile(Assembly_original_location):
            Assembly_original_location = Assembly_dir + 'scaffolds.fasta'
        assert (os.path.isfile(Assembly_original_location))
        if not os.path.isdir(Assembly_results_workspace):
            Assembly_results_workspace = uF.setupDirectory(
                workspace, Assembly_results)
        Assembly_new_location = Assembly_results_workspace + sample_dir.split(
            '/')[-2] + '.genome.fa'
        os.system('cp %s %s' %
                  (Assembly_original_location, Assembly_new_location))
    except:
        logObject.error('Unable to move assembly to results directory.')

    logObject.info('*' * 80)

    # Assembly QC Storage

    logObject.info('Moving GAEMR assembly QC to results directory.')
    logObject.info('-' * 80)

    try:
        Assembly_QC_new_location = workspace + 'Assembly_QC/'
        Assembly_QC_original_dir = sample_dir + 'GAEMR/QC/'
        assert (os.path.isdir(Assembly_QC_original_dir))
        os.system('cp -r %s %s' %
                  (Assembly_QC_original_dir, Assembly_QC_new_location))

    except:
        logObject.error(
            'Unable to move GAEMR assembly QC to results directory.')

    logObject.info('*' * 80)

    # Pilon Results Storage

    logObject.info('Moving Pilon output to results directory.')
    logObject.info('-' * 80)

    try:
        Pilon_new_dir = workspace + 'Reference_Assembly_and_Variant_Calling/'
        Pilon_original_dir = sample_dir + 'Pilon/results/'
        assert (os.path.isdir(Pilon_original_dir))
        os.system('cp -r %s %s' % (Pilon_original_dir, Pilon_new_dir))
        os.system('gzip %s*' % Pilon_new_dir)

    except:
        logObject.error('Unable to move Pilon output to results directory.')

    logObject.info('*' * 80)

    # StrainGST Results Storage

    logObject.info('Moving StrainGST output to results directory.')
    logObject.info('-' * 80)

    try:
        Straingst_result_file = sample_dir + 'StrainGST/' + sample + '.straingst_result.tsv'
        assert (os.path.isfile(Straingst_result_file))
        Straingst_new_dir = 'StrainGST/'
        Straingst_results_workspace = workspace + Straingst_new_dir
        if not os.path.isdir(workspace + Straingst_new_dir):
            Straingst_results_workspace = uF.setupDirectory(
                workspace, Straingst_new_dir)
        os.system('cp %s %s' %
                  (Straingst_result_file, Straingst_results_workspace))

    except:
        logObject.error(
            'Unable to move StrainGST output to results directory.')

    logObject.info('*' * 80)

    uF.closeLoggerObject(logObject)

    # create successful completion file if steps completed!
    conf_file = open(sample_dir + "LSARP.txt", 'w')
    conf_file.write("LSARP Table Creation: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 27
0
def runAdapterTrim(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir,
                   trimgalore_options, cutadapt_options):
    try:
        assert ((fastq_sin and os.path.isfile(fastq_sin)
                 or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw)
                     and os.path.isfile(fastq_rev))))
    except:
        sys.stderr.write(
            "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"
        )
        raise RuntimeError

    trimgalore_options = trimgalore_options.strip('"')
    cutadapt_options = cutadapt_options.strip('"')

    try:
        assert (not (trimgalore_options and cutadapt_options))
    except:
        sys.stderr.write(
            "ERROR: Both filtering options with cutadapt and trim galore provided. Can only use one adapter trimmer. Exiting now ...\n"
        )
        raise RuntimeError

    # set up directory structure
    workspace_name = "AdapterTrim"
    workspace = uF.setupDirectory(parent_dir, workspace_name)

    # create logging object
    log_file = workspace + 'AdapterTrim.log'
    logObject = uF.createLoggerObject(log_file)

    ### Perform single end cutadapt operation
    if fastq_sin and os.path.isfile(fastq_sin):

        # create Fastq object
        FastqObj = Fastq(fastq_sin, sample_name, logObject)

        # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format
        if cutadapt_options:
            FastqObj.cutadapt_adapter_trim(workspace, options=cutadapt_options)
        else:
            FastqObj.trim_galore_adapter_trim(workspace,
                                              options=trimgalore_options)

    ### Perform paired-end cutadapt operation
    elif fastq_frw and fastq_rev:

        # create FastqPaired Object
        FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name,
                                     logObject)

        # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format
        if cutadapt_options:
            FastqPairedObj.cutadapt_adapter_trim(workspace,
                                                 options=cutadapt_options)
        else:
            FastqPairedObj.trim_galore_adapter_trim(workspace,
                                                    options=trimgalore_options)

    # create successful completion file if steps completed!
    conf_file = open(parent_dir + "ADAPTERTRIM.txt", 'w')
    conf_file.write("AdapterTrim: Module Completed Succesfully!")
    conf_file.close()
Ejemplo n.º 28
0
def cleanup(suffix_file, sample_dir):
    try:
        assert (os.path.isdir(sample_dir))
    except:
        sys.stderr.write(
            "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n")
        raise RuntimeError
    try:
        assert (not suffix_file or os.path.isfile(suffix_file))
    except:
        sys.stderr.write(
            "ERROR: Input file for purging filetypes doesn't exist. Exiting now ...\n"
        )
        raise RuntimeError

    sample_dir = os.path.abspath(sample_dir) + '/'

    if not suffix_file:
        suffix_file = DEFAULT_SUFFIX_PURGE_FILE

    # create logging object
    log_file = sample_dir + 'CLEANUP.txt'
    logObject = uF.createLoggerObject(log_file)

    sample = sample_dir.split('/')[-3]
    logObject.info("Cleanup initiating for sample %s", sample)
    logObject.info("-" * 80)
    purging_filetypes = defaultdict(set)
    dot_depth = 0
    with open(suffix_file) as osf:
        for line in osf:
            line = line.rstrip('\n').strip()
            ls = line.split()
            grouping = ls[0]
            for l in ls[1:]:
                purging_filetypes[grouping].add(l)
                type_dot_depth = len(l.split('.'))
                if type_dot_depth > dot_depth:
                    dot_depth = type_dot_depth

    for subdir, dirs, files in os.walk(sample_dir):
        for file in files:
            file_ends = set([])
            file_suffices_len = len(file.split('.')) - 1
            neg_counter = -1
            for i in range(0, dot_depth):
                if i < file_suffices_len:
                    if i == 0:
                        file_ends.add('.' + file.split('.')[neg_counter])
                    else:
                        file_ends.add('.' +
                                      '.'.join(file.split('.')[neg_counter:]))
                    neg_counter -= 1
            for group in purging_filetypes.items():
                if len(file_ends.intersection(group[1])) > 0 and (
                        group[0].lower().strip() == "all"
                        or group[0].lower().strip() == os.path.abspath(
                            subdir).split()[-1].lower().strip()):
                    full_file_path = os.path.join(subdir, file)
                    if not '/Input/' in full_file_path and not '/KneadData/' in full_file_path:
                        logObject.info('purging file %s' % full_file_path)
                        os.system("rm -f " + full_file_path)

    uF.closeLoggerObject(logObject)
Ejemplo n.º 29
0
def seQuoiaSheppard(meta, illumina_data, illumina_data_format, nanopore, workflow, outdir, poolsize, cluster, project, config, sample_config):
    # assert that input file exists and there is a directory with sequencing data
    try:
        assert(os.path.isfile(meta))
    except:
        sys.stderr.write("ERROR: Unable to locate input meta-specifying data file. Now exiting ...\n")
        sys.exit(1)

    meta_file = os.path.abspath(meta)
    outdir = os.path.abspath(outdir) + '/'

    # set up batch analysis directory structure
    if not os.path.isdir(outdir):
        os.system('mkdir %s' % outdir)
    else:
        sys.stderr.write('-*-'*20 + '\n')
        sys.stderr.write('Warning: Results directory already exists! Will delete failed steps for samples, and pick up what needs to still be run!\n')
        sys.stderr.write('-*-'*20 + '\n')

    # create logging object
    log_file = outdir + 'seQc.log'
    logObject = uF.createLoggerObject(log_file)

    # clear any currently running zombie jobs associated with the seQuoia output repository
    # and capture version information for external software
    clearAnyZombieJobs(outdir, cluster, logObject, wait=30)
    capture_external_versioning(outdir, logObject)

    logObject.info("Starting batch seQuoia analysis for meta-input file at: %s" % meta_file)
    sys.stdout.write("Starting batch seQuoia analysis for meta-input file at:\n%s\n" % meta_file)

    # Parse configuration file
    config_params_dict = {}
    try:
        assert(os.path.isfile(workflow))
        os.system('cp %s %s' % (workflow, outdir))
        if config and sample_config:
            logObject.error("Please provide only a config or sample_config file. The simultaneous use of both is not supported. Exiting now ...")
            sys.exit(1)
        elif config:
            assert(os.path.isfile(config))
            logObject.info("The following workflow is being run with config parameters: %s, making copy for future reference." % workflow)
            logObject.info("Config parameters provided in %s. Also copied to seQc repo directory." % config)
            os.system('cp %s %s' % (config, outdir))
            config_params_dict = read_config_file(config, workflow, logObject)
        elif sample_config:
            assert (os.path.isfile(sample_config))
            logObject.info("The following workflow is being run with sample specfic config parameters: %s, making copy for future reference." % workflow)
            logObject.info("Config parameters provided in %s. Also copied to seQc repo directory." % sample_config)
            os.system('cp %s %s' % (sample_config, outdir))
            config_params_dict = read_sample_config_file(sample_config, workflow, logObject)
        else:
            logObject.info("The following workflow is being run with default parameters: %s, making copy for future reference." % workflow)
    except:
        logObject.error("Problem locating workflow or reading configurations file. Exiting now ...")
        sys.exit(1)


    # Parse Illumina input sequencing data
    illumina_present = False
    nanopore_present = False
    il_data = None; np_data = None
    if illumina_data or illumina_data_format:
        # assert that the datatype is a valid specification and that the
        try:
            valid_illumina_datatypes = set(['illumina-paired', 'illumina-single', 'gp-directory', 'bam'])
            assert(illumina_data_format in valid_illumina_datatypes)
            assert(os.path.isfile(illumina_data) or os.path.isdir(illumina_data))
            il_data = os.path.abspath(illumina_data)
            if os.path.isdir(il_data):
                il_data += '/'
            illumina_present = True
            logObject.info("Short read Illumina data specified in %s format.\nLocation of this data is provided/listed at: %s" % (illumina_data_format, il_data))
            sys.stdout.write("Short read Illumina data specified in %s format.\nLocation of this data is provided/listed at:\n%s\n" % (illumina_data_format, il_data))
        except:
            logObject.error("The datatype of the short read sequencing data is not a valid entry or illumina data path provided did not correspond to a file / directory. Please fix and retry. Exiting now ...\n")
            sys.exit(1)

    # Parse Oxford Nanopore input sequencing data.
    if nanopore:
        try:
            assert(os.path.isfile(nanopore))
            np_data = os.path.abspath(nanopore)
            nanopore_present = True
            logObject.info("Nanopore data provided and can be found listed at: %s" % np_data)
            sys.stdout.write("Nanopore data provided and can be found listed at:\n%s\n" % np_data)
        except:
            logObject.error("Nanopore data is not provided in expected format. Exiting now ...")
            sys.exit(1)

    # read input data file and store meta information for each strain.
    metadata_parsed = read_and_store_metadata(meta_file, logObject)

    # match samples specified in input file to illumina sequencing data files.
    ildata_parsed = None
    if illumina_present:
        ildata_parsed = process_illumina_data(il_data, illumina_data_format, metadata_parsed, nanopore_present, logObject)

    # match samples specified in input file to nanopore sequencing data files.
    npdata_parsed = None
    if nanopore_present:
        npdata_parsed = process_nanopore_data(np_data, metadata_parsed, logObject)

    # prepare for submission of workflows with multiprocessing pool
    sample_data, sample_runs = prepare_for_submission(cluster, project, outdir, ildata_parsed, npdata_parsed, metadata_parsed, workflow, config_params_dict)

    # catalog multiple runs for same sample with different parameters, if needed.
    sample_to_runs_file = output_directory + 'sample_to_runs.txt'
    sample_to_runs_handle = open(sample_to_runs_file, 'w')
    for s in sample_runs.items():
        if len(s[1]) > 1:
            for i in s[1]:
                sample_to_runs_handle.write(s[0] + '\t' + i[0] + '\t' + i[1] + '\n')
    sample_to_runs_handle.close()
    if os.path.getsize(sample_to_runs_file) < 10:
        os.system('rm -f %s' % sample_to_runs_file)

    # create pool and submit data to pool worker
    logObject.info("Starting pool submission!!!")
    sys.stdout.write("Starting pool submission!!!\n")

    if len(sample_data) < poolsize:
        poolsize = len(sample_data)

    p = multiprocessing.Pool(poolsize)

    p.map(workflow_process, sample_data)

    try:
        p.map(workflow_process, sample_data)
    except:
        clearAnyZombieJobs(outdir, cluster, logObject, wait=0)
        logObject.error("User prompted KeyboardInterrupt! Running jobs successfully killed, Exiting!")
        p.close()
        os.exit(1)
    else:
        p.close()

    # seQcSheppard exiting!
    logObject.info("Done!")
    sys.stdout.write("\nDone!\n")
    sys.exit(0)
Ejemplo n.º 30
0
def runAssemblyQC(assembly, sample_name, sample_dir, format_options,
                  qc_options, cores, illumina_frw, illumina_rev,
                  picard_insert_file, nanopore_fastq, gaemr_ont, identifier):
    try:
        assert (os.path.isfile(assembly))
    except:
        sys.stderr.write("ERROR: Assembly does not exist. Raising exception\n")
        raise RuntimeError

    try:
        assert ((not nanopore_fastq)
                or (nanopore_fastq and os.path.isfile(nanopore_fastq)))
    except:
        sys.stderr.write(
            "ERROR: Nanopore FASTQ provided but the path does not exist. Raising exception\n"
        )
        raise RuntimeError

    format_options = format_options.strip('"')
    qc_options = qc_options.strip('"')

    # set up directory structure
    workspace_name = "GAEMR"
    if identifier:
        workspace_name += '_' + identifier
    workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'GAEMR.log'
    logObject = uF.createLoggerObject(log_file)

    # Format Assembly for GAEMR QC Analysis
    workspace_a = uF.setupDirectory(workspace, "Formatting")
    AssemblyObj = AssemblyAnalyzer.Assembly(assembly, sample_name, logObject)
    AssemblyObj.run_gaemr_formatter(workspace_a,
                                    options=format_options,
                                    reference_change=True)

    # Generate read list file for QC
    read_list = workspace + 'read_list.txt'
    outf = open(read_list, 'w')
    outf.write("#name,lib_type,mean_read_length,dir,insert_size,files\n")
    frw_read = None
    rev_read = None
    illumina_avg_read_length = 250
    illumina_avg_insert_length = 400
    if illumina_frw and illumina_rev and os.path.isfile(
            illumina_frw) and os.path.isfile(illumina_rev):
        readlengths = []
        if illumina_frw.endswith(".gz"):
            with gzip.open(illumina_frw, 'rt') as ofr:
                for i, line in enumerate(ofr):
                    if i > 40000: continue
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))
        else:
            with open(illumina_frw) as ofr:
                for i, line in enumerate(ofr):
                    if i > 40000: continue
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))
        frw_read = illumina_frw
        rev_read = illumina_rev
        illumina_avg_read_length = sum(readlengths) / float(len(readlengths))

    elif os.path.isdir(sample_dir + 'Subsample'):
        frw_read = [
            sample_dir + 'Subsample/' + x
            for x in os.listdir(sample_dir + 'Subsample') if '_R1.' in x and (
                x.endswith('.fastq.gz') or x.endswith('.fastq'))
        ][0]
        rev_read = [
            sample_dir + 'Subsample/' + x
            for x in os.listdir(sample_dir + 'Subsample') if '_R2.' in x and (
                x.endswith('.fastq.gz') or x.endswith('.fastq'))
        ][0]

        readlengths = []
        if frw_read.endswith(".gz"):
            with gzip.open(frw_read, 'rt') as ofr:
                for i, line in enumerate(ofr):
                    if i > 40000: continue
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))
        else:
            with open(frw_read) as ofr:
                for i, line in enumerate(ofr):
                    if i > 40000: continue
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))

        illumina_avg_read_length = sum(readlengths) / float(len(readlengths))

    if picard_insert_file and os.path.isfile(picard_insert_file):
        with open(picard_insert_file) as oisf:
            for line in oisf:
                line = line.strip()
                ls = line.split('\t')
                if len(ls) > 0 and ls[0].startswith("MEDIAN_INSERT_SIZE"):
                    flag_header_observed = True
                    continue
                if flag_header_observed:
                    illumina_avg_insert_length = int(float(ls[5]))
                    break

    elif os.path.isdir(sample_dir + 'ProcessGPDirectory'):
        insert_stats_file_query = [
            sample_dir + 'ProcessGPDirectory/' + x
            for x in os.listdir(sample_dir + 'ProcessGPDirectory')
            if x.endswith('.insert_size_metrics')
        ]
        if len(insert_stats_file_query) == 1:
            insert_stats_file = insert_stats_file_query[0]
            flag_header_observed = False
            with open(insert_stats_file) as oisf:
                for line in oisf:
                    line = line.strip()
                    ls = line.split('\t')
                    if len(ls) > 0 and ls[0].startswith("MEDIAN_INSERT_SIZE"):
                        flag_header_observed = True
                        continue
                    if flag_header_observed:
                        illumina_avg_insert_length = int(float(ls[5]))
                        break

    if frw_read and rev_read:
        outf.write('Fragments,fragment,%d,fr,%d,%s,%s\n' %
                   (illumina_avg_read_length, illumina_avg_insert_length,
                    frw_read, rev_read))

    if nanopore_fastq and gaemr_ont:
        readlengths = []
        if nanopore_fastq.endswith(".gz"):
            with gzip.open(nanopore_fastq, 'rt') as ofr:
                for i, line in enumerate(ofr):
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))
        else:
            with open(nanopore_fastq) as ofr:
                for i, line in enumerate(ofr):
                    if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0:
                        readlengths.append(len(line.strip()))

        nanopore_avg_read_length = 2000
        if len(readlengths) > 0:
            nanopore_avg_read_length = sum(readlengths) / len(readlengths)

        nanopore_avg_insert_size = nanopore_avg_read_length

        outf.write('Long,unpaired,%d,,%d,%s\n' %
                   (nanopore_avg_read_length, nanopore_avg_insert_size,
                    nanopore_fastq))

    outf.close()

    # Run GAEMR QC
    workspace_b = uF.setupDirectory(workspace, "QC")
    if nanopore_fastq and gaemr_ont:
        AssemblyObj.run_gaemr_qc_ont(read_list,
                                     workspace_b,
                                     options=qc_options,
                                     cores=cores)
    else:
        AssemblyObj.run_gaemr_qc(read_list,
                                 workspace_b,
                                 options=qc_options,
                                 cores=cores)

    # create successful completion file if steps completed!
    conf_file_name = sample_dir + "GAEMR"
    if identifier: conf_file_name += '_' + identifier
    conf_file_name += ".txt"
    conf_file = open(conf_file_name, 'w')
    conf_file.write("GAEMR: Module Completed Succesfully!")
    conf_file.close()