def countOrfs(infile, outfile):
    feat = "gene_id"
    outfile = outfile.replace(".gz", "")
    filename = re.match(
        r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)",
        infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    mapping = "sample_mappings.dir/{}/{}.mapped.bam".format(
        filemap.samplename, filemap.samplename)
    #get pairedness
    paired = True
    num_paired = subprocess.check_output(
        ["samtools", "view", "-c", "-f 1",
         "{}".format(mapping)]).decode(sys.stdout.encoding)
    if int(num_paired.strip("\n")) == 0:
        paired = False
    #generate counts per orf across all samples
    job_threads = int(PARAMS["featureCounts_threads"])
    job_memory = str(PARAMS["featureCounts_memory"]) + "G"
    scratchgtf = PARAMS["featureCounts_tmp"] + filename + ".gtf"
    statementlist = []
    statementlist.append("zcat {} > {}".format(filemap.shortgtfpath,
                                               scratchgtf))
    statementlist.append(
        PipelineEnumerate.countFeatures(feat, scratchgtf, paired, outfile,
                                        mapping, PARAMS))
    statementlist.append("gzip {}".format(outfile))
    statementlist.append("rm -rf {}".format(scratchgtf))
    statement = " && ".join(statementlist)
    P.run(statement)
def makeBowtieDbs(infile, outfile):
    filename = re.match(
        r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)",
        infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    #call to bowtie2-build
    job_memory = str(PARAMS["BowtieDB_memory"]) + "G"
    job_threads = int(PARAMS["BowtieDB_threads"])
    statement = PipelineEnumerate.buildBowtieDB(filemap.contigpath,
                                                outfile.replace(".1.bt2l", ""),
                                                PARAMS)
    P.run(statement)
def mapSamples(infile, outfile):
    filename = re.match(
        r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)",
        infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    #get the mapping DB
    bowtiedb = "contig_databases.dir/{}.contigs.bowtie".format(
        filemap.samplename)
    job_threads = int(PARAMS["Bowtie_threads"])
    job_memory = str(PARAMS["Bowtie_memory"]) + "G"
    seqdat = PipelineAssembly.SequencingData(infile)
    bowtie = PipelineFilter.Bowtie2(seqdat, outfile, PARAMS, bowtiedb)
    #need to reset the working directory in the bowtie function as it is running on files in one directory
    bowtie.indir = ""
    statementlist = []
    #remove all comments from read names in files (trimming can add comments making non-matching pairs)
    #only skip if there was a failure in a previous run at the bowtie step
    if PARAMS["Bowtie_skip_file_prep"] != "true":
        statementlist.append(bowtie.cleanNames())
    #directory for output
    statementlist.append("mkdir -p {}".format(os.path.dirname(outfile)))
    #call to bowtie
    statementlist.append(bowtie.build())
    #convert sam to bam
    statementlist.append("samtools view -bS {} > {}".format(
        outfile.replace(".bam", ".sam"), outfile))
    #remove the sam file
    statementlist.append("rm {}".format(outfile.replace(".bam", ".sam")))
    statement = " && ".join(statementlist)
    P.run(statement)
def countOrfs(infile, outfile):
    feat = "gene_id"
    filename = re.match(
        r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)",
        infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    mapping = "sample_mappings.dir/{}/{}.mapped.bam".format(
        filemap.samplename, filemap.samplename)
    #get pairedness
    paired = True
    num_paired = subprocess.check_output(
        ["samtools", "view", "-c", "-f 1",
         "{}".format(mapping)]).decode(sys.stdout.encoding)
    if int(num_paired.strip("\n")) == 0:
        paired = False
    #generate counts per orf across all samples
    job_threads = int(PARAMS["featureCounts_threads"])
    job_memory = str(PARAMS["featureCounts_memory"]) + "G"
    statement = PipelineEnumerate.countFeatures(feat, filemap.shortgtfpath,
                                                paired, outfile, mapping,
                                                PARAMS)
    P.run(statement)
def countPairedFeatures(infile, outfile):
    filename = re.search("orf_counts.dir/(\S+).tsv", infile).group(1)
    filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS)
    #generate counts for other features from ORF counts and full GTF
    job_threads = int(PARAMS["featureCounts_threads_otherfeats"])
    job_memory = str(PARAMS["featureCounts_memory_otherfeats"]) + "G"
    statement = "python {}scripts/countFeatPairs.py --orf_counts {} --feature_pairs {} --gtf {} --outdir annotation_counts.dir/ --logfile {}".format(
        os.path.dirname(__file__).rstrip("pipelines"), infile,
        ",".join(FEATUREPAIRS), filemap.gtfpath, outfile)
    #count the tpm counts if enabled
    if PARAMS["General_tpm"] == "true":
        statement += " && python {}scripts/countFeatPairs.py --orf_counts {} --feature_pairs {} --gtf {} --outdir annotation_counts.dir/ --logfile {}".format(
            os.path.dirname(__file__).rstrip("pipelines"),
            infile.replace(".tsv", ".tpm.tsv"), ",".join(FEATUREPAIRS),
            filemap.gtfpath, outfile.replace(".log", ".tpm.log"))
    P.run(statement)