def countOrfs(infile, outfile): feat = "gene_id" outfile = outfile.replace(".gz", "") filename = re.match( r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)", infile).group(1) filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS) mapping = "sample_mappings.dir/{}/{}.mapped.bam".format( filemap.samplename, filemap.samplename) #get pairedness paired = True num_paired = subprocess.check_output( ["samtools", "view", "-c", "-f 1", "{}".format(mapping)]).decode(sys.stdout.encoding) if int(num_paired.strip("\n")) == 0: paired = False #generate counts per orf across all samples job_threads = int(PARAMS["featureCounts_threads"]) job_memory = str(PARAMS["featureCounts_memory"]) + "G" scratchgtf = PARAMS["featureCounts_tmp"] + filename + ".gtf" statementlist = [] statementlist.append("zcat {} > {}".format(filemap.shortgtfpath, scratchgtf)) statementlist.append( PipelineEnumerate.countFeatures(feat, scratchgtf, paired, outfile, mapping, PARAMS)) statementlist.append("gzip {}".format(outfile)) statementlist.append("rm -rf {}".format(scratchgtf)) statement = " && ".join(statementlist) P.run(statement)
def makeBowtieDbs(infile, outfile): filename = re.match( r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)", infile).group(1) filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS) #call to bowtie2-build job_memory = str(PARAMS["BowtieDB_memory"]) + "G" job_threads = int(PARAMS["BowtieDB_threads"]) statement = PipelineEnumerate.buildBowtieDB(filemap.contigpath, outfile.replace(".1.bt2l", ""), PARAMS) P.run(statement)
def mapSamples(infile, outfile): filename = re.match( r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)", infile).group(1) filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS) #get the mapping DB bowtiedb = "contig_databases.dir/{}.contigs.bowtie".format( filemap.samplename) job_threads = int(PARAMS["Bowtie_threads"]) job_memory = str(PARAMS["Bowtie_memory"]) + "G" seqdat = PipelineAssembly.SequencingData(infile) bowtie = PipelineFilter.Bowtie2(seqdat, outfile, PARAMS, bowtiedb) #need to reset the working directory in the bowtie function as it is running on files in one directory bowtie.indir = "" statementlist = [] #remove all comments from read names in files (trimming can add comments making non-matching pairs) #only skip if there was a failure in a previous run at the bowtie step if PARAMS["Bowtie_skip_file_prep"] != "true": statementlist.append(bowtie.cleanNames()) #directory for output statementlist.append("mkdir -p {}".format(os.path.dirname(outfile))) #call to bowtie statementlist.append(bowtie.build()) #convert sam to bam statementlist.append("samtools view -bS {} > {}".format( outfile.replace(".bam", ".sam"), outfile)) #remove the sam file statementlist.append("rm {}".format(outfile.replace(".bam", ".sam"))) statement = " && ".join(statementlist) P.run(statement)
def countOrfs(infile, outfile): feat = "gene_id" filename = re.match( r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)", infile).group(1) filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS) mapping = "sample_mappings.dir/{}/{}.mapped.bam".format( filemap.samplename, filemap.samplename) #get pairedness paired = True num_paired = subprocess.check_output( ["samtools", "view", "-c", "-f 1", "{}".format(mapping)]).decode(sys.stdout.encoding) if int(num_paired.strip("\n")) == 0: paired = False #generate counts per orf across all samples job_threads = int(PARAMS["featureCounts_threads"]) job_memory = str(PARAMS["featureCounts_memory"]) + "G" statement = PipelineEnumerate.countFeatures(feat, filemap.shortgtfpath, paired, outfile, mapping, PARAMS) P.run(statement)
def countPairedFeatures(infile, outfile): filename = re.search("orf_counts.dir/(\S+).tsv", infile).group(1) filemap = PipelineEnumerate.enumerateMapper(filename, PARAMS) #generate counts for other features from ORF counts and full GTF job_threads = int(PARAMS["featureCounts_threads_otherfeats"]) job_memory = str(PARAMS["featureCounts_memory_otherfeats"]) + "G" statement = "python {}scripts/countFeatPairs.py --orf_counts {} --feature_pairs {} --gtf {} --outdir annotation_counts.dir/ --logfile {}".format( os.path.dirname(__file__).rstrip("pipelines"), infile, ",".join(FEATUREPAIRS), filemap.gtfpath, outfile) #count the tpm counts if enabled if PARAMS["General_tpm"] == "true": statement += " && python {}scripts/countFeatPairs.py --orf_counts {} --feature_pairs {} --gtf {} --outdir annotation_counts.dir/ --logfile {}".format( os.path.dirname(__file__).rstrip("pipelines"), infile.replace(".tsv", ".tpm.tsv"), ",".join(FEATUREPAIRS), filemap.gtfpath, outfile.replace(".log", ".tpm.log")) P.run(statement)