def runSoapdenovo(infile, outfile): ''' run soapdenovo ''' job_options = "-l mem_free=30G" statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile) P.run()
def countReadsWithinWindows(bedfile, windowfile, outfile, counting_method="midpoint"): '''count reads given in *tagfile* within intervals in *windowfile*. Both files need to be :term:`bed` formatted. Counting is done using bedtools. The counting method can be 'midpoint' or 'nucleotide'. ''' job_options = "-l mem_free=4G" if counting_method == "midpoint": f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' ''' elif counting_method == "nucleotide": f = "" else: raise ValueError("unknown counting method: %s" % counting_method) statement = ''' zcat %(bedfile)s %(f)s | coverageBed -a stdin -b %(windowfile)s -split | sort -k1,1 -k2,2n | gzip > %(outfile)s ''' P.run()
def collectMEMEResults(tmpdir, target_path, outfile): '''collect output from a MEME run in tmpdir and copy all over to target_path convert images output by MEME (.eps files) to .png files.''' # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile) # convert images to png epsfiles = glob.glob(os.path.join(target_path, "*.eps")) for epsfile in epsfiles: b, ext = os.path.splitext(epsfile) pngfile = b + ".png" statement = '''convert %(epsfile)s %(pngfile)s ''' P.run()
def runSpades(infile, outfile): ''' run spades on each track ''' job_options = " -l mem_free=30G" statement = PipelineMetagenomeAssembly.Spades().build(infile) P.run()
def computeOverlapCoding( infile, outfile ): '''compute overlap between coding markers and windows. This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id are removed. ''' to_cluster = True tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=ensembl.diff.genes_ovl \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def runGLAM2SCAN(infiles, outfile): '''run glam2scan on all intervals and motifs. ''' to_cluster = True # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles controlfile = dbfile[:-len(".fasta")] + ".controlfasta" if not os.path.exists(controlfile): raise P.PipelineError( "control file %s for %s does not exist" % (controlfile, dbfile)) if os.path.exists(outfile): os.remove(outfile) for motiffile in motiffiles: of = IOTools.openFile(outfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s ::\n" % motif) of.close() statement = ''' cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s ''' P.run()
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def buildTranscriptLevelReadCounts(infiles, outfile): '''count reads falling into transcripts of protein coding gene models. .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' bamfile, geneset = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' statement = ''' zcat %(geneset)s | python %(scriptsdir)s/gtf2table.py --reporter=transcripts --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def buildGeneLevelReadCounts(infiles, outfile): '''compute read counts and coverage of exons with reads. ''' bamfile, exons = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' # ignore multi-mapping reads statement = ''' zcat %(exons)s | python %(scriptsdir)s/gtf2table.py --reporter=genes --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def buildUniformityOfCoverage(infiles, outfile): ''' build matrix of coverage over contigs ''' bam = infiles[0] track = P.snip(os.path.basename(bam), ".bam") tmp_bed = P.getTempFilename(".") + ".bed" tmp_bam = P.getTempFilename(".") + ".bam" # filter for mapped reads statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s ; samtools index %(tmp_bam)s''' P.run() for infs in infiles[1:]: for inf in infs: if P.snip(inf, ".lengths.tsv") == track: length_file = inf statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s''' P.run() statement = '''python %(scriptsdir)s/bam2peakshape.py --only-interval %(tmp_bam)s %(tmp_bed)s --log=%(outfile)s.log --output-filename-pattern=%(track)s.%%s''' P.run() os.unlink(tmp_bed) os.unlink(tmp_bam)
def buildBAMStats( infile, outfile ): '''Count number of reads mapped, duplicates, etc. ''' to_cluster = USECLUSTER scriptsdir = PARAMS["general_scriptsdir"] statement = '''python %(scriptsdir)s/bam2stats.py --force --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s''' P.run()
def buildDownstreamFlankBed(infile, outfile): """ build interval downstream of gene start for each entry in bed file""" window = PARAMS["geneset_flank"] faidx = PARAMS["faidx"] statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s""" P.run()
def ExtendRegion(infile, outfile): """convert bed to gtf""" statement = """gunzip < %(infile)s | slopBed -i stdin -g %(faidx)s -b 1000 | gzip > %(outfile)s """ P.run()
def getNoncodingGeneset(infile, outfile): """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """ ensembl_transcripts = PARAMS["ensembl_transcripts"] statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s; echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; cat %(outfile)s | wc -l >> %(outfile)s.count;""" P.run()
def addMissingNoncodingTranscripts(infile, outfile): """ Add ensembl gene id to GTF file""" ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"] statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf; cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n > %(outfile)s;""" P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def loadEffects(infile, outfile): '''load transcript effects into tables.''' root = infile[:-len(".effects.gz")] statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --from-zipped \ --index=transcript_id \ --table=%(root)s_effects \ < %(infile)s > %(outfile)s ''' P.run() for suffix in ("cds", "intron", "splicing", "translation"): statement = ''' gunzip < %(infile)s.%(suffix)s.gz | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=transcript_id --table=%(root)s_effects_%(suffix)s --ignore-column=seq_na --ignore-column=seq_aa >> %(outfile)s ''' P.run()
def buildFilteredLncRNAGeneSet(infile, outfile): ''' Depending on on filtering_remove_single_exon will: i) remove all single exon transcripts from all lncrna models (transcripts) ii) remove lncrna loci that only contain single exon transcripts (loci) iii) leave all single-exon and multi-exon loci in outfile (None) ''' if not PARAMS["filtering_remove_single_exon"]: E.info("Both multi-exon and single-exon lncRNA are retained!") statement = ("cp %(infile)s %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "loci": E.info("Warning: removing all single-exon" " transcripts from lncRNA set") statement = ("zcat %(infile)s |" " grep 'exon_status_locus \"s\"'" " gzip > %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "transcripts": E.info("Warning: removing loci with only single-exon transcripts") statement = ("zcat %(infile)s |" " grep 'exon_status \"s\"'" " gzip > %(outfile)s") else: raise ValueError("Unregocnised parameter %s" % PARAMS["filtering_remove_single_exon"]) P.run()
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' tablename = os.path.basename( filenameToTablename(P.snip(infile, ".gtf.gz"))) to_cluster = False # just load each transcript with its classification temp = P.getTempFile() inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % ( transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() inf_1 = temp.name statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --header=transcript_id,gene_id,class" " < %(inf_1)s > %(outfile)s") P.run()
def makeSegments( infile, outfile ): '''compute intron overrun.''' to_cluster = True statement = '''gunzip < %(infile)s | %(scriptsdir)s/gff_sort pos | python %(scriptsdir)s/gff2histogram.py --method=values --output-filename-pattern="%(outfile)s.%%s" --force --log=%(outfile)s.log > %(outfile)s ''' P.run() statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gff2histogram.py --method=values --force --output-filename-pattern="%(outfile)s_genes.%%s" --log=%(outfile)s.log >> %(outfile)s''' P.run()
def loadRepeatInformation( infiles, outfile ): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename( "." ) statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | python %(scriptsdir)s/gff2bed.py -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink( tmpfilename )
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --apply=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def exportMotifDiscoverySequences( infile, outfile ): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip( infile, "_intervals.load" ) dbhandle = connect() p = P.substituteParameters( **locals() ) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full = False, masker = P.asList(p['motifs_masker']), halfwidth = int(p["motifs_halfwidth"]), maxsize = int(p["motifs_max_size"]), proportion = p["motifs_proportion"], min_sequences = p["motifs_min_sequences"], num_sequences = p["motifs_num_sequences"], order = p['motifs_score']) if nseq == 0: E.warn( "%s: no sequences - meme skipped" % outfile) P.touch( outfile )
def exportMotifLocations( infiles, outfile ): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip( table, "_mast" ) for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ): tmpf.write( "\t".join( map(str, x) ) + "\n" ) tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif ) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink( tmpf.name )
def buildAnnotations( infiles, outfile ): '''annotate transcripts by location (intergenic, intronic, ...)''' infile, annotation = infiles statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 "python %(scriptsdir)s/gtf2table.py --counter=position --counter=classifier --section=exons --section=introns --counter=length --counter=splice --counter=composition-na --counter=splice-comparison --log=%(outfile)s.log --filename-format=gff --filename-gff=%(annotation)s --genome-file=%(genome_dir)s/%(genome)s" | gzip > %(outfile)s ''' P.run()
def buildContigSummary(infiles, outfile): ''' merge the contig summary statistics ''' stats = collections.defaultdict(list) for filepath in infiles: dirname = os.path.dirname(filepath) stats[dirname].append(os.path.basename(filepath)) N = PARAMS["scaffold_n"] # connect to database dbh = connect() cc = dbh.cursor() for dirname in stats.keys(): outfname = os.path.join(dirname, "contig.summary.tsv") outf = open(outfname, "w") outf.write( "track\tnscaffolds\tscaffold_length\tN%i\tmean_length\tmedian_length\tmax_length\n" % N) for infile in stats[dirname]: track = P.snip( infile.split(dirname.split(".dir")[0])[1][1:], ".summary.load") table = P.toTable(infile) data = cc.execute("""SELECT nscaffolds , scaffold_length , N50 , mean_length , median_length , max_length FROM %s""" % table).fetchone() outf.write("\t".join( map(str, [track, data[0], data[1], data[2], data[3], data[4], data[5]])) + "\n") outf.close()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | python %(scriptsdir)s/gtf2gtf.py --sort=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --renumber-genes=%(gene_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --renumber-transcripts=%(transcript_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --sort=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def loadFilteringSummary(infile, outfile): '''load filtering summary.''' P.load(infile, outfile)
def update_report(): '''update report.''' E.info( "updating documentation" ) P.run_report( clean = False )
def build_report(): '''build report from scratch.''' E.info( "starting documentation build process from scratch" ) P.run_report( clean = True )
def buildPicardAlignStats( infile, outfile ): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip( os.path.basename(infile), ".bam" ) statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals() P.run()
def sortByPosition( infile, outfile ): '''Add number of hits tags to sam file''' to_cluster = USECLUSTER track = P.snip(outfile, ".bam") statement = '''samtools sort %(infile)s %(track)s;''' P.run()
import pysam import numpy import gzip import fileinput import CGATPipelines.PipelineTracks as PipelineTracks import CGATPipelines.PipelineMapping as PipelineMapping USECLUSTER = True ################################################### ################################################### ################################################### ## Pipeline configuration ################################################### import CGAT.Pipeline as P P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ] ) PARAMS = P.PARAMS bowtie_options = {'n0m1':"-n 0 -a --best --strata -m 1 -3 1",'n1m1':"-n 1 -a --best --strata -m 1 -3 1",'n2m1':"-n 2 -a --best --strata -m 1 -3 1",'n3m1':"-n 3 -a --best --strata -m 1 -3 1", 'n0m2':"-n 0 -a --best --strata -m 2 -3 1",'n1m2':"-n 1 -a --best --strata -m 2 -3 1",'n2m2':"-n 2 -a --best --strata -m 2 -3 1",'n3m2':"-n 3 -a --best --strata -m 2 -3 1", 'n0m3':"-n 0 -a --best --strata -m 3 -3 1",'n1m3':"-n 1 -a --best --strata -m 3 -3 1",'n2m3':"-n 2 -a --best --strata -m 3 -3 1",'n3m3':"-n 3 -a --best --strata -m 3 -3 1", 'n0m4':"-n 0 -a --best --strata -m 4 -3 1",'n1m4':"-n 1 -a --best --strata -m 4 -3 1",'n2m4':"-n 2 -a --best --strata -m 4 -3 1",'n3m4':"-n 3 -a --best --strata -m 4 -3 1", 'n0m5':"-n 0 -a --best --strata -m 5 -3 1",'n1m5':"-n 1 -a --best --strata -m 5 -3 1",'n2m5':"-n 2 -a --best --strata -m 5 -3 1",'n3m5':"-n 3 -a --best --strata -m 5 -3 1", 'v0m1':"-v 0 -a --best --strata -m 1 -3 1",'v1m1':"-v 1 -a --best --strata -m 1 -3 1",'v2m1':"-v 2 -a --best --strata -m 1 -3 1",'v3m1':"-v 3 -a --best --strata -m 1 -3 1", 'v0m2':"-v 0 -a --best --strata -m 2 -3 1",'v1m2':"-v 1 -a --best --strata -m 2 -3 1",'v2m2':"-v 2 -a --best --strata -m 2 -3 1",'v3m2':"-v 3 -a --best --strata -m 2 -3 1", 'v0m3':"-v 0 -a --best --strata -m 3 -3 1",'v1m3':"-v 1 -a --best --strata -m 3 -3 1",'v2m3':"-v 2 -a --best --strata -m 3 -3 1",'v3m3':"-v 3 -a --best --strata -m 3 -3 1", 'v0m4':"-v 0 -a --best --strata -m 4 -3 1",'v1m4':"-v 1 -a --best --strata -m 4 -3 1",'v2m4':"-v 2 -a --best --strata -m 4 -3 1",'v3m4':"-v 3 -a --best --strata -m 4 -3 1", 'v0m5':"-v 0 -a --best --strata -m 5 -3 1",'v1m5':"-v 1 -a --best --strata -m 5 -3 1",'v2m5':"-v 2 -a --best --strata -m 5 -3 1",'v3m5':"-v 3 -a --best --strata -m 5 -3 1"} ################################################################### ###################################################################
def replaceBaseWithN(infile, outfile): '''replaces the specified base with N''' to_cluster = True statement = '''python %(scriptsdir)s/fastq2N.py -i %(infile)s %(replace_options)s''' P.run()
def processReads(infiles, outfile): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs(infile) if infile2: track = P.snip(outfile, ".fastq.1.gz") outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz" else: track = P.snip(outfile, ".fastq.gz") if PARAMS["process_combine_reads"]: E.warn( "combining reads cannot be can not be combined with other processing for paired ended reads" ) if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn( "if specifying --max-overlap read and fragment length options will be ignored" ) max_overlap = "--max-overlap=%i" % PARAMS[ "combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len( fragment_options.strip().split(" ")) < 3: E.warn( "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used" ) max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS[ "combine_reads_fragment_length"] and PARAMS[ "combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn( "--max-overlap will override the specified read and fragment length options" ) max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals( ) else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS[ "combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS[ "combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[ "combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([ track + x for x in [ ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz" ] ]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads" ) statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat(IOTools.openFile(infile), raises=False) E.info("%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset(format, raises=False) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s'] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log' ) do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn("no filtering specified for %s - nothing done" % infile) return s.append("gzip") if not infile2: statement = " | ".join(s) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn("processing first of pair") # first read pair statement = " | ".join(s) + " > %(tmpfile1)s" P.run() # second read pair E.warn("processing second of pair") infile = infile2 statement = " | ".join(s) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation") statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink(tmpfile1) os.unlink(tmpfile2) os.unlink(tmpfile)
def publish(): '''publish files.''' P.publish_report()
def loadAllProcessingSummary(infile, outfile): P.load(infile, outfile)
def loadCountKeggAssociations(infile, outfile): ''' load counts of KO associations ''' P.load(infile, outfile, "--header=pathway,p_annotated_reads")
def loadFastqcSummary(infile, outfile): P.load(infile, outfile, options="--index=track")
def loadCountContributingReads(infile, outfile): ''' load contributing read counts ''' P.load(infile, outfile)
def loadCountKeggGenes(infile, outfile): ''' load counts of KO associations ''' P.load(infile, outfile, "--header=KO,p_annotated_reads")
def loadMetaphlanRelativeAbundances(infile, outfile): ''' load the metaphlan relative abundances ''' P.load(infile, outfile)
def loadKeggTable(infile, outfile): ''' load KEGG table ''' P.load(infile, outfile)
def countReads(infile, outfile): '''count number of reads in input files.''' to_cluster = True m = PipelineMapping.Counter() statement = m.build((infile, ), outfile) P.run()
def loadCountLcaTaxa(infile, outfile): ''' load taxa level counts ''' P.load(infile, outfile)
def convertToGTF( infile, outfile ): '''convert bed to gtf''' statement = """gunzip < %(infile)s | python %(scriptsdir)s/bed2gff.py --as-gtf --log=%(outfile)s.log > %(outfile)s """ P.run()
def loadMetaphlanReadmaps(infile, outfile): ''' load the metaphlan read maps ''' P.load(infile, outfile)
def convertStrandedTranscriptsToBed( infile, outfile ): '''Convert GTF to compressed BED file''' track = P.snip( os.path.basename(infile), ".gtf" ) statement = '''cat %(infile)s | python %(scriptsdir)s/gff2bed.py --is-gtf --log=%(outfile)s.log | sort -k1,1 -k2,2n | gzip > %(outfile)s''' P.run()
import CGAT.Metaphlan as Metaphlan import CGATPipelines.PipelineMapping as PipelineMapping import CGATPipelines.PipelineMappingQC as PipelineMappingQC import pysam import CGAT.Fastq as Fastq ################################################### ################################################### ################################################### # Pipeline configuration ################################################### # load options from the config file import CGAT.Pipeline as P P.getParameters(["pipeline.ini"]) PARAMS = P.PARAMS ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import PipelineTracks # collect fastq.gz tracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( glob.glob("*.fastq.1.gz"), "(\S+).fastq.1.gz")
def convertGffToGtf( infile, outfile ): '''Convert txt to Gtf''' track = P.snip( os.path.basename(infile), ".gff" ) statement = '''cat %(infile)s | awk 'OFS="\\t" {print $1,$2,$3,$4,$5,$6,$7,$8,"transcript_id \\""$9"\\"; gene_id \\""$9"\\";"}' > %(outfile)s''' P.run()
def renameTranscripts( infile, outfile ): '''systematically rename transcripts ''' statement = '''cat %(infile)s | awk 'OFS="\\t" {print $1,$2,$3,$4,$5,$6,$7,$8,"transcript_id \\"rnaseq_es_novel_transcript_"NR"\\"; gene_id \\"rnaseq_es_novel_gene_"NR"\\"; "}' > %(outfile)s;''' P.run()
def buildPolyphenInput(infiles, outfile): '''build polyphen input file. SNPS across all species are aggregated into a single file to avoid multiple submissions for the same variant. Mapping to Uniprot ids was not successful - 40% of the SNPs would have been lost. Hence I map to ensembl protein identifiers. Note that the sequence file is then to be submitted to POLYPHEN as well. Note that this method outputs 1-based coordinates for polyphen, while the coordinates in the .map file are still 0-based. SNPs are assigned a snp_id and a locus_id. The snp_id refers to the SNP within a peptide sequence while the locus_id refers to the genomic location. If there are alternative transcripts overlapping a SNP, the same SNP will get two snp_ids, but the same locus_id. As the peptide background might be different for the same SNP depending on the transcript, its effect needs to be predicted twice. ''' statement = '''SELECT transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, snp_position FROM %(table)s_cds WHERE variant_code = '=' AND code = 'N' ''' dbhandle = connect() cc = dbhandle.cursor() infiles.sort() # ensembl mapping map_transcript2id = dict( cc.execute( "SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL" ).fetchall()) total_counts = E.Counter() notfound, found = set(), set() outf_map = open(outfile + ".map", "w") outf_map.write( "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n" ) outf = open(outfile, "w") snps = {} locus_ids = {} for infile in infiles: table = P.toTable(infile) track = table[:-len("_effects")] print statement % locals() cc.execute(statement % locals()) counts = E.Counter() snp_id = 0 for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc: counts.input += 1 if transcript_id not in map_transcript2id: notfound.add(transcript_id) counts.not_found += 1 continue if "," in variant_codons: counts.heterozygous += 1 continue for phase in range(0, 3): if orig_na[phase].lower() != variant_na[phase].lower(): break pid = map_transcript2id[transcript_id] # one-based coordinates peptide_pos = int(math.floor(cds_start / 3.0)) + 1 key = "%s-%i-%s" % (pid, peptide_pos, variant_codons) if key in snps: snp_id = snps[key] else: snp_id = len(snps) snps[key] = snp_id outf.write("snp%010i\t%s\t%i\t%s\t%s\n" % ( snp_id, pid, peptide_pos, orig_codons, variant_codons, )) counts.output += 1 locus_key = "%s-%i-%s" % (contig, pos, variant_codons) if locus_key not in locus_ids: locus_ids[locus_key] = len(locus_ids) # use 0-based coordinates throughout, including peptide pos outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" % (snp_id, track, transcript_id, pid, peptide_pos - 1, locus_ids[locus_key], contig, pos, phase)) found.add(transcript_id) total_counts += counts E.info("%s: %s" % (table, str(counts))) outf.close() outf_map.close() E.info("%s: transcripts: %s found, %i not found" % (table, len(found), len(notfound))) E.info("total=%s, snp_ids=%i, locus_ids=%i" % (str(total_counts), len(snps), len(locus_ids))) if notfound: E.warn( "%i transcripts had SNPS that were ignored because there was no uniprot accession" % len(notfound)) E.warn("notfound: %s" % ",".join(notfound)) statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s''' P.run()
def getGtfStrandedTranscripts( infile, outfile ): '''join exons to get transcripts from GTF file''' track = P.snip( os.path.basename(infile), ".gtf" ) statement = '''cat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --join-exons --log=%(outfile)s.log | sort -k1,1 -k4,4n > %(outfile)s''' P.run()
import CGAT.Stats as Stats import pysam # only update R if called as pipeline # otherwise - failure with sphinx from rpy2.robjects import r as R import rpy2.robjects as ro import rpy2.robjects.numpy2ri ################################################################### ################################################################### ################################################################### # Pipeline configuration import CGAT.Pipeline as P P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") SEPARATOR = "|" ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc import CGATPipelines.PipelineTracks as PipelineTracks
def analysePolyphen(infile, outfile): '''compute enrichment of SNPs within genes and deleterious SNPs within SNPs within genes. del: enrichment of deleterious snps within snps per gene len: enrichment of snps within genes com: enrichment of deleterious snps within gene ''' table = P.toTable(infile) tablename_map = "polyphen_map" dbhandle = connect() cc = dbhandle.cursor() statement = ''' SELECT i.gene_id, COUNT(DISTINCT map.locus_id) as nsnps, COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious, MAX(s.length) FROM %(table)s as t, %(tablename_map)s as map, annotations.protein_stats as s, annotations.transcript_info as i WHERE map.snp_id = t.snp_id AND i.transcript_id = map.transcript_id AND s.protein_id = map.protein_id GROUP BY i.gene_id ''' % locals() data = cc.execute(statement).fetchall() statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) FROM annotations.transcript_info AS i, annotations.protein_stats AS s WHERE s.protein_id = i.protein_id GROUP BY i.gene_id''' gene_ids = cc.execute(statement).fetchall() total_nsnps = sum([x[1] for x in data]) total_ndel = sum([x[2] for x in data]) total_length = sum([x[1] for x in gene_ids]) del_p = float(total_ndel) / total_nsnps len_p = float(total_nsnps) / total_length com_p = float(total_ndel) / total_length E.info("del: background probability: %i/%i = %f" % (total_ndel, total_nsnps, del_p)) E.info("len: background probability: %i/%i = %f" % (total_nsnps, total_length, len_p)) E.info("com: background probability: %i/%i = %f" % (total_ndel, total_length, com_p)) outf = open(outfile, "w") outf.write("\t".join(( "gene_id", "code", "length", "nsnps", "ndel", "del_p", "del_pvalue", "del_qvalue", "len_p", "len_pvalue", "len_qvalue", "com_p", "com_pvalue", "com_qvalue", )) + "\n") del_pvalues, len_pvalues, com_pvalues = [], [], [] for gene_id, nsnps, ndel, length in data: # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x # > X ). del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p)) len_pvalues.append( scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p)) com_pvalues.append( scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p)) if len(del_pvalues) > 10: del_qvalues = Stats.doFDR(del_pvalues).mQValues else: E.warn("no FDR computed for del") del_qvalues = del_pvalues if len(len_pvalues) > 10: len_qvalues = Stats.doFDR(len_pvalues).mQValues else: E.warn("no FDR computed for del") len_qvalues = len_pvalues if len(com_pvalues) > 10: com_q = Stats.doFDR(com_pvalues).mQValues else: E.warn("no FDR computed for com") com_qvalues = com_pvalues fdr = PARAMS["polyphen_fdr"] found = set() for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \ zip(data, del_pvalues, del_qvalues, len_pvalues, len_qvalues, com_pvalues, com_qvalues, ): gene_id, nsnps, ndel, length = a found.add(gene_id) del_p = float(ndel) / nsnps len_p = float(nsnps) / length code = "".join( [str(int(x < fdr)) for x in (del_qvalue, len_qvalue, com_qvalue)]) outf.write("\t".join(( gene_id, code, "%i" % int(round(length)), "%i" % int(nsnps), "%i" % int(ndel), "%6.4f" % del_p, "%6.4g" % del_pvalue, "%6.4g" % del_qvalue, "%6.4f" % len_p, "%6.4g" % len_pvalue, "%6.4g" % len_qvalue, "%6.4f" % com_p, "%6.4g" % com_pvalue, "%6.4g" % com_qvalue, )) + "\n") # add missing genes: code = "---" for gene_id, length in gene_ids: if gene_id in found: continue outf.write("\t".join(( gene_id, code, "%i" % int(round(length)), "%i" % 0, "%i" % 0, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, "%6.4f" % 0, "%6.4g" % 1, "%6.4g" % 1, )) + "\n") outf.close()
def filterAndMergeGTF(infile, outfile, remove_genes, merge=False): '''filter gtf file infile with gene ids in remove_genes and write to outfile. If *merge* is set, the resultant transcript models are merged by overlap. A summary file "<outfile>.summary" contains the number of transcripts that failed various filters. A file "<outfile>.removed.tsv.gz" contains the filters that a transcript failed. ''' counter = E.Counter() # write summary table outf = IOTools.openFile(outfile + ".removed.tsv.gz", "w") outf.write("gene_id\tnoverlap\tsection\n") for gene_id, r in remove_genes.iteritems(): for s in r: counter[s] += 1 outf.write("%s\t%i\t%s\n" % (gene_id, len(r), ",".join(r))) outf.close() # filter gtf file tmpfile = P.getTempFile(".") inf = GTF.iterator(IOTools.openFile(infile)) genes_input, genes_output = set(), set() for gtf in inf: genes_input.add(gtf.gene_id) if gtf.gene_id in remove_genes: continue genes_output.add(gtf.gene_id) tmpfile.write("%s\n" % str(gtf)) tmpfile.close() tmpfilename = tmpfile.name outf = IOTools.openFile(outfile + ".summary.tsv.gz", "w") outf.write("category\ttranscripts\n") for x, y in counter.iteritems(): outf.write("%s\t%i\n" % (x, y)) outf.write("input\t%i\n" % len(genes_input)) outf.write("output\t%i\n" % len(genes_output)) outf.write("removed\t%i\n" % (len(genes_input) - len(genes_output))) outf.close() # close-by exons need to be merged, otherwise # cuffdiff fails for those on "." strand if merge: statement = ''' %(scriptsdir)s/gff_sort pos < %(tmpfilename)s | python %(scriptsdir)s/gtf2gtf.py --unset-genes="NONC%%06i" --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --merge-genes --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --merge-exons --merge-exons-distance=5 --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --renumber-genes="NONC%%06i" --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --renumber-transcripts="NONC%%06i" --log=%(outfile)s.log | %(scriptsdir)s/gff_sort genepos | gzip > %(outfile)s ''' else: statement = ''' %(scriptsdir)s/gff_sort pos < %(tmpfilename)s | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def buildSharedSNPMatrix(infiles, outfiles): '''build matrix of shared coding nonsynonymous SNPs. Counts are per locus id. Percent identities are only within coding segregating loci and thus do not reflect the real divergence. ''' dbhandle = connect() cc = dbhandle.cursor() segregating_sites = cc.execute( 'SELECT COUNT( DISTINCT locus_id) FROM polyphen_map').fetchone()[0] statement = '''SELECT DISTINCT locus_id, track FROM polyphen_map ORDER BY locus_id''' cc.execute(statement) matrix = collections.defaultdict(int) for k, vals in itertools.groupby(cc, key=lambda x: x[0]): tracks = [x[1] for x in list(vals)] for t1 in tracks: matrix[(t1, t1)] += 1 if len(tracks) > 1: for t1, t2 in itertools.combinations(tracks, 2): matrix[(t1, t2)] += 1 matrix[(t2, t1)] += 1 all_tracks = set([x[0] for x in matrix.keys()] + [x[1] for x in matrix.keys()]) # output matrix with shared SNPs. outf = open(outfiles[0], "w") outf.write("track\t%s\n" % "\t".join(all_tracks)) for track1 in all_tracks: outf.write("%s" % track1) for track2 in all_tracks: outf.write("\t%i" % matrix[(track1, track2)]) outf.write("\n") outf.close() # output matrix with shared segregating sites as # distance matrix outf = open(outfiles[1], "w") outf.write("track\t%s\n" % "\t".join(all_tracks)) for track1 in all_tracks: outf.write("%s" % track1) for track2 in all_tracks: if track1 == track2: outf.write("\t%i" % 0) else: outf.write("\t%i" % (segregating_sites - matrix[(track1, track2)])) outf.write("\n") outf.close() # output matrix as percent identity matrix # percent identity is given as # segregating sites - sites where strains differ = segregating_sites - (matrix[i,i] + matrix[j,j] - 2 * matrix[i,j]) # simplifies to: # segsites - matrix[i,i] -matrix[j,j] + # divided by the total number of segregating sites outf = open(outfiles[2], "w") outf.write("track\t%s\n" % "\t".join(all_tracks)) pids = {} for track1 in all_tracks: outf.write("%s" % track1) for track2 in all_tracks: a = segregating_sites - \ (matrix[(track1, track1)] + matrix[(track2, track2)] - 2 * matrix[(track1, track2)]) pid = 100.0 * a / segregating_sites outf.write("\t%6.4f" % pid) pids[(track1, track2)] = pid outf.write("\n") outf.close() # distance matrix outf = open(outfiles[3], "w") outf.write("track\t%s\n" % "\t".join(all_tracks)) for track1 in all_tracks: outf.write("%s" % track1) for track2 in all_tracks: val = 100.0 - pids[(track1, track2)] outf.write("\t%6.4f" % val) outf.write("\n") outf.close() outfile_distance, outfile_tree = outfiles[3], outfiles[4] # build tree statement = '''python %(scriptsdir)s/matrix2matrix.py --output-format=phylip < %(outfile_distance)s | python %(scriptsdir)s/matrix2tree.py --method=nj > %(outfile_tree)s ''' P.run()