def dtwWrapper(data, rows, columns, k): ''' wrapper function for dynamic time warping. includes use of exponential adaptive tuning function with temporal correlation if k > 0 ''' # not explicitly called, but needs to be in R environment DTW = importr("dtw") # create a data frame of zeros of size number of ids x number of ids # fill it with the calculated distance metric for each pair wise comparison df_ = pd.DataFrame(index=rows, columns=columns) df_ = df_.fillna(0.0).astype(np.float64) # fill the array with dtw-distance values pandas2ri.activate() for i in rows: E.info("DTW %s" % i) for j in columns: series1 = data.loc[i].values.tolist() series2 = data.loc[j].values.tolist() DTW_value = (R.dtw(series1, series2)).rx('distance')[0][0] cort_value = temporalCorrelate(series1, series2) tuned_value = adaptiveTune(cort_value, k) time_dist = DTW_value * tuned_value df_.loc[i][j] = float(time_dist) df_[j][i] = float(time_dist) return df_
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError( "required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append( '''gunzip < %(filename)s''' % locals() ) else: statement.append( ''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals() ) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def executewait(dbhandle, statement, error, retry=False, wait=5, args=()): """execute sql statement. Retry on error, if retry is True. Returns a cursor object. """ cc = dbhandle.cursor() i = 20 while i > 0: try: cc.execute(statement, args) return cc except sqlite3.OperationalError as e: msg = e.message E.warn("import failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not retry: raise e if not re.search("locked", str(msg)): raise e time.sleep(wait) i -= 1 continue break raise sqlite3.OperationalError("Database locked and too many retries")
def correlateDistanceMetric(data, rows, columns, method, lag=0): ''' wrapper for correlation coefficients as distance metrics for time-series clustering. Use either temporal correlation (analagous to template matching) or normalised cross correlation. ''' # create blank (all 0's) dataframe to fill with correlation values df_ = pd.DataFrame(index=rows, columns=columns) df_ = df_.fillna(0.0) if method == "cross-correlate": for i in rows: E.info("cross-correlation %s" % i) for j in columns: series1 = data.loc[i].values.tolist() series2 = data.loc[j].values.tolist() corr = crossCorrelate(series1, series2, lag=lag) df_.loc[i][j] = 1.0 - abs(corr) df_[j][i] = 1.0 - abs(corr) elif method == "temporal-correlate": for i in rows: E.info("temporal correlation %s" % i) for j in columns: series1 = data.loc[i].tolist() series2 = data.loc[j].tolist() corr = temporalCorrelate(series1, series2) df_.loc[i][j] = 1.0 - abs(corr) df_[j][i] = 1.0 - abs(corr) return df_
def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile) statement = '''gunzip < %(infile)s | cgat maf2psl --query=%(track)s --target=%(maf_master)s --log=%(outfile)s.log | cgat psl2psl --method=filter-fasta --method=sanitize --queries-tsv-file=%(genomefile)s --target-psl-file=%(genome)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def segmentWithCpG( infile, options ): '''segment a fasta file, output locations of CpG.''' ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator( infile ) segments = [] while 1: try: cur_record = iterator.next() except StopIteration: break if cur_record is None: break ninput += 1 contig = re.sub("\s.*", "", cur_record.title ) last = None for pos, this in enumerate( cur_record.sequence.upper()): if last == "C" and this == "G": segments.append( (contig, pos - 1, pos + 1, 1.0)) last = this E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) return segments
def findClusters(df, distance, size, tracks_map, groups): '''define clusters of genomic loci depending on thresholds for size and minimum members per cluster This was written with CpGs in mind but will work with any data frame containing "position" and "contig" columns''' positions = df['position'].tolist() contigs = df['contig'].tolist() current_pos = 0 cluster_ix = [] current_contig = "" cluster_dfs = {} n = 0 for ix in range(0, len(positions)): next_pos = positions[ix] next_contig = contigs[ix] if (((next_pos < current_pos + distance) & (next_contig == current_contig))): cluster_ix.append(ix) else: if len(cluster_ix) >= size: start, end = (cluster_ix[0], cluster_ix[-1] + 1) cluster_dfs[n] = df.iloc[start:end] n += 1 cluster_ix = [] current_pos = next_pos current_contig = next_contig E.info("found %i clusters" % n) return (cluster_dfs)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def mergeExpectedAndObservedGenomeCoverage(infiles, outfile): ''' merge the expected and actual estimates of genome coverage ''' expected = open(infiles[0]) expected_header = expected.readline() observed = open(infiles[1]) observed_header = observed.readline() expected_data = {} E.info("reading expected coverage over genomes") for line in expected.readlines(): data = line[:-1].split("\t") gi, coverage = data[0], data[1] expected_data[gi] = coverage outf = open(outfile, "w") E.info("writing results") outf.write("track\tgi\tspecies\tobserved\texpected\n") for line in observed.readlines(): data = line[:-1].split("\t") track, gi, species, coverage = data[0], data[1], "_".join(data[2].split("_")[5:7]), data[3] outf.write("%s\t%s\t%s\t%s\t%s\n" % (track, gi, species, coverage, expected_data[gi])) outf.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: E.debug("# merging %s" % fn) infile = IOTools.openFile(fn, "r") if options.output_header: self.parseHeader(infile, outfile, options) for l in infile: nfields = l.count("\t") if l[0] == "#": options.stdlog.write(l) elif self.nfields is not None and nfields != self.nfields: # validate number of fields in row, raise warning # for those not matching and skip. E.warn( "# line %s has unexpected number of fields: %i != %i" % (l[:-1], nfields, self.nfields)) else: if self.mFieldIndex is not None: data = l[:-1].split("\t") try: data[self.mFieldIndex] = self.mMapper( fi, data[self.mFieldIndex]) except IndexError: raise IndexError( "can not find field %i in %s" % (self.mFieldIndex, l)) l = "\t".join(data) + "\n" outfile.write(l) infile.close()
def sanitizeGenome(iterator, contigs): """truncate bed intervals that extend beyond contigs. removes empty intervals (start == end). throws an error if start > end. """ ninput, noutput = 0, 0 ntruncated_contig, nskipped_contig, nskipped_empty = 0, 0, 0 for bed in iterator: ninput += 1 if bed.contig not in contigs: nskipped_contig += 1 continue # IMS: changing >= to > in if statement: next line sets bed.end = contigs[bed.contig] # this shouldn't count as a truncation. if bed.end > contigs[bed.contig]: bed.end = contigs[bed.contig] ntruncated_contig += 1 if bed.start < 0: bed.start = 0 ntruncated_contig += 1 if bed.start == bed.end: nskipped_empty += 1 continue elif bed.start > bed.end: raise ValueError("invalid interval: start > end for %s" % str(bed)) noutput += 1 yield bed E.info("ninput=%i, noutput=%i, nskipped_contig=%i, ntruncated=%i, nskipped_empty=%i" % (ninput, noutput, nskipped_contig, ntruncated_contig, nskipped_empty))
def extendInterval(iterator, distance): ninput, noutput, nskipped = 0, 0, 0 for bed in iterator: ninput += 1 if bed.contig not in contigs: nskipped_contig += 1 continue if bed.start < 0 or bed.end < 0: nskipped_range += 1 continue if bed.end > contigs[bed.contig]: nskipped_range += 1 continue newstart = bed.start - distance newend = bed.end + distance if newstart < 0: newstart = 0 if newend > contigs[bed.contig]: newend = contigs[bed.contig] bed.start = newstart bed.end = newend noutput += 1 yield bed E.info("ninput = %i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
def readTable( filename, options): '''read table and filter. ''' if os.path.exists(filename): lines = IOTools.openFile(filename, "r").readlines() else: lines = [] # extract table by regular expression if options.regex_start: rx = re.compile(options.regex_start) for n, line in enumerate(lines): if rx.search(line): E.info("reading table from line %i/%i" % (n,len(lines))) lines = lines[n:] break else: E.info("start regex not found - no table") lines = [] if options.regex_end: rx = re.compile(options.regex_end) for n, line in enumerate(lines): if rx.search(line): break lines = lines[:n] # remove comments and empty lines lines = [ x for x in lines if not x.startswith("#") and x.strip()] return lines
def filterGenome(iterator, contigs): """remove bed intervals that are outside of contigs. contigs is a dictionary of contig sizes.""" ninput, noutput = 0, 0 nskipped_contig, nskipped_range, nskipped_endzero = 0, 0, 0 for bed in iterator: ninput += 1 if bed.contig not in contigs: nskipped_contig += 1 continue # IMS: add filtering for filtering <0 co-ordinates if bed.start < 0 or bed.end < 0: nskipped_range += 1 continue # should this not be just >, as co-ordinates are half-closed, so # if end = contigs[bed.contig], then interval ends on last base? if bed.end > contigs[bed.contig]: nskipped_range += 1 continue if bed.end == 0: nskipped_endzero += 1 continue noutput += 1 yield bed E.info("ninput=%i, noutput=%i, nskipped_contig=%i, nskipped_range=%i, nskipped_endzero=%i" % (ninput, noutput, nskipped_contig, nskipped_range, nskipped_endzero))
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' cc = dbhandle.cursor() table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc.execute(sql) outfile = IOTools.openFile(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) P.touch(outfile)
def segmentWithCpG(infile, with_contig_sizes=False): '''segment a fasta file, output locations of CpG.''' ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) segments, contig_sizes = [], collections.OrderedDict() for cur_record in iterator: ninput += 1 contig = re.sub("\s.*", "", cur_record.title) last = None contig_sizes[contig] = (0, len(cur_record.sequence)) for pos, this in enumerate(cur_record.sequence.upper()): if last == "C" and this == "G": segments.append((contig, pos - 1, pos + 1, 1.0)) last = this E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) if with_contig_sizes: return segments, contig_sizes return segments
def buildRawGenomeAlignment(infiles, outfile): '''build pairwise genomic aligment from maf files. ''' try: os.remove(outfile) except OSError: pass for infile in infiles: # skip maf files without Hsap on top. if "other" in infile or "supercontig" in infile: continue E.info("adding %s" % infile) genome_query, genome_target = getGenomes() statement = '''gunzip < %(infile)s | python %(scriptsdir)s/maf2psl.py --query=%(maf_name_query)s --target=%(maf_name_target)s --log=%(outfile)s.log | python %(scriptsdir)s/psl2psl.py --method=filter-fasta --method=sanitize --queries-tsv-file=%(genome_query)s --target-psl-file=%(genome_target)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''gather BAM file alignment statistics using Picard ''' job_options = getPicardOptions() job_threads = 3 if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence --output-sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def parseHeader(self, infile, outfile, options): """parse header in infile.""" # skip comments until header while 1: l = infile.readline() if not l or l[0] != "#": break options.stdlog.write(l) # Print only the first header and check if # all the headers are the same. if self.mHeader: if self.mHeader != l: raise ValueError( "inconsistent header in file %s\ngot=%s\nexpected=%s" % (infile, l, self.mHeader)) else: outfile.write(l) self.mHeader = l if self.mFieldIndex is None and self.mFieldName: try: self.mFieldIndex = self.mHeader.split( "\t").index(self.mFieldName) except ValueError: E.warn("no mapping, can not find field %s in %s" % (self.mFieldName, self.mHeader)) self.mFieldName = None E.debug( "substituting field: %s, %s" % (self.mFieldName, self.mFieldIndex))
def count(self, filename1, filename2): """count overlap between two gtf files.""" E.info("counting started for %s versus %s" % (filename1, filename2)) idx2 = self.buildIndex(filename2) (self.mGenes1, self.mGenesOverlapping1, self.mExons1, self.mExonsOverlapping1, self.mBases1, self.mBasesOverlapping1 ) = \ self._count(filename1, idx2) self.mGenesUnique1 = self.mGenes1 - self.mGenesOverlapping1 self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1 self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1 idx1 = self.buildIndex(filename1) (self.mGenes2, self.mGenesOverlapping2, self.mExons2, self.mExonsOverlapping2, self.mBases2, self.mBasesOverlapping2 ) = \ self._count(filename2, idx1) self.mGenesUnique2 = self.mGenes2 - self.mGenesOverlapping2 self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2 self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS_ANNOTATIONS["interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def pslSelectQuery(options): ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 value, field = options.select.split("-") if field == "nmatches": f = lambda x: x.mNMatches elif field == "nmismatches": f = lambda x: x.mNMisMatches for data in Blat.iterator_per_query(Blat.iterator(options.stdin)): ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) data.sort(key=f) if value == "most": options.stdout.write("%s\n" % str(data[-1])) elif value == "least": options.stdout.write("%s\n" % str(data[0])) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def iterator_rename_query(infile, options): ninput, noutput, nerrors = 0, 0, 0 map_old2new = {} x = 1 while 1: match = infile.next() if not match: break ninput += 1 if match.mQueryId not in map_old2new or options.unique: new = options.id_format % x map_old2new[match.mQueryId] = new x += 1 else: new = map_old2new[match.mQueryId] match.mQueryId = new yield match if options.output_filename_map: outfile = open(options.output_filename_map, "w") outfile.write("%s\t%s\n" % ("old", "new")) for old, new in map_old2new.iteritems(): outfile.write("%s\t%s\n" % (old, new)) outfile.close() E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))
def _copy(src, dest): if os.path.exists(dest): shutil.rmtree(dest) if not os.path.exists(src): E.warn("%s does not exist - skipped" % src) return shutil.copytree(os.path.abspath(src), dest)
def pslAddSequence(query_fasta, sbjct_fasta, options): iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) new = Blat.MatchPSLX() new.fromPSL(match, query_fasta.getSequence( match.mQueryId, "+", match.mQueryFrom, match.mQueryTo), sbjct_fasta.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)) options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def __call__(self, filenames, outfile, options): for fi, fn in filenames: E.debug("# merging %s" % fn) infile = IOTools.openFile(fn, "r") if options.output_header: self.parseHeader(infile, outfile, options) for l in infile: if l[0] == "#": options.stdlog.write(l) else: if self.mFieldIndex is not None: data = l[:-1].split("\t") try: data[self.mFieldIndex] = self.mMapper( fi, data[self.mFieldIndex]) except IndexError: raise IndexError( "can not find field %i in %s" % (self.mFieldIndex, l)) l = "\t".join(data) + "\n" outfile.write(l) infile.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do sth ninput, nskipped, noutput = 0, 0, 0 psl = None def chain_iterator(infile): lines = [] for line in options.stdin: if line.startswith("#"): continue if line.strip() == "": continue if line.startswith("chain"): if lines: yield lines lines = [] lines.append(line) yield lines for lines in chain_iterator(options.stdin): ninput += 1 psl = Blat.Match() (_, _, psl.mSbjctId, target_length, target_strand, target_start, target_end, psl.mQueryId, query_length, query_strand, query_start, query_end, alignment_id) = lines[0][:-1].split() (psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength, psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength) = \ [int(x) for x in (query_start, query_end, query_length, target_start, target_end, target_length)] map_query2target = alignlib_lite.py_makeAlignmentBlocks() qstart, tstart = psl.mQueryStart, psl.mSbjctStart for line in lines[1:-1]: size, dt, dq = [int(x) for x in line[:-1].split()] map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) qstart += size + dq tstart += size + dt size = int(lines[-1][:-1]) map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) psl.fromMap(map_query2target) # sort out strand # target_strand is always positive assert(target_strand == "+") # if query strand is negative if query_strand == "-": # invert both query and target psl.switchTargetStrand() # manually invert the query coordinates psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - \ psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom options.stdout.write("%s\n" % psl) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def iterate_guess(infile, max_tries=10000, guess=None): '''iterate over contents of fastq file. Guess quality format by looking at the first `max_tries` entries and then subsequently setting the quality score format for each entry. Arguments --------- infile : File File or file-like object to iterate over max_tries : int Number of records to examine for guessing the quality score format. guess : string Default format. This format will be chosen in the quality score format is ambiguous. The method checks if the `guess` is compatible with the records read so far. Yields ------ fastq An object of type :class:`Record`. Raises ------ ValueError If the ranges of the fastq records are not compatible, are incompatible with guess or are ambiguous. ''' quals = set(RANGES.keys()) cache = [] myiter = iterate(infile) lengths = [] for c, record in enumerate(myiter): quals.intersection_update(set(record.guessFormat())) if len(quals) == 0: raise ValueError("could not guess format - ranges incompatible.") if len(quals) == 1: break cache.append(record) lengths.append(len(record.seq)) if c > max_tries: break if len(quals) == 1: ref_format = list(quals)[0] elif guess in quals: E.warn("multiple input formats possible: %s. Continuing with %s" % (", ".join(quals), guess)) ref_format = guess elif quals.issubset(set(["solexa", "phred64"])): # guessFormat will call phred64 reads as phred64 AND solexa # if both still remain after max_tries, assume phred64 ref_format = "phred64" else: raise ValueError("could not guess format - could be one of %s." % str(quals)) for r in cache: r.format = ref_format yield r for r in myiter: r.format = ref_format yield r
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--source", dest="source_directory", type="string", default=False, help="The directory in which data" "files are held [%default]") parser.add_option("-d", "--dest", dest="dest_directory", type="string", default=False, help="The directory in which links" "are created [%default]") parser.set_defaults(source_directory=None, dest_directory=".") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read a map of input files to links with sanity checks map_filename2link = {} links = set() for line in options.stdin: if line.startswith("#"): continue # ignore header if line.startswith("source"): continue filename, link = line[:-1].split()[:2] if filename in map_filename2link: raise ValueError("duplicate filename '%s' " % filename) if link in links: raise ValueError("duplicate link '%s' " % link) map_filename2link[filename] = link links.add(link) counter = E.Counter() counter.input = len(map_filename2link) def _createLink(src, dest, counter): src = os.path.abspath(src) dest = os.path.abspath(os.path.join(options.dest_directory, dest)) if os.path.exists(dest): E.warn("existing symlink %s" % dest) counter.link_exists += 1 elif not os.path.exists(src): counter.file_not_found += 1 E.warn("did not find %s" % src) else: try: os.symlink(src, dest) counter.success += 1 except OSError: pass if not options.source_directory: # no source directory given, filenames must have complete path for filename, link in map_filename2link.items(): _createLink(filename, link, counter) else: # walk through directory hierchy and create links # for files matching filenames in map_filename2link found = set() for dirName, subdirList, fileList in os.walk(options.source_directory): for f in fileList: if f in map_filename2link: if f in found: E.warn("found multiple files with " "the same name %s" % f) else: _createLink(os.path.join(dirName, f), map_filename2link[f], counter) found.add(f) else: E.info("Filename %s not in map" % f) notfound = set(map_filename2link.keys()).difference(found) counter.notfound = len(notfound) if notfound: E.warn("did not find %i files: %s" % (len(notfound), str(notfound))) E.info(counter) # write footer and output benchmark information E.Stop()
def publish(): '''publish report and data.''' E.info("publishing report") P.publish_report()
def update_report(): '''update report.''' E.info("updating report") P.run_report(clean=False)
def build_report(): '''build report from scratch.''' E.info("starting report build process from scratch") P.run_report(clean=True)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.openFile(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = iterator.next() except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = other_iterator.next() except StopIteration: raise "run out of sequences." if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = string.translate( sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = hard_masked_iterator.next() except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in itertools.izip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in xrange(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len(filter(lambda x: x in mask_chars, seq[x:x + 3])) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = other_iterator.next() if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = open(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in map_seq2nid.items(): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "normalize-by-min-diagonal", "normalize-by-column", "log", "ln", "negzero2value", "set-diagonal", "subtract-matrix", "mix-matrix", "normalize-by-matrix", "normalize-by-column-max", "normalize-by-row-max", "normalize-by-column-min", "normalize-by-row-min", "normalize-by-column-median", "normalize-by-row-median", "normalize-by-column-mean", "normalize-by-row-mean", "normalize-by-column-total", "normalize-by-row-total", "correspondence-analysis", "normalize-by-value", "add-value", "sort-rows", "sort-columns", "transpose", "upper-bound", "lower-bound", "subtract-first-col", "multiply-by-value", "divide-by-value", "mask-rows", "mask-columns", "mask-rows-and-columns", "symmetrize-mean", "symmetrize-max", "symmetrize-min", ), help="""method to use [default=%default]""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by [default=%default].") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default=%default].") parser.add_option("--rows-tsv-file", dest="filename_rows", type="string", help="filename with rows to mask [default=%default].") parser.add_option("--columns-tsv-file", dest="filename_columns", type="string", help="filename with columns to mask [default=%default].") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--header-names", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""") parser.add_option( "--missing-value", dest="missing", type="float", help= "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]." ) parser.set_defaults( methods=[], scale=1.0, headers=True, format="%6.4f", output_format="full", input_format="full", value=0.0, parameters="", write_separators=True, filename_rows=None, filename_columns=None, missing=None, ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") lines = [x for x in sys.stdin.readlines() if x[0] != "#"] if len(lines) == 0: raise IOError("no input") chunks = [x for x in range(len(lines)) if lines[x][0] == ">"] if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) if options.filename_rows: row_names, n = IOTools.ReadList(open(options.filename_rows, "r")) if options.filename_columns: column_names, n = IOTools.ReadList(open(options.filename_columns, "r")) for chunk in range(len(chunks) - 1): try: raw_matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])), format=options.input_format, headers=options.headers, missing=options.missing) except ValueError as msg: E.warn("matrix could not be read: %s" % msg) continue nrows, ncols = raw_matrix.shape E.debug("read matrix: %i x %i, %i row titles, %i colum titles" % (nrows, ncols, len(row_headers), len(col_headers))) parameter = 0 for method in options.methods: matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape) if method in ("normalize-by-matrix", "subtract-matrix", "mix-matrix", "add-matrix"): other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix( open(options.parameters[parameter], "r"), headers=options.headers) other_nrows, other_ncols = other_matrix.shape if options.loglevel >= 2: options.stdlog.write( "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n" % (options.parameters[parameter], other_nrows, other_ncols, len(other_row_headers), len(other_col_headers))) parameter += 1 elif method == "normalize-by-min-diagonal": for x in range(nrows): for y in range(ncols): m = min(raw_matrix[x, x], raw_matrix[y, y]) if m > 0: matrix[x, y] = raw_matrix[x, y] / m elif method == "normalize-by-column": if nrows != ncols: raise ValueError("only supported for symmeric matrices") for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y] elif method == "normalize-by-value": matrix = raw_matrix / float(options.parameters[parameter]) parameter += 1 elif method == "normalize-by-row": if nrows != ncols: raise ValueError("only supported for symmeric matrices") for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x] elif method == "subtract-first-col": for x in range(nrows): for y in range(ncols): matrix[x, y] -= raw_matrix[x, 0] elif method.startswith("normalize-by-column"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for y in range(ncols): m = f(matrix[:, y]) if m != 0: for x in range(nrows): matrix[x, y] = matrix[x, y] / m elif method.startswith("normalize-by-row"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for x in range(nrows): m = f(matrix[x, :]) if m != 0: for y in range(ncols): matrix[x, y] = raw_matrix[x, y] / m elif method == "negzero2value": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): if matrix[x, y] <= 0: matrix[x, y] = options.value elif method == "minmax": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): matrix[x, y], matrix[y, x] = \ min(matrix[x, y], matrix[y, x]), \ max(matrix[x, y], matrix[y, x]) elif method == "log": # apply log to all values. for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log10(matrix[x, y]) elif method == "ln": for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log(matrix[x, y]) elif method == "transpose": matrix = numpy.transpose(matrix) row_headers, col_headers = col_headers, row_headers nrows, ncols = ncols, nrows elif method == "mul": matrix = numpy.dot(matrix, numpy.transpose(matrix)) col_headers = row_headers elif method == "multiply-by-value": matrix *= options.value elif method == "divide-by-value": matrix /= options.value elif method == "add-value": matrix += options.value elif method == "angle": # write angles between col vectors v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0)) matrix = numpy.dot(numpy.transpose(matrix), matrix) row_headers = col_headers nrows = ncols for x in range(nrows): for y in range(ncols): matrix[x, y] /= v1[x] * v1[y] elif method == "euclid": # convert to euclidean distance matrix matrix = numpy.zeros((ncols, ncols), numpy.float) for c1 in range(0, ncols - 1): for c2 in range(c1 + 1, ncols): for r in range(0, nrows): d = raw_matrix[r][c1] - raw_matrix[r][c2] matrix[c1, c2] += (d * d) matrix[c2, c1] = matrix[c1, c2] matrix = numpy.sqrt(matrix) row_headers = col_headers nrows = ncols elif method.startswith("symmetrize"): f = method.split("-")[1] if f == "max": f = max elif f == "min": f = min elif f == "mean": f = lambda x, y: float(x + y) / 2 if nrows != ncols: raise ValueError( "symmetrize only available for symmetric matrices") if row_headers != col_headers: raise ValueError( "symmetrize not available for permuted matrices") for x in range(nrows): for y in range(ncols): matrix[x, y] = matrix[y, x] = f(matrix[x, y], matrix[y, x]) elif method == "sub": matrix = options.value - matrix elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[parameter]) new_value = float(options.parameters[parameter + 1]) parameter += 2 if method == "upper-bound": for x in range(nrows): for y in range(ncols): if matrix[x, y] > boundary: matrix[x, y] = new_value else: for x in range(nrows): for y in range(ncols): if matrix[x, y] < boundary: matrix[x, y] = new_value elif method == "subtract-matrix": matrix = matrix - other_matrix elif method == "add-matrix": matrix = matrix + other_matrix elif method == "normalize-by-matrix": # set 0s to 1 in the other matrix for x in range(nrows): for y in range(ncols): if other_matrix[x, y] == 0: other_matrix[x, y] = 1.0 matrix = matrix / other_matrix elif method == "mix-matrix": for x in range(len(other_row_headers) - 1): for y in range(x + 1, len(other_col_headers)): matrix[x, y] = other_matrix[x, y] elif method == "set-diagonal": value = float(options.parameters[parameter]) for x in range(min(nrows, ncols)): matrix[x, x] = value parameter += 1 elif method == "transpose": matrix = numpy.transpose(raw_matrix) row_headers, col_headers = col_headers, row_headers elif method == "correspondence-analysis": row_indices, col_indices = CorrespondenceAnalysis.GetIndices( raw_matrix) map_row_new2old = numpy.argsort(row_indices) map_col_new2old = numpy.argsort(col_indices) matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix( raw_matrix, map_row_new2old, map_col_new2old, row_headers=row_headers, col_headers=col_headers) elif method == "mask-rows": r = set(row_names) for x in range(len(row_headers)): if row_headers[x] in r: matrix[x, :] = options.value elif method == "mask-columns": r = set(column_names) for x in range(len(col_headers)): if col_headers[x] in r: matrix[:, x] = options.value elif method == "mask-rows-and-columns": r = set(row_names) c = set(column_names) for x in range(len(row_headers)): for y in range(len(col_headers)): if row_headers[x] in r and col_headers[y] in c: matrix[x, y] = options.value raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape) else: # for simple re-formatting jobs matrix = raw_matrix if options.write_separators: options.stdout.write(lines[chunks[chunk]]) MatlabTools.writeMatrix(sys.stdout, matrix, value_format=options.format, format=options.output_format, row_headers=row_headers, col_headers=col_headers) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--filter-query", dest="filename_filter_query", type="string", help= "filename with intervals in the query to filter (in gff format) [default=%default]." ) parser.add_option( "--filter-target", dest="filename_filter_target", type="string", help= "filename with intervals in the target to filter (in gff format) [default=%default]." ) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--filename-queries", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--filename-target", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option( "--id-format", dest="id_format", type="string", help= "format of new identifiers for the rename function [default=%default]." ) parser.add_option( "--unique", dest="unique", action="store_true", help= "in the rename function, make each match unique [default=%default].") parser.add_option( "--output-filename-map", dest="output_filename_map", type="string", help= "filename with map of old to new labels for rename function [default=%default]." ) parser.add_option( "--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks [default=%default].") parser.add_option( "--complement-border", dest="complement_border", type="int", help= "number of residues to exclude before alignment at either end [default=%default]." ) parser.add_option( "--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments [default=%default].") parser.add_option( "--threshold-merge-distance", dest="threshold_merge_distance", type="int", help= "distance in nucleotides at which two adjacent reads shall be merged even if they are not overlapping [%default]." ) parser.add_option( "--test", dest="test", type="int", help= "for debugging purposes - stop after x iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and target/genome sequence data." ) iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize(iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta(iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--input-filename-queries", dest="input_filename_queries", type="string", help= "fasta filename with queries - required for polyA analysis [%default]." ) parser.add_option("--polyA", dest="polyA", action="store_true", help="detect polyA tails [%default].") parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help= "OUTPUT filename with histogram information on aggregate coverages [%default]." ) parser.add_option( "--output-filename-empty", dest="output_filename_empty", type="string", help= "OUTPUT filename with queries for which all matches have been discarded [%default]." ) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("map", "psl"), help="output format to choose [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float", help="minimum thresholds for pid [%default].") parser.add_option( "--threshold-min-matches", dest="threshold_min_matches", type="int", help="minimum threshold for number of matching residues [%default].") parser.add_option( "--threshold-max-error-rate", dest="threshold_max_error_rate", type="float", help="maximum threshold for error of aligned part [%default].") parser.add_option( "--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float", help= "minimum query coverage for segments to be counted as good [%default]." ) parser.add_option( "--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float", help="minimum query coverage for segments to be accepted [%default].") parser.add_option( "--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int", help="maximum number of gap characters in query[%default].") parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int", help="maximum number of gaps in query[%default].") parser.add_option( "--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int", help="maximum number of gap characters in sbjct[%default].") parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true", help="ignore filters for unique matches [%default].") parser.add_option( "--keep-all-best", dest="keep_all_best", action="store_true", help= "when sorting matches, keep all matches within the collection threshold [%default]." ) parser.add_option( "--best-per-sbjct", dest="best_per_sbjct", action="store_true", help= "keep only the best entry per sbjct (for transcript mapping) [%default]." ) parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int", help="maximum number of gaps in sbjct[%default].") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing[%default].") parser.add_option( "-m", "--matching-mode", dest="matching_mode", type="choice", choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage", "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid", "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid", "unique", "all"), help="determines how to selecte the best match [%default].") parser.add_option( "--filename-filter-sbjct", dest="filename_filter_sbjct", type="string", help= "gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default]." ) parser.add_option( "--keep-forbidden", dest="keep_forbidden", action="store_true", help= "if set, keep only matches that overlap the regions supplied with --filename-filter-sbjct [%default]." ) parser.add_option( "--query-forward-coordinates", dest="query_forward_coordinates", action="store_true", help= "use forward coordinates for query, strand will refer to sbjct [%default]." ) parser.add_option( "--ignore-all-random", dest="ignore_all_random", action="store_true", help= "if there are multiple best matches, ignore all those to chrUn and _random [%default]." ) parser.add_option( "--collection-threshold", dest="collection_threshold", type="float", help= "threshold for collecting matches, percent of best score [%default].") parser.add_option( "--collection-distance", dest="collection_distance", type="float", help= "threshold for collecting matches, difference to best score [%default]." ) parser.set_defaults( input_filename_domains=None, input_filename_queries=None, threshold_good_query_coverage=90.0, threshold_min_pid=30.0, threshold_min_matches=0, threshold_max_error_rate=None, output_filename_pattern="%s", keep_unique_matches=False, output_format="map", print_matched=["full", "partial", "good"], from_zipped=False, combine_overlaps=True, min_length_domain=30, threshold_min_query_coverage=50, min_length_singletons=30, new_family_id=10000000, add_singletons=False, matching_mode="best-coverage", best_per_sbjct=False, threshold_max_query_gapchars=None, threshold_max_query_gaps=None, threshold_max_sbjct_gapchars=None, threshold_max_sbjct_gaps=None, filename_filter_sbjct=None, keep_forbidden=False, keep_all_best=False, test=None, query_forward_coordinates=False, output_filename_empty=None, collection_threshold=1.0, collection_distance=0, polyA=False, # max residues missing from non polyA end polyA_max_unaligned=3, # min residues in tail polyA_min_unaligned=10, # min percent residues that are A/T in tail polyA_min_percent=70.0, ## ignore duplicate matches if they are on Un or _random ignore_all_random=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) == 1: if options.from_zipped or args[0][-3:] == ".gz": import gzip infile = gzip.open(args[0], "r") else: infile = open(args[0], "r") else: infile = sys.stdin if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None if options.filename_filter_sbjct: try: import bx.intervals.io import bx.intervals.intersection except ImportError: raise "filtering for intervals requires the bx tools." intervals = GTF.readGFFFromFileAsIntervals( open(options.filename_filter_sbjct, "r")) intersectors = {} for contig, values in intervals.items(): intersector = bx.intervals.intersection.Intersecter() for start, end in values: intersector.add_interval(bx.intervals.Interval(start, end)) intersectors[contig] = intersector if options.loglevel >= 1: options.stdlog.write("# read %i intervals for %i contigs.\n" %\ (sum( [ len(x) for x in intervals.values() ] ), len( intersectors ) )) else: intersectors = None ################################################ ################################################ ################################################ ## processing of a chunk (matches of same query) ################################################ ninput, noutput, nskipped = 0, 0, 0 ## number of sequences with full/partial/good matches nfull_matches, npartial_matches, ngood_matches = 0, 0, 0 ## number of sequences which are fully/good/partially matched ## i.e., after combining all aligned regions nfully_matched, npartially_matched, nwell_matched = 0, 0, 0 nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0 nremoved_gaps, nremoved_nmatches = 0, 0 nremoved_regions = 0 nqueries_removed_region = 0 aggregate_coverages = [] mapped_coverages = [] fully_matched = [] well_matched = [] partially_matched = [] new_family_id = options.new_family_id if options.output_filename_empty: outfile_empty = open(options.output_filename_empty, "w") outfile_empty.write("read_id\tcomment\n") else: outfile_empty = None if options.polyA: options.outfile_polyA = open(options.output_filename_pattern % "polyA", "w") options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n") def processChunk(query_id, matches): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * math.power(options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append(match) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return ## Remove queries matching to a forbidden region. This section ## will remove the full query if any of its matches matches in a ## forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo) if found and not options.keep_forbidden or ( found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write("%s\toverlap with forbidden region\n" % query_id) return ## check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 ## compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * (match.mQueryLength - sum( map(lambda x: x[1] - x[0], rest))) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_good_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) ## select matches to output matches, msg = selectMatches(query_id, matches, options, queries_fasta) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write( "%s\n" %\ "\t".join( map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str,match.mBlockSizes) ), ",".join( map(str,match.mQueryBlockStarts)), ",".join( map(str,match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write(str(match) + "\n") noutput += 1 else: if outfile_empty: outfile_empty.write("%s\tno matches selected: %s\n" % (query_id, msg)) nempty += 1 if options.output_format == "map": options.stdout.write("\t".join( ("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage", "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n") elif options.output_format == "psl": options.stdout.write(Blat.Match().getHeader() + "\n") ################################################ ################################################ ################################################ ## main loop ################################################ nfully_covered = None matches = [] last_query_id = None is_complete = True ninput_lines = 0 skip = 0 iterator = Blat.BlatIterator(infile) while 1: try: match = iterator.next() except Blat.ParsingError: iterator = Blat.BlatIterator(infile) continue if match == None: break ninput_lines += 1 if options.test and ninput_lines > options.test: break if match.mQueryId != last_query_id: if last_query_id: processChunk(last_query_id, matches) matches = [] last_query_id = match.mQueryId matches.append(match) processChunk(last_query_id, matches) printHistogram(aggregate_coverages, "aggregate", options) printHistogram(mapped_coverages, "mapped", options) if "full" in options.print_matched: printMatched(fully_matched, "full", options) if "good" in options.print_matched: printMatched(well_matched, "good", options) if "partial" in options.print_matched: printMatched(partially_matched, "partial", options) if options.loglevel >= 1: options.stdlog.write("# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete))) options.stdlog.write("# queries: ninput=%i, noutput=%i\n" % (ninput, noutput)) options.stdlog.write( "# individual coverage: full=%i, good=%i, partial=%i\n" % (nfull_matches, ngood_matches, npartial_matches)) options.stdlog.write( "# aggregate coverage: full=%i, good=%i, partial=%i\n" % (len(fully_matched), len(well_matched), len(partially_matched))) options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %\ (nskipped+nqueries_removed_region+nempty, nskipped, nqueries_removed_region, nempty)) options.stdlog.write( "# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % (nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches)) E.Stop()
def pslMerge(options): """merge psl alignments. """ iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 last_query = None last_target = None last_strand = None def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(xrange(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge(x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn( "query=%s, target=%s, strand=%s: removed overlapping/out-of-order segments: before=%i, after=%i" % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 10: options.stdlog.write("# input: %s\n" % (str(match))) if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mQueryId != last_query or match.strand != last_strand or match.mSbjctId != last_target: if last_query: noutput += process(matches) matches = [] last_query, last_target, last_strand = match.mQueryId, match.mSbjctId, match.strand matches.append(match) if last_query: noutput += process(matches) E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--exclusive", dest="exclusive", action="store_true", help="Intervals reported will be merged across the positive set" " and do not overlap any interval in any of the other sets" " [default=%default].") parser.add_option("-p", "--pattern-id", dest="pattern_id", type="string", help="pattern to convert a filename to an id [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("merged-combinations", "unmerged-combinations"), help = "method to perform [default=%default]") parser.set_defaults( pattern_id="(.*).bed.gz", exclusive=False, method="merged-combinations", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("at least two arguments required") tags, bedfiles = [], [] for infile in args: bedfiles.append(pysam.Tabixfile(infile, "r")) tags.append(re.search(options.pattern_id, infile).groups()[0]) indices = range(len(bedfiles)) is_exclusive = options.exclusive if options.method == "merged-combinations": if is_exclusive: start = 1 else: start = 2 options.stdout.write("combination\twithout\tcounts\n") for ncombinants in range(start, len(bedfiles) + 1): for combination in itertools.combinations(indices, ncombinants): other = [x for x in indices if x not in combination] tag = ":".join([tags[x] for x in combination]) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) other_bed = [bedfiles[x] for x in other] outf = IOTools.openFile( E.getOutputFile(tag), "w", create_dir=True) c = E.Counter() for contig, start, end in combineMergedIntervals([bedfiles[x] for x in combination]): c.found += 1 if is_exclusive and isContainedInOne(contig, start, end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\t%i\t%i\n" % (contig, start, end)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%i\n" % ( ":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) elif options.method == "unmerged-combinations": options.stdout.write("track\tcombination\twithout\tcounts\n") for foreground in indices: start = 0 background = [x for x in indices if x != foreground] for ncombinants in range(0, len(background) + 1): for combination in itertools.combinations(background, ncombinants): other = [x for x in background if x not in combination] combination_bed = [bedfiles[x] for x in combination] other_bed = [bedfiles[x] for x in other] tag = ":".join([tags[foreground]] + [tags[x] for x in combination]) E.debug("fg=%i, combination=%s, other=%s" % (foreground, combination, other)) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) outf = IOTools.openFile( E.getOutputFile(tag), "w", create_dir=True) c = E.Counter() for bed in combineUnmergedIntervals( bedfiles[foreground], combination_bed): c.found += 1 if is_exclusive and isContainedInOne(bed.contig, bed.start, bed.end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\n" % str(bed)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%s\t%i\n" % ( tags[foreground], ":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) E.Stop()
continue except StopIteration: break if not x: break ninput += 1 if query_fasta: x.mQueryId = fq(x.mQueryId) if sbjct_fasta: x.mSbjctId = ft(x.mSbjctId) if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) yield x E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) def iterator_filter_fasta(infile, query_fasta, sbjct_fasta, options): ninput, noutput, nerrors = 0, 0, 0 qmissing, smissing = collections.defaultdict(int), collections.defaultdict( int), while 1: try:
def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(xrange(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge(x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn( "query=%s, target=%s, strand=%s: removed overlapping/out-of-order segments: before=%i, after=%i" % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option("--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option("--apply", dest="apply", type="string", help="apply a filter to fastq file (taking only reads in filename) [default=%default].") parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option("--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option("--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults( change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") E.warn( "consider sorting individual fastq files - this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def pslMap(options): """thread psl alignments using intervals. """ if options.format == "gtf": use_copy = False else: use_copy = True ninput, noutput, ndiscarded, nskipped, nskipped_small_queries = 0, 0, 0, 0, 0 min_length = options.min_aligned for match, qx, tx in iterator_psl_intervals(options): map_query2target = match.getMapQuery2Target() ninput += 1 # if no filter on qx or tx, use full segment if qx is None: qx = [(match.mQueryFrom, match.mQueryTo, 0)] elif tx is None: tx = [(match.mSbjctFrom, match.mSbjctTo, 0)] # if no overlap: return if not qx or not tx: nskipped += 1 continue for query in qx: qstart, qend, qval = query # skip elements that are too small if qend - qstart < min_length: E.debug("query too small - skipped at %s:%i-%i" % (match.mQueryId, qstart, qend)) nskipped_small_queries += 1 continue E.debug("working on query %s:%i-%i" % (match.mQueryId, qstart, qend)) mqstart, mqend = (map_query2target.mapRowToCol( qstart, alignlib_lite.py_RIGHT), map_query2target.mapRowToCol( qend, alignlib_lite.py_LEFT)) if match.strand == "-": qstart, qend = match.mQueryLength - \ qend, match.mQueryLength - qstart for target in tx: tstart, tend, tval = target if tstart >= mqend or tend <= mqstart: continue if tend - tstart < min_length: continue new = alignlib_lite.py_makeAlignmentBlocks() if use_copy: # do copy with range filter if options.loglevel >= 3: mtstart, mtend = map_query2target.mapColToRow( tstart), map_query2target.mapColToRow(tend) E.debug( "query: %i-%i (len=%i)-> %i-%i(len=%i); target: %i-%i (len=%i)-> %i-%i (len=%i)" % (qstart, qend, qend - qstart, mqstart, mqend, mqend - mqstart, tstart, tend, tend - tstart, mtstart, mtend, mtend - mtstart)) alignlib_lite.py_copyAlignment(new, map_query2target, qstart, qend, tstart, tend) else: # do copy with alignment filter map_query = qval if map_query: tmp = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment(tmp, map_query2target, map_query, alignlib_lite.py_RR) if options.loglevel >= 5: options.stdlog.write( "######## mapping query ###########\n") options.stdlog.write("# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) options.stdlog.write("# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query))) options.stdlog.write("# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions(tmp)) ) else: tmp = map_query2target map_target = tval if map_target: new = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment(new, tmp, map_target, alignlib_lite.py_CR) if options.loglevel >= 5: options.stdlog.write( "######## mapping target ###########\n") options.stdlog.write("# before: %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions(tmp)) ) options.stdlog.write("# map : %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_target))) options.stdlog.write("# after : %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions(new)) ) else: new = tmp if options.loglevel >= 4: E.debug("putative match with intervals: %s and %s: %i-%i" % (str(query), str(target), qstart, qend)) if options.loglevel >= 5: E.debug("input : %s" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) E.debug("final : %s" % str( alignlib_lite.py_AlignmentFormatEmissions(new))) if new.getLength() > 0: n = match.copy() n.fromMap(new, use_strand=True) E.info("match : %s" % (str(n))) if new.getNumAligned() > options.min_aligned: n = match.copy() n.fromMap(new, use_strand=True) options.stdout.write(str(n) + "\n") noutput += 1 else: ndiscarded += 1 E.info( "map: ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i, nsmall_queries=%i" % (ninput, noutput, nskipped, ndiscarded, nskipped_small_queries))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree_diff.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-1", "--filename-tree1", dest="filename_tree1", type="string", help="filename with first tree(s).") parser.add_option("-2", "--filename-tree2", dest="filename_tree2", type="string", help="filename with second tree(s).") parser.add_option("-o", "--outgroup", dest="outgroup", type="string", help="reroot with outgroup before processing.") parser.set_defaults(filename_tree1=None, filename_tree2=None, outgroup=None) (options, args) = E.Start(parser, add_pipe_options=True) if (len(args) == 2): options.filename_tree1, options.filename_tree2 = args if not options.filename_tree1 or not options.filename_tree2: raise ValueError("please specify two trees.") ## take first trees nexus = TreeTools.Newick2Nexus(open(options.filename_tree1, "r")) trees1 = nexus.trees if options.loglevel >= 1: options.stdlog.write("# read %i trees from %s.\n" % (len(trees1), options.filename_tree1)) ## take first trees nexus = TreeTools.Newick2Nexus(open(options.filename_tree2, "r")) trees2 = nexus.trees if options.loglevel >= 1: options.stdlog.write("# read %i trees from %s.\n" % (len(trees2), options.filename_tree2)) ntotal, nsame, ndiff = 0, 0, 0 if options.outgroup: for tree in trees1: tree.root_with_outgroup(options.outgroup) for tree in trees2: tree.root_with_outgroup(options.outgroup) for x in range(len(trees1)): for y in range(len(trees2)): if options.loglevel >= 2: print trees1[x] print trees2[y] if trees1[x].is_identical(trees2[y]): code = "=" nsame += 1 else: code = "<>" ndiff += 1 options.stdout.write("%s\t%i\t%i\n" % (code, x, y)) ntotal += 1 options.stdlog.write("# n1=%i, n2=%i, ntotal=%i, nsame=%i, ndiff=%i\n" % (len(trees1), len(trees2), ntotal, nsame, ndiff)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option( "-s", "--sites", dest="sites", type="string", help="sites to use [default=%default].", ) parser.add_option( "-f", "--file", dest="filename", type="string", help="filename of multiple alignment (- for stdin) [default=%default].", metavar="FILE") parser.add_option("-o", "--format", dest="format", type="string", help="format [default=%default].", metavar="format") parser.add_option( "-d", "--distance", dest="distance", type="choice", choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81", "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT", "PMB", "PAM", "Kimura", "CategoriesModel"), help="method to use for distance calculation [default=%default].") parser.add_option("--method", dest="method", type="choice", choices=("phylip", "baseml", "own", "xrate"), help="program to use for rate calculation.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("list", "tree"), help="output format.") parser.add_option( "-m", "--min-sites", dest="min_sites", type="int", help="minimum number of sites for output[default=%default].", ) parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na", "auto"), help="alphabet to use.", ) parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree information.") parser.add_option("--set-alpha", dest="alpha", type="float", help="initial alpha value.") parser.add_option("--fix-alpha", dest="fix_alpha", action="store_true", help="do not estimate alpha.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.add_option("--test", dest="test", action="store_true", help="test run - does not clean up.") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions.") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input", "trained", "all"), help="output sections to write for xrate.") parser.add_option("--output-pattern", dest="output_pattern", type="string", help="output pattern for output files.") parser.add_option("--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate.") parser.set_defaults( \ input_format = "fasta", filename_tree = None, with_counts = False, sites = "d4", distance = "T92", min_sites = 1, filename = "-", alphabet="auto", format= "%6.4f", method="phylip", kappa = None, fix_kappa = False, alpha = None, fix_alpha = False, dump = False, clean_data = None, output_format = "list", iteration="all-vs-all", pairwise=False, report_step = 1000, output_pattern = "%s.eg", write = [], test_xrate = False, xrate_min_increment = None, is_codons = False, ) (options, args) = E.Start(parser) if options.filename != "-": infile = open(options.filename, "r") else: infile = sys.stdin ## read multiple alignment if options.pairwise: ## read sequences, but not as a multiple alignment. This permits multiple names. mali = Mali.SequenceCollection() options.iteration = "pairwise" else: mali = Mali.Mali() mali.readFromFile(infile, format=options.input_format) ids = mali.getIdentifiers() if options.alphabet == "auto": s = "".join(map(lambda x: x.mString, mali.values())).lower() ss = re.sub("[acgtxn]", "", s) if float(len(ss)) < (len(s) * 0.1): options.alphabet = "na" if mali.getNumColumns() % 3 == 0: options.is_codons = True else: options.alphabet = "aa" if options.loglevel >= 1: options.stdlog.write("# autodetected alphabet: %s\n" % options.alphabet) if options.filename != "-": infile.close() npairs = 0 nskipped_length = 0 nskipped_distance = 0 pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids) - 1): for y in range(x + 1, len(ids)): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) if options.alphabet == "na": if options.method == "baseml": runBaseML(mali, pairs, options) elif options.method == "phylip" and options.distance in ("F84", "K80", "JC69", "LogDet"): runDNADIST(mali, pairs, options) elif options.method == "xrate": runXrate(mali, pairs, options) else: if options.is_codons: h = Genomics.SequencePairInfoCodons().getHeader() else: h = Genomics.SequencePairInfo().getHeader() options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h)) for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] info = Genomics.CalculatePairIndices( mali[id_x], mali[id_y], with_codons=options.is_codons) if options.distance in ("T92", "JC69"): if options.sites == "d4": seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x], mali[id_y], position=3, degeneracy=4) if len(seq1) < options.min_sites: nskipped_length += 1 continue else: raise "unknown sites %s" % options.sites if options.distance == "T92": distance, variance = CalculateDistanceT92(info) elif options.distance == "JC69": distance, variance = CalculateDistanceJC69(info) elif options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( map(str, (id_x, id_y, options.format % distance, options.format % variance, info))) + "\n") else: nskipped_distance += 1 elif options.alphabet == "aa": if options.distance in ("JTT", "PMB", "PAM", "Kimura", "CategoriesModel"): # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("protdist") phylip.setMali(mali) phylip_options = [] if options.distance == "PMG": phylip_options += ["D"] * 1 elif options.distance == "PAM": phylip_options += ["D"] * 2 elif options.distance == "Kimura": phylip_options += ["D"] * 3 elif options.distance == "CategoriesModel": phylip_options += ["D"] * 4 phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() writePhylipResult(result, options) else: options.stdout.write("id1\tid2\tdist\tvar\n") ## iterate over all pairs of sequences for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] if options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": ## percentage overlap distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( (id_x, id_y, options.format % distance, options.format % variance)) + "\n") else: nskipped_distance += 1 if options.loglevel >= 1: options.stdlog.write( "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n" % (len(ids), npairs, nskipped_length, nskipped_distance)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [kl=kullback-leibler]", choices=("kl", )) parser.add_option("-n", "--no-normalize", dest="normalize", action="store_false", help="do not normalize data") parser.add_option("-p", "--pseudocounts", dest="pseudocounts", type="int", help="pseudocounts to add.") parser.add_option("-f", "--number-format", dest="number_format", type="string", help="number format.") parser.set_defaults(method="kl", columns="all", headers=True, xrange=None, pseudocounts=1, normalize=True, number_format="%6.4f") (options, args) = E.Start(parser, add_pipe_options=True) if options.xrange: options.xrange = map(float, options.xrange.split(",")) data, legend = IOTools.readTable(sys.stdin, numeric_type=numpy.float32, take=options.columns, headers=options.headers, truncate=options.xrange) nrows, ncols = data.shape # first: normalize rows for y in range(1, ncols): for x in range(nrows): data[x, y] = data[x, y] + float(options.pseudocounts) if options.normalize: t = numpy.sum(data[:, y]) for x in range(nrows): data[x, y] = data[x, y] / t for x in range(1, len(legend) - 1): for y in range(x + 1, len(legend)): if options.method == "kl": d1 = 0.0 d2 = 0.0 for bin in range(nrows): p = data[bin, x] q = data[bin, y] d1 += p * math.log(p / q) d2 += q * math.log(q / p) options.stdout.write( "%s\t%s\t%s\n" % (legend[x], legend[y], options.number_format % d1)) options.stdout.write( "%s\t%s\t%s\n" % (legend[y], legend[x], options.number_format % d2)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $", usage=globals()["__doc__"]) parser.add_option( "-m", "--species-map", dest="species_map", type="string", help="text file specifying the mapping between contig and genome") parser.add_option( "-g", "--genome-dir", dest="genome_dir", type="string", help="specify directory where genome / genomes are stored") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contig lengths into dictionary E.info("reading contigs file") c_contigs = 0 contigs_lengths = {} for fasta in FastaIterator.iterate(options.stdin): c_contigs += 1 # titles of fasta records must be single strings with no special # characters contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence) E.info("read %i contigs" % c_contigs) # read in mapping between spcies and contigs species_map = {} for line in open(options.species_map).readlines(): data = line[:-1].split("\t") contig, species = data[0], data[1] species_map[contig] = species # read genomes into memory # NB this may need optimisin if using large # genomes or many genomes E.info("reading genomes from %s" % options.genome_dir) # The directory must ONLY contain genome files!! genomes_sequences = {} c_genomes = 0 for genome_file in glob.glob(os.path.join(options.genome_dir, "*")): c_genomes += 1 for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)): genomes_sequences[fasta.title] = fasta.sequence E.info("read %i genomes from %s" % (c_genomes, options.genome_dir)) # iterate over the contigs and sample from the respective genome E.info("iterating over contigs") c_contigs_output = 0 for contig, length in contigs_lengths.iteritems(): if contig not in species_map: E.warn("contig %s not in species map file" % contig) else: c_contigs_output += 1 genome = species_map[contig] genome_length = len(genomes_sequences[genome]) # get the start position from which to sample start = random.randint(1, genome_length) try: end = start + length - 1 except ValueError: print "end of sampled contig extends beyond length of genome" sampled_seq = genomes_sequences[genome][start:end] options.stdout.write( ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq)) E.info("written %i contigs" % c_contigs_output) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.Samfile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.readMap( IOTools.openFile(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.iteritems()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.openFile(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.Samfile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.Samfile("-", "wh", template=genome_samfile) else: output_samfile = pysam.Samfile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.Samfile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.Samfile(options.filename_junctions, "rb") else: junctions_samfile = None c = _bams2bam.filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.openFile(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2fasta.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--filename-query", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--filename-target", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option( "--forward-query", dest="forward_query", action="store_true", help= "reverse-complement sequences such that query is always on forward strand [%default]" ) parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts)) q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \ (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali ) ) id += 1 E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2view.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--target", dest="target", type="choice", choices=("ucsc", "gbrowser"), help="target location to open [%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("-f", "--add-flank", dest="flank", type="int", help="add # nucleotides for each region.") parser.add_option("-z", "--zoom", dest="zoom", type="float", help="zoom out (# > 1) or in (# < 1).") parser.add_option("-c", "--chunk-size", dest="chunk_size", type="int", help="number of tabs to display in one go.") parser.add_option("--ucsc-assembly", dest="ucsc_assembly", type="string", help="ucsc assembly.") parser.add_option("--ucsc-user-tracks", dest="ucsc_user_tracks", type="string", help="ucsc user tracks.") parser.add_option("--gbrowser-assembly", dest="gbrowser_assembly", type="string", help="gbrowser assembly.") parser.add_option("--randomize", dest="randomize", action="store_true", help="randomize input [%default]") parser.set_defaults( ucsc_assembly="ponAbe2", ucsc_url="http://genome.ucsc.edu/cgi-bin/hgTracks", gbrowser_assembly="Songbird", gbrowser_url="http://genserv.anat.ox.ac.uk/cgi-bin/devel/gbrowse", genome_file=None, ucsc_custom_annotation="http://wwwfgu.anat.ox.ac.uk/~andreas/ucsc_tracks/%s", ucsc_user_tracks=None, flank=None, zoom=None, chunk_size=50, is_gtf=False, target="ucsc", randomize=False, joined=False, ) (options, args) = E.Start(parser) if len(args) != 1: print USAGE raise "please specify the gff file to open." if options.is_gtf: entry_iterator = GTF.iterator chunk_iterator = GTF.flat_gene_iterator else: entry_iterator = GTF.iterator if options.joined: chunk_iterator = GTF.joined_iterator else: chunk_iterator = GTF.chunk_iterator if len(args) == "0" or args[0] == "-": iterator = chunk_iterator(entry_iterator(sys.stdin)) else: iterator = chunk_iterator(entry_iterator(open(args[0], "r"))) nopened = 0 # b = webbrowser.get( "konqueror" ) b = webbrowser.get("firefox") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.ucsc_user_tracks: annotations = "hgt.customText=%s" % ( options.ucsc_custom_annotation % options.ucsc_user_tracks) else: annotations = None for chunk in iterator: start = min([x.start for x in chunk]) end = max([x.end for x in chunk]) if options.flank: start -= options.flank end += options.flank if options.zoom: s = end - start d = options.zoom * s - s start -= d end += d start = max(0, start) contig = chunk[0].contig if fasta: contig = fasta.getToken(contig) end = min(end, fasta.getLength(contig)) if len(contig) < 3: contig = "chr%s" % contig if options.target == "ucsc": url_options = ["db=%s" % options.ucsc_assembly, "position=%s:%i-%i" % (contig, start, end)] if annotations: url_options.append(annotations) url = "%s?%s" % (options.ucsc_url, "&".join(url_options)) elif options.target == "gbrowser": url = "%s/%s?name=%s:%i..%i" % (options.gbrowser_url, options.gbrowser_assembly, contig, start, end) print "# opening browser window for:" print "#", url if nopened % options.chunk_size == 0: if nopened != 0: x = raw_input( 'showing %i - hit return to continue:' % options.chunk_size) b.open_new(url) first = False else: b.open_new_tab(url) nopened += 1 E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--master", dest="master", type="string", help="master sequence.") parser.add_option("-p", "--master-pattern", dest="master_pattern", type="string", help="master pattern.") parser.add_option("--master-species", dest="master_species", type="string", help="species to use as master sequences.") parser.add_option("-t", "--translate", dest="filename_translation", type="string", help="filename on where to store translated sequences.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-c", "--mark-codons", dest="mark_codons", action="store_true", help="mark codons.") parser.add_option( "-i", "--ignore-case", dest="ignore_case", action="store_true", help="ignore case (otherwise: lowercase are unaligned chars).") parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons.") parser.add_option("--mask-stops", dest="mask_stops", action="store_true", help="mask stop codons.") parser.add_option("--mask-char", dest="mask_char", type="string", help="masking character to use.") parser.add_option("-f", "--remove-frameshifts", dest="remove_frameshifts", action="store_true", help="remove columns corresponding to frameshifts.") parser.add_option( "--mask-master", dest="mask_master", action="store_true", help= "columns in master to be removed are masked to keep residue numbering." ) parser.add_option( "-s", "--split-exons", dest="split_exons", action="store_true", help="split columns aligned to different exons in the same gene.") parser.add_option("-a", "--target", dest="target", type="choice", choices=("paml", ), help="perform cleaning up for certain targets.") parser.set_defaults( gap_char="-", mask_char="n", gap_chars="-.", separator="|", master=None, master_species=None, filename_translation=None, filename_exons=None, master_pattern=None, remove_stops=False, mark_codons=False, mask_unaligned=False, split_exons=False, remove_frameshifts=False, min_segment_length=5, ignore_case=False, mask_stops=False, target=None, mask_master=False, ) (options, args) = E.Start(parser) if options.target == "paml": options.mask_stops = True options.mask_char = "n" options.remove_frameshifts = True if options.loglevel >= 1: options.stdlog.write( "# setting output to paml : removing frameshifts, masking stops with '%s'.\n" % (options.mask_char)) # 1. read multiple alignment in fasta format mali = Mali.Mali() mali.readFromFile(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read mali with %i entries.\n" % len(mali)) if len(mali) == 0: raise "empty multiple alignment" identifiers = mali.getIdentifiers() masters = [] if options.master: masters = options.master.split(",") elif options.master_pattern: for id in identifiers: if re.search(options.master_pattern, id): masters.append(id) elif options.master_species: for id in identifiers: if options.master_species == id.split(options.separator)[0]: masters.append(id) else: masters.append(identifiers[0]) if options.loglevel >= 2: options.stdlog.write("# master sequences are: %s\n" % str(masters)) options.stdlog.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(identifiers), from_zero=True) if options.loglevel >= 2: options.stdlog.write("# read exons %i sequences.\n" % len(exons)) else: exons = {} ########################################################################## ########################################################################## ########################################################################## # translate characters to upper/lower case according to exon info. ########################################################################## if exons: for id in identifiers: if id in exons: mali.getSequence(id).mString = AddExonInformation( mali[id], exons[id], mask_char=options.mask_char) elif options.ignore_case: # convert all to uppercase mali.upper() ########################################################################## ########################################################################## ########################################################################## # untangle misaligned exons ########################################################################## if exons and options.split_exons: # first split with masters if len(masters) > 0: SplitExons(mali, exons, masters=masters, options=options) if options.loglevel >= 4: mali.writeToFile(open("log_mali1", "w"), format="fasta") SplitExons(mali, exons, options) ########################################################################## ########################################################################## ########################################################################## # remove frameshifts ########################################################################## if options.remove_frameshifts: out_of_frame_columns = [] if len(masters) == 1: frame_columns = GetFrameColumns(mali, masters[0], gap_chars=options.gap_chars) else: columns = [] for id in masters: columns += GetFrameColumns(mali, id, gap_chars=options.gap_chars) if len(columns) == 0: columns += GetFrameColumns(mali, identifiers[0], gap_chars=options.gap_chars) # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100), # and (1,2,100) before (1,3,4). columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2]))) # select codons frame_columns = [] last_codon = columns[0] for codon in columns[1:]: # skip identical codons if codon == last_codon: continue # take first (shortest) codon in case of identical first # residue if codon[0] == last_codon[0]: continue # if not overlapping, keep if codon[0] > last_codon[2]: frame_columns.append(last_codon) else: out_of_frame_columns += last_codon # if overlapping, but out of register: skip last_codon = codon frame_columns.append(last_codon) # build set of skipped columns frame_set = set() for column in frame_columns: for c in column: frame_set.add(c) # columns that contain a master sequence that is out of # frame out_of_frame_set = set(out_of_frame_columns) out_of_frame_set = out_of_frame_set.difference(frame_set) if options.loglevel >= 1: options.stdlog.write("# found %i/%i columns in frame\n" % (len(frame_columns) * 3, mali.getWidth())) if options.loglevel >= 5: options.stdlog.write("# frame columns: %i\n" % (len(frame_columns))) x = 0 for column in frame_columns: options.stdlog.write("# %i\t%s\n" % (x, ",".join(map(str, column)))) x += 1 if options.loglevel >= 5: options.stdlog.write( "# Out-of frame columns with residue of masters: %i\n" % (len(out_of_frame_set))) options.stdlog.write("# %s" % ",".join(map(str, out_of_frame_columns))) mask_chars = (string.upper(options.mask_char), string.lower(options.mask_char)) to_delete = [] ignore_case = exons or options.ignore_case for id in identifiers: ngaps, nmasked = 0, 0 sequence = mali.getSequence(id).mString if options.loglevel >= 7: options.stdlog.write( "# processing sequence %s of length %i with gaps\n" % (id, len(sequence))) # treat masters differently if they are only to be masked, not # pruned. # simple mask all characters that are to skipped fragments = [] nstops, ncodons, naligned = 0, 0, 0 codon = [] chars = [] is_master = id in masters for x in range(len(sequence)): c = sequence[x] # delete columns that do not align to # a master. if x not in frame_set and x not in out_of_frame_set: continue chars.append(c) if c not in options.gap_chars: codon.append(c) if len(codon) % 3 == 0: codon = "".join(codon) codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options) if codon_is_aligned: naligned += 1 to_mask = False if codon_is_all_gaps: ngaps += len(chars) elif codon_is_ok: ncodons += 1 if string.upper(codon) in ("TAG", "TAA", "TGA"): nstops += 1 to_mask = True else: to_mask = True nmasked += 1 if to_mask: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) chars = [] codon = [] # mask incomplete codons at the end if chars: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) # else: # for a,b,c in frame_columns: ## codon = sequence[a] + sequence[b] + sequence[c] ## codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options ) ## if codon_is_aligned: naligned += 1 # if codon_is_all_gaps: ## fragments.append( options.gap_char * 3 ) ## ngaps += 1 # elif codon_is_ok: ## ncodons += 1 # if string.upper(codon) in ("TAG", "TAA", "TGA"): # if options.remove_stops: ## fragments.append( options.gap_char * 3 ) # elif options.mask_stops: ## fragments.append( options.mask_char * 3 ) # else: ## fragments.append( codon ) ## nstops += 1 # else: ## fragments.append( codon ) # else: ## fragments.append( options.gap_char * 3 ) ## nmasked += 1 # if options.loglevel >= 7: # options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id, # a,b,c, # codon, # str(codon_is_ok), # str(codon_is_aligned) )) s = string.join(fragments, "") if options.loglevel >= 1: options.stdlog.write( "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n" % (id, len(fragments), naligned, ncodons, nstops, ngaps, nmasked)) options.stdlog.flush() # postpone deletion in order to not # confuse the iteration of ids if naligned == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned nucleotides.\n" % id) to_delete.append(id) elif ncodons == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned codons.\n" % id) to_delete.append(id) else: mali.setSequence(id, string.join(fragments, "")) for id in to_delete: del mali[id] for id in identifiers: if options.mark_codons: a = mali[id] f = lambda x: a[x:x + 3] s = string.join([f(x) for x in range(0, len(a), 3)], " ") else: s = mali[id] options.stdout.write(">%s\n%s\n" % (id, s)) if options.filename_translation: outfile = open(options.filename_translation, "w") for id in mali.keys(): outfile.write(">%s\n%s\n" % (id, Genomics.TranslateDNA2Protein(mali[id]))) outfile.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="supply input bam file name") parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string", help="supply input gtf file name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="supply output file name") parser.add_option( "-G", "--reference-GTF", dest="reference_gtf", type="string", help= "supply reference gtf for context of reads not contributing to transcripts" ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ###################################################### ###################################################### # for all alignments ###################################################### ###################################################### # open outfile and prepare headers outf = open(options.outfile, "w") outf.write("\t".join([ "total alignments", "aligments in transcripts", "percent alignments in transcripts", "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts" ]) + "\n") # calculate coverage over transcript file - NB split reads contribute twice to the transcript # use BedTool object pybedbamfile = pybedtools.BedTool(options.bam_file) # count alignments E.info("counting total number of alignments and spliced alignments") total_alignments = 0 spliced_alignments = 0 for alignment in pybedbamfile: cigar = alignment[5] if cigar.find("N") != -1: # N signifies split read total_alignments += 1 spliced_alignments += 1 else: total_alignments += 1 # merge the gtf file to avoid double counting of exons in different transcripts - converts to a bed file gtffile = pybedtools.BedTool(options.gtf_file).merge() E.info("computing coverage of aligments in %s over intervals in %s" % (options.bam_file, options.gtf_file)) cover = pybedbamfile.coverage(gtffile) # make sure that the exons aren't being counted twice - shouldn't be because of merge E.info("counting reads contributing to transcripts") c = 0 for entry in cover: coverage = int(entry[3]) if coverage > 0: c += coverage # sum the coverage across exons from all transcripts coverage_in_transcripts = c ###################################################### ###################################################### # for spliced alignments ###################################################### ###################################################### # count total number of spliced alignments # requires that the CIGAR string 'N' is present # uses pysam to write out a bam file of the spliced reads only allreads = pysam.Samfile(options.bam_file) spliced_bamname = P.snip(options.bam_file, ".bam") + "_spliced_reads.bam" # open file for outputting spliced alignments splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads) # cigar string in pysam for spliced alignment is (3, int) spliced = collections.defaultdict(list) for read in allreads: for cigar_tag in read.cigar: if cigar_tag[0] == 3: spliced[read].append(cigar_tag) # write out spliced alignments for read in spliced.keys(): splicedreads.write(read) splicedreads.close() allreads.close() # index splice reads bam file pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam")) pysam.index(spliced_bamname) # read in the spliced reads as a BedTool object splicedbam = pybedtools.BedTool(spliced_bamname) # perform coverage of spliced reads over intervals - will be twice as many as there should be # due to counting both exons overlapping spliced_coverage = splicedbam.coverage(gtffile) # avoid double counting exons E.info("counting spliced reads contributing to transcripts") spliced_exons = {} c = 0 for entry in spliced_coverage: coverage = int(entry[3]) if coverage > 0: c += coverage spliced_coverage_in_transcripts = c # NOTE: the counting of spliced alignments is not accurate spliced_coverage_in_transcripts = float( spliced_coverage_in_transcripts) / 2 ########################### ## write out the results ## ########################### outf.write(str(int(total_alignments)) + "\t") # remove half of the coverage assigned to spliced reads coverage_in_transcripts = (coverage_in_transcripts) - ( spliced_coverage_in_transcripts) outf.write( str( int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t") # write out spliced counts outf.write(str(int(spliced_alignments)) + "\t") outf.write(str(int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100))) outf.close() ############################ # contextualise those that # don't fall in transcripts ############################ if options.reference_gtf: context_summary = open( P.snip(options.bam_file, ".bam") + ".excluded.context", "w") context_summary.write("\t".join(["Feature", "number"]) + "\n") # write out the read info as well context_file = open( P.snip(options.bam_file, ".bam") + ".excluded", "w") context_dict = collections.defaultdict(int) # intersect bam - write non-overlapping with transcripts - intersect with reference - write out context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect( pybedtools.BedTool(options.reference_gtf), wb=True) for entry in context: feature = entry[8] context_dict[feature] += 1 context_file.write("\t".join([e for e in entry]) + "\n") for feature, value in context_dict.iteritems(): context_summary.write("\t".join([feature, str(value)]) + "\n") context_file.close() context_summary.close() ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--format", dest="format", type="choice", choices=("gff", "fasta", "aa"), help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.format == "gff": os.system("grep -v '#' | grep -v '^$'") elif options.format == "aa": pattern = "Protein" elif options.format == "fasta": pattern = "DNA" # assume that each sequence line does not just contain the # amino acids A, C, T or G - THIS IS NOT OPTIMAL # This is a list of the amino acids that are not in either "DNA" or "A", "C", "T", "G" # This assumes that each sequence (line) will contain at least one of these amino_acids = [ "M", "R", "Q", "E", "H", "I", "L", "K", "F", "P", "S", "W", "Y", "V" ] result = [] name = None for line in options.stdin.readlines(): if not line.startswith("##") or line.find("date") != -1 or line.find( "gff") != -1 or line.find("source") != -1: continue data = line[2:-1] if data.startswith("%s" % pattern): name = data if result: options.stdout.write(">%s\n%s\n" % (prot_name, "".join(result))) result = [] else: if pattern == "Protein": if "".join(map(str, [data.find(x) != -1 for x in amino_acids ])).find("True") != -1 and data.find( "end") == -1 and data.find( "%s" % pattern): result.append(data) prot_name = name elif pattern == "DNA": if name: if "".join( map(str, [data.find(x) != -1 for x in amino_acids ])).find("True") == -1 and data.find( "end") == -1 and data.find( "%s" % pattern): result.append(data) prot_name = name if result: options.stdout.write(">%s\n%s\n" % (prot_name, "".join(result))) # write footer and output benchmark information. E.Stop()
elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-o", "--file-output"): param_filename_output = a # 1. read multiple alignment in fasta format mali, identifiers = MaliIO.readFasta(sys.stdin) if param_loglevel >= 1: print "# read mali with %i entries." % len(identifiers) print E.GetHeader() print E.GetParams() # 1. remove gaps in multiple alignment mali = MaliIO.removeGaps(mali) if param_master: frame_columns = GetFrameColumns(mali, param_master) elif param_master_pattern: columns = [] for id in identifiers: if re.search(param_master_pattern, id): columns += GetFrameColumns(mali, id) if len(columns) == 0:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2summary.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm"), help="input format of multiple alignment") parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na"), help="alphabet to use [default=%default].", ) parser.add_option( "-p", "--pattern-mali", dest="pattern_mali", type="string", help="filename pattern for input multiple alignment files.") parser.set_defaults( input_format="fasta", output_format="fasta", mask_chars="nN", gap_chars="-.", alphabet="na", pattern_mali=None, ) (options, args) = E.Start(parser) if options.pattern_mali: prefix_header = "prefix\t" prefix_row = "\t" else: prefix_header = "" prefix_row = "" options.stdout.write( "%sncol_mean\tpcol_mean\tncol_median\tpcol_median\tnrow_mean\tprow_mean\tnrow_median\tprow_median\n" % (prefix_header, )) ninput, nskipped, noutput, nempty = 0, 0, 0, 0 if options.pattern_mali: ids, errors = IOTools.ReadList(sys.stdin) E.debug("read %i identifiers.\n" % len(ids)) nsubstitutions = len(re.findall("%s", options.pattern_mali)) for id in ids: filename = options.pattern_mali % tuple([id] * nsubstitutions) ninput += 1 if not os.path.exists(filename): nskipped += 1 continue ## read multiple alignment in various formats mali = Mali.Mali() mali.readFromFile(open(filename, "r"), format=options.input_format) if mali.isEmpty(): nempty += 1 continue E.debug("read mali with %i entries from %s.\n" % (len(mali), filename)) if analyzeMali(mali, options, prefix_row="%s\t" % id): noutput += 1 else: ## read multiple alignment in various formats mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) ninput += 1 if mali.isEmpty(): nempty += 1 else: E.debug("read mali with %i entries." % (len(mali))) if analyzeMali(mali, options, prefix_row=""): noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, nempty=%i." % (ninput, noutput, nskipped, nempty)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-f", "--id-format", dest="id_format", type="string", help="format for numeric identifier if --as-gtf is set and " "no name in bed file [%default].") parser.set_defaults(as_gtf=False, id_format="%08i", test=None) (options, args) = E.Start(parser, add_pipe_options=True) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator(options.stdin): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.fields and len(bed.fields) >= 3: gff.strand = bed.fields[2] else: gff.strand = "." if bed.fields and len(bed.fields) >= 2: gff.score = bed.fields[1] if as_gtf: if bed.fields: gff.gene_id = bed.fields[0] gff.transcript_id = bed.fields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.fields: gff.source = bed.fields[0] options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_genome.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-c", "--chunk-size", dest="chunk_size", help="size of chunks in nucleotides.", type="int") parser.add_option("-o", "--filename-pattern-output", dest="filename_pattern_output", help="filename for output (should contain one '%i').", type="string") parser.set_defaults( chunk_size=200000, filename_pattern_output="%i.fasta", width=100, ) (options, args) = E.Start(parser, add_pipe_options=True) nchunk = 0 chunksize = 0 pos = 0 fragments = [] outfile = None for line in sys.stdin: is_header = line[0] == ">" if is_header or chunksize > options.chunk_size: if outfile: rest = Print(outfile, fragments, options) chunksize = len(rest) pos -= chunksize fragments = [rest] outfile.close() else: fragments = [] chunksize = 0 nchunk += 1 outfile = IOTools.openFile( options.filename_pattern_output % nchunk, "w") if is_header: description = line[1:-1] id = re.split("\s", description)[0] pos = 0 outfile.write(">%s|%i|%i %s\n" % (id, nchunk, pos, description)) if is_header: continue s = re.sub("\s", "", line[:-1]) l = len(s) pos += l chunksize += l fragments.append(s) if outfile: rest = Print(outfile, fragments, options) outfile.close()