def importCEL( infile, outfiles ): '''import CEL files.''' map_cel = IOTools.readMap( open(infile, "r"), has_header = True ) indir = PARAMS["datadir"] for old, new in map_cel.iteritems(): oldname = os.path.join( indir, old + ".CEL" ) newname = os.path.join( ".", new + ".CEL" ) if not os.path.exists( oldname ): raise IOError( "input file %s does not exist" % oldname ) if os.path.exists(newname): continue os.symlink( os.path.abspath( oldname), os.path.abspath( newname ) )
def importCEL(infile, outfiles): """import CEL files.""" map_cel = IOTools.readMap(open(infile, "r"), has_header=True) indir = PARAMS["datadir"] for old, new in map_cel.iteritems(): oldname = os.path.join(indir, old + ".CEL") newname = os.path.join(".", new + ".CEL") if not os.path.exists(oldname): raise IOError("input file %s does not exist" % oldname) if os.path.exists(newname): continue os.symlink(os.path.abspath(oldname), os.path.abspath(newname))
def importFromSeries(infiles, outfile): '''import expression levels from a GEO series.''' tablename = P.toTable(outfile) tmpf = P.getTempFile() infile_data, infile_map = infiles map_header = IOTools.readMap(open(infile_map, "r")) if "ID_REF" not in map_header: map_header["ID_REF"] = "probeset" inf = gzip.open(infile_data, "r") for line in inf: if line.startswith("!"): continue if not line.strip(): continue line = re.sub('"', "", line) if line.startswith("ID_REF"): line = "\t".join([map_header[x] for x in line[:-1].split("\t")]) + "\n" tmpf.write(line) tmpf.close() tmpname = tmpf.name header = map_header["ID_REF"] statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=%(header)s \ --table=%(tablename)s \ < %(tmpname)s > %(outfile)s ''' P.run() os.unlink(tmpname)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$") parser.add_option("-p", "--peptides-fasta-file", dest="filename_peptides", type="string", help="filename with peptide sequences [%default].") parser.add_option("-c", "--cds-gtf-file", "--cdnas", dest="filename_cdna", type="string", help="filename with cdna sequences [%default].") parser.add_option( "-m", "--map", dest="filename_map", type="string", help= "filename with map of peptide identifiers to cdna identifiers [%default]." ) parser.add_option("--output-identifier", dest="output_identifier", type="choice", choices=("cdna", "peptide"), help="output identifier to use [%default].") parser.add_option("-f", "--output-format=", dest="output_format", type="choice", choices=("alignment", "fasta"), help="output format.") parser.set_defaults( peptides=None, filename_cdna=None, output_format="alignment", filename_map=None, stop_codons=("TAG", "TAA", "TGA"), output_identifier="peptide", ) (options, args) = E.Start(parser, add_pipe_options=True) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info("reading map") map_peptide2cds = IOTools.readMap( IOTools.openFile(options.filename_map, "r")) E.info("read map for %i identifiers" % len(map_peptide2cds)) else: map_peptide2cds = {} E.info("reading cds sequences") cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r")) E.info("read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator(infile) use_cds_id = options.output_identifier == "cds" for cur_record in iterator: ninput += 1 peptide_identifier = re.split("\s+", cur_record.title)[0] cds_identifier = map_peptide2cds.get(peptide_identifier, peptide_identifier) if cds_identifier not in cds_sequences: nnosequence += 1 continue p = cur_record.sequence c = cds_sequences[cds_identifier] E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c))) try: map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options) except ValueError: nskipped += 1 continue if use_cds_id: identifier = cds_identifier else: identifier = peptide_identifier if options.output_format == "alignment": options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions(map_p2c), len(cur_record.sequence), len(cds_sequences[identifier])))) + "\n") elif options.output_format == "fasta": map_p2c.switchRowCol() alignatum = alignlib_lite.py_makeAlignatum(c) alignatum.mapOnAlignment(map_p2c, len(p) * 3) s = alignatum.getString() if len(s) != len(p) * 3: raise ValueError( "incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c)) options.stdout.write(">%s\n%s\n" % (identifier, s)) noutput += 1 sys.stdout.flush() E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped)) E.Stop()
def test_cmdline(): '''test style of scripts ''' # start script in order to build the command line parser global ORIGINAL_START if ORIGINAL_START is None: ORIGINAL_START = E.Start # read the first two columns map_option2action = IOTools.readMap(IOTools.openFile(FILENAME_OPTIONLIST), columns=(0, 1), has_header=True) files = [] for label, expression in EXPRESSIONS: f = glob.glob(expression) files.extend(sorted(f)) files = filter_files(files) # make sure to use the current working directory as # primary lookup. sys.path.insert(0, ".") # files = [ # 'scripts/check_db.py', # 'scripts/cgat_build_report_page.py'] for f in files: if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue script_name = os.path.abspath(f) pyxfile = (os.path.join(os.path.dirname(f), "_") + os.path.basename(f) + "x") fail_.description = script_name # check if script contains getopt with IOTools.openFile(script_name) as inf: if "getopt" in inf.read(): yield (fail_, "script uses getopt directly: %s" % script_name) continue module, modulename = load_script(script_name) if module is None: yield (fail_, "module could not be imported: %s\n" % script_name) continue E.Start = LocalStart try: module.main(argv=["--help"]) except AttributeError: yield (fail_, "no main method in %s\n" % script_name) ok_(False, "no main method in %s" % script_name) except SystemExit: yield (fail_, "script does not use E.Start() %s\n" % script_name) except DummyError: pass for option in PARSER.option_list: # ignore options added by optparse if option.dest is None: continue optstring = option.get_opt_string() if optstring.startswith("--"): optstring = optstring[2:] check_option.description = script_name + ":" + optstring yield (check_option, optstring, os.path.abspath(f), map_option2action) # clear up del sys.modules[modulename] # scripts with pyximport need special handling. # # Multiple imports of pyximport seems to create # some confusion - here, clear up sys.meta_path after # each script if os.path.exists(pyxfile): sys.meta_path = []
def main(): parser = E.OptionParser( version = "%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $", usage = globals()["__doc__"]) dbhandle = Database.Database() parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default]." ) parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories [default=%default].") parser.add_option( "-g", "--genes", dest="filename_genes", type="string", help="filename with genes to analyse [default=%default]." ) parser.add_option( "-b", "--background", dest="filename_background", type="string", help="filename with background genes to analyse [default=%default]." ) parser.add_option( "-m", "--minimum-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have fewer than # number of genes" " [default=%default]." ) parser.add_option( "-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio" ), help="output sort order [default=%default]." ) parser.add_option( "--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested separately." " [default=%default]." ) parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values." ) parser.add_option ("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile [default=%default]." ) parser.add_option ("--filename-gene2name", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names [default=%default]." ) parser.add_option ("--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default]." ) parser.add_option ( "--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile [default=%default]." ) parser.add_option ( "--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default]." ) parser.add_option ( "--filename-output-pattern", "--output-filename-pattern", dest = "output_filename_pattern", type="string", help="pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option ( "--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR [ReadGene2GOFromFiledefault=%default]." ) parser.add_option ( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]." ) parser.add_option ( "--gene-pattern", dest = "gene_pattern", type="string", help="pattern to transform identifiers to GO gene names [default=%default].") parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims [default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default]." ) parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background [default=%default]." ) parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices = ( "empirical", "storey", "BH" ), help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default]." ) # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults( species = None, filename_genes = "-", filename_background = None, filename_slims = None, minimum_counts = 0, ontology = [], filename_dump = None, sample = 0, fdr = False, output_filename_pattern = None, threshold = 0.05, filename_map_slims = None, gene_pattern = None, sort_order = "ratio", get_genes = None, strict = False, qvalue_method = "empirical", pairs_min_observed_counts = 3, compute_pairwise = False, filename_gene2name = None ) (options, args) = E.Start( parser, add_mysql_options = True ) if options.go2goslim: convertGo2Goslim( options ) E.Stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn( "fdr will be computed without sampling" ) ############################################################# ## dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = ["biol_process", "mol_function", "cell_location"] E.info( "dumping GO categories to %s" % (options.filename_dump) ) dbhandle.Connect( options ) outfile = IOTools.openFile( options.filename_dump, "w", create_dir = True ) DumpGOFromDatabase( outfile, dbhandle, options ) outfile.close() E.Stop() sys.exit(0) ############################################################# ## read GO categories from file if options.filename_input: E.info( "reading association of categories and genes from %s" % (options.filename_input) ) infile = IOTools.openFile(options.filename_input) gene2gos, go2infos = ReadGene2GOFromFile( infile ) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.openFile( options.filename_gene2name) gene2name = IOTools.readMap( infile, has_header = True ) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: gene2name = None ############################################################# ## read GO ontology from file if options.filename_ontology: E.info( "reading ontology from %s" % (options.filename_ontology) ) infile = IOTools.openFile(options.filename_ontology) ontology = readOntology( infile ) infile.close() def _g(): return collections.defaultdict( GOInfo ) go2infos = collections.defaultdict( _g ); ## substitute go2infos for go in ontology.values(): go2infos[go.mNameSpace][go.mId] = GOInfo( go.mId, go_type = go.mNameSpace, description = go.mName ) ############################################################# ## get foreground gene list input_foreground, genelists = ReadGeneLists( options.filename_genes, gene_pattern = options.gene_pattern ) E.info( "read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists)) ) ############################################################# ## get background if options.filename_background: # nick - bug fix: background is the first tuple element from ReadGeneLists input_background = ReadGeneLists( options.filename_background, gene_pattern = options.gene_pattern )[0] E.info( "read %i genes for background" % len(input_background) ) else: input_background = None ############################################################# ## sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = gene2gos.keys() E.info( "found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append( "\t".join( ( "genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments") ) + "\n" ) ############################################################# ## get go categories for genes for test_ontology in options.ontology: # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info( "working on ontology %s" % test_ontology ) ############################################################# ## get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info( "reading data from database ..." ) dbhandle.Connect( options ) gene2go, go2info = ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species ) E.info( "finished" ) if len(go2info) == 0: E.warn( "could not find information for terms - could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) E.info( "assignments found: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) ) if options.minimum_counts > 0: to_remove = set([ x for x,y in counts_per_category.iteritems() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts ) ) removeCategories( gene2go, to_remove ) ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) E.info( "assignments after filtering: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) ) for genelist_name, foreground in genelists.iteritems(): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## ## build background - reconcile with foreground ################################################################## if input_background == None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % (len(missing), str(missing)) else: if len(missing) != 0: E.warn( "%i genes in foreground that are not in background - added to background of %i" %\ (len(missing), len(background)) ) background.extend(missing) E.info( "(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) ############################################################# ## sanity checks: ## are all of the foreground genes in the dataset ## missing = set(genes).difference( set(gene2go.keys()) ) ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# ## read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GetGOSlims( IOTools.openFile(options.filename_slims, "r") ) if options.loglevel >=1: v = set() for x in go_slims.values(): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\ ( options.filename_slims, len(go_slims), len( v ) )) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile=IOTools.openFile(options.filename_map_slims, "w" ) outfile.write( "GO\tGOSlim\n" ) for go, go_slim in go_slims.items(): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = MapGO2Slims( gene2go, go_slims, ontology = ontology ) if options.loglevel >=1: ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) options.stdlog.write( "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n" % (ngenes, ncategories, nmaps) ) ############################################################# ## Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in gene2go.items(): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append( gene ) elif gene in background: bg.append( gene ) else: ng.append( gene ) ## skip to next GO class if not (bg or ng): continue options.stdout.write( "# genes in GO category %s\n" % options.get_genes ) options.stdout.write( "gene\tset\n" ) for x in fg: options.stdout.write("%s\t%s\n" % ("fg", x)) for x in bg: options.stdout.write("%s\t%s\n" % ("bg", x)) for x in ng: options.stdout.write("%s\t%s\n" % ("ng", x)) E.info( "nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng) )) E.Stop() sys.exit(0) ############################################################# outfile = getFileName( options, go = test_ontology, section = 'foreground', set = genelist_name ) outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( foreground) ) ) ) if options.output_filename_pattern: outfile.close() outfile = getFileName( options, go = test_ontology, section = 'background', set = genelist_name ) outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( background[0]) ) ) ) if options.output_filename_pattern: outfile.close() ############################################################# ## do the analysis go_results = AnalyseGO( gene2go, foreground, background ) if len(go_results.mSampleGenes) == 0: E.warn( "%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = go_results.mResults.items() ############################################################# ## calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = computeFDRs( go_results, foreground, background, options, test_ontology, gene2go, go2info) for x,v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append( "fdr=%s" % method) if options.sort_order == "fdr": pairs.sort( lambda x, y: cmp(x[1].mQValue, y[1].mQValue)) elif options.sort_order == "ratio": pairs.sort( lambda x, y: cmp(x[1].mRatio, y[1].mRatio)) elif options.sort_order == "pvalue": pairs.sort( lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) ############################################################# ############################################################# ############################################################# ## output the full result outfile = getFileName( options, go = test_ontology, section = 'overall', set = genelist_name ) outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples ) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = selectSignificantResults( pairs, fdrs, options ) nselected = len(filtered_pairs) nselected_up = len( [x for x in filtered_pairs if x[1].mRatio > 1 ] ) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1 ] ) assert nselected_up + nselected_down == nselected outfile = getFileName( options, go = test_ontology, section = 'results', set = genelist_name ) outputResults( outfile, filtered_pairs, go2info, options, fdrs = fdrs, samples = samples ) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append( pairs ) all_significant_results.append( filtered_pairs ) all_genelists_with_results.append( genelist_name ) ############################################################# ############################################################# ############################################################# ## output parameters ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) outfile = getFileName( options, go = test_ontology, section = 'parameters', set = genelist_name ) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology )) outfile.write( "parameter\tvalue\tdescription\n" ) outfile.write( "mapped_genes\t%i\tmapped genes\n" % ngenes ) outfile.write( "mapped_categories\t%i\tmapped categories\n" % ncategories ) outfile.write( "mappings\t%i\tmappings\n" % nmaps ) outfile.write( "genes_in_fg\t%i\tgenes in foreground\n" % len(foreground) ) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes)) ) outfile.write( "genes_in_bg\t%i\tinput background\n" % nbackground ) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes)) ) outfile.write( "associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal ) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal ) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ))) outfile.write( "significant\t%i\tsignificant results reported\n" % nselected ) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up ) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down ) outfile.write( "threshold\t%6.4f\tsignificance threshold\n" % options.threshold ) if options.output_filename_pattern: outfile.close() summary.append( "\t".join( map(str, ( \ genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ), IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ), ",".join( msgs) ) ) ) + "\n" ) ############################################################# ############################################################# ############################################################# ## output the fg patterns outfile = getFileName( options, go = test_ontology, section = 'withgenes', set = genelist_name ) outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples, gene2go = gene2go, foreground = foreground, gene2name = gene2name ) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ###################################################################### ###################################################################### ###################################################################### ## output various summary files ## significant results outputMultipleGeneListResults( all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section = 'significant') ## all results outputMultipleGeneListResults( all_results, all_genelists_with_results, test_ontology, go2info, options, section = 'all') if options.compute_pairwise: pairwiseGOEnrichment( all_results, all_genelists_with_results, test_ontology, go2info, options ) outfile_summary = options.stdout outfile_summary.write( "".join( summary) ) E.Stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories " "[default=%default].") parser.add_option("-g", "--genes-tsv-file", dest="filename_genes", type="string", help="filename with genes to analyse " "[default=%default].") parser.add_option("-b", "--background-tsv-file", dest="filename_background", type="string", help="filename with background genes to analyse " "[default=%default].") parser.add_option("-m", "--min-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have " "fewer than # number of genes" " [default=%default].") parser.add_option("-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option("--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested " "separately [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this " "refers to the fdr, otherwise it is a cutoff for p-values.") parser.add_option("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile " "[default=%default].") parser.add_option( "--gene2name-map-tsv-file", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names " "[default=%default].") parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option("--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile " "[default=%default].") parser.add_option("--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern with output filename pattern " "(should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option("--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR default=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and " "write to STDOUT [default=%default].") parser.add_option("--gene-pattern", dest="gene_pattern", type="string", help="pattern to transform identifiers to GO gene names " "[default=%default].") parser.add_option("--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims " "[default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background " "[default=%default].") parser.add_option( "-q", "--fdr-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None) (options, args) = E.Start(parser, add_database_options=True) if options.go2goslim: GO.convertGo2Goslim(options) E.Stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# # dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location" ] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle = connectToEnsembl(options) outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True) GO.DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.Stop() sys.exit(0) ############################################################# # read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = IOTools.openFile(options.filename_input) gene2gos, go2infos = GO.ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.openFile(options.filename_gene2name) gene2name = IOTools.readMap(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: # use identity mapping gene2name = dict([(x, x) for x in list(gene2gos.keys())]) ############################################################# # read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.openFile(options.filename_ontology) ontology = GO.readOntology(infile) infile.close() def _g(): return collections.defaultdict(GO.GOInfo) go2infos = collections.defaultdict(_g) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# # get foreground gene list input_foreground, genelists = GO.ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# # get background if options.filename_background: # nick - bug fix: background is the first tuple element from # ReadGeneLists input_background = GO.ReadGeneLists( options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# # sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = list(gene2gos.keys()) E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join( ("genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments", "messages")) + "\n") ############################################################# # get go categories for genes for test_ontology in sorted(options.ontology): # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# # get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = GO.ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn("could not find information for terms - " "could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go) E.info("assignments found: %i genes mapped to %i categories " "(%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set([ x for x, y in counts_per_category.items() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) GO.removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) E.info("assignments after filtering: %i genes mapped " "to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) for genelist_name, foreground in sorted(genelists.items()): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## # build background - reconcile with foreground ################################################################## if input_background is None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % ( len(missing), str(missing)) else: if len(missing) != 0: E.warn("%i genes in foreground that are not in " "background - added to background of %i" % (len(missing), len(background))) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) # sort foreground and background, important for reproducibility # under random seed foreground = sorted(foreground) background = sorted(background) ############################################################# # sanity checks: # are all of the foreground genes in the dataset # missing = set(genes).difference( set(gene2go.keys()) ) # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# # read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GO.GetGOSlims( IOTools.openFile(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" % (options.filename_slims, len(go_slims), len(v))) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = IOTools.openFile(options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in sorted(list(go_slims.items())): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to " "%i categories (%i maps)\n" % (ngenes, ncategories, nmaps)) ############################################################# # Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in list(gene2go.items()): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) # skip to next GO class if not (bg or ng): continue options.stdout.write("# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in sorted(fg): options.stdout.write("%s\t%s\n" % ("fg", x)) for x in sorted(bg): options.stdout.write("%s\t%s\n" % ("bg", x)) for x in sorted(ng): options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.Stop() sys.exit(0) ############################################################# outfile = GO.getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = GO.getFileName(options, go=test_ontology, section='background', set=genelist_name) # Jethro bug fix - see section 'build background' for assignment outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background)))) if options.output_filename_pattern: outfile.close() ############################################################# # do the analysis go_results = GO.AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = list(go_results.mResults.items()) ############################################################# # calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = GO.computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(key=lambda x: x[1].mQValue) elif options.sort_order == "ratio": pairs.sort(key=lambda x: x[1].mRatio) elif options.sort_order == "pvalue": pairs.sort(key=lambda x: x[1].mPValue) ############################################################# ############################################################# ############################################################# # output the full result outfile = GO.getFileName(options, go=test_ontology, section='overall', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = GO.getFileName(options, go=test_ontology, section='results', set=genelist_name) GO.outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# # output parameters ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) outfile = GO.getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write("mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes))) outfile.write("associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write("significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write("threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append("\t".join( map(str, (genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.prettyPercent(len(go_results.mSampleGenes), len(foreground), "%5.2f"), IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f"), ",".join(msgs)))) + "\n") ############################################################# ############################################################# ############################################################# # output the fg patterns outfile = GO.getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ################################################################### # output various summary files # significant results GO.outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') # all results GO.outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: GO.pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--suffix", dest="suffixes", action="append", type="string", help="mapping of suffix [%default]") parser.add_option("-m", "--map", dest="filename_map", type="string", help="filename containing a mapping of filenames " "to destinations [%default]") parser.add_option("-u", "--no-sort", dest="do_sort", action="store_false", help="do not sort filenames before grouping " "[%default]") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run - do not merge " "[%default]") parser.set_defaults(filename_map=None, do_sort=True, dry_run=False, suffixes=[]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.filename_map: map_regex2dest = IOTools.readMap(IOTools.openFile( options.filename_map)) map_regex2dest = dict([(re.compile(x), y) for x, y in map_regex2dest.items()]) map_suffix2dest = {} for suffix in options.suffixes: src, dest = suffix.split("=") map_suffix2dest[src.strip()] = dest.strip() filenames = args if options.do_sort: filenames.sort() dest2src = collections.defaultdict(list) for filename in filenames: dests = [] for regex, dest in map_regex2dest.items(): if regex.search(filename): dests.append(dest) if len(dests) == 0: raise ValueError("no destination found for %s" % filename) elif len(dests) > 1: raise ValueError("multiple destinations found for %s: %s " % (filename, dests)) dest = dests[0] # implement suffix mapping, note that # suffixes can extend beyond an extension for suffix, new_suffix in map_suffix2dest.items(): if filename.endswith(suffix): if suffix in map_suffix2dest: dest = dest + map_suffix2dest[suffix] break dest2src[dest].append(filename) for dest, srcs in sorted(dest2src.iteritems()): E.info("merging: %s <- %s" % (dest, srcs)) if options.dry_run: continue E.run('cat %s > %s' % (" ".join(srcs), dest)) # write footer and output benchmark information. E.Stop()
def main(argv=None): parser = E.OptionParser( version="%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option("-s", "--filename-strand", dest="filename_strand", type="string", help="set strand information according to file [default=%DEFAULT].") parser.set_defaults(as_gtf=False, filename_strand=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) #################################### if options.filename_strand: map_id2strand = IOTools.readMap(open(options.filename_strand, "r")) else: map_id2strand = {} iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 if options.as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "psl" gff.feature = "exon" ids = {} while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if match.mQueryId not in ids: ids[match.mQueryId] = 1 id = match.mQueryId else: id = match.mQueryId + ":%i" % ids[match.mQueryId] ids[match.mQueryId] += 1 if options.as_gtf: gff.contig = match.mSbjctId gff.gene_id = id gff.transcript_id = id else: gff.contig = match.mSbjctId gff.clearAttributes() gff.addAttribute("gene_id", id) if id in map_id2strand: gff.strand = map_id2strand[id] else: gff.strand = match.strand for qstart, sstart, size in match.getBlocks(): gff.start = sstart gff.end = sstart + size options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
(options, args) = E.Start( parser, add_pipe_options = True ) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info( "reading map" ) map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) ) E.info( "read map for %i identifiers" % len(map_peptide2cds) ) else: map_peptide2cds = {} E.info( "reading cds sequences" ) cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") ) E.info( "read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator( infile )
def test_cmdline(): '''test style of scripts ''' # start script in order to build the command line parser global ORIGINAL_START if ORIGINAL_START is None: ORIGINAL_START = E.Start # read the first two columns map_option2action = IOTools.readMap( IOTools.openFile(FILENAME_OPTIONLIST), columns=(0, 1), has_header=True) files = [] for label, expression in EXPRESSIONS: f = glob.glob(expression) files.extend(sorted(f)) files = filter_files(files) # make sure to use the current working directory as # primary lookup. sys.path.insert(0, ".") # files = [ # 'scripts/check_db.py', # 'scripts/cgat_build_report_page.py'] for f in files: if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue script_name = os.path.abspath(f) pyxfile = (os.path.join(os.path.dirname(f), "_") + os.path.basename(f) + "x") fail_.description = script_name # check if script contains getopt with IOTools.openFile(script_name) as inf: if "getopt" in inf.read(): yield (fail_, "script uses getopt directly: %s" % script_name) continue module, modulename = load_script(script_name) if module is None: yield (fail_, "module could not be imported: %s\n" % script_name) continue E.Start = LocalStart try: module.main(argv=["--help"]) except AttributeError: yield (fail_, "no main method in %s\n" % script_name) ok_(False, "no main method in %s" % script_name) except SystemExit: yield (fail_, "script does not use E.Start() %s\n" % script_name) except DummyError: pass for option in PARSER.option_list: # ignore options added by optparse if option.dest is None: continue optstring = option.get_opt_string() if optstring.startswith("--"): optstring = optstring[2:] check_option.description = script_name + ":" + optstring yield(check_option, optstring, os.path.abspath(f), map_option2action) # clear up del sys.modules[modulename] # scripts with pyximport need special handling. # # Multiple imports of pyximport seems to create # some confusion - here, clear up sys.meta_path after # each script if os.path.exists(pyxfile): sys.meta_path = []
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option("--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug( ("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute( "protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max( [x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes( GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute( "transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 transcript = _select(gene) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max( [x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute('gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute('transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator( GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % ( gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % ( gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.readMap( IOTools.openFile(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.openFile(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = _bams2bam.filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.openFile(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons ]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator( GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator( options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([ exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon" ]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l ] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs ]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [ item for item in set(transcript_ids) if transcript_ids.count(item) > 1 ] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute( 'gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( 'transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError( "can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError( "can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: peptides2cds.py 2890 2010-04-07 08:58:54Z andreas $") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences [%default]." ) parser.add_option("-c", "--cds", "--cdnas", dest="filename_cdna", type="string", help="filename with cdna sequences [%default]." ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="filename with map of peptide identifiers to cdna identifiers [%default]." ) parser.add_option( "--output-identifier", dest="output_identifier", type="choice", choices=("cdna", "peptide"), help="output identifier to use [%default]." ) parser.add_option("-f", "--output-format=", dest="output_format", type="choice", choices=("alignment", "fasta"), help="output format.") parser.set_defaults( peptides=None, filename_cdna = None, output_format="alignment", filename_map = None, stop_codons = ("TAG", "TAA", "TGA"), output_identifier = "peptide", ) (options, args) = E.Start( parser, add_pipe_options = True ) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info( "reading map" ) map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) ) E.info( "read map for %i identifiers" % len(map_peptide2cds) ) else: map_peptide2cds = {} E.info( "reading cds sequences" ) cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") ) E.info( "read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator( infile ) use_cds_id = options.output_identifier == "cds" for cur_record in iterator: ninput += 1 peptide_identifier = re.split("\s+", cur_record.title)[0] cds_identifier = map_peptide2cds.get( peptide_identifier, peptide_identifier ) if cds_identifier not in cds_sequences: nnosequence += 1 continue p = cur_record.sequence c = cds_sequences[cds_identifier] E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c))) try: map_p2c = getMapPeptide2Cds( p, c, options ) except ValueError: nskipped += 1 continue if use_cds_id: identifier = cds_identifier else: identifier = peptide_identifier if options.output_format =="alignment": options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions( map_p2c ), len(cur_record.sequence), len(cds_sequences[identifier])) ) )+"\n") elif options.output_format == "fasta": map_p2c.switchRowCol() alignatum = alignlib_lite.py_makeAlignatum( c ) alignatum.mapOnAlignment( map_p2c, len(p) * 3 ) s = alignatum.getString() if len(s) != len(p) * 3: raise ValueError ("incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c )) options.stdout.write( ">%s\n%s\n" % (identifier, s )) noutput += 1 sys.stdout.flush() E.info( "ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.Samfile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.readMap( IOTools.openFile(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.iteritems()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.openFile(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.Samfile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.Samfile("-", "wh", template=genome_samfile) else: output_samfile = pysam.Samfile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.Samfile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.Samfile(options.filename_junctions, "rb") else: junctions_samfile = None c = _bams2bam.filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.openFile(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [%default].", ) parser.add_option( "--pattern-identifier", dest="pattern", type="string", help="pattern to use for renaming genes/transcripts. " "The pattern should contain a %i, for example " "--pattern-identifier=ENSG%010i [%default].", ) parser.add_option( "--sort-order", dest="sort_order", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [%default].", ) parser.add_option( "-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[%default].", ) parser.add_option( "--filter-method", dest="filter_method", type="choice", choices=( "gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript", "proteincoding", "lincrna", ), help="Filter method to apply. Available filters are: " "'gene': filter by gene_id given in ``--map-tsv-file``, " "'transcript': filter by transcript_id given in ``--map-tsv-file``, " "'longest-gene': output the longest gene for overlapping genes ," "'longest-transcript': output the longest transcript per gene," "'representative-transcript': output the representative transcript " "per gene. The representative transcript is the transcript " "that shares most exons with other transcripts in a gene. " "The input needs to be sorted by gene. " "'proteincoding': only output protein coding features. " "'lincrna': only output lincRNA features. " "[%default].", ) parser.add_option( "-a", "--map-tsv-file", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [%default].", ) parser.add_option( "--gff-file", dest="filename_gff", type="string", metavar="GFF", help="second filename of features (see --remove-overlapping) " "[%default]", ) parser.add_option( "--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[%default].", ) parser.add_option( "--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--method=filter --filter-method' is set " "[%default].", ) parser.add_option( "--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons-file2introns) " "[%default].", ) parser.add_option( "--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].", ) parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].", ) parser.add_option( "--ignore-strand", dest="ignore_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using ``transcripts2genes`` or ``filter``" "[%default].", ) parser.add_option( "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]" ) parser.add_option( "--duplicate-feature", dest="duplicate_feature", type="choice", choices=("gene", "transcript", "both", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]", ) parser.add_option( "-m", "--method", dest="method", type="choice", action="append", choices=( "add-protein-id", "exons2introns", "filter", "find-retained-introns", "genes-to-unique-chunks", "intersect-transcripts", "join-exons", "merge-exons", "merge-transcripts", "merge-genes", "merge-introns", "remove-overlapping", "remove-duplicates", "rename-genes", "rename-transcripts", "rename-duplicates", "renumber-genes", "renumber-transcripts", "set-transcript-to-gene", "set-gene-to-transcript", "set-protein-to-transcript", "set-score-to-distance", "set-gene_biotype-to-source", "sort", "transcript2genes", "unset-genes", ), help="Method to apply [%default]." "Please only select one.", ) parser.set_defaults( sort_order="gene", filter_method="gene", pattern="%i", merge_exons_distance=0, filename_filter=None, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, ignore_strand=False, with_utr=False, invert_filter=False, duplicate_feature=None, strict=True, method=None, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.method is None: raise ValueError("please specify a --method") if len(options.method) > 1: raise ValueError("multiple --method arguements specified") else: options.method = options.method[0] if options.method == "set-transcript-to-gene": for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "set-gene_biotype-to-source": for gff in GTF.iterator(options.stdin): ninput += 1 if "gene_biotype" not in gff: gff.setAttribute("gene_biotype", gff.source) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "remove-duplicates": counts = collections.defaultdict(int) if options.duplicate_feature == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.duplicate_feature == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x]) elif options.duplicate_feature == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.duplicate_feature == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.duplicate_feature == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif "sort" == options.method: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-gene-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-protein-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "add-protein-id" == options.method: transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif "join-exons" == options.method: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif "merge-genes" == options.method: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start" ) def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, ( "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n" ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.method == "renumber-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "unset-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "renumber-transcripts": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "transcripts2genes": transcripts = set() genes = set() ignore_strand = options.ignore_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if ignore_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.method in ("rename-genes", "rename-transcripts"): map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) if options.method == "rename-transcripts": is_gene_id = False elif options.method == "rename-genes": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.method == "filter": keep_genes = set() if options.filter_method == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter_method in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): """select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. """ all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) # add transcript id to sort to provide a stable # segmentation. transcript_counts.append((count, transcript[0].transcript_id, transcript)) transcript_counts.sort() return transcript_counts[-1][-1] if options.filter_method == "longest-transcript": _select = selectLongestTranscript elif options.filter_method == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter_method in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter_method == "gene" by_transcript = options.filter_method == "transcript" invert = options.invert_filter ignore_strand = options.ignore_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if ignore_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter_method == "gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) elif options.filter_method == "transcript": iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon" ) data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--map-tsv-file) or a sample-size." elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"): # extract entries by transcript/gene biotype. # This filter uses a test on the source field (ENSEMBL pre v78) # a regular expression on the attributes (ENSEMBL >= v78). tag = { "proteincoding": "protein_coding", "processed-pseudogene": "processed_pseudogene", "lincrna": "lincRNA", }[options.filter_method] rx = re.compile('"%s"' % tag) if not options.invert_filter: f = lambda x: x.source == tag or rx.search(x.attributes) else: f = lambda x: x.source != tag and not rx.search(x.attributes) for gff in GTF.iterator(options.stdin): ninput += 1 if f(gff): options.stdout.write(str(gff) + "\n") noutput += 1 else: ndiscarded += 1 elif options.method == "exons2introns": for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.method == "set-score-to-distance": for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.method == "remove-overlapping": index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.method == "intersect-transcripts": for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif "rename-duplicates" == options.method: # note: this will only rename entries with "CDS" in feature column assert options.duplicate_feature in ["gene", "transcript", "both"], ( "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'" ) gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if options.duplicate_feature in ["both", "gene"]: if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if options.duplicate_feature in ["both", "transcript"]: if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id]) ) options.stdout.write("%s\n" % gtf) elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"): for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] try: biotypes = [x["gene_biotype"] for x in gffs] biotype = ":".join(set(biotypes)) except (KeyError, AttributeError): biotype = None if options.method == "merge-exons": # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.method == "merge-transcripts": entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.method == "merge-introns": if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 elif options.method == "find-retained-introns": for gene in GTF.gene_iterator(GTF.iterator(options.stdin)): ninput += 1 found_any = False for intron in find_retained_introns(gene): found_any = True options.stdout.write("%s\n" % str(intron)) nfeatures += 1 if found_any: noutput += 1 elif options.method == "genes-to-unique-chunks": for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 for exon in gene_to_blocks(gene): options.stdout.write("%s\n" % str(exon)) nfeatures += 1 noutput += 1 else: raise ValueError("unknown method '%s'" % options.method) E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-s", "--filename-strand", dest="filename_strand", type="string", help="set strand information according to file [default=%DEFAULT].") parser.set_defaults(as_gtf=False, filename_strand=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) #################################### if options.filename_strand: map_id2strand = IOTools.readMap(open(options.filename_strand, "r")) else: map_id2strand = {} iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 if options.as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "psl" gff.feature = "exon" ids = {} while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if match.mQueryId not in ids: ids[match.mQueryId] = 1 id = match.mQueryId else: id = match.mQueryId + ":%i" % ids[match.mQueryId] ids[match.mQueryId] += 1 if options.as_gtf: gff.contig = match.mSbjctId gff.gene_id = id gff.transcript_id = id else: gff.contig = match.mSbjctId gff.clearAttributes() gff.addAttribute("gene_id", id) if id in map_id2strand: gff.strand = map_id2strand[id] else: gff.strand = match.strand for qstart, sstart, size in match.getBlocks(): gff.start = sstart gff.end = sstart + size options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--suffix", dest="suffixes", action="append", type="string", help="mapping of suffix [%default]") parser.add_option("-m", "--map", dest="filename_map", type="string", help="filename containing a mapping of filenames " "to destinations [%default]") parser.add_option("-u", "--no-sort", dest="do_sort", action="store_false", help="do not sort filenames before grouping " "[%default]") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run - do not merge " "[%default]") parser.set_defaults( filename_map=None, do_sort=True, dry_run=False, suffixes=[]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.filename_map: map_regex2dest = IOTools.readMap( IOTools.openFile(options.filename_map)) map_regex2dest = dict([(re.compile(x), y) for x, y in list(map_regex2dest.items())]) map_suffix2dest = {} for suffix in options.suffixes: src, dest = suffix.split("=") map_suffix2dest[src.strip()] = dest.strip() filenames = args if options.do_sort: filenames.sort() dest2src = collections.defaultdict(list) for filename in filenames: dests = [] for regex, dest in list(map_regex2dest.items()): if regex.search(filename): dests.append(dest) if len(dests) == 0: raise ValueError("no destination found for %s" % filename) elif len(dests) > 1: raise ValueError( "multiple destinations found for %s: %s " % (filename, dests)) dest = dests[0] # implement suffix mapping, note that # suffixes can extend beyond an extension for suffix, new_suffix in list(map_suffix2dest.items()): if filename.endswith(suffix): if suffix in map_suffix2dest: dest = dest + map_suffix2dest[suffix] break dest2src[dest].append(filename) for dest, srcs in sorted(dest2src.items()): E.info("merging: %s <- %s" % (dest, srcs)) if options.dry_run: continue E.run('cat %s > %s' % (" ".join(srcs), dest)) # write footer and output benchmark information. E.Stop()