Python IOTools.readMap Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: readMap

Examples at hotexamples.com: 20

The python CGAT.IOTools.ReadMap is a function that is a part of the CGAT package, specifically the IOTools module. This function is used to read in a map file in a specific format into a Python dictionary. The map file contains key-value pairs, where each line in the file represents one pair. The function reads the file and constructs a dictionary mapping the keys to their corresponding values. This function is useful for quickly accessing values based on their keys without having to search through the entire file.

Python IOTools.readMap - 20 examples found. These are the top rated real world Python examples of CGAT.IOTools.readMap extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: pipeline_expression.py Project: yangjl/cgat

def importCEL( infile, outfiles ):
    '''import CEL files.'''
    
    map_cel = IOTools.readMap( open(infile, "r"), has_header = True )
    
    indir = PARAMS["datadir"]
    
    for old, new in map_cel.iteritems():
        oldname = os.path.join( indir, old + ".CEL" )
        newname = os.path.join( ".", new + ".CEL" )

        if not os.path.exists( oldname ):
            raise IOError( "input file %s does not exist" % oldname )
        if os.path.exists(newname): continue

        os.symlink( os.path.abspath( oldname), os.path.abspath( newname ) )

Example #2

Show file

File: pipeline_expression.py Project: nishantthakur/cgat

def importCEL(infile, outfiles):
    """import CEL files."""

    map_cel = IOTools.readMap(open(infile, "r"), has_header=True)

    indir = PARAMS["datadir"]

    for old, new in map_cel.iteritems():
        oldname = os.path.join(indir, old + ".CEL")
        newname = os.path.join(".", new + ".CEL")

        if not os.path.exists(oldname):
            raise IOError("input file %s does not exist" % oldname)
        if os.path.exists(newname):
            continue

        os.symlink(os.path.abspath(oldname), os.path.abspath(newname))

Example #3

Show file

File: pipeline_expression.py Project: Charlie-George/cgat

def importFromSeries(infiles, outfile):
    '''import expression levels from a GEO series.'''
    tablename = P.toTable(outfile)

    tmpf = P.getTempFile()

    infile_data, infile_map = infiles

    map_header = IOTools.readMap(open(infile_map, "r"))
    if "ID_REF" not in map_header:
        map_header["ID_REF"] = "probeset"

    inf = gzip.open(infile_data, "r")

    for line in inf:
        if line.startswith("!"):
            continue
        if not line.strip():
            continue
        line = re.sub('"', "", line)
        if line.startswith("ID_REF"):
            line = "\t".join([map_header[x]
                             for x in line[:-1].split("\t")]) + "\n"

        tmpf.write(line)

    tmpf.close()
    tmpname = tmpf.name

    header = map_header["ID_REF"]
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=%(header)s \
              --table=%(tablename)s \
    < %(tmpname)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpname)

Example #4

Show file

File: pipeline_expression.py Project: santayana/cgat

def importFromSeries(infiles, outfile):
    '''import expression levels from a GEO series.'''
    tablename = P.toTable(outfile)

    tmpf = P.getTempFile()

    infile_data, infile_map = infiles

    map_header = IOTools.readMap(open(infile_map, "r"))
    if "ID_REF" not in map_header:
        map_header["ID_REF"] = "probeset"

    inf = gzip.open(infile_data, "r")

    for line in inf:
        if line.startswith("!"):
            continue
        if not line.strip():
            continue
        line = re.sub('"', "", line)
        if line.startswith("ID_REF"):
            line = "\t".join([map_header[x]
                              for x in line[:-1].split("\t")]) + "\n"

        tmpf.write(line)

    tmpf.close()
    tmpname = tmpf.name

    header = map_header["ID_REF"]
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=%(header)s \
              --table=%(tablename)s \
    < %(tmpname)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpname)

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$")

    parser.add_option("-p",
                      "--peptides-fasta-file",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences [%default].")

    parser.add_option("-c",
                      "--cds-gtf-file",
                      "--cdnas",
                      dest="filename_cdna",
                      type="string",
                      help="filename with cdna sequences [%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help=
        "filename with map of peptide identifiers to cdna identifiers [%default]."
    )

    parser.add_option("--output-identifier",
                      dest="output_identifier",
                      type="choice",
                      choices=("cdna", "peptide"),
                      help="output identifier to use [%default].")

    parser.add_option("-f",
                      "--output-format=",
                      dest="output_format",
                      type="choice",
                      choices=("alignment", "fasta"),
                      help="output format.")

    parser.set_defaults(
        peptides=None,
        filename_cdna=None,
        output_format="alignment",
        filename_map=None,
        stop_codons=("TAG", "TAA", "TGA"),
        output_identifier="peptide",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r")
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info("reading map")
        map_peptide2cds = IOTools.readMap(
            IOTools.openFile(options.filename_map, "r"))
        E.info("read map for %i identifiers" % len(map_peptide2cds))
    else:
        map_peptide2cds = {}

    E.info("reading cds sequences")

    cds_sequences = Genomics.ReadPeptideSequences(
        IOTools.openFile(options.filename_cdna, "r"))

    E.info("read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator(infile)

    use_cds_id = options.output_identifier == "cds"

    for cur_record in iterator:

        ninput += 1

        peptide_identifier = re.split("\s+", cur_record.title)[0]
        cds_identifier = map_peptide2cds.get(peptide_identifier,
                                             peptide_identifier)

        if cds_identifier not in cds_sequences:
            nnosequence += 1
            continue

        p = cur_record.sequence
        c = cds_sequences[cds_identifier]

        E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" %
                (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c)))

        try:
            map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options)
        except ValueError:
            nskipped += 1
            continue

        if use_cds_id:
            identifier = cds_identifier
        else:
            identifier = peptide_identifier

        if options.output_format == "alignment":
            options.stdout.write("\t".join(
                map(str, (identifier,
                          alignlib_lite.py_AlignmentFormatEmissions(map_p2c),
                          len(cur_record.sequence),
                          len(cds_sequences[identifier])))) + "\n")

        elif options.output_format == "fasta":

            map_p2c.switchRowCol()

            alignatum = alignlib_lite.py_makeAlignatum(c)

            alignatum.mapOnAlignment(map_p2c, len(p) * 3)

            s = alignatum.getString()
            if len(s) != len(p) * 3:
                raise ValueError(
                    "incomplete aligned string for %s: %s, cds=%s" %
                    (cur_record.title, s, c))

            options.stdout.write(">%s\n%s\n" % (identifier, s))

        noutput += 1
        sys.stdout.flush()

    E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" %
           (ninput, noutput, nnosequence, nskipped))

    E.Stop()

Example #6

Show file

def test_cmdline():
    '''test style of scripts
    '''

    # start script in order to build the command line parser
    global ORIGINAL_START
    if ORIGINAL_START is None:
        ORIGINAL_START = E.Start

    # read the first two columns
    map_option2action = IOTools.readMap(IOTools.openFile(FILENAME_OPTIONLIST),
                                        columns=(0, 1),
                                        has_header=True)

    files = []
    for label, expression in EXPRESSIONS:
        f = glob.glob(expression)
        files.extend(sorted(f))

    files = filter_files(files)

    # make sure to use the current working directory as
    # primary lookup.
    sys.path.insert(0, ".")

    # files = [
    #    'scripts/check_db.py',
    #    'scripts/cgat_build_report_page.py']

    for f in files:
        if os.path.isdir(f):
            continue
        if os.path.basename(f) in EXCLUDE:
            continue

        script_name = os.path.abspath(f)
        pyxfile = (os.path.join(os.path.dirname(f), "_") +
                   os.path.basename(f) + "x")

        fail_.description = script_name
        # check if script contains getopt
        with IOTools.openFile(script_name) as inf:
            if "getopt" in inf.read():
                yield (fail_, "script uses getopt directly: %s" % script_name)
                continue

        module, modulename = load_script(script_name)
        if module is None:
            yield (fail_, "module could not be imported: %s\n" % script_name)
            continue
        E.Start = LocalStart

        try:
            module.main(argv=["--help"])
        except AttributeError:
            yield (fail_, "no main method in %s\n" % script_name)
            ok_(False, "no main method in %s" % script_name)
        except SystemExit:
            yield (fail_, "script does not use E.Start() %s\n" % script_name)
        except DummyError:
            pass

        for option in PARSER.option_list:
            # ignore options added by optparse
            if option.dest is None:
                continue

            optstring = option.get_opt_string()
            if optstring.startswith("--"):
                optstring = optstring[2:]

            check_option.description = script_name + ":" + optstring

            yield (check_option, optstring, os.path.abspath(f),
                   map_option2action)

        # clear up
        del sys.modules[modulename]

        # scripts with pyximport need special handling.
        #
        # Multiple imports of pyximport seems to create
        # some confusion - here, clear up sys.meta_path after
        # each script
        if os.path.exists(pyxfile):
            sys.meta_path = []

Example #7

Show file

File: runGO.py Project: yangjl/cgat

def main():

    parser = E.OptionParser( version = "%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $", usage = globals()["__doc__"])

    dbhandle = Database.Database()
    
    parser.add_option("-s", "--species", dest="species", type="string",
                      help="species to use [default=%default]." )

    parser.add_option("-i", "--slims", dest="filename_slims", type="string",
                      help="filename with GO SLIM categories [default=%default].")

    parser.add_option( "-g", "--genes", dest="filename_genes", type="string",
                       help="filename with genes to analyse [default=%default]." )

    parser.add_option( "-b", "--background", dest="filename_background", type="string",
                       help="filename with background genes to analyse [default=%default]." )

    parser.add_option( "-m", "--minimum-counts", dest="minimum_counts", type="int",
                       help="minimum count - ignore all categories that have fewer than # number of genes"
                            " [default=%default]." )

    parser.add_option( "-o", "--sort-order", dest="sort_order", type="choice",
                       choices=("fdr", "pvalue", "ratio" ),
                       help="output sort order [default=%default]." )

    parser.add_option( "--ontology", dest="ontology", type="string", action="append",
                       help="go ontologies to analyze. Ontologies are tested separately."
                       " [default=%default]." )

    parser.add_option( "-t", "--threshold", dest="threshold", type="float",
                       help="significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values." )

    parser.add_option ("--filename-dump", dest="filename_dump", type="string",
                       help="dump GO category assignments into a flatfile [default=%default]." )

    parser.add_option ("--filename-gene2name", dest="filename_gene2name", type="string",
                       help="optional filename mapping gene identifiers to gene names [default=%default]." )

    parser.add_option ("--filename-ontology", dest="filename_ontology", type="string",
                       help="filename with ontology in OBO format [default=%default]." )

    parser.add_option ( "--filename-input", dest="filename_input", type="string",
                       help="read GO category assignments from a flatfile [default=%default]." )

    parser.add_option ( "--sample-size", dest="sample", type="int",
                        help="do sampling (with # samples) [default=%default]." )

    parser.add_option ( "--filename-output-pattern", "--output-filename-pattern", 
                        dest = "output_filename_pattern", type="string",
                        help="pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option ( "--fdr", dest="fdr", action="store_true",
                        help="calculate and filter by FDR [ReadGene2GOFromFiledefault=%default]." )
    
    parser.add_option ( "--go2goslim", dest="go2goslim", action="store_true",
                        help="convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]." )

    parser.add_option ( "--gene-pattern", dest = "gene_pattern", type="string",
                        help="pattern to transform identifiers to GO gene names [default=%default].")
    
    parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string",
                       help="write mapping between GO categories and GOSlims [default=%default].")

    parser.add_option( "--get-genes", dest="get_genes", type="string",
                       help="list all genes in the with a certain GOID [default=%default]." )

    parser.add_option( "--strict", dest="strict", action="store_true",
                       help="require all genes in foreground to be part of background. "
                       "If not set, genes in foreground will be added to the background [default=%default]." )

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices = ( "empirical", "storey", "BH" ),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )


    parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true",
                       help="compute pairwise enrichment for multiple gene lists. "
                       "[default=%default]." )

    # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    #                    help="fdr computation: method for estimating pi0 [default=%default]."  )
    
    parser.set_defaults( species = None,
                         filename_genes = "-",
                         filename_background = None,
                         filename_slims = None,
                         minimum_counts = 0,
                         ontology = [],
                         filename_dump = None,
                         sample = 0,
                         fdr = False,
                         output_filename_pattern = None,
                         threshold = 0.05,
                         filename_map_slims = None,
                         gene_pattern = None,
                         sort_order = "ratio",
                         get_genes = None,
                         strict = False,
                         qvalue_method = "empirical",
                         pairs_min_observed_counts = 3,
                         compute_pairwise = False,
                         filename_gene2name = None
                         )

    (options, args) = E.Start( parser, add_mysql_options = True )

    if options.go2goslim:
        convertGo2Goslim( options )
        E.Stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn( "fdr will be computed without sampling" )
        
    #############################################################
    ## dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = ["biol_process", "mol_function", "cell_location"] 

        E.info( "dumping GO categories to %s" % (options.filename_dump) )

        dbhandle.Connect( options )
            
        outfile = IOTools.openFile( options.filename_dump, "w", create_dir = True )
        DumpGOFromDatabase( outfile,
                            dbhandle,
                            options )
        outfile.close()
        E.Stop()
        sys.exit(0)

    #############################################################
    ## read GO categories from file
    if options.filename_input:
        E.info( "reading association of categories and genes from %s" % (options.filename_input) )
        infile = IOTools.openFile(options.filename_input)
        gene2gos, go2infos = ReadGene2GOFromFile( infile )
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name)
        infile = IOTools.openFile( options.filename_gene2name)
        gene2name = IOTools.readMap( infile, has_header = True )
        infile.close()
        E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())),
                                                               len(gene2name)))
    else:
        gene2name = None

    #############################################################
    ## read GO ontology from file
    if options.filename_ontology:
        E.info( "reading ontology from %s" % (options.filename_ontology) )
        
        infile = IOTools.openFile(options.filename_ontology)
        ontology = readOntology( infile )
        infile.close()
        
        def _g():
            return collections.defaultdict( GOInfo )
        go2infos = collections.defaultdict( _g );

        ## substitute go2infos
        for go in ontology.values():
            go2infos[go.mNameSpace][go.mId] = GOInfo( go.mId,
                                                      go_type = go.mNameSpace,
                                                      description = go.mName )

    #############################################################
    ## get foreground gene list
    input_foreground, genelists = ReadGeneLists( options.filename_genes, 
                                                 gene_pattern = options.gene_pattern )

    E.info( "read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists)) )

    #############################################################
    ## get background
    if options.filename_background:
        
        # nick - bug fix: background is the first tuple element from ReadGeneLists
        input_background = ReadGeneLists( options.filename_background, 
                                          gene_pattern = options.gene_pattern )[0]
        E.info( "read %i genes for background" % len(input_background) )
    else:
        input_background = None

    #############################################################
    ## sort out which ontologies to test
    if not options.ontology: 
        if options.filename_input:
            options.ontology = gene2gos.keys()

    E.info( "found %i ontologies: %s" % (len(options.ontology), options.ontology))

    summary = []
    summary.append( "\t".join( (
                "genelist",
                "ontology",
                "significant",
                "threshold",
                "ngenes",
                "ncategories",
                "nmaps",
                "nforegound",
                "nforeground_mapped",
                "nbackground",
                "nbackground_mapped",
                "nsample_counts",
                "nbackground_counts",
                "psample_assignments",
                "pbackground_assignments") ) + "\n" )


    #############################################################
    ## get go categories for genes
    for test_ontology in options.ontology:

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info( "working on ontology %s" % test_ontology )
        #############################################################
        ## get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info( "reading data from database ..." )

            dbhandle.Connect( options )
            gene2go, go2info = ReadGene2GOFromDatabase( dbhandle,
                                                        test_ontology,
                                                        options.database, options.species )

            E.info( "finished" )

        if len(go2info) == 0:
            E.warn( "could not find information for terms - could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )        
        E.info( "assignments found: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) )

        if options.minimum_counts > 0:
            to_remove = set([ x for x,y in counts_per_category.iteritems() if y < options.minimum_counts ])
            E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts ) )
            removeCategories( gene2go, to_remove )

            ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )        
            E.info( "assignments after filtering: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) )

        for genelist_name, foreground in genelists.iteritems():

            msgs = []
            E.info("processing %s with %i genes" % (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            ## build background - reconcile with foreground
            ##################################################################
            if input_background == None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple. 
            # background is the first tuple element 
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn( "%i genes in foreground that are not in background - added to background of %i" %\
                                (len(missing), len(background)) )

                background.extend(missing)

            E.info( "(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background)))

            #############################################################
            ## sanity checks:            
            ## are all of the foreground genes in the dataset
            ## missing = set(genes).difference( set(gene2go.keys()) )
            ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################            
            ## read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GetGOSlims( IOTools.openFile(options.filename_slims, "r") )

                if options.loglevel >=1:
                    v = set()
                    for x in go_slims.values():
                        for xx in x: v.add(xx)
                    options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\
                                              ( options.filename_slims,
                                                len(go_slims), 
                                                len( v ) ))



                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile=IOTools.openFile(options.filename_map_slims, "w" )

                    outfile.write( "GO\tGOSlim\n" )
                    for go, go_slim in go_slims.items():
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = MapGO2Slims( gene2go, go_slims, ontology = ontology )

                if options.loglevel >=1:
                    ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )
                    options.stdlog.write( "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n" % (ngenes, ncategories, nmaps) )

            #############################################################
            ## Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in gene2go.items():
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append( gene )
                            elif gene in background:
                                bg.append( gene )
                            else:
                                ng.append( gene )

                ## skip to next GO class
                if not (bg or ng): continue

                options.stdout.write( "# genes in GO category %s\n" % options.get_genes )
                options.stdout.write( "gene\tset\n" )
                for x in fg: options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in bg: options.stdout.write("%s\t%s\n" % ("bg", x))           
                for x in ng: options.stdout.write("%s\t%s\n" % ("ng", x))                       

                E.info( "nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng) ))

                E.Stop()
                sys.exit(0)

            #############################################################
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'foreground',
                                   set = genelist_name )

            outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( foreground) ) ) )
            if options.output_filename_pattern:
                outfile.close()

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'background',
                                   set = genelist_name )

            outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( background[0]) ) ) )
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            ## do the analysis
            go_results = AnalyseGO( gene2go, foreground, background )

            if len(go_results.mSampleGenes) == 0:
                E.warn( "%s: no genes with GO categories - analysis aborted" % genelist_name)
                continue

            pairs = go_results.mResults.items()

            #############################################################
            ## calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method  = computeFDRs( go_results, 
                                                      foreground,
                                                      background,
                                                      options, 
                                                      test_ontology,
                                                      gene2go,
                                                      go2info)
                for x,v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None
                
            msgs.append( "fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort( lambda x, y: cmp(x[1].mQValue, y[1].mQValue))           
            elif options.sort_order == "ratio":
                pairs.sort( lambda x, y: cmp(x[1].mRatio, y[1].mRatio))
            elif options.sort_order == "pvalue":
                pairs.sort( lambda x, y: cmp(x[1].mPValue, y[1].mPValue))

            #############################################################
            #############################################################
            #############################################################
            ## output the full result
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'overall',
                                   set = genelist_name )

            outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples )

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = selectSignificantResults( pairs, fdrs, options )

            nselected = len(filtered_pairs)
            nselected_up = len( [x for x in filtered_pairs if x[1].mRatio > 1 ] )
            nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1 ] )
            
            assert nselected_up + nselected_down == nselected

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'results',
                                   set = genelist_name )

            outputResults( outfile, 
                           filtered_pairs, 
                           go2info, 
                           options,
                           fdrs = fdrs, 
                           samples = samples )
            
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append( pairs )
            all_significant_results.append( filtered_pairs )
            all_genelists_with_results.append( genelist_name )

            #############################################################
            #############################################################
            #############################################################
            ## output parameters
            ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go )

            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'parameters',
                                   set = genelist_name )

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology ))
            outfile.write( "parameter\tvalue\tdescription\n" )
            outfile.write( "mapped_genes\t%i\tmapped genes\n" % ngenes )
            outfile.write( "mapped_categories\t%i\tmapped categories\n" % ncategories )
            outfile.write( "mappings\t%i\tmappings\n" % nmaps )
            outfile.write( "genes_in_fg\t%i\tgenes in foreground\n" % len(foreground) )
            outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes)) )
            outfile.write( "genes_in_bg\t%i\tinput background\n" % nbackground )
            outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes)) )
            outfile.write( "associations_in_fg\t%i\tassociations in sample\n"     % go_results.mSampleCountsTotal )
            outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal )
            outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" )))
            outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" )))
            outfile.write( "significant\t%i\tsignificant results reported\n" % nselected )
            outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up )
            outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down )
            outfile.write( "threshold\t%6.4f\tsignificance threshold\n" % options.threshold )        

            if options.output_filename_pattern:
                outfile.close()

            summary.append( "\t".join( map(str, ( \
                                                genelist_name,
                                                  test_ontology,
                                                  nselected,
                                                  options.threshold,
                                                  ngenes,
                                                  ncategories,
                                                  nmaps,
                                                  len(foreground),
                                                  len(go_results.mSampleGenes),
                                                  nbackground,
                                                  len(go_results.mBackgroundGenes),
                                                  go_results.mSampleCountsTotal,
                                                  go_results.mBackgroundCountsTotal,
                                                  IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ),
                                                  IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ),
                                                  ",".join( msgs) ) ) ) + "\n" )

            #############################################################
            #############################################################
            #############################################################
            ## output the fg patterns
            outfile = getFileName( options, 
                                   go = test_ontology,
                                   section = 'withgenes',
                                   set = genelist_name )

            outputResults( outfile, pairs, go2info, options, 
                           fdrs = fdrs, 
                           samples = samples,
                           gene2go = gene2go,
                           foreground = foreground,
                           gene2name = gene2name )
            
            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ######################################################################
            ######################################################################
            ######################################################################
            ## output various summary files
            ## significant results
            outputMultipleGeneListResults( all_significant_results, 
                                           all_genelists_with_results, 
                                           test_ontology, 
                                           go2info,
                                           options,
                                           section = 'significant')

            ## all results
            outputMultipleGeneListResults( all_results, 
                                           all_genelists_with_results, 
                                           test_ontology, 
                                           go2info,
                                           options,
                                           section = 'all')

            
            if options.compute_pairwise:
                pairwiseGOEnrichment( all_results,
                                      all_genelists_with_results,
                                      test_ontology,
                                      go2info,
                                      options )

    outfile_summary = options.stdout
    outfile_summary.write( "".join( summary) )



    E.Stop()

Example #8

Show file

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use [default=%default].")

    parser.add_option("-i",
                      "--slims",
                      dest="filename_slims",
                      type="string",
                      help="filename with GO SLIM categories "
                      "[default=%default].")

    parser.add_option("-g",
                      "--genes-tsv-file",
                      dest="filename_genes",
                      type="string",
                      help="filename with genes to analyse "
                      "[default=%default].")

    parser.add_option("-b",
                      "--background-tsv-file",
                      dest="filename_background",
                      type="string",
                      help="filename with background genes to analyse "
                      "[default=%default].")

    parser.add_option("-m",
                      "--min-counts",
                      dest="minimum_counts",
                      type="int",
                      help="minimum count - ignore all categories that have "
                      "fewer than # number of genes"
                      " [default=%default].")

    parser.add_option("-o",
                      "--sort-order",
                      dest="sort_order",
                      type="choice",
                      choices=("fdr", "pvalue", "ratio"),
                      help="output sort order [default=%default].")

    parser.add_option("--ontology",
                      dest="ontology",
                      type="string",
                      action="append",
                      help="go ontologies to analyze. Ontologies are tested "
                      "separately [default=%default].")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option("--filename-dump",
                      dest="filename_dump",
                      type="string",
                      help="dump GO category assignments into a flatfile "
                      "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file",
        dest="filename_gene2name",
        type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology",
        dest="filename_ontology",
        type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option("--filename-input",
                      dest="filename_input",
                      type="string",
                      help="read GO category assignments from a flatfile "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample",
                      type="int",
                      help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option("--fdr",
                      dest="fdr",
                      action="store_true",
                      help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim",
        dest="go2goslim",
        action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option("--gene-pattern",
                      dest="gene_pattern",
                      type="string",
                      help="pattern to transform identifiers to GO gene names "
                      "[default=%default].")

    parser.add_option("--filename-map-slims",
                      dest="filename_map_slims",
                      type="string",
                      help="write mapping between GO categories and GOSlims "
                      "[default=%default].")

    parser.add_option(
        "--get-genes",
        dest="get_genes",
        type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict",
        dest="strict",
        action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q",
        "--fdr-method",
        dest="qvalue_method",
        type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise",
        dest="compute_pairwise",
        action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None)

    (options, args) = E.Start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.Stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"
            ]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = connectToEnsembl(options)

        outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True)
        GO.DumpGOFromDatabase(outfile, dbhandle, options)
        outfile.close()
        E.Stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = IOTools.openFile(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = IOTools.openFile(options.filename_gene2name)
        gene2name = IOTools.readMap(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())), len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = IOTools.openFile(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)

        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId,
                                                        go_type=go.mNameSpace,
                                                        description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes, gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background, gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join(
        ("genelist", "ontology", "significant", "threshold", "ngenes",
         "ncategories", "nmaps", "nforegound", "nforeground_mapped",
         "nbackground", "nbackground_mapped", "nsample_counts",
         "nbackground_counts", "psample_assignments",
         "pbackground_assignments", "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle, test_ontology, options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn("could not find information for terms - "
                   "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" % (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set([
                x for x, y in counts_per_category.items()
                if y < options.minimum_counts
            ])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    IOTools.openFile(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims, len(go_slims), len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = IOTools.openFile(options.filename_map_slims,
                                                   "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" %
                        (ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write("# genes in GO category %s\n" %
                                     options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.Stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results, foreground,
                                                       background, options,
                                                       test_ontology, gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write("mapped_categories\t%i\tmapped categories\n" %
                          ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n"
                % (len(go_results.mSampleGenes)))
            outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n"
                % (len(go_results.mBackgroundGenes)))
            outfile.write("associations_in_fg\t%i\tassociations in sample\n" %
                          go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mSampleGenes),
                                         len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mBackgroundGenes),
                                         nbackground, "%5.2f")))
            outfile.write("significant\t%i\tsignificant results reported\n" %
                          nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n"
                % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n"
                % nselected_down)
            outfile.write("threshold\t%6.4f\tsignificance threshold\n" %
                          options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(
                map(str, (genelist_name, test_ontology, nselected,
                          options.threshold, ngenes, ncategories, nmaps,
                          len(foreground), len(go_results.mSampleGenes),
                          nbackground, len(go_results.mBackgroundGenes),
                          go_results.mSampleCountsTotal,
                          go_results.mBackgroundCountsTotal,
                          IOTools.prettyPercent(len(go_results.mSampleGenes),
                                                len(foreground), "%5.2f"),
                          IOTools.prettyPercent(
                              len(go_results.mBackgroundGenes), nbackground,
                              "%5.2f"), ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology, go2info, options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.Stop()

Example #9

Show file

File: combine_files.py Project: zpeng1989/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--suffix",
                      dest="suffixes",
                      action="append",
                      type="string",
                      help="mapping of suffix [%default]")

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="filename containing a mapping of filenames "
                      "to destinations [%default]")

    parser.add_option("-u",
                      "--no-sort",
                      dest="do_sort",
                      action="store_false",
                      help="do not sort filenames before grouping "
                      "[%default]")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run - do not merge "
                      "[%default]")

    parser.set_defaults(filename_map=None,
                        do_sort=True,
                        dry_run=False,
                        suffixes=[])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.filename_map:
        map_regex2dest = IOTools.readMap(IOTools.openFile(
            options.filename_map))
        map_regex2dest = dict([(re.compile(x), y)
                               for x, y in map_regex2dest.items()])

    map_suffix2dest = {}
    for suffix in options.suffixes:
        src, dest = suffix.split("=")
        map_suffix2dest[src.strip()] = dest.strip()

    filenames = args

    if options.do_sort:
        filenames.sort()

    dest2src = collections.defaultdict(list)
    for filename in filenames:
        dests = []
        for regex, dest in map_regex2dest.items():
            if regex.search(filename):
                dests.append(dest)
        if len(dests) == 0:
            raise ValueError("no destination found for %s" % filename)
        elif len(dests) > 1:
            raise ValueError("multiple destinations found for %s: %s " %
                             (filename, dests))

        dest = dests[0]
        # implement suffix mapping, note that
        # suffixes can extend beyond an extension
        for suffix, new_suffix in map_suffix2dest.items():
            if filename.endswith(suffix):
                if suffix in map_suffix2dest:
                    dest = dest + map_suffix2dest[suffix]
                    break

        dest2src[dest].append(filename)

    for dest, srcs in sorted(dest2src.iteritems()):
        E.info("merging: %s <- %s" % (dest, srcs))
        if options.dry_run:
            continue
        E.run('cat %s > %s' % (" ".join(srcs), dest))

    # write footer and output benchmark information.
    E.Stop()

Example #10

Show file

File: psl2gff.py Project: Charlie-George/cgat

def main(argv=None):

    parser = E.OptionParser(
        version="%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf.")

    parser.add_option("-s", "--filename-strand", dest="filename_strand", type="string",
                      help="set strand information according to file [default=%DEFAULT].")

    parser.set_defaults(as_gtf=False,
                        filename_strand=None,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ####################################
    if options.filename_strand:
        map_id2strand = IOTools.readMap(open(options.filename_strand, "r"))
    else:
        map_id2strand = {}

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    if options.as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "psl"
    gff.feature = "exon"

    ids = {}

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if match.mQueryId not in ids:
            ids[match.mQueryId] = 1
            id = match.mQueryId
        else:
            id = match.mQueryId + ":%i" % ids[match.mQueryId]
            ids[match.mQueryId] += 1

        if options.as_gtf:
            gff.contig = match.mSbjctId
            gff.gene_id = id
            gff.transcript_id = id
        else:
            gff.contig = match.mSbjctId
            gff.clearAttributes()
            gff.addAttribute("gene_id", id)

        if id in map_id2strand:
            gff.strand = map_id2strand[id]
        else:
            gff.strand = match.strand

        for qstart, sstart, size in match.getBlocks():

            gff.start = sstart
            gff.end = sstart + size
            options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()

Example #11

Show file

File: peptides2cds.py Project: siping/cgat

    (options, args) = E.Start( parser, add_pipe_options = True )

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r") 
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info( "reading map" )
        map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) )
        E.info( "read map for %i identifiers" % len(map_peptide2cds) )
    else:
        map_peptide2cds = {}

    E.info( "reading cds sequences" )
        
    cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") )    

    E.info( "read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator( infile )

Example #12

Show file

File: test_commandline.py Project: CGATOxford/cgat

def test_cmdline():
    '''test style of scripts
    '''

    # start script in order to build the command line parser
    global ORIGINAL_START
    if ORIGINAL_START is None:
        ORIGINAL_START = E.Start

    # read the first two columns
    map_option2action = IOTools.readMap(
        IOTools.openFile(FILENAME_OPTIONLIST),
        columns=(0, 1),
        has_header=True)

    files = []
    for label, expression in EXPRESSIONS:
        f = glob.glob(expression)
        files.extend(sorted(f))

    files = filter_files(files)

    # make sure to use the current working directory as
    # primary lookup.
    sys.path.insert(0, ".")

    # files = [
    #    'scripts/check_db.py',
    #    'scripts/cgat_build_report_page.py']

    for f in files:
        if os.path.isdir(f):
            continue
        if os.path.basename(f) in EXCLUDE:
            continue

        script_name = os.path.abspath(f)
        pyxfile = (os.path.join(os.path.dirname(f), "_") +
                   os.path.basename(f) + "x")

        fail_.description = script_name
        # check if script contains getopt
        with IOTools.openFile(script_name) as inf:
            if "getopt" in inf.read():
                yield (fail_,
                       "script uses getopt directly: %s" % script_name)
                continue

        module, modulename = load_script(script_name)
        if module is None:
            yield (fail_,
                   "module could not be imported: %s\n" % script_name)
            continue
        E.Start = LocalStart

        try:
            module.main(argv=["--help"])
        except AttributeError:
            yield (fail_,
                   "no main method in %s\n" % script_name)
            ok_(False, "no main method in %s" % script_name)
        except SystemExit:
            yield (fail_,
                   "script does not use E.Start() %s\n" % script_name)
        except DummyError:
            pass

        for option in PARSER.option_list:
            # ignore options added by optparse
            if option.dest is None:
                continue

            optstring = option.get_opt_string()
            if optstring.startswith("--"):
                optstring = optstring[2:]

            check_option.description = script_name + ":" + optstring

            yield(check_option, optstring, os.path.abspath(f),
                  map_option2action)

        # clear up
        del sys.modules[modulename]

        # scripts with pyximport need special handling.
        #
        # Multiple imports of pyximport seems to create
        # some confusion - here, clear up sys.meta_path after
        # each script
        if os.path.exists(pyxfile):
            sys.meta_path = []

Example #13

Show file

File: gtf2gtf.py Project: jmadzo/cgat

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t", "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes", dest="merge_genes", action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j", "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes", dest="unset_genes", type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene",
                               "gene+transcript",
                               "transcript",
                               "position",
                               "contig+gene",
                               "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u", "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i", "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g", "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G", "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f", "--filter", dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r", "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes", dest="renumber_genes", type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a", "--apply", dest="filename_filter", type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length", type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option("--intron-border",
                      dest="intron_border",
                      type="int",
                      help="number of residues to exclude at intron at either end "
                      "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping", dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates", dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates", dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates", dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(
                GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(
                        ("removing transcript '%s' due to "
                         "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute(
                "protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max(
                [x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)),
            sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(
                GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute(
                        "transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig,
                               min([x.start for x in gff]),
                               max([x.end for x in gff]),
                               gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end)
                                      for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)]
                                 for x in transcript if x.feature == "exon"])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                transcript = _select(gene)
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max(
                [x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids)
                          if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute('gene_id',
                                     gtf.gene_id + "." +
                                     str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute('transcript_id',
                                     gtf.transcript_id + "." +
                                     str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(
                GTF.iterator(options.stdin),
                strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (
                    gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (
                    gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()

Example #14

Show file

File: bams2bam.py Project: kathrinjansen/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(
            IOTools.openFile(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                            "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                         "wb",
                                         template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                          "rb")
    else:
        junctions_samfile = None

    c = _bams2bam.filter(genome_samfile,
                         output_samfile,
                         output_mismapped,
                         transcripts_samfile,
                         junctions_samfile,
                         transcripts,
                         regions=regions_to_remove,
                         unique=options.unique,
                         remove_contigs=options.remove_contigs,
                         colour_mismatches=options.colour_mismatches,
                         ignore_mismatches=options.ignore_mismatches,
                         ignore_transcripts=transcripts_samfile is None,
                         ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.openFile(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.Stop()

Example #15

Show file

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t",
                      "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes",
                      dest="merge_genes",
                      action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j",
                      "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes",
                      dest="unset_genes",
                      type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene", "gene+transcript", "transcript",
                               "position", "contig+gene", "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u",
                      "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i",
                      "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g",
                      "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G",
                      "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r",
                      "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes",
                      dest="renumber_genes",
                      type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a",
                      "--apply",
                      dest="filename_filter",
                      type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample_size",
                      type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length",
                      type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end "
        "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping",
                      dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates",
                      dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates",
                      dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates",
                      dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                           strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin),
                                         strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                               strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to "
                             "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id",
                             transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons
                                      ]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator(
            GTF.iterator(options.stdin)),
                                          sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(
                options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id",
                                     map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]),
                               max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript",
                                "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript
                                      if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([
                        exon_counts[(x.start, x.end)] for x in transcript
                        if x.feature == "exon"
                    ])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l
                    ]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs
                                      ]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [
            item for item in set(transcript_ids)
            if transcript_ids.count(item) > 1
        ]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute(
                        'gene_id',
                        gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute(
                        'transcript_id', gtf.transcript_id + "." +
                        str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin),
                                           strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple strands: %s" %
                    (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple contigs: %s" %
                    (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()

Example #16

Show file

File: peptides2cds.py Project: BioinformaticsArchive/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: peptides2cds.py 2890 2010-04-07 08:58:54Z andreas $")

    parser.add_option("-p", "--peptides", dest="filename_peptides", type="string",
                      help="filename with peptide sequences [%default]."  )
    
    parser.add_option("-c", "--cds", "--cdnas", dest="filename_cdna", type="string",
                      help="filename with cdna sequences [%default]."  )

    parser.add_option("-m", "--map", dest="filename_map", type="string",
                      help="filename with map of peptide identifiers to cdna identifiers [%default]."  )

    parser.add_option( "--output-identifier", dest="output_identifier", type="choice",
                       choices=("cdna", "peptide"),
                       help="output identifier to use [%default]."  )


    parser.add_option("-f", "--output-format=", dest="output_format", type="choice",
                      choices=("alignment", "fasta"),
                      help="output format.")
    
    parser.set_defaults(
        peptides=None,
        filename_cdna = None,
        output_format="alignment",
        filename_map = None,
        stop_codons = ("TAG", "TAA", "TGA"),
        output_identifier = "peptide",
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r") 
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info( "reading map" )
        map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) )
        E.info( "read map for %i identifiers" % len(map_peptide2cds) )
    else:
        map_peptide2cds = {}

    E.info( "reading cds sequences" )
        
    cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") )    

    E.info( "read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator( infile )

    use_cds_id = options.output_identifier == "cds"

    for cur_record in iterator:

        ninput += 1
        
        peptide_identifier = re.split("\s+", cur_record.title)[0]
        cds_identifier = map_peptide2cds.get( peptide_identifier, peptide_identifier )

        if cds_identifier not in cds_sequences:
            nnosequence += 1
            continue

        p = cur_record.sequence
        c = cds_sequences[cds_identifier]
        
        E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c)))

        try:
            map_p2c = getMapPeptide2Cds( p, c, options )
        except ValueError:
            nskipped += 1
            continue
            
        if use_cds_id:
            identifier = cds_identifier
        else:
            identifier = peptide_identifier

        if options.output_format =="alignment":
            options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions( map_p2c ),
                                                      len(cur_record.sequence), len(cds_sequences[identifier])) ) )+"\n")
            
        elif options.output_format == "fasta":

            map_p2c.switchRowCol()

            alignatum = alignlib_lite.py_makeAlignatum( c )
            
            alignatum.mapOnAlignment( map_p2c, len(p) * 3 )

            s = alignatum.getString()
            if len(s) != len(p) * 3:
                raise ValueError ("incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c ))
            
            options.stdout.write( ">%s\n%s\n" % (identifier, s ))

        noutput += 1
        sys.stdout.flush()

    E.info( "ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped) )
        
    E.Stop()

Example #17

Show file

File: bams2bam.py Project: Q-KIM/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.Samfile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(
            IOTools.openFile(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.iteritems()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.Samfile(options.filename_transcriptome,
                                            "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.Samfile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.Samfile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.Samfile(options.filename_mismapped,
                                         "wb",
                                         template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.Samfile(options.filename_junctions,
                                          "rb")
    else:
        junctions_samfile = None

    c = _bams2bam.filter(genome_samfile,
                         output_samfile,
                         output_mismapped,
                         transcripts_samfile,
                         junctions_samfile,
                         transcripts,
                         regions=regions_to_remove,
                         unique=options.unique,
                         remove_contigs=options.remove_contigs,
                         colour_mismatches=options.colour_mismatches,
                         ignore_mismatches=options.ignore_mismatches,
                         ignore_transcripts=transcripts_samfile is None,
                         ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.openFile(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.Stop()

Example #18

Show file

File: gtf2gtf.py Project: mmaarriiee/cgat

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option(
        "--merge-exons-distance",
        dest="merge_exons_distance",
        type="int",
        help="distance in nucleotides between " "exons to be merged [%default].",
    )

    parser.add_option(
        "--pattern-identifier",
        dest="pattern",
        type="string",
        help="pattern to use for renaming genes/transcripts. "
        "The pattern should contain a %i, for example "
        "--pattern-identifier=ENSG%010i [%default].",
    )

    parser.add_option(
        "--sort-order",
        dest="sort_order",
        type="choice",
        choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"),
        help="sort input data [%default].",
    )

    parser.add_option(
        "-u",
        "--with-utr",
        dest="with_utr",
        action="store_true",
        help="include utr in merged transcripts " "[%default].",
    )

    parser.add_option(
        "--filter-method",
        dest="filter_method",
        type="choice",
        choices=(
            "gene",
            "transcript",
            "longest-gene",
            "longest-transcript",
            "representative-transcript",
            "proteincoding",
            "lincrna",
        ),
        help="Filter method to apply. Available filters are: "
        "'gene': filter by gene_id given in ``--map-tsv-file``, "
        "'transcript': filter by transcript_id given in ``--map-tsv-file``, "
        "'longest-gene': output the longest gene for overlapping genes ,"
        "'longest-transcript': output the longest transcript per gene,"
        "'representative-transcript': output the representative transcript "
        "per gene. The representative transcript is the transcript "
        "that shares most exons with other transcripts in a gene. "
        "The input needs to be sorted by gene. "
        "'proteincoding': only output protein coding features. "
        "'lincrna': only output lincRNA features. "
        "[%default].",
    )

    parser.add_option(
        "-a",
        "--map-tsv-file",
        dest="filename_filter",
        type="string",
        metavar="tsv",
        help="filename of ids to map/filter [%default].",
    )

    parser.add_option(
        "--gff-file",
        dest="filename_gff",
        type="string",
        metavar="GFF",
        help="second filename of features (see --remove-overlapping) " "[%default]",
    )

    parser.add_option(
        "--invert-filter",
        dest="invert_filter",
        action="store_true",
        help="when using --filter, invert selection " "(like grep -v). " "[%default].",
    )

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="int",
        help="extract a random sample of size # if the option "
        "'--method=filter --filter-method' is set "
        "[%default].",
    )

    parser.add_option(
        "--intron-min-length",
        dest="intron_min_length",
        type="int",
        help="minimum length for introns (for --exons-file2introns) " "[%default].",
    )

    parser.add_option(
        "--min-exons-length",
        dest="min_exons_length",
        type="int",
        help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].",
    )

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].",
    )

    parser.add_option(
        "--ignore-strand",
        dest="ignore_strand",
        action="store_true",
        help="remove strandedness of features (set to '.') when "
        "using ``transcripts2genes`` or ``filter``"
        "[%default].",
    )

    parser.add_option(
        "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]"
    )

    parser.add_option(
        "--duplicate-feature",
        dest="duplicate_feature",
        type="choice",
        choices=("gene", "transcript", "both", "ucsc", "coordinates"),
        help="remove duplicates by gene/transcript. "
        "If ``ucsc`` is chosen, transcripts ending on _dup# are "
        "removed. This is necessary to remove duplicate entries "
        "that are next to each other in the sort order "
        "[%default]",
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        action="append",
        choices=(
            "add-protein-id",
            "exons2introns",
            "filter",
            "find-retained-introns",
            "genes-to-unique-chunks",
            "intersect-transcripts",
            "join-exons",
            "merge-exons",
            "merge-transcripts",
            "merge-genes",
            "merge-introns",
            "remove-overlapping",
            "remove-duplicates",
            "rename-genes",
            "rename-transcripts",
            "rename-duplicates",
            "renumber-genes",
            "renumber-transcripts",
            "set-transcript-to-gene",
            "set-gene-to-transcript",
            "set-protein-to-transcript",
            "set-score-to-distance",
            "set-gene_biotype-to-source",
            "sort",
            "transcript2genes",
            "unset-genes",
        ),
        help="Method to apply [%default]." "Please only select one.",
    )

    parser.set_defaults(
        sort_order="gene",
        filter_method="gene",
        pattern="%i",
        merge_exons_distance=0,
        filename_filter=None,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        ignore_strand=False,
        with_utr=False,
        invert_filter=False,
        duplicate_feature=None,
        strict=True,
        method=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.method is None:
        raise ValueError("please specify a --method")

    if len(options.method) > 1:
        raise ValueError("multiple --method arguements specified")
    else:
        options.method = options.method[0]

    if options.method == "set-transcript-to-gene":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "set-gene_biotype-to-source":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            if "gene_biotype" not in gff:
                gff.setAttribute("gene_biotype", gff.source)

            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "remove-duplicates":

        counts = collections.defaultdict(int)

        if options.duplicate_feature == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.duplicate_feature == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x])
            elif options.duplicate_feature == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.duplicate_feature == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.duplicate_feature == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))

    elif "sort" == options.method:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "set-gene-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif "set-protein-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "add-protein-id" == options.method:

        transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" % len(missing))

    elif "join-exons" == options.method:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif "merge-genes" == options.method:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start"
        )

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, (
                        "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n"
                    ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.method == "renumber-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "unset-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "renumber-transcripts":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "transcripts2genes":

        transcripts = set()
        genes = set()
        ignore_strand = options.ignore_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if ignore_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes)))

    elif options.method in ("rename-genes", "rename-transcripts"):

        map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        if options.method == "rename-transcripts":
            is_gene_id = False
        elif options.method == "rename-genes":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" % gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.method == "filter":

        keep_genes = set()
        if options.filter_method == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter_method in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                """select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                """
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"])
                    # add transcript id to sort to provide a stable
                    # segmentation.
                    transcript_counts.append((count, transcript[0].transcript_id, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][-1]

            if options.filter_method == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter_method == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter_method in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter_method == "gene"
                by_transcript = options.filter_method == "transcript"
                invert = options.invert_filter

                ignore_strand = options.ignore_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if ignore_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter_method == "gene":
                    iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
                elif options.filter_method == "transcript":
                    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator, min_length=options.min_exons_length, feature="exon"
                    )

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--map-tsv-file) or a sample-size."

        elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"):
            # extract entries by transcript/gene biotype.
            # This filter uses a test on the source field (ENSEMBL pre v78)
            # a regular expression on the attributes (ENSEMBL >= v78).
            tag = {
                "proteincoding": "protein_coding",
                "processed-pseudogene": "processed_pseudogene",
                "lincrna": "lincRNA",
            }[options.filter_method]
            rx = re.compile('"%s"' % tag)
            if not options.invert_filter:
                f = lambda x: x.source == tag or rx.search(x.attributes)
            else:
                f = lambda x: x.source != tag and not rx.search(x.attributes)

            for gff in GTF.iterator(options.stdin):
                ninput += 1
                if f(gff):
                    options.stdout.write(str(gff) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1

    elif options.method == "exons2introns":

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.method == "set-score-to-distance":

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.method == "remove-overlapping":

        index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.method == "intersect-transcripts":

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif "rename-duplicates" == options.method:
        # note: this will only rename entries with "CDS" in feature column

        assert options.duplicate_feature in ["gene", "transcript", "both"], (
            "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'"
        )

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if options.duplicate_feature in ["both", "gene"]:
                    if gtf.gene_id in dup_gene:
                        gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                        gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if options.duplicate_feature in ["both", "transcript"]:
                    if gtf.transcript_id in dup_transcript:
                        transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1
                        gtf.setAttribute(
                            "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])
                        )

            options.stdout.write("%s\n" % gtf)

    elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"):
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            try:
                biotypes = [x["gene_biotype"] for x in gffs]
                biotype = ":".join(set(biotypes))
            except (KeyError, AttributeError):
                biotype = None

            if options.method == "merge-exons":
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.method == "merge-transcripts":

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                if biotype:
                    entry.addAttribute("gene_biotype", biotype)
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.method == "merge-introns":

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    elif options.method == "find-retained-introns":

        for gene in GTF.gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found_any = False
            for intron in find_retained_introns(gene):
                found_any = True
                options.stdout.write("%s\n" % str(intron))
                nfeatures += 1
            if found_any:
                noutput += 1

    elif options.method == "genes-to-unique-chunks":

        for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            for exon in gene_to_blocks(gene):
                options.stdout.write("%s\n" % str(exon))
                nfeatures += 1
            noutput += 1

    else:
        raise ValueError("unknown method '%s'" % options.method)

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()

Example #19

Show file

def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-s",
        "--filename-strand",
        dest="filename_strand",
        type="string",
        help="set strand information according to file [default=%DEFAULT].")

    parser.set_defaults(as_gtf=False, filename_strand=None, test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ####################################
    if options.filename_strand:
        map_id2strand = IOTools.readMap(open(options.filename_strand, "r"))
    else:
        map_id2strand = {}

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    if options.as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "psl"
    gff.feature = "exon"

    ids = {}

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if match.mQueryId not in ids:
            ids[match.mQueryId] = 1
            id = match.mQueryId
        else:
            id = match.mQueryId + ":%i" % ids[match.mQueryId]
            ids[match.mQueryId] += 1

        if options.as_gtf:
            gff.contig = match.mSbjctId
            gff.gene_id = id
            gff.transcript_id = id
        else:
            gff.contig = match.mSbjctId
            gff.clearAttributes()
            gff.addAttribute("gene_id", id)

        if id in map_id2strand:
            gff.strand = map_id2strand[id]
        else:
            gff.strand = match.strand

        for qstart, sstart, size in match.getBlocks():

            gff.start = sstart
            gff.end = sstart + size
            options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()

Example #20

Show file

File: combine_files.py Project: CGATOxford/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--suffix", dest="suffixes",
                      action="append",
                      type="string",
                      help="mapping of suffix [%default]")

    parser.add_option("-m", "--map", dest="filename_map",
                      type="string",
                      help="filename containing a mapping of filenames "
                      "to destinations [%default]")

    parser.add_option("-u", "--no-sort", dest="do_sort",
                      action="store_false",
                      help="do not sort filenames before grouping "
                      "[%default]")

    parser.add_option("-n", "--dry-run", dest="dry_run",
                      action="store_true",
                      help="dry run - do not merge "
                      "[%default]")

    parser.set_defaults(
        filename_map=None,
        do_sort=True,
        dry_run=False,
        suffixes=[])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.filename_map:
        map_regex2dest = IOTools.readMap(
            IOTools.openFile(options.filename_map))
        map_regex2dest = dict([(re.compile(x), y) for x, y in
                               list(map_regex2dest.items())])

    map_suffix2dest = {}
    for suffix in options.suffixes:
        src, dest = suffix.split("=")
        map_suffix2dest[src.strip()] = dest.strip()

    filenames = args

    if options.do_sort:
        filenames.sort()

    dest2src = collections.defaultdict(list)
    for filename in filenames:
        dests = []
        for regex, dest in list(map_regex2dest.items()):
            if regex.search(filename):
                dests.append(dest)
        if len(dests) == 0:
            raise ValueError("no destination found for %s" % filename)
        elif len(dests) > 1:
            raise ValueError(
                "multiple destinations found for %s: %s " %
                (filename, dests))

        dest = dests[0]
        # implement suffix mapping, note that
        # suffixes can extend beyond an extension
        for suffix, new_suffix in list(map_suffix2dest.items()):
            if filename.endswith(suffix):
                if suffix in map_suffix2dest:
                    dest = dest + map_suffix2dest[suffix]
                    break

        dest2src[dest].append(filename)

    for dest, srcs in sorted(dest2src.items()):
        E.info("merging: %s <- %s" % (dest, srcs))
        if options.dry_run:
            continue
        E.run('cat %s > %s' % (" ".join(srcs),
                               dest))

    # write footer and output benchmark information.
    E.Stop()