def main(argv=None): parser = get_option_parser() (options, args) = E.start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 P.get_parameters() P.start_session() if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] statements = [build_command(x) for x in data] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.stop() sys.exit(0) P.run(statements) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = IOTools.open_file(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = IOTools.open_file(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.stop()
def AnalyseGO(gene2go, genes, genes_background=None, do_probabilities=True): """analyse go ids. goids: list of goids to analyse genes: sample set of genes genes_background: background set of genes (default: all) """ if genes_background is None: genes_background = list(gene2go.keys()) result = GOResults() # get background frequencies (background_counts_total, background_counts, background_genes) = \ GetGOFrequencies(gene2go, genes_background) result.mBackgroundCountsTotal = background_counts_total result.mBackgroundNumCategories = len(background_counts) result.mBackgroundGenes = background_genes # get sample frequencies (sample_counts_total, sample_counts, sample_genes) = \ GetGOFrequencies(gene2go, genes) result.mNumGenes = len(genes) result.mSampleCountsTotal = sample_counts_total result.mSampleNumCategories = len(sample_counts) result.mSampleGenes = sample_genes # test for over or underrepresented categories in the slims # report results for all go categories in the background # so that also categories completely absent in the foreground (sample) # are considered. for go_id in list(background_counts.keys()): result_go = GOResult(go_id) # use gene counts result_go.mSampleCountsCategory = sample_counts.get(go_id, 0) result_go.mSampleCountsTotal = len(sample_genes) result_go.mBackgroundCountsTotal = len(background_genes) result_go.mBackgroundCountsCategory = background_counts[go_id] E.debug( "processing %s: genes in foreground=%i, genes in backgound=%i, sample_counts=%i, background_counts=%i" % ( go_id, len(sample_genes), len(background_genes), sample_counts.get(go_id, 0), background_counts.get(go_id, 0), )) if do_probabilities: try: result_go.UpdateProbabilities() except AssertionError as msg: print(msg) print("# error while calculating probabilities for %s" % go_id) print("# genes in sample", sample_genes) print("# counts in sample: %i out of %i total" % (result_go.mSampleCountsCategory, result_go.mSampleCountsTotal)) print("# counts in background %i out of %i total" % (result_go.mBackgroundCountsCategory, result_go.mBackgroundCountsTotal)) for x in list(sample_genes.keys()): for y in gene2go[x]: print(x, str(y)) sys.exit(0) result.mResults[go_id] = result_go return result
def main(argv=None): # Parse the options parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-p", "--params", dest="params", type="string", help="comma separated list of addtional parameter strings") parser.add_option("-m", "--module", dest="module", type="string", help="the full path to the module file", default=None) parser.add_option("-i", "--input", dest="input_filenames", type="string", action="append", help="input filename") parser.add_option("-o", "--output-section", dest="output_filenames", type="string", action="append", help="output filename") parser.add_option("-f", "--function", dest="function", type="string", help="the module function", default=None) parser.set_defaults(input_filenames=[], output_filenames=[], params=None) (options, args) = E.Start(parser) # Check a module and function have been specified if not options.module or not options.function: raise ValueError("Both a function and Module must be specified") # If a full path was given, add this path to the system path location = os.path.dirname(options.module) if location != "": sys.path.append(location) # Establish the module name, accomodating cases where the # .py extension has been included in the module name module_name = os.path.basename(options.module) if module_name.endswith(".py"): module_base_name = module_name[:-3] else: module_base_name = module_name # Import the specified module and map the specified fuction E.info("importing module '%s' " % module_base_name) E.debug("sys.path is: %s" % sys.path) module = importlib.import_module(module_base_name) try: function = getattr(module, options.function) except AttributeError as msg: raise AttributeError( msg.message + "unknown function, available functions are: %s" % ",".join([x for x in dir(module) if not x.startswith("_")])) if options.input_filenames and not options.input_filenames == ["None"]: infiles = options.input_filenames else: infiles = False if options.output_filenames and not options.output_filenames == ["None"]: outfiles = options.output_filenames else: outfiles = False # Parse the parameters into an array if options.params: params = [param.strip() for param in options.params.split(",")] else: params = False # deal with single file case if infiles and len(infiles) == 1: infiles = infiles[0] if outfiles and len(outfiles) == 1: outfiles = outfiles[0] # Make the function call if infiles and outfiles and params: function(infiles, outfiles, params) elif infiles and outfiles and not params: function(infiles, outfiles) elif params: function(params) else: raise ValueError( "Expecting infile+outfile+params or infile+outfile or params") E.Stop()
def run(**kwargs): """run a command line statement. The method runs a single or multiple statements on the cluster using drmaa. The cluster is bypassed if: * ``to_cluster`` is set to None in the context of the calling function. * ``--local`` has been specified on the command line and the option ``without_cluster`` has been set as a result. * no libdrmaa is present * the global session is not initialized (GLOBAL_SESSION is None) To decide which statement to run, the method works by examining the context of the calling function for a variable called ``statement`` or ``statements``. If ``statements`` is defined, multiple job scripts are created and sent to the cluster. If ``statement`` is defined, a single job script is created and sent to the cluster. Additionally, if ``job_array`` is defined, the single statement will be submitted as an array job. Troubleshooting: 1. DRMAA creates sessions and their is a limited number of sessions available. If there are two many or sessions become not available after failed jobs, use ``qconf -secl`` to list sessions and ``qconf -kec #`` to delete sessions. 2. Memory: 1G of free memory can be requested using the job_memory variable: ``job_memory = "1G"`` If there are error messages like "no available queue", then the problem could be that a particular complex attribute has not been defined (the code should be ``hc`` for ``host:complex`` and not ``hl`` for ``host:local``). Note that qrsh/qsub directly still works. """ # combine options using correct preference options = dict(list(PARAMS.items())) options.update(list(getCallerLocals().items())) options.update(list(kwargs.items())) # insert legacy synonyms options['without_cluster'] = options.get('without_cluster') getParallelEnvironment(options) # enforce highest priority for cluster options in command-line if "cli_cluster_memory_default" in PARAMS: options["cluster_memory_default"] = PARAMS["cli_cluster_memory_default"] if "cli_cluster_memory_resource" in PARAMS: options["cluster_memory_resource"] = PARAMS["cli_cluster_memory_resource"] if "cli_cluster_num_jobs" in PARAMS: options["cluster_num_jobs"] = PARAMS["cli_cluster_num_jobs"] if "cli_cluster_options" in PARAMS: options["cluster_options"] = PARAMS["cli_cluster_options"] if "cli_cluster_parallel_environment" in PARAMS: options["cluster_parallel_environment"] = PARAMS["cli_cluster_parallel_environment"] if "cli_cluster_priority" in PARAMS: options["cluster_priority"] = PARAMS["cli_cluster_priority"] if "cli_cluster_queue" in PARAMS: options["cluster_queue"] = PARAMS["cli_cluster_queue"] if "cli_cluster_queue_manager" in PARAMS: options["cluster_queue_manager"] = PARAMS["cli_cluster_queue_manager"] # if the command-line has not been used # get information from the legacy job_options if options["cluster_options"] == "": options["cluster_options"] = options.get("job_options", options["cluster_options"]) # get the memory requirement for the job job_memory = getJobMemory(options, PARAMS) # get the queue manager queue_manager = PARAMS["cluster_queue_manager"] shellfile = os.path.join(PARAMS["workingdir"], "shell.log") pid = os.getpid() E.debug('task: pid = %i' % pid) # connect to global session session = GLOBAL_SESSION E.debug('task: pid %i: sge session = %s' % (pid, str(session))) ignore_pipe_errors = options.get('ignore_pipe_errors', False) ignore_errors = options.get('ignore_errors', False) # run on cluster if: # * to_cluster is not defined or set to True # * command line option without_cluster is set to False # * an SGE session is present run_on_cluster = ("to_cluster" not in options or options.get("to_cluster")) and \ not options["without_cluster"] and \ GLOBAL_SESSION is not None # SGE compatible job_name job_name = re.sub( "[:]", "_", os.path.basename(options.get("outfile", "ruffus"))) def _writeJobScript(statement, job_memory, job_name, shellfile): # disabled - problems with quoting # tmpfile.write( '''echo 'statement=%s' >> %s\n''' % # (shellquote(statement), shellfile) ) # module list outputs to stderr, so merge stderr and stdout script = '''#!/bin/bash -e \n echo "%(job_name)s : START -> ${0}" >> %(shellfile)s set | sed 's/^/%(job_name)s : /' &>> %(shellfile)s set +o errexit module list 2>&1 | sed 's/^/%(job_name)s: /' &>> %(shellfile)s set -o errexit hostname | sed 's/^/%(job_name)s: /' &>> %(shellfile)s cat /proc/meminfo | sed 's/^/%(job_name)s: /' &>> %(shellfile)s echo "%(job_name)s : END -> ${0}" >> %(shellfile)s ''' % locals() # restrict virtual memory # Note that there are resources in SGE which could do this directly # such as v_hmem. # Note that limiting resident set sizes (RSS) with ulimit is not # possible in newer kernels. script += "ulimit -v %i\n" % IOTools.human2bytes(job_memory) script += expandStatement(statement, ignore_pipe_errors=ignore_pipe_errors) script += "\n" job_path = getTempFilename(dir=PARAMS["workingdir"]) with open(job_path, "w") as script_file: script_file.write(script) return(job_path) if run_on_cluster: # run multiple jobs if options.get("statements"): statement_list = [] for statement in options.get("statements"): options["statement"] = statement statement_list.append(buildStatement(**options)) if options.get("dryrun", False): return jt = setupDrmaaJobTemplate(session, options, job_name, job_memory) E.debug("Job spec is: %s" % jt.nativeSpecification) job_ids, filenames = [], [] for statement in statement_list: E.info("running statement:\n%s" % statement) job_path = _writeJobScript(statement, job_memory, job_name, shellfile) jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path) job_id = session.runJob(jt) job_ids.append(job_id) filenames.append((job_path, stdout_path, stderr_path)) E.debug("job has been submitted with job_id %s" % str(job_id)) E.debug("waiting for %i jobs to finish " % len(job_ids)) session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, False) # collect and clean up for job_id, statement, paths in zip(job_ids, statement_list, filenames): job_path, stdout_path, stderr_path = paths collectSingleJobFromCluster(session, job_id, statement, stdout_path, stderr_path, job_path, ignore_errors=ignore_errors) session.deleteJobTemplate(jt) # run single job on cluster - this can be an array job else: statement = buildStatement(**options) E.info("running statement:\n%s" % statement) if options.get("dryrun", False): return jt = setupDrmaaJobTemplate(session, options, job_name, job_memory) E.debug("Job spec is: %s" % jt.nativeSpecification) job_path = _writeJobScript(statement, job_memory, job_name, shellfile) jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path) if "job_array" in options and options["job_array"] is not None: # run an array job start, end, increment = options.get("job_array") E.debug("starting an array job: %i-%i,%i" % (start, end, increment)) # sge works with 1-based, closed intervals job_ids = session.runBulkJobs(jt, start + 1, end, increment) E.debug("%i array jobs have been submitted as job_id %s" % (len(job_ids), job_ids[0])) retval = session.synchronize( job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True) stdout, stderr = getStdoutStderr(stdout_path, stderr_path) else: # run a single job job_id = session.runJob(jt) E.debug("job has been submitted with job_id %s" % str(job_id)) collectSingleJobFromCluster(session, job_id, statement, stdout_path, stderr_path, job_path, ignore_errors=ignore_errors) session.deleteJobTemplate(jt) else: # run job locally on cluster statement_list = [] if options.get("statements"): for statement in options.get("statements"): options["statement"] = statement statement_list.append(buildStatement(**options)) else: statement_list.append(buildStatement(**options)) if options.get("dryrun", False): return for statement in statement_list: E.info("running statement:\n%s" % statement) # process substitution <() and >() does not # work through subprocess directly. Thus, # the statement needs to be wrapped in # /bin/bash -c '...' in order for bash # to interpret the substitution correctly. if "<(" in statement or ">(" in statement: shell = os.environ.get('SHELL', "/bin/bash") if "bash" not in shell: raise ValueError( "require bash for advanced shell syntax: <()") # Note: pipes.quote is deprecated in Py3, use shlex.quote # (not present in Py2.7). statement = pipes.quote(statement) statement = "%s -c %s" % (shell, statement) process = subprocess.Popen( expandStatement( statement, ignore_pipe_errors=ignore_pipe_errors), cwd=PARAMS["workingdir"], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0 and not ignore_errors: raise OSError( "---------------------------------------\n" "Child was terminated by signal %i: \n" "The stderr was: \n%s\n%s\n" "-----------------------------------------" % (-process.returncode, stderr, statement))
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="method", type="choice", choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply [%default]") parser.add_option("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option("-c", "--contigs-tsv-file", dest="input_filename_contigs", type="string", help="filename with contig lengths.") parser.add_option( "--agp-file", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--crop-gff-file", dest="filename_crop_gff", type="string", help="GFF/GTF file to crop against.") parser.add_option( "--group-field", dest="group_field", type="string", help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... [%default].""") parser.add_option( "--filter-range", dest="filter_range", type="string", help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_option("--sanitize-method", dest="sanitize_method", type="choice", choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " "[%default].") parser.add_option( "--flank-method", dest="flank_method", type="choice", choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " "[%default].") parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised [%default].") parser.add_option( "--contig-pattern", dest="contig_pattern", type="string", help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize [%default].") parser.add_option( "--assembly-report", dest="assembly_report", type="string", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-hasids", dest="assembly_report_hasIDs", type="int", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type="int", help="column in the assembly report containing ucsc contig ids" "[%default].") parser.add_option( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type="int", help="column in the assembly report containing ensembl contig ids" "[%default].") parser.add_option( "--assembly-extras", dest="assembly_extras", type="str", help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome [%default].") parser.add_option("--extension-upstream", dest="extension_upstream", type="float", help="extension for upstream end [%default].") parser.add_option("--extension-downstream", dest="extension_downstream", type="float", help="extension for downstream end [%default].") parser.add_option( "--min-distance", dest="min_distance", type="int", help="minimum distance of features to merge/join [%default].") parser.add_option( "--max-distance", dest="max_distance", type="int", help="maximum distance of features to merge/join [%default].") parser.add_option( "--min-features", dest="min_features", type="int", help="minimum number of features to merge/join [%default].") parser.add_option( "--max-features", dest="max_features", type="int", help="maximum number of features to merge/join [%default].") parser.add_option( "--rename-chr-file", dest="rename_chr_file", type="string", help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (options, args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if options.input_filename_contigs: contigs = Genomics.readContigSizes( IOTools.open_file(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.rename_chr_file: chr_map = {} with open(options.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if options.assembly_report: df = pd.read_csv(options.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if options.assembly_report_hasIDs == 1: ucsccol = options.assembly_report_ucsccol ensemblcol = options.assembly_report_ensemblcol df.ix[df[1] == "assembled-molecule", ensemblcol] = df.ix[df[1] == "assembled-molecule", 0] if options.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif options.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if options.assembly_extras is not None: assembly_extras = options.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if options.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.open_file(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == options.method add_downstream_flank = "add-downstream-flank" == options.method if options.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(options.extension_upstream) downstream_flank = int(options.extension_downstream) extend_flank = options.flank_method == "extend" if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.method == "join-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=False, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "merge-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=True, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "crop": for gff in cropGFF(gffs, options.filename_crop_gff): options.stdout.write(str(gff) + "\n") elif options.method == "crop-unique": for gff in cropGFFUnique(gffs): options.stdout.write(str(gff) + "\n") elif options.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif options.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif options.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if options.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if options.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.contig_pattern: to_remove = [ re.compile(x) for x in options.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif options.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): options.stdout.write(str(gff) + "\n") else: for gff in gffs: if options.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if options.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) options.stdout.write(str(gff) + "\n") E.stop()
for glob_expression, template, dest in dirs: if not os.path.exists(dest): os.mkdir(dest) files = glob.glob(os.path.abspath(glob_expression)) for filename in files: dirname, name = os.path.split(filename) prefix = name[:-3] # if os.path.exists( os.path.join( dirname, "_%s.pyx" % prefix )): # E.warn( "ignoring pyximport file _%s.pyx" % prefix ) # continue filename = os.path.join(os.path.abspath(dest), "%s.rst" % prefix) if os.path.exists(filename): nskipped += 1 continue E.debug("adding %s" % filename) outfile = open(filename, "w") outfile.write(template % locals()) outfile.close() ncreated += 1 E.info("ncreated=%i, nskipped=%i" % (ncreated, nskipped)) E.Stop()
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: quality2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--quality-threshold", dest="quality_threshold", type="int", help="quality threshold for masking positions [default=%default]") parser.add_option( "--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]") parser.add_option( "--map-tsv-file", dest="filename_map", type="string", help= "filename in psl format mapping entries in multiple alignment to the genome [default=%default]" ) parser.add_option( "-q", "--quality-file", dest="quality_file", type="string", help= "filename with genomic base quality information [default=%default].") parser.set_defaults( quality_threshold=40, quality_file="quality", filename_map=None, frame=3, ) (options, args) = E.start(parser) ################################################## ################################################## ################################################## # read map ################################################## infile = IOTools.open_file(options.filename_map) map_genes2genome = {} for match in Blat.iterator(infile): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## # get quality scores ################################################## quality = IndexedFasta.IndexedFasta(options.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) ################################################## ################################################## ################################################## # main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write("cluster_id\tstart\tend\n") for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn("gene_id %s not found in map." % gene_id) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment(map_gene2mali, alignment) # get quality scores try: quality_scores = quality.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) except ValueError as msg: nmissed += 1 E.warn("could not retrieve quality scores for %s:%i-%i: %s" % (match.mSbjctId, match.mSbjctFrom, match.mSbjctTo, msg)) continue # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR) # print str(alignlib_lite.py_AlignmentFormatEmissions( # map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp, c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue positions.append(y) scores = [quality_scores[x] for x in positions] random.shuffle(scores) for p, q in zip(positions, scores): quality_scores[p] = q # negative strand to_mask = [] # reverse position rp = len(alignment) for fp, c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol(fp), quality_scores[y])) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend(list(range(start, start + options.frame))) else: to_mask.append(p) regions = Iterators.group_by_distance(sorted(to_mask)) for start, end in regions: options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end)) noutput += 1 E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed)) E.stop()