Python IndexedFasta.TranslatorBytes Exemples

Langage de programmation: Python

Espace de nommage/Pack: cgat

Class/Type: IndexedFasta

Méthode/Fonction: TranslatorBytes

Exemples au hotexamples.com: 2

Python IndexedFasta.TranslatorBytes - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de cgat.IndexedFasta.TranslatorBytes extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

IndexedFasta(6)

TranslatorBytes(2)

getConverter(2)

MultipleFastaIterator(1)

TranslatorPhred(1)

TranslatorRange200(1)

TranslatorSolexa(1)

createDatabase(1)

parseCoordinates(1)

verify(1)

Méthodes fréquemment utilisées

IndexedFasta (6)

TranslatorBytes (2)

getConverter (2)

MultipleFastaIterator (1)

TranslatorPhred (1)

TranslatorRange200 (1)

TranslatorSolexa (1)

createDatabase (1)

parseCoordinates (1)

verify (1)

Exemple #1

0

Afficher le fichier

Fichier : gtf2table.py Projet : harmeet1990/cgat-apps

def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()

Exemple #2

0

Afficher le fichier

Fichier : index_fasta.py Projet : alphaneer/cgat-apps

def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from cgat import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in list(synonyms.items()): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print(globals()["__doc__"]) sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.stop()