def applyThreshold(infile, fasta, threshold, max_distance=0): '''apply threshold to a wig file writing a bed-formatted file as output.''' c = E.Counter() for contig, size in list( fasta.getContigSizes(with_synonyms=False).items()): c.contigs += 1 E.debug("processing %s" % contig) last_start, last_end = -1, 0 for start, end, value in block_iterator(infile, contig, size): d = start - last_end if (d > 0 or value < threshold): if last_start >= 0: yield contig, last_start, last_end c.intervals += 1 last_start = -1 elif last_start < 0 and value >= threshold: last_start = start last_end = end if last_start >= 0: yield contig, last_start, end c.intervals += 1 c.output += 1 E.info(str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("script", "module"), help="type of tests to create [%default].") parser.set_defaults(method="script") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) == 0: raise ValueError( "setup_test.py requires one or more command line arguments") targetdir = os.path.dirname(__file__) counter = E.Counter() for arg in args: counter.input += 1 script_dirname, basename = os.path.split(arg) dirname = os.path.join(targetdir, basename) if os.path.exists(dirname): E.warn("%s already exists - skipping" % basename) counter.skipped += 1 continue os.mkdir(dirname) with open(os.path.join(dirname, "tests.yaml"), "w") as outf: outf.write(YAML_TEMPLATE) counter.created += 1 E.info("%s" % str(counter)) # write footer and output benchmark information. E.Stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) total_counter = E.Counter() table = [] for section, map_task2runner in [("tool", map_tool_to_runner), ("metric", map_metric_to_runner), ("split", map_split_to_runner), ("collate", map_collate_to_runner)]: E.debug("processing section: {}".format(section)) counter = E.Counter() for task, taskf in sorted(map_task2runner.items()): counter.ntasks += 1 comments = [] try: version = taskf().get_version() counter.version_ok += 1 except Exception: version = "" comments.append("unavailable") counter.version_fail += 1 comments = "; ".join(comments) table.append((section, task, version, comments)) E.info("{}: {}".format(section, counter)) total_counter += counter options.stdout.write("section\ttask\tversion\tcomments\n") for row in table: options.stdout.write("\t".join(map(str, row)) + "\n") E.info("{}: {}".format("total", counter)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("length", ), help="methods to apply [%default]") parser.set_defaults( methods=[], input_fastq_file=None, ) (options, args) = E.start(parser, argv) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if options.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 options.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run, do not delete any files [%default]") parser.set_defaults(dry_run=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filenames = args c = E.Counter() for filename in filenames: c.checked += 1 if os.path.exists(filename + ".log"): if iotools.isComplete(filename + ".log"): c.complete += 1 continue if iotools.isComplete(filename): c.complete += 1 continue c.incomplete += 1 E.info('deleting %s' % filename) if options.dry_run: continue os.unlink(filename) c.deleted += 1 E.info(c) # write footer and output benchmark information. E.Stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq-file", dest="input_fastq_file", type=str, help="input fastq file. ") parser.add_argument("-m", "--method", dest="methods", action="append", type=str, choices=("length", ), help="methods to apply ") parser.set_defaults( methods=[], input_fastq_file=None, ) (args, unknown) = E.start(parser, argv, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if args.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(args.input_fastq_file) as inf: for read in inf: counter.input += 1 args.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = get_params().get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = iotools.open_file(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = iotools.open_file(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = iotools.zap_file(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write( "%s\t%s\t%s\t%s\n" % (fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) get_logger().info("zapped: %s" % (c)) outfile.close() return c
def pair_iterator(test_vcf, truth_vcf, contig): counter = E.Counter() test_iter = test_vcf.fetch(contig) truth_iter = truth_vcf.fetch(contig) test_record = next(test_iter) truth_record = next(truth_iter) try: while 1: if test_record.pos < truth_record.pos: test_record = next(test_iter) continue elif test_record.pos > truth_record.pos: truth_record = next(truth_iter) continue elif len(test_record.alts) > 1: counter.skip_test_truth += 1 test_record = next(test_iter) continue elif len(truth_record.alts) > 1: counter.skip_multiallelic_truth += 1 truth_record = next(truth_iter) continue elif test_record.alts != truth_record.alts: counter.skip_genotype_difference += 1 test_record = next(test_iter) truth_record = next(truth_iter) continue if test_record.ref != truth_record.ref: # todo: deal with indels raise ValueError("mismatching reference bases at position " "{}:{}".format(test_record.chrom, test_record.pos)) yield test_record, truth_record test_record = next(test_iter) truth_record = next(truth_iter) except StopIteration: pass E.debug(str(counter))
def read_and_randomize_rows(infile, args): """read table from stdin and randomize rows, keeping header.""" c = E.Counter() if args.has_headers: keep_header = 1 else: keep_header = 0 for x in range(keep_header): c.header += 1 args.stdout.write(infile.readline()) lines = infile.readlines() c.lines_input = len(lines) random.shuffle(lines) args.stdout.write("".join(lines)) c.lines_output = len(lines) E.info(c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--keep-header", dest="keep_header", type="int", help="randomize, but keep header in place [%default]") parser.set_defaults(keep_header=0) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) inf = options.stdin outf = options.stdout c = E.Counter() for x in range(options.keep_header): c.header += 1 outf.write(inf.readline()) lines = inf.readlines() c.lines_input = len(lines) random.shuffle(lines) for line in lines: outf.write(line) c.lines_output = len(lines) E.info(c) # write footer and output benchmark information. E.stop()
def buildMRBed(infile, outfile): '''output bed6 file with methylated regions. All regions are output, even the insignificant ones. The score is the log fold change. ''' outf = iotools.openFile(outfile, "w") c = E.Counter() for row in csv.DictReader(iotools.openFile(infile), dialect="excel-tab"): c.input += 1 contig, start, end = re.match("(.*):(\d+)-(\d+)", row["interval_id"]).groups() c.output += 1 outf.write("\t".join((contig, start, end, str(c.input), row["lfold"])) + "\n") outf.close() E.info("%s" % str(c))
def read_vcf_positions_into_dataframe(filename, filters=None): vcf_in = pysam.VariantFile(filename) if filters is None: filters = [] pass_filter = False snp_filter = False for f in filters: if f == "PASS": pass_filter = True elif f == "SNP": snp_filter = True records = [] c = E.Counter() for record in vcf_in: c.input += 1 f = record.filter.keys() if pass_filter and "PASS" not in f and "." not in f: c.removed_pass_filter += 1 continue if snp_filter: is_snp = (len(record.ref) == 1 and len(record.alts) == 1 and len(record.alts[0]) == 1) if not is_snp: c.removed_snp_filter += 1 continue c.output += 1 records.append((record.chrom, record.pos)) df = pandas.DataFrame.from_records(records, columns=["chrom", "pos"]) E.info("{}: {}".format(filename, c)) return df
def createGOFromGeneOntology(infile, outfile): """get GO assignments from Geneontology.org GO terms are mapped to ensembl gene names via uniprot identifiers. Configuration ------------- geneontology_file Filename on geneontology database, e.g., gene_association.goa_human.gz database_name Pipeline database name Arguments --------- infile : string Unused outfile : string Output filename """ filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz") if not os.path.exists(filename): statement = ''' wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD ''' P.run(statement) # see http://www.geneontology.org/gene-associations/readme/goa.README Data = collections.namedtuple( "Data", "db db_object_id db_object_symbol qualifier goid dbreference evidence " " with_id aspect " " db_object_name synonym db_object_type " " taxon_id date assigned_by " " annotation_extension" " gene_product_form_id") dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() map_uniprot2ensembl = dict( cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info"). fetchall()) map_goid2description = dict( cc.execute("SELECT DISTINCT go_id, description FROM go_assignments"). fetchall()) aspect2name = { "P": "biol_process", "F": "mol_function", "C": "cell_location" } c = E.Counter() found_uniprot, found_genes, notfound_uniprot = set(), set(), set() outf = iotools.open_file(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for line in iotools.open_file(filename): if line.startswith("!"): continue c.input += 1 data = Data._make(line[:-1].split("\t")) if data.db_object_symbol in map_uniprot2ensembl: gene_id = map_uniprot2ensembl[data.db_object_symbol] found_uniprot.add(data.db_object_symbol) found_genes.add(gene_id) outf.write( "%s\t%s\t%s\t%s\t%s\n" % (aspect2name[data.aspect], gene_id, data.goid, map_goid2description.get(data.goid, ""), data.evidence)) c.output += 1 else: c.notfound += 1 notfound_uniprot.add(data.db_object_symbol) c.found_genes = len(found_genes) c.found_uniprot = len(found_uniprot) c.notfound_uniprot = len(notfound_uniprot) E.info("%s" % str(c)) E.info("not found=%s" % str(notfound_uniprot)) outf.close()
def imputeGO(infile_go, infile_paths, outfile): """impute GO accessions. Output a list of gene-to-GO associations for genes that includes ancestral terms. Arguments --------- infile_go : string Filename with gene-to-GO assocations for genes infile_paths : string Filename with paths of term to ancestor (see go2fmt.pl). outfile : string Output filename """ c = E.Counter() term2ancestors = collections.defaultdict(set) with iotools.open_file(infile_paths) as inf: for line in inf: parts = line[:-1].split() term = parts[0] ancestors = [parts[x] for x in range(2, len(parts), 2)] # there can be multiple paths term2ancestors[term].update(ancestors) goid2description = {} gene2goids = collections.defaultdict(list) goid2type = {} with iotools.open_file(infile_go) as inf: for line in inf: if line.startswith("go_type"): continue go_type, gene_id, goid, description, evidence = line[:-1].split( "\t") gene2goids[gene_id].append(goid) goid2description[goid] = description goid2type[goid] = go_type outf = iotools.open_file(outfile, "w ") for gene_id, in_goids in gene2goids.items(): c.genes += 1 out_goids = set(in_goids) for goid in in_goids: out_goids.update(term2ancestors[goid]) if len(in_goids) != len(out_goids): c.increased += 1 else: c.complete += 1 for goid in out_goids: outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid, goid2description.get(goid, ""), "NA")) + "\n") c.assocations += 1 outf.close() E.info("%s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-t", "--template-bam-file", dest="filename_genome_bam", type=str, help="input bam file for header information ") parser.add_argument("-s", "--contigs-tsv-file", dest="filename_contigs", type=str, help="filename with contig sizes ") parser.add_argument( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) ") parser.add_argument("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches ") parser.add_argument("-c", "--remove-contigs", dest="remove_contigs", type=str, help="','-separated list of contigs to remove ") parser.add_argument("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files ") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely ") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if args.filename_genome_bam: genomefile = pysam.AlignmentFile(args.filename_genome_bam, "rb") elif args.filename_contigs: contigs = iotools.ReadMap(iotools.open_file(args.filename_contigs)) data = list(zip(*list(contigs.items()))) referencenames, referencelengths = data[0], list(map(int, data[1])) else: raise ValueError( "please provide either --template-bam-file or --contigs-tsv-file") infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if args.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.items()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = list( map(int, (first_exon_start, first_exon_end, last_exon_start, last_exon_end))) first_exon_end += 1 total = first_exon_end - first_exon_start + \ last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file") parser.add_option( "-f", "--reference-bam", dest="reference_bam_file", type="string", help="reference BAM file [%default]") parser.add_option( "-q", "--query-name-regex", dest="query_name_regex", type="string", help="regular expression to apply on query name. " "Potentially required to match samtools sort order and should " "evaluate to an integer [%default]") parser.set_defaults( input_bam_file=None, reference_bam_file=None, query_name_regex=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 2: options.input_bam_file = args[0] options.reference_bam_file = args[1] if options.input_bam_file is None: raise ValueError("please supply a BAM file as input") if options.reference_bam_file is None: raise ValueError("please supply a BAM file as reference") # update paths to absolute options.input_bam_file = os.path.abspath(options.input_bam_file) options.reference_bam_file = os.path.abspath(options.reference_bam_file) if not os.path.exists(options.input_bam_file): raise OSError("input bam file {} does not exist".format( options.input_bam_file)) if not os.path.exists(options.reference_bam_file): raise OSError("reference bam file {} does not exist".format( options.reference_bam_file)) bam_in = pysam.AlignmentFile(options.input_bam_file) ref_in = pysam.AlignmentFile(options.reference_bam_file) outf_mapped = E.open_output_file("mapped") outf_mapped.write("\t".join( ["read", "length", "status", "overlap", "comp_contig", "comp_start", "comp_end", "ref_contig", "ref_start", "ref_end", "shared_misaligned", "shared_aligned", "shared_insertion", "shared_deletion", "comp_aligned", "comp_insertion", "comp_deletion", "ref_aligned", "ref_insertion", "ref_deletion"]) + "\n") outf_missing = E.open_output_file("missing") outf_missing.write("\t".join( ["read", "length", "status", "aligned", "insertion", "deletion"]) + "\n") counter = E.Counter() if options.query_name_regex: rx = re.compile(options.query_name_regex) def extract_query(x): return int(rx.search(x).groups()[0]) qname_fn = None if options.query_name_regex: qname_fn = extract_query for reads_cmp, read_ref in group_pairs(iterate_read_pairs( bam_in.fetch(until_eof=True), ref_in.fetch(until_eof=True), qname_fn=qname_fn)): if len(reads_cmp) == 0: counter.missing += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "missing") + count_pairs(pairs_ref))) + "\n") continue if len(reads_cmp) > 1: # multiple matches counter.multi_mapping += 1 prefix = "multi_" else: counter.unique_mapping += 1 prefix = "unique_" is_mapped = False for read_cmp in reads_cmp: counter.paired += 1 if read_cmp.is_unmapped: counter.unmapped += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "unmapped") + count_pairs(pairs_ref))) + "\n") continue overlap = max(0, (min(read_cmp.reference_end, read_ref.reference_end) - max(read_cmp.reference_start, read_ref.reference_start))) pairs_cmp = set(read_cmp.get_aligned_pairs()) pairs_ref = set(read_ref.get_aligned_pairs()) shared_cmp = pairs_cmp.intersection(pairs_ref) unique_cmp = pairs_cmp.difference(pairs_ref) missaligned = len([x for x, y in unique_cmp if x is not None and y is not None]) if read_cmp.reference_name != read_ref.reference_name or \ overlap == 0: status = "mismapped" else: counter.overlap += 1 status = "mapped" is_mapped = True outf_mapped.write("\t".join( map(str, (read_cmp.query_name, read_cmp.query_length, prefix + status, overlap, read_cmp.reference_name, read_cmp.reference_start, read_cmp.reference_end, read_ref.reference_name, read_ref.reference_start, read_ref.reference_end, missaligned) + count_pairs(shared_cmp) + count_pairs(pairs_cmp) + count_pairs(pairs_ref))) + "\n") else: if is_mapped: status = "mapped" else: status = "mismapped" counter[prefix + status] += 1 with E.open_output_file("summary") as outf: outf.write("category\tcounts\n") outf.write(counter.asTable() + "\n") E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "--input-filename-fasta", dest="input_filename_fasta", type=str, help="filename with reference sequence in fasta format ") parser.add_argument( "--input-filename-bam", dest="input_filename_bam", type=str, help="filename with aligned reads ") parser.add_argument( "--method", dest="methods", type=str, action="append", choices=["add-strelka-genotype", "lift-over"], help="methods to apply ") parser.add_argument( "--input-filename-chain", dest="input_filename_chain", type=str, help="filename with alignment chain for lift-over ") parser.add_argument( "--normal-sample-regex", dest="normal_sample_regex", type=str, help="regular expression to apply to header to identify normal " "sample id ") parser.add_argument( "--output-filename-unmapped", dest="output_filename_unmapped", type=str, help="filename with variants that could not be lifted over ") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf="-", sample_size=0.001, region_size=20, methods=[], normal_sample_regex=None, input_filename_chain=None, output_filename_unmapped=None, ) (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) if len(unknown) > 0: args.input_filename_vcf = unknown[0] vcf_in = pysam.VariantFile(args.input_filename_vcf) if "lift-over" in args.methods: if args.input_filename_chain is None: raise ValueError("--method=lift-over requires --input-filename-chain") if not os.path.exists(args.input_filename_chain): raise OSError("file {} with chain data does not exist".format( args.input_filename_chain)) E.info("reading chain from {}".format(args.input_filename_chain)) with iotools.open_file(args.input_filename_chain) as inf: map_chain, map_contig2length = read_liftover_chain(inf) if args.input_filename_fasta: fasta = pysam.FastaFile(args.input_filename_fasta) else: fasta = None if args.input_filename_bam: bam = pysam.AlignmentFile(args.input_filename_bam) else: bam = None outf = args.stdout c = E.Counter() if "add-strelka-genotype" in args.methods: map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."} map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"} header = str(vcf_in.header).splitlines() header.insert( len(header) - 1, '##FORMAT=<ID=GT,Number=1,Type=String,Description=' '"Genotypes of reference and alternative alleles, ' 'added by cgatcore vcf2vcf.">') header = "\n".join(header) if args.normal_sample_regex: normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0] else: normal_sample = "NORMAL" is_first = True for record in vcf_in: c.input += 1 if "GT" in record.format: if is_first: outf.write(header + "\n") is_first = False outf.write(str(record)) c.has_gt += 1 continue gt_normal = map_nt2gt[record.info["NT"]] gt_tumour = record.info["SGT"] norm, tumour = gt_tumour.split("->") if gt_tumour[0] in "ACGT": alts = record.alts if alts is None: c.no_alt += 1 continue if len(record.alts) > 1: c.multi_allelic += 1 continue _map_tumour2gt = { record.alts[0]: "1", record.ref: "0"} try: gt_tumour = "/".join( sorted([_map_tumour2gt[x] for x in tumour])) except KeyError: gt_tumour = "." c.ambigous_genotype += 1 else: gt_tumour = map_tumour2gt[tumour] fields = str(record)[:-1].split("\t") # FORMAT fields[8] = ":".join(("GT", fields[8])) # SAMPLES # makes a few assumptions, fix! header_insert_normal = False if len(fields) == 11: fields[9] = ":".join((gt_normal, fields[9])) fields[10] = ":".join((gt_tumour, fields[10])) elif len(fields) == 10: header_insert_normal = True values = fields[9].split(":") fields.append(":".join((gt_tumour, fields[9]))) fields[9] = ":".join([gt_normal] + ["."] * len(values)) else: raise NotImplementedError() if is_first: if not header_insert_normal: outf.write(header + "\n") else: header = re.sub(r"\tFORMAT\t", "\tFORMAT\t%s\t" % normal_sample, header) outf.write(header + "\n") is_first = False outf.write("\t".join(fields) + "\n") c.output += 1 elif "lift-over" in args.methods: header = str(vcf_in.header).splitlines() if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) else: expected_lengths = map_contig2length # update contig names and sizes in VCF header header = [x for x in header if not x.startswith("##contig")] header[-1:-1] = ["##contig=<ID={},length={}>".format( contig, length) for contig, length in sorted(expected_lengths.items())] header.insert( len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format( args.input_filename_chain, args.input_filename_fasta)) outf.write("\n".join(header) + "\n") unmapped_contigs = set() unknown_contigs = set() trans_genotypes = str.maketrans("01", "10") if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) for contig, length in list(map_contig2length.items()): if contig in expected_lengths: if length != expected_lengths[contig]: raise ValueError( "contig lengths mismatch. For contig {} chain files " "says {}, but fasta files says {}".format( contig, length, expected_lengths[contig])) E.info("contig sizes in chain file and fasta files correspond.") if args.output_filename_unmapped: outfile_unmapped = iotools.open_file(args.output_filename_unmapped, "w") outfile_unmapped.write("\n".join(header) + "\n") else: outfile_unmapped = None for record in vcf_in: c.input += 1 try: mm = map_chain[record.contig] except KeyError: c.skipped_unmapped_contig += 1 unmapped_contigs.add(record.contig) if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record))) continue try: m = mm.search(record.start, record.stop) except AttributeError: c.skipped_mapping_error += 1 if outfile_unmapped: outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record))) continue if len(m) == 0: c.skipped_unmapped_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record))) continue elif len(m) > 1: c.skipped_multimapping_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record))) continue m = m[0] y_contig, y_start, y_end, y_invert = m.data if y_invert: y_pos = y_end - (record.start - m.start) else: y_pos = (record.start - m.start) + y_start if fasta: try: ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper() except KeyError: c.skipped_unknown_contig += 1 unknown_contigs.add(y_contig) ref_base = None continue swap_alleles = False if ref_base: error = False if ref_base == record.ref: c.matches += 1 else: if len(record.alts) == 1: alt_base = record.alts[0] if ref_base == alt_base: swap_alleles = True c.allele_swap_variant += 1 else: c.error_mismatch_variant += 1 error = "mismatch" else: error = "multi-mismatch" c.error_multi_mismatch_variant += 1 if error: if outfile_unmapped: outfile_unmapped.write("{}\t{}".format(error, str(record))) c.skipped_error_variant += 1 continue fields = str(record)[:-1].split("\t") fields[0] = y_contig fields[1] = str(y_pos) if swap_alleles: fields[4] = alt_base fields[5] = ref_base # update genotype fields keep = False for idx in range(9, len(fields)): gt, rest = fields[idx].split(":", 1) keep = keep or "0" in gt fields[idx] = ":".join((gt.translate(trans_genotypes), rest)) # remove reference only calls if not keep: if outfile_unmapped: outfile_unmapped.write("reference_call\t{}".format(str(record))) c.skipped_allele_swap_reference += 1 continue c.output += 1 outf.write("\t".join(fields) + "\n") c.unmapped_contigs = len(unmapped_contigs) c.unknown_contigs = len(unknown_contigs) E.info(c.asTable()) if unknown_contigs: E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs)))) if unmapped_contigs: E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs)))) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False): '''get refseq gene set from UCSC database and save as :term:`gtf` formatted file. Matches to ``chr_random`` are ignored (as does ENSEMBL). Note that this approach does not work as a gene set, as refseq maps are not real gene builds and unalignable parts cause differences that are not reconcilable. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`gtf` format. The filename aims to be close to the ENSEMBL gtf format. remove_duplicate : bool If True, duplicate mappings are removed. ''' duplicates = set() if remove_duplicates: cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene WHERE chrom NOT LIKE '%_random' GROUP BY name HAVING c > 1""") duplicates = set([x[0] for x in cc.fetchall()]) E.info("removing %i duplicates" % len(duplicates)) # these are forward strand coordinates statement = ''' SELECT gene.name, link.geneName, link.name, gene.name2, product, protAcc, chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, exonFrames FROM refGene as gene, refLink as link WHERE gene.name = link.mrnaAcc AND chrom NOT LIKE '%_random' ORDER by chrom, cdsStart ''' outf = iotools.open_file(outfile, "w") cc = dbhandle.execute(statement) SQLResult = collections.namedtuple( 'Result', '''transcript_id, gene_id, gene_name, gene_id2, description, protein_id, contig, strand, start, end, nexons, starts, ends, frames''') counts = E.Counter() counts.duplicates = len(duplicates) for r in map(SQLResult._make, cc.fetchall()): if r.transcript_id in duplicates: continue starts = list(map(int, r.starts.split(",")[:-1])) ends = list(map(int, r.ends.split(",")[:-1])) frames = list(map(int, r.frames.split(",")[:-1])) gtf = GTF.Entry() gtf.contig = r.contig gtf.source = "protein_coding" gtf.strand = r.strand gtf.gene_id = r.gene_id gtf.transcript_id = r.transcript_id gtf.addAttribute("protein_id", r.protein_id) gtf.addAttribute("transcript_name", r.transcript_id) gtf.addAttribute("gene_name", r.gene_name) assert len(starts) == len(ends) == len(frames) if gtf.strand == "-": starts.reverse() ends.reverse() frames.reverse() counts.transcripts += 1 i = 0 for start, end, frame in zip(starts, ends, frames): gtf.feature = "exon" counts.exons += 1 i += 1 gtf.addAttribute("exon_number", i) # frame of utr exons is set to -1 in UCSC gtf.start, gtf.end, gtf.frame = start, end, "." outf.write("%s\n" % str(gtf)) cds_start, cds_end = max(r.start, start), min(r.end, end) if cds_start >= cds_end: # UTR exons have no CDS # do not expect any in UCSC continue gtf.feature = "CDS" # invert the frame frame = (3 - frame % 3) % 3 gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame outf.write("%s\n" % str(gtf)) outf.close() E.info("%s" % str(counts))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--min-overlap", dest="min_overlap", type=int, help="minimum overlap") parser.add_argument( "-w", "--pattern-window", dest="pattern_window", type=str, help="regular expression to extract window coordinates from " "test id ") parser.add_argument("-i", "--invert", dest="invert", action="store_true", help="invert direction of fold change ") parser.set_defaults(min_overlap=10, invert=False, pattern_window="(\S+):(\d+)-(\d+)"), # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) outfiles = iotools.FilePool(args.output_filename_pattern) if args.invert: test_f = lambda l2fold: l2fold < 0 else: test_f = lambda l2fold: l2fold > 0 def read(): rx_window = re.compile(args.pattern_window) # filter any of the DESeq/EdgeR message that end up at the top of the # output file for data in iotools.iterate(args.stdin): contig, start, end = rx_window.match(data.test_id).groups() start, end = list(map(int, (start, end))) yield DATA._make( (data.test_id, contig, start, end, data.treatment_name, float(data.treatment_mean), float(data.treatment_std), data.control_name, float(data.control_mean), float(data.control_std), float(data.pvalue), float(data.qvalue), float(data.l2fold), float(data.fold), int(data.significant), data.status, 0)) def grouper(data, distance=10): last = next(data) entries = [last] while 1: d = next(data) if d is None: break if d.contig == last.contig and d.start < last.start: raise ValueError("error not sorted by start") if ((d.contig != last.contig) or (d.start - last.end > distance) or (d.status != last.status) or (d.significant != last.significant) or (d.l2fold * last.l2fold < 0)): yield entries entries = [] entries.append(d) last = d yield entries counter = E.Counter() args.stdout.write("\t".join(DATA._fields) + "\n") # set of all sample names - used to create empty files samples = set() # need to sort by coordinate all_data = list(read()) all_data.sort(key=lambda x: (x.contig, x.start)) group_id = 0 for group in grouper(iter(all_data), distance=args.min_overlap): group_id += 1 start, end = group[0].start, group[-1].end assert start < end, 'start > end: %s' % str(group) n = float(len(group)) counter.input += n g = group[0] if g.l2fold < 0: l2fold = max([x.l2fold for x in group]) fold = max([x.fold for x in group]) else: l2fold = min([x.l2fold for x in group]) fold = min([x.fold for x in group]) outdata = DATA._make( (str(group_id), g.contig, start, end, g.treatment_name, sum([x.treatment_mean for x in group]) / n, max([x.treatment_std for x in group]), g.control_name, sum([x.control_mean for x in group]) / n, max([x.control_std for x in group]), max([x.pvalue for x in group]), max([x.qvalue for x in group]), l2fold, fold, g.significant, g.status, int(n))) samples.add(g.treatment_name) samples.add(g.control_name) if g.significant: if test_f(g.l2fold): # treatment lower methylation than control outfiles.write( g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.treatment_mean for x in group]) / n)) else: outfiles.write( g.control_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.control_mean for x in group]) / n)) args.stdout.write("\t".join(map(str, outdata)) + "\n") counter.output += 1 # create empty files for sample in samples: outfiles.write(sample, "") outfiles.close() E.info("%s" % counter) # write footer and output benchmark information. E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main(argv=sys.argv): # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-s", "--session", dest="session", type=str, help="load session before creating plots ") parser.add_argument("-d", "--snapshot-dir", dest="snapshotdir", type=str, help="directory to save snapshots in ") parser.add_argument("-f", "--format", dest="format", type=str, choices=("png", "eps", "svg"), help="output file format ") parser.add_argument("-o", "--host", dest="host", type=str, help="host that IGV is running on ") parser.add_argument("-p", "--port", dest="port", type=int, help="port that IGV listens at ") parser.add_argument("-e", "--extend", dest="extend", type=int, help="extend each interval by a number of bases ") parser.add_argument("-x", "--expand", dest="expand", type=float, help="expand each region by a certain factor ") parser.add_argument("--session-only", dest="session_only", action="store_true", help="plot session after opening, " "ignore intervals ") parser.add_argument("-n", "--name", dest="name", type=str, choices=("bed-name", "increment"), help="name to use for snapshot ") parser.set_defaults( command="igv.sh", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, session=None, session_only=False, keep_open=False, name="bed-name", ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) igv_process = None if args.new_instance: E.info("starting new IGV process") igv_process = IGV.startIGV(command=args.command, port=args.port) E.info("new IGV process started") E.info("connection to process on %s:%s" % (args.host, args.port)) E.info("saving images in %s" % args.snapshotdir) igv = IGV(host=args.host, port=args.port, snapshot_dir=os.path.abspath(args.snapshotdir)) if args.session: E.info('loading session from %s' % args.session) igv.load(args.session) E.info('loaded session') if args.session_only: E.info('plotting session only ignoring any intervals') fn = "%s.%s" % (os.path.basename(args.session), args.format) E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn)) igv.save(fn) else: c = E.Counter() for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()): c.input += 1 # IGV can not deal with white-space in filenames if args.name == "bed-name": name = re.sub("\s", "_", bed.name) elif args.name == "increment": name = str(c.input) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = args.extend if args.expand: d = end - start extend = max(extend, (args.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = E.get_output_file("%s.%s" % (name, args.format)) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) if igv_process is not None and not args.keep_open: E.info('shutting down IGV') igv_process.send_signal(signal.SIGKILL) E.stop()
def process_daisy(options): filter_n = "filter-N" in options.methods filter_ont = "filter-ONT" in options.methods if "filter-identifier" in options.methods: if options.input_filter_tsv is None: raise ValueError( "please set --input-filter-tsv for method filter-identifier") with iotools.open_file(options.input_filter_tsv) as inf: filter_identifier = set( [x.split()[0].strip() for x in inf.readlines()]) else: filter_identifier = False if options.output_removed_tsv: outf_removed_tsv = iotools.open_file(options.output_removed_tsv, "w") else: outf_removed_tsv = None if options.output_removed_fastq: outf_removed_fastq = iotools.open_file(options.output_removed_fastq, "w") else: outf_removed_fastq = None if options.set_prefix: prefix = "{}".format(options.set_prefix) else: prefix = None quality_offset = options.quality_offset counter = E.Counter() with pysam.FastxFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 remove = False if filter_n: chars = collections.Counter(read.sequence) if "N" in chars and \ 100.0 * chars["N"] / len(read.sequence) > options.max_percent_N: remove = True counter.filter_n += 1 if filter_identifier: if read.name not in filter_identifier: counter.filter_identifier += 1 remove = True if filter_ont: quals = read.get_quality_array() n = len(quals) if n < options.min_sequence_length or \ float(sum(quals)) / n < options.min_average_quality: counter.remove_ont += 1 remove = True if remove: counter.removed += 1 if outf_removed_tsv: outf_removed_tsv.write(read.name + "\n") if outf_removed_fastq: outf_removed_fastq.write(str(read) + "\n") continue if prefix: read.name = prefix + read.name[2:] if quality_offset: quals = numpy.array(read.get_quality_array()) quals += quality_offset quals[quals < 0] = 0 quals += 33 # pysam fastq is read-only, so fudge it: # Note: not outputting description read = "@{}\n{}\n+\n{}".format( read.name, read.sequence, "".join([chr(x) for x in quals])) counter.output += 1 options.stdout.write(str(read) + "\n") if outf_removed_tsv: outf_removed_tsv.close() if outf_removed_fastq: outf_removed_fastq.close() if options.output_stats_tsv: with iotools.open_file(options.output_stats_tsv, "w") as outf: outf.write(counter.asTable(as_rows=False) + "\n") return counter
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--input-filter-tsv", dest="input_filter_tsv", type="string", help="list with identifiers to remove. " "[%default]") parser.add_option("--set-prefix", dest="set_prefix", type="string", help="set sequence prefix [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum alignment length [%default]") parser.add_option("--method", dest="methods", action="append", choices=("shift-region", ), help="methods to apply [%default]") parser.set_defaults( input_maf_file=None, input_filter_tsv=None, set_prefix=None, min_length=0, methods=[], ) (options, args) = E.start(parser, argv) if options.input_filter_tsv: with iotools.open_file(options.input_filter_tsv) as inf: skip_id = set([x[:-1] for x in inf]) else: skip_id = False counter = E.Counter() if options.set_prefix: prefix = "s {}".format(options.set_prefix) else: prefix = None for block in iterate_maf_blocks(options.stdin): counter.blocks_input += 1 if skip_id: if block[2].startswith("s "): id = re.match("s (\S+)", block[2]).groups()[0] if id in skip_id: counter.blocks_skipped_id += 1 continue if options.min_length: if block[2].startswith("s "): id, pos, length = re.match("s (\S+)\s+(\d+)\s+(\d+)", block[2]).groups() if int(length) <= options.min_length: counter.blocks_skipped_length += 1 continue if prefix: block[2] = prefix + block[2][4:] if block[2].startswith("s "): header, ali1, ali2, qual = parse_block(block) if "shift-region" in options.methods: rows = [] contig, start, end = parse_region_string(ali1.src) ali1 = ali1._replace(src=contig, start=start + ali1.start) rows.append(list(map(str, ali1))) rows.append(list(map(str, ali2))) if qual: rows.append(list(map(str, qual))) lines = [header] lines.append(format_tabular(rows, "llrrrrl")) lines.append("\n") block = lines counter.blocks_output += 1 options.stdout.write("".join(block)) E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts", "taxa-counts", "individual"), help="summarise the taxa counts - no. phyla etc") parser.add_option("--output-map", dest="output_map", action="store_true", help="ouput map of taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.output_map: found = [] options.stdout.write("""Domain\t \ kingdom\t \ phylum\t \ class\t \ order\t \ family\t \ genus\t \ species\n""") # only output the mapping file - do not continue # summarise regardless of the specified options for lca in LCA.iterate(options.stdin): # if bacteria or archaea the kingdom will # be the domain if lca.domain == "Bacteria" or lca.domain == "Archaea": kingdom = lca.domain else: kingdom = lca.kingdom hierarchy = [ lca.domain, kingdom, lca.phylum, lca._class, lca.order, lca.family, lca.genus, lca.species ] if hierarchy in found: continue else: found.append(hierarchy) options.stdout.write("\t".join(hierarchy) + "\n") return if options.summarise == "level-counts": level_counts = collections.defaultdict(set) total = 0 nreads_domain = 0 nreads_kingdom = 0 nreads_kingdom_plus = 0 nreads_phylum = 0 nreads_phylum_plus = 0 nreads_class = 0 nreads_class_plus = 0 nreads_order = 0 nreads_order_plus = 0 nreads_family = 0 nreads_family_plus = 0 nreads_genus = 0 nreads_genus_plus = 0 nreads_species = 0 nreads_species_plus = 0 nreads_subspecies = 0 nreads_subspecies_plus = 0 c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": nreads_domain += 1 level_counts["domain"].add(lca.domain) else: c.kingdom_unmapped += 1 if lca.kingdom != "NA": nreads_kingdom += 1 level_counts["kingdom"].add(lca.kingdom) else: c.kingdom_unmapped += 1 if lca.kingdom_plus != "NA": nreads_kingdom_plus += 1 level_counts["kingdom+"].add(lca.kingdom_plus) else: c.kingdom_plus_unmapped += 1 if lca.phylum != "NA": nreads_phylum += 1 level_counts["phylum"].add(lca.phylum) else: c.phylum_unmapped += 1 if lca.phylum_plus != "NA": nreads_phylum_plus += 1 level_counts["phylum+"].add(lca.phylum_plus) else: c.phylum_plus_unmapped += 1 if lca._class != "NA": nreads_class += 1 level_counts["class"].add(lca._class) else: c.class_unmapped += 1 if lca._class_plus != "NA": nreads_class_plus += 1 level_counts["class+"].add(lca._class_plus) else: c.class_plus_unmapped += 1 if lca.order != "NA": nreads_order += 1 level_counts["order"].add(lca.order) else: c.order_unmapped += 1 if lca.order_plus != "NA": nreads_order_plus += 1 level_counts["order+"].add(lca.order_plus) else: c.order_plus_unmapped += 1 if lca.family != "NA": nreads_family += 1 level_counts["family"].add(lca.family) else: c.family_unmapped += 1 if lca.family != "NA": nreads_family_plus == 1 level_counts["family+"].add(lca.family_plus) else: c.family_plus_unmapped += 1 if lca.genus != "NA": nreads_genus += 1 level_counts["genus"].add(lca.genus) else: c.genus_unmapped += 1 if lca.genus_plus != "NA": nreads_genus_plus == 1 level_counts["genus+"].add(lca.genus_plus) else: c.genus_plus_unmapped += 1 if lca.species != "NA": nreads_species += 1 level_counts["species"].add(lca.species) else: c.species_unmapped += 1 if lca.species_plus != "NA": nreads_species_plus += 1 level_counts["species+"].add(lca.species_plus) else: c.species_plus_unmapped += 1 # removed subspecies mapping for the time # being # if lca.subspecies != "NA": # nreads_subspecies += 1 # level_counts["subspecies"].add(lca.subspecies) # else: # c.subspecies_unmapped += 1 # if lca.subspecies_plus != "NA": # nreads_subspecies_plus += 1 # level_counts["subspecies+"].add(lca.subspecies_plus) # else: # c.subspecies_plus_unmapped += 1 options.stdout.write("\t".join([ "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+" ]) + "\n") options.stdout.write("\t".join( map(str, [ len(level_counts["domain"]), len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(level_counts["species"]), len(level_counts["species+"]), nreads_domain, nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus ])) + "\n") elif options.summarise == "taxa-counts": unmapped = collections.defaultdict(int) total = 0 taxa_counts = { "domain": collections.defaultdict(int), "kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int) } c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": taxa_counts["domain"][lca.domain] += 1 else: c.kingdom_unmapped += 1 unmapped["domain"] += 1 if lca.kingdom != "NA": taxa_counts["kingdom"][lca.kingdom] += 1 else: c.kingdom_unmapped += 1 unmapped["kingdom"] += 1 if lca.kingdom_plus != "NA": taxa_counts["kingdom+"][lca.kingdom_plus] += 1 else: c.kingdom_plus_unmapped += 1 unmapped["kingdom+"] += 1 if lca.phylum != "NA": taxa_counts["phylum"][lca.phylum] += 1 else: c.phylum_unmapped += 1 unmapped["phylum"] += 1 if lca.phylum_plus != "NA": taxa_counts["phylum+"][lca.phylum_plus] += 1 else: c.phylum_plus_unmapped += 1 unmapped["phylum+"] += 1 if lca._class != "NA": taxa_counts["class"][lca._class] += 1 else: c.class_unmapped += 1 unmapped["class"] += 1 if lca._class_plus != "NA": taxa_counts["class+"][lca._class_plus] += 1 else: c.class_plus_unmapped += 1 unmapped["class+"] += 1 if lca.order != "NA": taxa_counts["order"][lca.order] += 1 else: c.order_unmapped += 1 unmapped["order"] += 1 if lca.order_plus != "NA": taxa_counts["order+"][lca.order_plus] += 1 else: c.order_plus_unmapped += 1 unmapped["order+"] += 1 if lca.family != "NA": taxa_counts["family"][lca.family] += 1 else: c.family_unmapped += 1 unmapped["family"] += 1 if lca.family_plus != "NA": taxa_counts["family+"][lca.family_plus] += 1 else: c.family_plus_unmapped += 1 unmapped["family+"] += 1 if lca.genus != "NA": taxa_counts["genus"][lca.genus] += 1 else: c.genus_unmapped += 1 unmapped["genus"] += 1 if lca.genus_plus != "NA": taxa_counts["genus+"][lca.genus_plus] += 1 else: c.genus_plus_unmapped += 1 unmapped["genus+"] += 1 if lca.species != "NA": taxa_counts["species"][lca.species] += 1 else: c.species_unmapped += 1 unmapped["species"] += 1 if lca.species_plus != "NA": taxa_counts["species+"][lca.species_plus] += 1 else: c.species_plus_unmapped += 1 unmapped["species+"] += 1 options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n") for level, taxa_count in sorted(taxa_counts.items()): total_level = total - unmapped[level] for taxa, count in sorted(taxa_count.items()): options.stdout.write("\t".join([ level, taxa, str(count), "{:.8}".format(float(count) / total_level), "{:.8}". format(float(count) / (float(total_level) / 1000000)) ]) + "\n") E.info(c) elif options.summarise == "individual": # each read is output with its respective # taxon assignments options.stdout.write("\t".join([ "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order", "order+", "family", "family+", "genus", "genus+", "species", "species+" ]) + "\n") for lca in LCA.iterate(options.stdin): options.stdout.write("\t".join([ lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order, lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus ]) + "\n") # write footer and output benchmark information. E.stop()
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.open_output_file("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.open_output_file("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def buildPolyphenInput(infiles, outfile): '''build polyphen input file. SNPS across all species are aggregated into a single file to avoid multiple submissions for the same variant. Mapping to Uniprot ids was not successful - 40% of the SNPs would have been lost. Hence I map to ensembl protein identifiers. Note that the sequence file is then to be submitted to POLYPHEN as well. Note that this method outputs 1-based coordinates for polyphen, while the coordinates in the .map file are still 0-based. SNPs are assigned a snp_id and a locus_id. The snp_id refers to the SNP within a peptide sequence while the locus_id refers to the genomic location. If there are alternative transcripts overlapping a SNP, the same SNP will get two snp_ids, but the same locus_id. As the peptide background might be different for the same SNP depending on the transcript, its effect needs to be predicted twice. ''' statement = '''SELECT transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, snp_position FROM %(table)s_cds WHERE variant_code = '=' AND code = 'N' ''' dbhandle = connect() cc = dbhandle.cursor() infiles.sort() # ensembl mapping map_transcript2id = dict( cc.execute("SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL").fetchall()) total_counts = E.Counter() notfound, found = set(), set() outf_map = open(outfile + ".map", "w") outf_map.write( "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n") outf = open(outfile, "w") snps = {} locus_ids = {} for infile in infiles: table = P.toTable(infile) track = table[:-len("_effects")] print(statement % locals()) cc.execute(statement % locals()) counts = E.Counter() snp_id = 0 for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc: counts.input += 1 if transcript_id not in map_transcript2id: notfound.add(transcript_id) counts.not_found += 1 continue if "," in variant_codons: counts.heterozygous += 1 continue for phase in range(0, 3): if orig_na[phase].lower() != variant_na[phase].lower(): break pid = map_transcript2id[transcript_id] # one-based coordinates peptide_pos = int(math.floor(cds_start / 3.0)) + 1 key = "%s-%i-%s" % (pid, peptide_pos, variant_codons) if key in snps: snp_id = snps[key] else: snp_id = len(snps) snps[key] = snp_id outf.write("snp%010i\t%s\t%i\t%s\t%s\n" % (snp_id, pid, peptide_pos, orig_codons, variant_codons, )) counts.output += 1 locus_key = "%s-%i-%s" % (contig, pos, variant_codons) if locus_key not in locus_ids: locus_ids[locus_key] = len(locus_ids) # use 0-based coordinates throughout, including peptide pos outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" % (snp_id, track, transcript_id, pid, peptide_pos - 1, locus_ids[locus_key], contig, pos, phase)) found.add(transcript_id) total_counts += counts E.info("%s: %s" % (table, str(counts))) outf.close() outf_map.close() E.info("%s: transcripts: %s found, %i not found" % (table, len(found), len(notfound))) E.info("total=%s, snp_ids=%i, locus_ids=%i" % (str(total_counts), len(snps), len(locus_ids))) if notfound: E.warn("%i transcripts had SNPS that were ignored because there was no uniprot accession" % len(notfound)) E.warn("notfound: %s" % ",".join(notfound)) statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.set_defaults() # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) # do sth if len(unknown) == 1: fastqfile1 = unknown[0] fastqfile2 = args.output_filename_pattern % "2" elif len(unknown) == 2: fastqfile1, fastqfile2 = unknown else: fastqfile1 = args.output_filename_pattern % "1" fastqfile2 = args.output_filename_pattern % "2" # only output compressed data if not fastqfile1.endswith(".gz"): fastqfile1 += ".gz" if not fastqfile2.endswith(".gz"): fastqfile2 += ".gz" if args.stdin != sys.stdin: samfile = pysam.AlignmentFile(args.stdin.name, "rb") else: samfile = pysam.AlignmentFile("-", "rb") tmpdir = tempfile.mkdtemp() outtemp1 = os.path.join(tmpdir, "pair1.gz") outtemp2 = os.path.join(tmpdir, "pair2.gz") outstream1 = iotools.open_file(outtemp1, "w") outstream2 = iotools.open_file(outtemp2, "w") E.info('writing fastq files to temporary directory %s' % tmpdir) found1, found2 = set(), set() read1_qlen, read2_qlen = 0, 0 c = E.Counter() for read in samfile.fetch(until_eof=True): c.input += 1 if not read.is_paired: outstream1.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found1.add(read.qname) if not read1_qlen: read1_qlen = read.qlen c.unpaired += 1 elif read.is_read1: outstream1.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found1.add(read.qname) if not read1_qlen: read1_qlen = read.qlen c.output1 += 1 elif read.is_read2: if read.qname not in found2: outstream2.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found2.add(read.qname) if not read2_qlen: read2_qlen = read.qlen c.output2 += 1 if c.unpaired == 0 and c.output1 == 0 and c.output2 == 0: E.warn("no reads were found") return sort_statement = '''gunzip < %s | sort -k1,1 | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}' | gzip > %s''' if c.output1 == 0 and c.output2 == 0: # single end data: outstream1.close() outstream2.close() E.info("sorting fastq files") E.run(sort_statement % (outtemp1, fastqfile1)) else: # paired end data for qname in found2.difference(found1): outstream1.write("\t".join((qname, "N" * read1_qlen, "B" * read1_qlen)) + "\n") c.extra1 += 1 for qname in found1.difference(found2): outstream2.write("\t".join((qname, "N" * read2_qlen, "B" * read2_qlen)) + "\n") c.extra2 += 1 E.info("%s" % str(c)) outstream1.close() outstream2.close() E.info("sorting fastq files") E.run(sort_statement % (outtemp1, fastqfile1)) E.run(sort_statement % (outtemp2, fastqfile2)) shutil.rmtree(tmpdir) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = iotools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = iotools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = merge_pairs(samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()