def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip( infile, ".bed.gz" ) fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) ) outs = IOTools.openFile( outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )): lcontig = fasta.getLength( bed.contig ) if mode == "intervals": seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) ) ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) ) elif mode == "leftright": l = bed.end - bed.start start, end = max(0,bed.start-l), bed.end-l ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) start, end = bed.start+l, min(lcontig,bed.end+l) ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) masked = maskSequences( seqs, masker ) outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) ) outs.close()
def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = IOTools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = IOTools.openFile(infile) header = inf.readline() outf = IOTools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()
def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out): digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] bed.start = frag.start bed.end = frag.end bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def buildForegroundSets(infiles, outfile): ''' build multiset of genes that are differentiallt expressed based on cluster assignments ''' clusters, probe2gene_file = infiles # read probe 2 gene map probe2gene = {} probe2gene_file = IOTools.openFile(probe2gene_file) for line in probe2gene_file.readlines(): data = line[:-1].split("\t") probe, gene = [x.replace('"', '') for x in data] probe2gene[probe] = gene # read probe 2 cluster map probe2cluster = {} clusters = IOTools.openFile(clusters) clusters.readline() for line in clusters.readlines(): data = line[:-1].split("\t") probe, cluster = data probe2cluster[probe] = cluster # output genes in each cluster for c in set(probe2cluster.values()): outname = "pathways.dir/C%s.foreground" % c outf = IOTools.openFile(outname, "w") for probe, cluster in probe2cluster.iteritems(): if cluster == c: outf.write("%s\n" % probe2gene[probe]) else: continue outf.close()
def extractEBioinfo(eBio_ids, vcfs, outfile): '''find the number of mutations identitified in previous studies (eBio_ids) for the mutated genes in the vcfs''' genes = set() n = 0 for vcf in vcfs: if n > 0: break else: n += 1 infile = VCF.VCFFile(IOTools.openFile(vcf)) for vcf_entry in infile: # assumes all vcf entries without "REJECT" are "PASS" if vcf_entry.filter != "REJECT": info_entries = vcf_entry.info.split(";") for entry in info_entries: if "SNPEFF_GENE_NAME" in entry: genes.update((entry.split("=")[1],)) eBio_ids = IOTools.openFile(eBio_ids, "r") tissue_counts = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.defaultdict(int))) for line in eBio_ids: tissue, study, table = line.strip().split("\t") for gene in genes: url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&" "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&" "gene_list=%(gene)s" % locals()) print url df = pd.io.parsers.read_csv(url, comment="#", sep="\t", header=False, index_col=0) # check dataframe contains data! if df.shape[0] != 0: tissue_counts[tissue][gene]["total"] += df.shape[1]-2 tissue_counts[tissue][gene]["mutations"] += int(df.count(1))-1 out = IOTools.openFile(outfile, "w") tissues = tissue_counts.keys() out.write("gene\t%s\n" % "\t".join([ "%s_frequency" % x.replace(" ", "_") for x in tissues])) for gene in genes: freq_values = [] for tissue in tissues: total = tissue_counts[tissue][gene]["total"] mutations = tissue_counts[tissue][gene]["mutations"] print "total: ", total, "mutations: ", mutations freq_values.append(np.divide(float(mutations), total)) out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values)))) out.close()
def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = IOTools.openFile(gtf) G = GTF.iterator(gfile) out = IOTools.openFile(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def buildFastQCSummaryStatus(infiles, outfile): '''load fastqc status summaries into a single table.''' outf = IOTools.openFile(outfile, "w") first = True for infile in infiles: track = P.snip(infile, ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") for fn in glob.glob(filename): prefix = os.path.basename(os.path.dirname(fn)) results = [] names, stats = [], [] for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): stats.append(status) names.append(name) if first: outf.write("track\tfilename\t%s\n" % "\t".join(names)) first = False outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats))) outf.close()
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def summarizeAllProcessing( infiles, outfile ): '''summarize processing information.''' outf = IOTools.openFile( outfile, "w" ) data = [] for infile in infiles: inf = IOTools.openFile( infile ) for line in inf: track, step, pair, ninput, noutput = line[:-1].split("\t") if track == "track": continue data.append( (track, step, pair, ninput, noutput) ) # sort by track, pair, input data.sort( key = lambda x: (x[0], x[2], -int(x[3]))) first = True for key, v in itertools.groupby( data, lambda x: (x[0], x[2])): vals = list(v) track,pair = key ninput = int(vals[0][3]) outputs = [int(x[4]) for x in vals] if first: outf.write( "track\tpair\tninput\t%s\t%s\t%s\t%s\n" % ("\t".join( [x[1] for x in vals] ), "noutput", "\t".join( ["percent_%s" % x[1] for x in vals] ), "percent_output" )) first = False outf.write( "%s\t%s\t%i\t%s\t%i\t%s\t%s\n" % ( track, pair, ninput, "\t".join( map(str,outputs)), outputs[-1], "\t".join( [ "%5.2f" % (100.0 * x / ninput) for x in outputs ] ), "%5.2f" % (100.0 * outputs[-1] / ninput))) outf.close()
def buildFastQCSummaryStatus(infiles, outfile, datadir): '''load fastqc status summaries into a single table.''' outf = IOTools.openFile(outfile, "w") names = set() results = [] for infile in infiles: track = P.snip(os.path.basename(infile), ".fastqc") filename = os.path.join(datadir, track + "*_fastqc", "fastqc_data.txt") # there can be missing sections for fn in glob.glob(filename): stats = collections.defaultdict(str) for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): stats[name] = status results.append((track, fn, stats)) names.update(stats.keys()) names = list(names) outf.write("track\tfilename\t%s\n" % "\t".join(names)) for track, fn, stats in results: outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats[x] for x in names))) outf.close()
def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--fastq1", dest="fastq1", type="string", help="supply read1 fastq file" ) parser.add_option("-b", "--fastq2", dest="fastq2", type="string", help="supply read2 fastq file" ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) fastq1 = IOTools.openFile(options.fastq1) fastq2 = IOTools.openFile(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?") except PairedReadError, e: raise PairedReadError(e), None, sys.exc_info()[2] else: assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def copy(src, dst, name): # remove "template" and the pipeline type from file/directory # names. fn_dest = os.path.join( destination_dir, dst, rx_type.sub("", rx_file.sub(name, src))) fn_src = os.path.join(srcdir, "pipeline_template_data", src) E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" % (fn_src, fn_dest, src, dst)) if os.path.exists(fn_dest) and not options.force: raise OSError( "file %s already exists - not overwriting." % fn_dest) if fn_src.endswith(".png"): shutil.copyfile(fn_src, fn_dest) else: with IOTools.openFile(fn_dest, "w") as outfile: with IOTools.openFile(fn_src) as infile: for line in infile: outfile.write(rx_reportdir.sub(reportdir, rx_template.sub(name, line)))
def mergeAdaptorFasta(infiles, outfile): ''' Merge fasta files of adapter contamination, include reverse complement, remove duplicate sequences ''' fasta_dict = {} for each in infiles: with IOTools.openFile(each, "r") as infle: for line in infle: if line[0] == '>': adapt = line.lstrip(">").rstrip("\n") fasta_dict[adapt] = set() fasta_dict[adapt + "_R"] = set() else: seq = line.rstrip("\n") rev_seq = reverseComplement(seq) fasta_dict[adapt].add(seq) fasta_dict[adapt + "_R"].add(rev_seq) # if there are no adapters to remove break the pipeline here if not len(fasta_dict): raise AttributeError("There are no overrepresented sequences in " "these fastq files. Please turn off this " "feature and re-run the pipeline") else: pass with IOTools.openFile(outfile, "w") as outfle: for key, value in fasta_dict.items(): outfle.write(">%s\n%s\n" % (key, list(value)[0]))
def annotateCpGIslands( infiles, outfile ): '''annotate transcript by absence/presence of CpG islands ''' cpgfile, tssfile = infiles cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) ) extension_upstream = PARAMS["cpg_search_upstream"] extension_downstream = PARAMS["cpg_search_downstream"] c = E.Counter() outf = IOTools.openFile( outfile, "w" ) outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" ) for tss in Bed.iterator(IOTools.openFile( tssfile ) ): c.tss_total += 1 if tss.strand == "+": start, end = tss.start - extension_upstream, tss.start + extension_downstream else: start, end = tss.end - extension_downstream, tss.end + extension_upstream try: matches = list(cpg[tss.contig].find( start, end )) except KeyError: c.promotor_without_matches += 1 continue if len(matches) == 0: c.promotor_without_matches += 1 continue c.promotor_output += 1 for match in matches: c.matches_total += 1 genome_start, genome_end, x = match l = genome_end - genome_start # get relative location of match if tss.strand == "+": relative_start = genome_start - tss.start else: relative_start = tss.end - genome_end relative_end = relative_start + l outf.write( "\t".join( map(str, ( tss.name, tss.strand, genome_start, genome_end, relative_start, relative_end ))) + "\n" ) c.matches_output += 1 outf.close() with IOTools.openFile( outfile + ".summary", "w" ) as outf: outf.write ("category\tcounts\n" ) outf.write( c.asTable() + "\n" ) E.info( c )
def CleanVariantTables(genes, variants, cols, outfile): variants = pd.read_csv(variants, sep="\t") variants = variants.drop(0) vp1 = copy.copy(variants[['CHROM', 'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']]) alleles = vp1['REF1'].str.cat( vp1['ALT'].str.strip(), sep=",").str.split(",") vp1['GT'] = vp1['GT'].str.replace(".", "0") inds1 = vp1['GT'].str.get(0).astype(int).values inds2 = vp1['GT'].str.get(-1).astype(int).values x = 0 a1s = [] a2s = [] gts = [] homhet = [] for allele in alleles: i1 = int(inds1[x]) i2 = int(inds2[x]) a1 = allele[i1] a2 = allele[i2] a1s.append(a1) a2s.append(a2) if a1 == a2: homhet.append("HOM") else: homhet.append("HET") gts.append("%s%s" % (a1, a2)) x += 1 vp1['HOMHET'] = homhet vp1['Allele1'] = a1s vp1['Allele2'] = a2s vp1['Genotype'] = gts vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1) vp1[cols] = copy.copy(variants[cols]) Ls = [] for gene in [line.strip() for line in IOTools.openFile(genes[0]).readlines()]: cp = [] with IOTools.openFile(genes[1]) as infile: for line in infile: r = re.search(gene, line) if r: line = line.strip().split("\t") chrom = line[0] pos = line[1] cp.append("%s_%s" % (chrom, pos)) cp = set(cp) for c in cp: Ls.append((gene, c.split("_"))) df = pd.DataFrame(Ls) df['CHROM'] = df[1].str.get(0) df['POS'] = df[1].str.get(1) df = df.drop(1, 1) df.columns = ['gene', 'CHROM', 'POS'] variants = vp1.merge(df, 'left') variants.to_csv(outfile, sep="\t")
def FilterFreqCols(infile, thresh, fcols): ''' Returns a set of line indices indicating lines where either of the alleles called have a frequency of less than thresh in all of the columns specified in fcols. No information - assigned allele frequency of -1. ''' fcols = fcols.split(",") # read the column headings from the variant table cols = IOTools.openFile(infile).readline().strip().split("\t") # store allele frequency columns AFdict = dict() # store low frequency indices nD = dict() for col in fcols: ind = cols.index(col) GT_i = cols.index('GT') n = 0 nlist = set() AFS = [] with IOTools.openFile(infile) as input: for line in input: if n > 1: line = line.strip().split("\t") GT = line[GT_i].replace(".", "0").split("/") af = line[ind].split(",") AF = [] # where the allele frequency is not numeric # "." or "NA" use -1 to indicate no data for a in af: try: AF.append(float(a)) except: AF.append(float(-1)) AF2 = [l if l > 0 else 0 for l in AF] AF = np.array(AF) AF = np.insert(AF, 0, 1 - sum(AF2)) GT[0] = int(GT[0]) GT[1] = int(GT[1]) # If the variant is not in database the column shows "." # but the site # may still have been called as multi allelic # - use -1 for all frequencies # in this case if max(GT[0], GT[1]) > (len(AF) - 1): AF = [float(-1)] * (max(GT[0], GT[1]) + 1) AF1 = AF[GT[0]] AF2 = AF[GT[1]] if AF1 >= thresh and AF2 >= thresh: nlist.add(n) AFS.append((AF1, AF2)) else: AFS.append(('NA', 'NA')) n += 1 AFdict[col] = AFS nD[col] = nlist ns = set.union(*list(nD.values())) return AFdict, ns
def iterator_psl_intervals(options): """iterate over psl file yield an entry together with overlapping entries. returns tuples of (match, list(query_intervals), list(target_intervals)) """ if options.filename_filter_query: intervals_query = readIntervals( IOTools.openFile(options.filename_filter_query, "r"), options) else: intervals_query = None if options.filename_filter_target: intervals_target = readIntervals( IOTools.openFile(options.filename_filter_target, "r"), options) else: intervals_target = None iterator = Blat.BlatIterator(options.stdin) ninput = 0 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write("# progress: ninput=%i\n" % (ninput)) options.stdlog.flush() qx, tx = None, None if intervals_query: try: qx = list( intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo)) except KeyError: qx = [] if intervals_target: try: tx = list( intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo)) except KeyError: tx = [] if options.loglevel >= 2: options.stdlog.write( "###################################################\n") options.stdlog.write("# testing match %s\n" % (str(match))) options.stdlog.write( "###################################################\n") yield match, qx, tx
def getProbeFragments(probe_bed, digest_bed, outfile, lookup_out): # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates # from the pregenerated file. # First iteration, no comparison first_iteration = True length_RE_cut = 0 last_bed = None for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)): if(first_iteration): first_iteration = False else: # If they are in the same contig they can be compared if(bed_digest.contig == last_bed.contig): length_RE_cut = bed_digest.start - last_bed.end break last_bed = bed_digest digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] # The restriction enzyme cut on the left side of the fragment # is the end site of the last restriction enzyme fragment + 1 # (+1 because according to the manual coordinates are specified # in 1-origin for the bed start.) bed.start = frag.start-length_RE_cut+1 bed.end = frag.end+length_RE_cut bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def imputeGO(infile_go, infile_paths, outfile): """impute GO accessions. Output a list of gene-to-GO associations for genes that includes ancestral terms. Arguments --------- infile_go : string Filename with gene-to-GO assocations for genes infile_paths : string Filename with paths of term to ancestor (see go2fmt.pl). outfile : string Output filename """ c = E.Counter() term2ancestors = collections.defaultdict(set) with IOTools.openFile(infile_paths) as inf: for line in inf: parts = line[:-1].split() term = parts[0] ancestors = [parts[x] for x in range(2, len(parts), 2)] # there can be multiple paths term2ancestors[term].update(ancestors) goid2description = {} gene2goids = collections.defaultdict(list) goid2type = {} with IOTools.openFile(infile_go) as inf: for line in inf: if line.startswith("go_type"): continue go_type, gene_id, goid, description, evidence = line[:-1].split("\t") gene2goids[gene_id].append(goid) goid2description[goid] = description goid2type[goid] = go_type outf = IOTools.openFile(outfile, "w ") for gene_id, in_goids in gene2goids.iteritems(): c.genes += 1 out_goids = set(in_goids) for goid in in_goids: out_goids.update(term2ancestors[goid]) if len(in_goids) != len(out_goids): c.increased += 1 else: c.complete += 1 for goid in out_goids: outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid, goid2description.get(goid, ""), "NA")) + "\n") c.assocations += 1 outf.close() E.info("%s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults( change_format=None, guess_format=None, pattern="%s.gz" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w") outfile_qual = IOTools.openFile(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def outputAllWindows( infile, outfile ): '''output all Windows as a bed file with the l2fold change as a score. ''' outf = IOTools.openFile( outfile, "w" ) for line in IOTools.iterate( IOTools.openFile( infile ) ): outf.write( "\t".join( (line.contig, line.start, line.end, "%6.4f" % float(line.l2fold ))) + "\n" ) outf.close()
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles): '''output a bed file with genomic regions with functional annotations. The regions for each gene are given in the gtf file. Each bed entry is a gene territory. Bed entries are labeled by functional annotations associated with a gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. ''' territories_file = gtffile outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) cc = dbh.cursor() outf = P.getTempFile(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write( "\t".join(map(str, ( contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run() outf = IOTools.openFile(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.iteritems(): outf.write("%s\t%s\n" % (term, description)) outf.close() os.unlink(tmpfname)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--output-section", dest="output", type="choice", choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError("two arguments required") if args[0] == "-": infile1 = options.stdin else: infile1 = IOTools.openFile(args[0], "r") infile2 = IOTools.openFile(args[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = options.output outfile = options.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.Stop()
def calculateSplicingIndex(bamfile, gtffile, outfile): bamfile = pysam.AlignmentFile(bamfile) counts = E.Counter() for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(gtffile))): introns = GTF.toIntronIntervals(transcript) E.debug("Gene %s (%s), Transcript: %s, %i introns" % (transcript[0].gene_id, transcript[0].contig, transcript[0].transcript_id, len(introns))) for intron in introns: reads = bamfile.fetch( reference=transcript[0].contig, start=intron[0], end=intron[1]) for read in reads: if 'N' in read.cigarstring: blocks = read.get_blocks() starts, ends = zip(*blocks) if intron[0] in ends and intron[1] in starts: counts["Exon_Exon"] += 1 else: counts["spliced_uncounted"] += 1 elif (read.reference_start <= intron[0] - 3 and read.reference_end >= intron[0] + 3): if transcript[0].strand == "+": counts["Exon_Intron"] += 1 else: counts["Intron_Exon"] += 1 elif (read.reference_start <= intron[1] - 3 and read.reference_end >= intron[1] + 3): if transcript[0].strand == "+": counts["Intron_Exon"] += 1 else: counts["Exon_Intron"] += 1 else: counts["unspliced_uncounted"] += 1 E.debug("Done, counts are: " + str(counts)) header = ["Exon_Exon", "Exon_Intron", "Intron_Exon", "spliced_uncounted", "unspliced_uncounted"] with IOTools.openFile(outfile, "w") as outf: outf.write("\t".join(header)+"\n") outf.write("\t".join(map(str, [counts[col] for col in header])) + "\n")
def __call__(self, track, slice=None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
def makeCpgIslandsBed(outfile): infile = PARAMS["methylation_summary_cpgislands"] out = IOTools.openFile(outfile, "w") with IOTools.openFile(infile, "r") as f: for line in f.readlines(): # this assumes location of req. values contig, start, end = line.split()[1:4] if not contig == "chrom": out.write("%s\t%s\t%s\n" % (contig, start, end)) out.close()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.set_defaults( ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) ## do sth assert len(args) == 3, "expected three command line arguments" samfile = pysam.Samfile( args[0], "rb" ) outstream1 = IOTools.openFile( args[1], "w" ) outstream2 = IOTools.openFile( args[2], "w" ) c = E.Counter() for read in samfile.fetch(): c.input += 1 if read.mate_is_unmapped: c.is_unmapped += 1 if read.is_read1: seq1, seq2 = read.seq, "N" * read.qlen qual1, qual2 = read.qual, "B" * read.qlen else: seq1, seq2 = "N" * read.qlen, read.seq qual1, qual2 = "B" * read.qlen, read.qual else: if read.is_read2: continue try: mate = samfile.mate( read ) c.found += 1 except ValueError, msg: mate = None c.failed += 1 if mate: seq1, seq2 = read.seq, mate.seq qual1, qual2 = read.qual, mate.qual else: seq1, seq2 = read.seq, "N" * read.qlen qual1, qual2 = read.qual, "B" * read.qlen c.output += 1 outstream1.write( "@%s\n%s\n+\n%s\n" % (read.qname, seq1, qual1 ) ) outstream2.write( "@%s\n%s\n+\n%s\n" % (read.qname, seq2, qual2 ) )
def getPathwayGenes(infiles, outfile): ''' get genes that are asscociated with diff pathways ''' pathway_file, clusters_file, geneset_file, probe2gene_file = [infiles[0]] + infiles[1] pathways = IOTools.openFile(pathway_file) # get which cluster it was cluster = os.path.basename(pathway_file)[1] # get genes associated with cluster probes = [x[:-1].split("\t")[0] for x in IOTools.openFile(clusters_file) if x[:-1].split("\t")[1] == cluster] # header pathways.readline() # get pathways sig_pathways = [x.split("\t")[11] for x in pathways.readlines() if x.split("\t")[0] == "+"] # probe2gene map probe2gene_file = IOTools.openFile(probe2gene_file) probe2gene = {} for line in probe2gene_file: data = line[:-1].split("\t") probe, gene = data probe = probe.replace('"', '') gene = gene.replace('"', '') if probe in probes: probe2gene[probe] = gene else: continue # pathway2genes pathways_geneset = IOTools.openFile(geneset_file) pathway2genes = collections.defaultdict(set) for line in pathways_geneset.readlines(): data = line[:-1].split("\t") pathway, gene = data[2], data[1] if pathway in sig_pathways: if gene in probe2gene.values(): pathway2genes[pathway].add(gene) else: continue else: continue # output outf = open(outfile, "w") outf.write("pathway\tgene\n") for pathway, genes in pathway2genes.iteritems(): for gene in genes: outf.write("%s\t%s\n" % (pathway, gene)) outf.close()
def getBarcodeCG(table, outfile): ''' Annotate barcode use statistics with %GC ''' statement = " SELECT * FROM %(table)s" % locals() umi_stats = PUtils.fetch_DataFrame(statement) def _GC(x): return float(x.count("G") + x.count("G"))/len(x) barcode_gc = umi_stats.Barcode.apply(_GC) sample_gc = umi_stats.Sample.apply(_GC) umi_gc = umi_stats.UMI.apply(_GC) gc_stats= pandas.DataFrame({"Barcode":umi_stats.Barcode, "barcode_gc": barcode_gc, "sample_gc": sample_gc, "umi_gc": umi_gc}) gc_stats.to_csv(IOTools.openFile(outfile,"w"), sep="\t", index=False)
def __init__(self, filename, *args, **kwargs): assert filename is not None,\ "please supply filename for CounterOverlap" Counter.__init__(self, *args, **kwargs) self.filename = filename E.info("reading intervals from %s" % self.filename) self.index = Bed.readAndIndex(IOTools.openFile(self.filename, "r"), per_track=True) E.info("read intervals for %s tracks" % len(self.index)) self.tracks = list(self.index.keys()) self.headers = [] for track in self.tracks: self.headers.extend(["%s_nover" % track, "%s_bases" % track])
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' tablename = os.path.basename(filenameToTablename(P.snip(infile, ".gtf.gz"))) to_cluster = False # just load each transcript with its classification temp = P.getTempFile() for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(infile))): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() inf = temp.name statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --header=transcript_id,gene_id,class < %(inf)s > %(outfile)s''' P.run()
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles, ), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def collectFastQCSections(infiles, section): '''iterate over all fastqc files and extract a particular section.''' results = [] for infile in infiles: track = P.snip(infile, ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") for fn in glob.glob(filename): prefix = os.path.basename(os.path.dirname(fn)) for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): if name == section: results.append((track, status, header, data)) return results
def getPeakShiftFromZinba(infile): '''get peak shift for filename infile (.zinba output file). returns None if no shift found ''' shift = None # search for # $offset # [1] 125 with IOTools.openFile(infile, "r") as ins: lines = ins.readlines() for i, line in enumerate(lines): if line.startswith("$offset"): shift = int(lines[i + 1].split()[1]) break return shift
def buildMatrixFromTables(infiles, column, column_header=0, dtype=numpy.float, default=None): '''build a matrix from a column called *column* in a series of input files. If column_value is None, the first column is taken as the name of the row. The columns are given by order of the input files. returns matrix, row_headers ''' lists = [] for infile in infiles: data = pandas.read_table(IOTools.openFile(infile)) lists.append(list(zip(list(data[column_header]), list(data[column])))) return buildMatrixFromLists(lists, dtype=dtype, default=default)
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = IOTools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=open(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def buildExperimentReadQuality(infiles, outfile, datadir): """build per-experiment read quality summary. Arguments --------- infiles : list List of filenames with fastqc output (logging information). The track name is derived from that. outfile : list Output filename in :term:`tsv` format. datadir : string Location of actual Fastqc output to be parsed. """ data = collectFastQCSections(infiles, "Per sequence quality scores", datadir) first = True if len(data) == 0: raise ValueError("received no data") for track, status, header, rows in data: T = track.replace("-", "_").replace(".", "_") rows = [list(map(float, x.split("\t"))) for x in rows] header = header.split("\t") if first: first = False df_out = pd.DataFrame(rows) df_out.columns = header df_out.rename(columns={"Count": T}, inplace=True) else: df = pd.DataFrame(rows) df.columns = header df.rename(columns={"Count": T}, inplace=True) df_out = df_out.merge(df, how="outer", on="Quality", sort=True) # SLV: is this really required? #df_out = pd.DataFrame(df_out.sum(axis=1)) df_out.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False)
def buildRefcodingGeneSetStats(infile, outfile): ''' counts: no. of transcripts no. genes average number of exons per transcript average number of exons per gene no. multi-exon transcripts no. single exon transcripts no. multi-exon genes no. single exon genes in the coding and lncRNA genesets ''' # calculate exon status for refcoding genes. tmpf = P.getTempFilename(".") + ".gz" PipelineLncRNA.flagExonStatus(infile, tmpf) outf = IOTools.openFile(outfile, "w") outf.write("\t".join(["no_transcripts", "no_genes", "no_exons_per_transcript", "no_exons_per_gene", "no_single_exon_transcripts", "no_multi_exon_transcripts", "no_single_exon_genes", "no_multi_exon_genes"]) + "\n") outf.write("\t".join(map(str, [ PipelineLncRNA.CounterTranscripts(tmpf).count(), PipelineLncRNA.CounterGenes(tmpf).count(), PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), PipelineLncRNA.CounterExonsPerGene(tmpf).count(), PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), PipelineLncRNA.CounterMultiExonGenes(tmpf).count()]))) os.unlink(tmpf) os.unlink(tmpf + ".log") os.unlink(P.snip(tmpf, ".gz"))
def readLarge(self, fname=None): """ read a potentially huge file. """ try: # get stderr, allowing for case where it's very large tmp = IOTools.openFile(fname, 'rb') s = '' buffsize = 1048576 try: while True: more = tmp.read(buffsize) if len(more) > 0: s += more else: break except OverflowError: pass tmp.close() except Exception as e: stop_err('Read Large Exception : %s' % str(e)) return s
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def buildPathwayGenes(infiles, outfile): ''' build sets of genes that are differentially expressed and fall into the pathways identified in the top 10 lists ''' goresult, gene2pathways = infiles[0], infiles[1] if "_up" in os.path.basename(goresult): contrast = os.path.basename(goresult).replace("_up", ",").split(",")[0] if "_down" in os.path.basename(goresult): contrast = os.path.basename(goresult).replace("_down", ",").split(",")[0] dbh = connect() result_table = contrast # catch where there is no sig if len(IOTools.openFile(goresult).readlines()) == 1: P.touch(outfile) else: PipelineIlmnArray.buildPathwayGenes(goresult, gene2pathways, contrast, dbh, result_table, outfile)
def preprocessIdba(infile, outfile): ''' preprocess pooled reads for IDBA ''' # check for second read in the pair if infile.endswith(".fastq.gz"): E.info("converting fastq file to fasta file") outf = open(outfile, "w") for fastq in Fastq.iterate(IOTools.openFile(infile)): outf.write("%s\n%s\n" % (">" + fastq.identifier, fastq.seq)) outf.close() elif infile.endswith(".1.gz"): read2 = P.snip(infile, ".1.gz") + ".2.gz" assert os.path.exists(read2), "file does not exist %s" % read2 statement = '''python %(scriptsdir)s/fastqs2fasta.py -a %(infile)s -b %(read2)s --log=%(infile)s.log > %(outfile)s''' P.run()
def fetchGenomeAssemblyAndAnnotations(infile, outfiles): '''Download all assemblies using rsync, as per ncbi recommendation''' outdir = P.snip(infile, '.txt') + '.dir' if os.path.exists(outdir): shutil.rmtree(outdir) os.mkdir(outdir) for genome in IOTools.openFile(infile): genome_path = genome.split().pop() genome_path = re.sub('ftp', 'rsync', genome_path) gff_path = P.snip(genome_path, '.fna.gz') + '.gff.gz' statement = (" rsync --copy-links --quiet" " %(genome_path)s" " %(outdir)s &&" " rsync --copy-links --quiet" " %(gff_path)s" " %(outdir)s") to_cluster = False P.run()
def __call__(self, track, slice=None): fn = os.path.join( DATADIR, "peakshapes.dir", "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals()) if not os.path.exists(fn): return matrix, rownames, colnames = IOTools.readMatrix(IOTools.openFile(fn)) nrows = len(rownames) if nrows < 2: return if nrows > 1000: take = numpy.array(numpy.floor(numpy.arange( 0, nrows, nrows / 1000)), dtype=int) rownames = [rownames[x] for x in take] matrix = matrix[take] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
def buildFDRStats(infile, outfile, method): '''compute number of windows called at different FDR. ''' data = pandas.read_csv(IOTools.openFile(infile), sep="\t", index_col=0) assert data['treatment_name'][0] == data['treatment_name'][-1] assert data['control_name'][0] == data['control_name'][-1] treatment_name, control_name = data['treatment_name'][0], data[ 'control_name'][0] key = (treatment_name, control_name) fdrs = (0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) for fdr in fdrs: print "fdr" take = data['qvalue'] <= fdr significant = sum(take) print significant
def getPeakShiftFromMacs(infile): '''get peak shift for filename infile (.macs output file). returns None if no shift found''' shift = None with IOTools.openFile(infile, "r") as ins: rx = re.compile("#2 predicted fragment length is (\d+) bps") r2 = re.compile("#2 Use (\d+) as shiftsize, \d+ as fragment length") for line in ins: x = rx.search(line) if x: shift = int(x.groups()[0]) break x = r2.search(line) if x: shift = int(x.groups()[0]) E.warn("shift size was set automatically - see MACS logfiles") break return shift
def __call__(self, track): length = {} for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))): length[transcript[0].transcript_id] = sum( [gtf.end - gtf.start for gtf in transcript]) score = {} dbh = sqlite3.connect("csvdb") cc = dbh.cursor() for data in cc.execute( "SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result" ): score[data[0]] = data[1] result = {"length": [], "score": []} for transcript, value in length.items(): result["length"].append(np.log10(length[transcript])) result["score"].append(score[transcript]) return result
def getGODescriptions(infile): '''build dictionary mapping GOids to types and descriptions. Arguments --------- infile : string Filename of table with GO assignments Returns ------- mapping : dict Dictionary mapping GOid to GOtype and GOdescription. ''' with IOTools.openFile(infile) as inf: fields, table = CSV.readTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def unstowSetDict(self, infile): ''' Regenerates a dictionary using a file generated by stowSetDict Reads a flat file where column one is dictionary keys and column two a comma delimited list of values for that key and generates a dictionary of sets. e.g. A a,b,c would become {A:set("a", "b", "c")} ''' D = dict() i = 0 with IOTools.openFile(infile) as inf: for line in inf: if i != 0: line = line.strip().split("\t") if line[0] not in D and len(line) > 1: D[line[0]] = set() for part in line[1].split(","): D[line[0]].add(part) i += 1 return D
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) contigs_map = {} for genome in glob.glob(os.path.join(options.genome_dir, "*")): for fasta in FastaIterator.iterate(IOTools.openFile(genome)): identifier = fasta.title.split("|") gi = identifier[1] contigs_map[gi] = fasta.title for line in options.stdin.readlines(): data = line[:-1].split("\t") gi = data[1] assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi])) # write footer and output benchmark information. E.Stop()
def __call__(self, track, slice=None): classes = [ "antisense", "antisense_upstream", "antisense_downstream", "sense_upstream", "sense_downstream", "intergenic", "sense_intronic", "antisense_intronic" ] coding_set = {} for gtf in GTF.iterator( IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")): coding_set[gtf.transcript_id] = gtf.source result = {"noncoding": {}, "coding": collections.defaultdict(int)} total_nc = float( self.getValue( "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'" )) for c in classes: result["noncoding"][c] = (float( self.getValue( """SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' AND b.C_NC = 'noncoding' AND a.transcript_id = b.transcript_id""" % (track, c))) / total_nc) * 100 total_c = len(list(coding_set.keys())) for c in classes: ids = self.getValues( "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'" ) for i in ids: if i in list(coding_set.keys()): if coding_set[i] == c: result["coding"][c] += 1 for x, y in result["coding"].items(): result["coding"][x] = (float(y) / total_c) * 100 return result
def _count(self, filename, idx): '''count filename against idx.''' overlapping_genes = set() genes = set() # iterate over exons infile = IOTools.openFile(filename, "r") it = Bed.bed_iterator(infile) nexons, nexons_overlapping = 0, 0 nbases, nbases_overlapping = 0, 0 for this in it: nexons += 1 nbases += this.end - this.start try: intervals = list(idx[this.contig].find(max(0, this.start), this.end)) except KeyError: continue except Exception as msg: raise Exception("error while processing %s, msg=%s" % (filename, msg)) if len(intervals) == 0: continue nexons_overlapping += 1 start, end = this.start, this.end counts = numpy.zeros(end - start, numpy.int) for other_start, other_end, other_value in intervals: for x in range( max(start, other_start) - start, min(end, other_end) - start): counts[x] += 1 nbases_overlapping += sum([1 for x in counts if x > 0]) infile.close() return nexons, nexons_overlapping, nbases, nbases_overlapping
def collectCpGIslands(infile, outfile): '''select repeats from UCSC and write to *outfile* in gff format. ''' dbhandle = PipelineGeneset.connectToUCSC() # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. cc = dbhandle.cursor() table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name, obsExp FROM %(table)s """ % locals() E.debug("executing sql statement: %s" % sql) cc.execute(sql) outf = IOTools.openFile(outfile, "w") for data in cc.fetchall(): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def loadFastqc(infile, outfile): '''load FASTQC stats.''' track = P.snip(infile, ".fastqc") filename = os.path.join(PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") for fn in glob.glob(filename): prefix = os.path.basename(os.path.dirname(fn)) results = [] for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): # do not collect basic stats, see loadFastQCSummary if name == "Basic Statistics": continue parser = CSV2DB.buildParser() (options, args) = parser.parse_args([]) options.tablename = prefix + "_" + re.sub(" ", "_", name) options.allow_empty = True inf = cStringIO.StringIO("\n".join([header] + data) + "\n") CSV2DB.run(inf, options) results.append((name, status)) # load status table parser = CSV2DB.buildParser() (options, args) = parser.parse_args([]) options.tablename = prefix + "_status" options.allow_empty = True inf = cStringIO.StringIO("\n".join(["name\tstatus"] + ["\t".join(x) for x in results]) + "\n") CSV2DB.run(inf, options) P.touch(outfile)
def readOntFile(self): ''' Reads an obo formatted file. Generates a dictionary - self.TermsToOnt Keys are terms, values are all immediate parents of that term - the "is_a" argument in the obo file. ''' TermsToOnt = dict() Tname = None isas = set() with IOTools.openFile(self.options['ont']) as infile: for line in infile: line = line.strip() if line.startswith("id"): if Tname is not None: TermsToOnt[Tname] = isas isas = set() Tname = line.split(": ")[1] elif line.startswith("is_a"): isas.add(":".join(line.split(": ")[1:]).split(" ! ")[0]) TermsToOnt[Tname] = isas return TermsToOnt
def GetAndTranslateAllGenes(outfile): ''' This step is required. 1. All Entrez gene IDs are downloaded from entrez gene. 2. Corresponding ensembl gene, ensembl transcript and ensembl protein IDs are downloaded from mygene.info 3. Corresponding gene symbols are downloaded from mygene.info 4. These are loaded into the database 5. A list of all gene Entrez IDs is stored as 'allgenes.tsv Tables: ensemblg2entrez$geneid - ensemblg to entrez ID ensemblg2ensemblt$other - ensemblg to ensembl transcript ensemblg2ensemblp$other - ensemblg to ensembl protein ensemblg2symbol_xxx$geneid - ensemblg to symbol in species xxx ''' GeneAnnot = PipelineGeneInfo.EntrezGeneAnnotation(PARAMS['db_name'], PARAMS['entrez_email']) genelist = GeneAnnot.download_all(PARAMS['entrez_host']) # Generate an EnsemblAnnotation object Ens = PipelineGeneInfo.EnsemblAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name']) # Get Ensembl annotations PipelineGeneInfo.runall(Ens, genelist, ['ensembl'], submit=True) # Generate a SymbolAnnotation object Sym = PipelineGeneInfo.SymbolAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['entrez_host'], PARAMS['entrez_sciname']) # Get Symbol Annotations PipelineGeneInfo.runall(Sym, genelist, ['symbol'], submit=True) # Make output gene list outf = IOTools.openFile(outfile, "w") for gene in genelist: outf.write("%s\n" % gene) outf.close()
def buildOptimalPrimerSet(infiles, outfile): ''' build a set of optimal primer pairs across sequences ''' outf = IOTools.openFile(outfile, "w") outf.write( """name\tforward_seq\tforward_gc (%) \tforward_tm\tforward_length (bp)\treverse_seq\treverse_gc (%)\treverse_tm\treverse_length (bp)\tfragment_length (bp)\n""" ) for infile in infiles: primerset = PrimerSet() name = primerset.readName(infile) size = primerset.readSize(infile) forward = primerset.readForward(infile) reverse = primerset.readReverse(infile) primerset = primerset.parse(name, size, *forward + reverse) outf.write("\t".join([ primerset.name, primerset.forwardseq, primerset.forwardgc, primerset.forwardtm, primerset.forwardlength, primerset.reverseseq, primerset.reversegc, primerset.reversetm, primerset.reverselength, primerset.size ]) + "\n") outf.close()