def ReadTSV(filename): snvheaders = [_f for _f in """CHROM POS REF ALT""".split() if _f] base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers(): raise RuntimeError("Required header: %s missing from SNV file %s" % (h, filename)) assert (snvs.headers()[:4] == snvheaders) chrom = set() snvdata = [] for r in snvs: ri = list(map(r.get, snvs.headers())) chrom.add(ri[0]) snvdata.append(ri) return ["\t".join(snvs.headers())], chrom, snvdata
progress.stage("Read SNV data", len(opt.snvs)) snvheaders = filter(None, """ CHROM POS REF ALT """.split()) snvdata = {} # extrasnvheaders = [] # usedsnvheaders = set() snvchroms = defaultdict(set) for filename in opt.snvs: base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'vcf': snvs = VCFFile(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers():
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable, BEDFile, VCFFile progress.stage("Read SNV data", len(opt.snvs)) snvheaders = [_f for _f in """ CHROM POS REF ALT """.split() if _f] snvdata = {} extrasnvheaders = [] usedsnvheaders = set() for filename in opt.snvs: base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'vcf': snvs = VCFFile(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers():
sumkeys = [ _f for _f in map( str.strip, """ SNPJuncIntronCount SNPJuncNoIntronCount NoSNPJuncIntronCount NoSNPJuncNoIntronCount SNPMateCount NoSNPMateCount SNPCount NoSNPCount MatesCount NotMatesCount IntronCount NoIntronCount SpanningReads RemovedDuplicateReads SNPLociReads""" .split()) if _f ] countdata = defaultdict(dict) progress.stage("Read SNP/Junction counts") from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable countheaders = None for filename in opt.counts: base, extn = filename.rsplit('.', 1) path, base = os.path.split(base) extn = extn.lower() if extn == 'csv': counts = CSVFileTable(filename=filename) elif extn == 'tsv': counts = TSVFileTable(filename=filename) elif extn == 'xls': counts = XLSFileTable(filename=filename) elif extn == 'xlsx': counts = XLSXFileTable(filename=filename) else: raise RuntimeError("Unexpected count file extension: %s" % filename) if countheaders == None: countheaders = counts.headers() else: assert countheaders == counts.headers() assert 'CHROM' in countheaders assert 'POS' in countheaders
#!/bin/env python27 import sys, traceback, re from collections import defaultdict from getwiki import GlycoMotifWiki, GlyGenMotif w = GlycoMotifWiki() motif2gd = defaultdict(set) from dataset import CSVFileTable for r in CSVFileTable(sys.argv[1]): entry = r['term (main_entry)'].strip() xrefs = r['term_xref'] gdacc = r['glycan_dictionary_accession'] allmid = set() for xr in filter(None, xrefs.split('|')): try: src, mid = xr.split(':', 1) except ValueError: continue if src.lower() == "glycomotif": if not re.search(r'^GGM\.\d{6}$', mid): print "Bad motif id: %s (%s)" % (mid, entry) continue motif2gd[mid].add((gdacc, entry)) for mid in w.site.allpages(prefix='GGM.', generator=False): # print mid m = w.get(mid) entries = motif2gd[mid]
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable, BEDFile, VCFFile progress.stage("Read SNV data", len(opt.snvs)) snvheaders = [_f for _f in """ CHROM POS REF ALT """.split() if _f] snvdata = {} snvchroms = defaultdict(set) for filename in opt.snvs: base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv': snvs = CSVFileTable(filename=filename) elif extn == 'vcf': snvs = VCFFile(filename=filename) elif extn == 'tsv': snvs = TSVFileTable(filename=filename) elif extn == 'xls': snvs = XLSFileTable(filename=filename) elif extn == 'xlsx': snvs = XLSXFileTable(filename=filename) elif extn == 'txt': snvs = TXTFileTable(filename=filename, headers=snvheaders) else: raise RuntimeError("Unexpected SNV file extension: %s" % filename) for h in snvheaders: if h not in snvs.headers():
if not opt.output: opt.quiet = True progress = ProgressText(quiet=opt.quiet) sumkeys = filter(None, map(str.strip, """ SNPJuncIntronCount SNPJuncNoIntronCount NoSNPJuncIntronCount NoSNPJuncNoIntronCount SNPMateCount NoSNPMateCount SNPCount NoSNPCount MatesCount NotMatesCount IntronCount NoIntronCount SpanningReads RemovedDuplicateReads SNPLociReads""".split())) countdata = defaultdict(dict) progress.stage("Read SNP/Junction counts") from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable countheaders = None for filename in opt.counts: base, extn = filename.rsplit('.', 1) path, base = os.path.split(base) extn = extn.lower() if extn == 'csv': counts = CSVFileTable(filename=filename) elif extn == 'tsv': counts = TSVFileTable(filename=filename) elif extn == 'xls': counts = XLSFileTable(filename=filename) elif extn == 'xlsx': counts = XLSXFileTable(filename=filename) else: raise RuntimeError("Unexpected count file extension: %s" % filename) if countheaders == None: countheaders = counts.headers() else: assert countheaders == counts.headers() assert 'CHROM' in countheaders assert 'POS' in countheaders