Exemple #1
0
def ReadTSV(filename):
    snvheaders = [_f for _f in """CHROM POS REF ALT""".split() if _f]
    base, extn = filename.rsplit('.', 1)
    extn = extn.lower()
    if extn == 'csv':
        snvs = CSVFileTable(filename=filename)
    elif extn == 'tsv':
        snvs = TSVFileTable(filename=filename)
    elif extn == 'xls':
        snvs = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        snvs = XLSXFileTable(filename=filename)
    elif extn == 'txt':
        snvs = TXTFileTable(filename=filename, headers=snvheaders)
    else:
        raise RuntimeError("Unexpected SNV file extension: %s" % filename)

    for h in snvheaders:
        if h not in snvs.headers():
            raise RuntimeError("Required header: %s missing from SNV file %s" %
                               (h, filename))

    assert (snvs.headers()[:4] == snvheaders)

    chrom = set()
    snvdata = []
    for r in snvs:
        ri = list(map(r.get, snvs.headers()))
        chrom.add(ri[0])
        snvdata.append(ri)

    return ["\t".join(snvs.headers())], chrom, snvdata
Exemple #2
0
progress.stage("Read SNV data", len(opt.snvs))
snvheaders = filter(None, """
CHROM POS REF ALT
""".split())

snvdata = {}
# extrasnvheaders = []
# usedsnvheaders = set()
snvchroms = defaultdict(set)
for filename in opt.snvs:

    base, extn = filename.rsplit('.', 1)
    extn = extn.lower()
    if extn == 'csv':
        snvs = CSVFileTable(filename=filename)
    elif extn == 'vcf':
        snvs = VCFFile(filename=filename)
    elif extn == 'tsv':
        snvs = TSVFileTable(filename=filename)
    elif extn == 'xls':
        snvs = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        snvs = XLSXFileTable(filename=filename)
    elif extn == 'txt':
        snvs = TXTFileTable(filename=filename, headers=snvheaders)
    else:
        raise RuntimeError("Unexpected SNV file extension: %s" % filename)

    for h in snvheaders:
        if h not in snvs.headers():
Exemple #3
0
progress.stage("Read SNV data", len(opt.snvs))
snvheaders = filter(None, """
CHROM POS REF ALT
""".split())

snvdata = {}
# extrasnvheaders = []
# usedsnvheaders = set()
snvchroms = defaultdict(set)
for filename in opt.snvs:

    base, extn = filename.rsplit('.', 1)
    extn = extn.lower()
    if extn == 'csv':
        snvs = CSVFileTable(filename=filename)
    elif extn == 'vcf':
        snvs = VCFFile(filename=filename)
    elif extn == 'tsv':
        snvs = TSVFileTable(filename=filename)
    elif extn == 'xls':
        snvs = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        snvs = XLSXFileTable(filename=filename)
    elif extn == 'txt':
        snvs = TXTFileTable(filename=filename, headers=snvheaders)
    else:
        raise RuntimeError("Unexpected SNV file extension: %s" % filename)

    for h in snvheaders:
        if h not in snvs.headers():
Exemple #4
0
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable, BEDFile, VCFFile

progress.stage("Read SNV data", len(opt.snvs))
snvheaders = [_f for _f in """
CHROM POS REF ALT
""".split() if _f]

snvdata = {}
extrasnvheaders = []
usedsnvheaders = set()
for filename in opt.snvs:

    base, extn = filename.rsplit('.', 1)
    extn = extn.lower()
    if extn == 'csv':
        snvs = CSVFileTable(filename=filename)
    elif extn == 'vcf':
        snvs = VCFFile(filename=filename)
    elif extn == 'tsv':
        snvs = TSVFileTable(filename=filename)
    elif extn == 'xls':
        snvs = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        snvs = XLSXFileTable(filename=filename)
    elif extn == 'txt':
        snvs = TXTFileTable(filename=filename, headers=snvheaders)
    else:
        raise RuntimeError("Unexpected SNV file extension: %s" % filename)

    for h in snvheaders:
        if h not in snvs.headers():
Exemple #5
0
sumkeys = [
    _f for _f in map(
        str.strip, """
SNPJuncIntronCount SNPJuncNoIntronCount NoSNPJuncIntronCount NoSNPJuncNoIntronCount SNPMateCount NoSNPMateCount SNPCount NoSNPCount MatesCount NotMatesCount IntronCount NoIntronCount SpanningReads RemovedDuplicateReads SNPLociReads"""
        .split()) if _f
]
countdata = defaultdict(dict)
progress.stage("Read SNP/Junction counts")
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable
countheaders = None
for filename in opt.counts:
    base, extn = filename.rsplit('.', 1)
    path, base = os.path.split(base)
    extn = extn.lower()
    if extn == 'csv':
        counts = CSVFileTable(filename=filename)
    elif extn == 'tsv':
        counts = TSVFileTable(filename=filename)
    elif extn == 'xls':
        counts = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        counts = XLSXFileTable(filename=filename)
    else:
        raise RuntimeError("Unexpected count file extension: %s" % filename)

    if countheaders == None:
        countheaders = counts.headers()
    else:
        assert countheaders == counts.headers()
    assert 'CHROM' in countheaders
    assert 'POS' in countheaders
Exemple #6
0
#!/bin/env python27

import sys, traceback, re
from collections import defaultdict

from getwiki import GlycoMotifWiki, GlyGenMotif
w = GlycoMotifWiki()

motif2gd = defaultdict(set)

from dataset import CSVFileTable
for r in CSVFileTable(sys.argv[1]):
    entry = r['term (main_entry)'].strip()
    xrefs = r['term_xref']
    gdacc = r['glycan_dictionary_accession']
    allmid = set()
    for xr in filter(None, xrefs.split('|')):
        try:
            src, mid = xr.split(':', 1)
        except ValueError:
            continue
        if src.lower() == "glycomotif":
            if not re.search(r'^GGM\.\d{6}$', mid):
                print "Bad motif id: %s (%s)" % (mid, entry)
                continue
            motif2gd[mid].add((gdacc, entry))

for mid in w.site.allpages(prefix='GGM.', generator=False):
    # print mid
    m = w.get(mid)
    entries = motif2gd[mid]
Exemple #7
0
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable, BEDFile, VCFFile

progress.stage("Read SNV data", len(opt.snvs))
snvheaders = [_f for _f in """
CHROM POS REF ALT
""".split() if _f]

snvdata = {}
snvchroms = defaultdict(set)
for filename in opt.snvs:

    base, extn = filename.rsplit('.', 1)
    extn = extn.lower()
    if extn == 'csv':
        snvs = CSVFileTable(filename=filename)
    elif extn == 'vcf':
        snvs = VCFFile(filename=filename)
    elif extn == 'tsv':
        snvs = TSVFileTable(filename=filename)
    elif extn == 'xls':
        snvs = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        snvs = XLSXFileTable(filename=filename)
    elif extn == 'txt':
        snvs = TXTFileTable(filename=filename, headers=snvheaders)
    else:
        raise RuntimeError("Unexpected SNV file extension: %s" % filename)

    for h in snvheaders:
        if h not in snvs.headers():
Exemple #8
0
if not opt.output:
    opt.quiet = True
progress = ProgressText(quiet=opt.quiet)

sumkeys = filter(None, map(str.strip, """
SNPJuncIntronCount SNPJuncNoIntronCount NoSNPJuncIntronCount NoSNPJuncNoIntronCount SNPMateCount NoSNPMateCount SNPCount NoSNPCount MatesCount NotMatesCount IntronCount NoIntronCount SpanningReads RemovedDuplicateReads SNPLociReads""".split()))
countdata = defaultdict(dict)
progress.stage("Read SNP/Junction counts")
from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable
countheaders = None
for filename in opt.counts:
    base, extn = filename.rsplit('.', 1)
    path, base = os.path.split(base)
    extn = extn.lower()
    if extn == 'csv':
        counts = CSVFileTable(filename=filename)
    elif extn == 'tsv':
        counts = TSVFileTable(filename=filename)
    elif extn == 'xls':
        counts = XLSFileTable(filename=filename)
    elif extn == 'xlsx':
        counts = XLSXFileTable(filename=filename)
    else:
        raise RuntimeError("Unexpected count file extension: %s" % filename)

    if countheaders == None:
        countheaders = counts.headers()
    else:
        assert countheaders == counts.headers()
    assert 'CHROM' in countheaders
    assert 'POS' in countheaders