Esempio n. 1
0
def read_tfgenes(tgfile):
    """Read tf-gene pairs"""
    logger.info('Reading TF-gene pairs in %s ...', tgfile)
    reader = TsvReader(tgfile, cnames=False)
    ret = {}  # gene => tf
    for row in reader:
        ret.setdefault(row[1], set()).add(row[0])
    reader.close()
    return ret
Esempio n. 2
0
def getAlleleCount(bamfile, snpfile, outfile):
	brcparams   = Box()
	brcparams.f = ref
	brcparams.w = 0
	brcparams.l = snpfile

	brcparams[''] = bamfile
	cmd = '{bamrc} {args} > {outfile!r}'.format(
		bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp')
	runcmd(cmd)

	# reformated output to desired format
	reader = TsvReader(outfile + '.tmp', cnames = False)
	snper  = TsvReader(snpfile, cnames = False)
	#chr1	564773	C	14	=:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	C:14:...	G:0:...	T:0:...	N:0:...
	writer = TsvWriter(outfile)
	writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount']

	for r in reader:
		while True:
			try:
				snp   = next(snper)
			except StopIteration:
				break
			# use the end position, in case it's 0-based
			if snp[0] == r[0] and snp[2] == r[1]:
				counts = dict(
					A = r[5].split(':', 2)[1],
					C = r[6].split(':', 2)[1],
					G = r[7].split(':', 2)[1],
					T = r[8].split(':', 2)[1]
				)
				rec    = TsvRecord()
				rec.Chrm  = r[0]
				rec.pos   = r[1]
				rec.Total = r[3]
				rec.A = counts['A']
				rec.C = counts['C']
				rec.G = counts['G']
				rec.T = counts['T']
				# if reference allele is unknown, assuming all are ref alleles
				rec.refCount = counts.get(snp[6].upper(), r[3])
				# if mut allele is unknown, assuming no mutations happened
				rec.mutCount = counts.get(snp[7].upper(), 0)
				writer.write(rec)
				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()
Esempio n. 3
0
    def __init__(self, sfile):
        reader = TsvReader(sfile)
        self.samcol = reader.cnames[0]
        if self.samcol == 'ROWNAMES':
            self.samcol = 'Sample'
            reader.cnames[0] = 'Sample'

        self.data = reader.dump()
        self.nrow = len(self.data)
        self.ncol = len(reader.cnames)
        self.colnames = reader.cnames
        self.rownames = [row[self.samcol] for row in self.data]

        expectColnames = ['Sample', 'Patient', 'Group', 'Batch']
        if not set(expectColnames) & set(self.colnames):
            raise SampleInfoException('Unexpected column names: %s.' %
                                      str(self.colnames))
Esempio n. 4
0
    def _read(self, sifile):
        standard_cnames = ["", "Sample", "Patient", "Group", "Batch"]
        reader = TsvReader(sifile)

        self.cnames = reader.cnames
        if not self.cnames:
            raise SampleInfoException(
                'Headers for sample information file is required.')

        if any(cname not in standard_cnames for cname in self.cnames):
            raise SampleInfoException(
                'Headers should be a subset of {!r}'.format(
                    ', '.join(standard_cnames)))

        if "" in self.cnames:
            self.cnames[self.cnames.index("")] = "Sample"

        self.mat = reader.dump()
Esempio n. 5
0
def assertFileEqual(self,
                    first,
                    second,
                    filetype=None,
                    firstInopts=None,
                    secondInopts=None,
                    msg=None):
    if not self.maxDiff is None:
        self.maxDiff = max(self.maxDiff or 5000, 5000)

    filetype1 = filetype or ('text' if istext(first) else 'nontext')
    filetype2 = filetype or ('text' if istext(second) else 'nontext')
    if filetype1 != filetype2:
        standardMsg = 'Files different, because file1 is {0} but file2 is {1}'.format(
            filetype1, filetype2)
        self.fail(self._formatMessage(msg, standardMsg))
    elif filetype1 == 'text':  # and filetype2 == 'text':
        reader1 = TsvReader(first, **
                            firstInopts) if firstInopts else TsvReader(first)
        reader2 = TsvReader(
            second, **secondInopts) if secondInopts else TsvReader(second)
        rindex = 0
        for r1 in reader1:
            rindex += 1
            try:
                r2 = next(reader2)
            except StopIteration:
                standardMsg = 'File1 and file2 are different.\nFile1: {2}\nFile2: {3}\nRow {0} of file1 is: {1}, but nothing at row {0} of file2.'.format(
                    rindex, r1, first, second)
                self.fail(self._formatMessage(msg, standardMsg))
            if r1 != r2:
                standardMsg = 'File1 and file2 are different.\nFile1: {3}\nFile2: {4}\nRow {0} of file1: {1}\nRow {0} of file2: {2}'.format(
                    rindex, r1, r2, first, second)
                self.fail(self._formatMessage(msg, standardMsg))
    else:  # filetype1 == 'nontext' and filetype2 == 'nonetext': # binary
        import filecmp
        if not filecmp.cmp(first, second, shallow=False):
            standardMsg = 'Binary files are different:\n{}\n{}'.format(
                first, second)
            self.fail(self._formatMessage(msg, standardMsg))
Esempio n. 6
0
if inopts1.headCallback:
	inopts1.headCallback = eval(inopts1.headCallback)
if inopts2.headCallback:
	inopts2.headCallback = eval(inopts2.headCallback)
inopts1.attach = False
inopts2.attach = False

rnames1 = inopts1.get('rnames', True)
rnames2 = inopts2.get('rnames', True)
if 'rnames' in inopts1:
	del inopts1['rnames']
if 'rnames' in inopts2:
	del inopts2['rnames']

indata1 = TsvReader(infile1, **inopts1)
indata2 = TsvReader(infile2, **inopts2)

cnames1 = indata1.meta if not rnames1 else indata1.meta[1:]
cnames2 = indata2.meta if not rnames2 else indata2.meta[1:]
paired  = list(set(cnames1) & set(cnames2))
cnames1 = cnames2 = paired

if rnames1:
	cnames1 = [indata1.meta[0]] + cnames1
if rnames2:
	cnames2 = [indata2.meta[0]] + cnames2

cindex1 = [indata1.meta.index(c) for c in cnames1]
cindex2 = [indata2.meta.index(c) for c in cnames2]
Esempio n. 7
0
bedtools = {{ args.bedtools | quote}}

shell.TOOLS.bedtools = bedtools
bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools

params['g']   = gsize
params['i']   = infile

if not 'l' and not 'r' and not 'b' in params:
	raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b')

if args.extend:
	left   = params.get('l', params.get('b', 0))
	right  = params.get('r', params.get('b', 0))
	stdns  = params.get('s', False)
	reader = TsvReader(infile, cnames = False)
	writer = TsvWriter(outfile)
	for r in reader:
		if not stdns or r[5] == '+':
			left2, right2 = left, right
		else:
			left2, right2 = right, left
		if params.pct:
			length   = r[2] - r[1]
			r[1] -= round(length * left2)
			r[2] += round(length * right2)
		else:
			r[1] -= left2
			r[2] += right2
		writer.write(r)
else:
Esempio n. 8
0
covfile   = {{i.covfile | quote}}
genes     = {{i.genes.split(',') | repr}}
tfs       = {{i.tfs.split(';') | repr}}
outdata   = {{o.outdata | quote}}
outgroup  = {{o.outgroup | quote}}
outcase   = {{o.outcase | quote}}
genetfs   = { g:tfs[i].split(',') for i, g in enumerate(genes) }

# get gene, snp pairs
"""
chr1	12463073	12463074	AADACL4	0	+	chr1	12463073	12463074	chr1_12463073_rs7547740_A_G	0	+
chr1	12480504	12480505	AADACL4	0	+	chr1	12480504	12480505	chr1_12480504_rs6660365_T_C	0	+
chr1	12496021	12496022	AADACL4	0	+	chr1	12496021	12496022	chr1_12496021_rs6541023_T_C	0	+
"""
mutgenes   = defaultdict(lambda: [])
intereader = TsvReader(interfile)
genes = set()
for r in intereader:
	if not r[3] in genetfs:
		continue
	mutgenes[r[9]].append(r[3])
	genes.add(r[3])
intereader.close()

# shrink the sets
genetfs = {g: genetfs[g] for g in genes}
tfs     = list({tf for gtfs in genetfs.values() for tf in gtfs})

# nothing, write empty files
if not mutgenes or not genes:
	open(outdata, 'w').close()
Esempio n. 9
0
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter

# snp gene
# SNP1 Gene10 # sorted by gene
infile = {{i.infile | quote}}
snpfile = {{o.snpfile | quote}}
genefile = {{o.genefile | quote}}
snppergene = {{args.snppergene | repr}}
nchr = {{args.nchr | repr}}
seed = {{args.seed | repr}}
# distances between genes
dist = {{args.dist | repr}}

random.seed(seed)

reader = TsvReader(infile, cnames=False)
allsnps = set(reader.dump(0))
reader.rewind()
allgenes = set(reader.dump(1))
reader.close()

# assign a probability to each snp
nsnps = len(allsnps)
ngenes = len(allgenes)
snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene),
                                             k=nsnps)))

genebed = TsvWriter(genefile)
snpbed = TsvWriter(snpfile)

geneperchr = math.ceil(float(ngenes) / float(nchr))
Esempio n. 10
0
ncol    = {{args.ncol}}
nameopt = {{args.name | quote}}

def rowFactory(row):
	rparts = row[0].split('_')
	if len(rparts) == 4:
		(chrom, pos, ref, alt) = rparts
		name = chrom + '_' + pos if nameopt == 'neat' else row[0]
	elif len(rparts) == 5:
		(chrom, pos, name, ref, alt) = rparts
		if name == 'NOVEL':
			name = chrom + '_' + pos
		if nameopt == 'full':
			name = row[0]
	else:
		raise ValueError('Malformat genotype matrix, expect 4 or 5 items in row names.')
	if ncol == 3:
		return [chrom, pos, int(pos) + 1]
	if ncol == 6:
		return [chrom, pos, int(pos) + 1, name, 0, '+']
	else:
		return [chrom, pos, int(pos) + 1, name, 0, '+', ref + ',' + alt, ','.join(rows[1:])]

reader = TsvReader(infile, cnames = True, attach = False, row = rowFactory)
writer = TsvWriter(outfile)
for r in reader:
	writer.write(r)
writer.close()


Esempio n. 11
0
		func=set(['ncRNA']),	
		locType=u'exact',	
		weight=1L,	
		exceptions=set([]),	
		submitterCount=26,	
		submitters='1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL,BUSHMAN,COMPLETE_GENOMICS,DDI,ENSEMBL,EVA-GONL,EVA_DECODE,EVA_GENOME_DK,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,GMI,HAMMER_LAB,HGSV,HUMANGENOME_JCVI,ILLUMINA-UK,JMKIDD_LAB,PJP,SSAHASNP,SSMP,TISHKOFF,WEILL_CORNELL_DGM,',	
		alleleFreqCount=2,	
		alleles='C,T,',	
		alleleNs='2634.000000,2374.000000,',	
		alleleFreqs='0.525958,0.474042,',	
		bitfields=set(['maf-5-all-pops', 'maf-5-some-pop'])	
	)	
	'''

    # snps
    reader = TsvReader(snpfile, cnames=False)
    snplist = list(set(r[snpcol] for r in reader))
    reader.close()

    from cruzdb import Genome
    g = Genome(genome)
    outfiletmp = outfile + '.tmp'
    writer = TsvWriter(outfiletmp)
    for i in range(0, len(snplist), 1000):
        chunk = snplist[i:i + 1000]
        sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format(
            dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk))
        result = g.sql(sql)
        for r in result:
            allfreqs = dict(zip(r.alleles.split(','),
                                r.alleleFreqs.split(',')))
Esempio n. 12
0
from bioprocs.gene import pPromoters
from bioprocs.bed import pBedIntersect
from bioprocs.tsv import pTsvMerge
from bioprocs.stats import pChow
from bioprocs.vcfnext import pGTMat2Bed
from bioprocs.utils.parallel import distributeList
from bioprocs.utils.tsvio2 import TsvReader

if __name__ == '__main__':
    params = params.parse()

    logger.logger.info('Reading tfhits ...')

    # get tf-gene pairs
    genes = defaultdict(lambda: set())
    reader = TsvReader(params.tfhits, cnames=False)
    allgenes = set()
    npairs = 0
    if params.tfhits.endswith('.bed'):
        for r in reader:
            tf, gene = r[3].split('::')
            genes[gene].add(tf)
            allgenes.add(gene)
            npairs += 1
    else:
        for r in reader:
            tf, gene = r[:2]
            genes[gene].add(tf)
            allgenes.add(gene)
            npairs += 1
    reader.close()
Esempio n. 13
0
tffile   = {{i.tffile | quote}}
sfile    = {{i.sfile | quote}}
outfile  = {{o.outfile | quote}}
outdir   = {{o.outdir | quote}}
tool     = {{args.tool | quote}}
meme     = {{args.meme | quote}}
params   = {{args.params | repr}}
tfmotifs = {{args.tfmotifs | quote}}
pval     = {{args.pval | repr}}
ucsclink = {{args.ucsclink | quote}}
nthread  = {{args.nthread | repr}}

# get all motifs
logger.info('Loading motif names ...')
reader = TsvReader(tffile, cnames = False)
ncol   = len(next(reader))
reader.rewind()
if ncol == 1:
	motifns = {r[0]:r[0] for r in reader}
else:
	motifns = {r[0]:r[1] for r in reader}
logger.info('%s motif names read.', len(motifns))

# match motifs
logger.info('Matching motifs in database ...')
mnames = [m.name for m in MemeReader(tfmotifs)]
motifs = {k:v for k, v in motifns.items() if k in mnames}
logger.info('%s motifs loaded', len(motifs))

if tool == 'meme':
Esempio n. 14
0
		from operator import mul
		try:
			reduce
		except NameError:
			from functools import reduce
		return reduce(mul, pvals, 1) ** (1.0/float(len(pvals)))
	else:
		raise ValueError('Method %s not supported yet.' % method)

def numpval(pval):
	try:
		return float(pval)
	except TypeError:
		return 1.0

reader    = TsvReader(infile)
writer    = TsvWriter(outfile)
prevsnp   = None
prevpvals = []
for r in reader:
	snp = r.Case.split('.')[0]
	if snp != prevsnp:
		if prevsnp:
			writer.write([
				prevsnp,
				aggregate(prevpvals, method)
			])
		prevsnp   = snp
		prevpvals = [numpval(r.Pval)]
	else:
		prevpvals.append(numpval(r.Pval))
Esempio n. 15
0
infile   = {{i.infile | quote}}
outfile  = {{o.outfile | quote}}
region   = {{args.region | repr}}
notfound = {{args.notfound | quote}}
inopts   = {{args.inopts | repr}}
outopts  = {{args.outopts | repr}}
refgene  = {{args.refgene | quote}}
genecol  = {{args.inopts.genecol | repr}}
ocnames  = {{args.outopts.cnames | repr}}
del inopts['genecol']
del outopts['cnames']
if region.down is None:
	region.down = region.up

# get all genes' TSS and strand
reader = TsvReader(refgene, cnames = False, delimit = '"')
genes  = {r[1]:r[0].split("\t")[:7] for r in reader}

reader = TsvReader(infile, **inopts)
writer = TsvWriter(outfile, **outopts)
if ocnames:
	writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND']
	writer.writeHead()
for r in reader:
	gene = r[genecol]
	if gene not in genes:
		msg = 'Gene does not exist: {}'.format(gene)
		if notfound == 'error':
			raise ValueError(msg)
		else:
			log2pyppl('Gene does not exist: {msg}', 'warning')
Esempio n. 16
0
infile = {{i.infile | quote}}
metafile = {{i.metafile | quote}}
outdir = {{o.outdir | quote}}
plink = {{args.plink | quote}}
keeptxt = {{args.keeptxt | repr}}
chrmaps = {{args.chrmaps | repr}}
prefix = path.join(outdir, {{i.infile | fn2 | quote}})
tpedfile = prefix + ".tped"
tfamfile = prefix + ".tfam"

# column names could be:
# FID, IID, PID, MID, Sex, Pheno
if metafile:
    logger.info('Reading metafile ...')
    metadata = dict(
        TsvReader(metafile, cnames=True, row=lambda r: tuple(
            (r.IID, r))).dump())
else:
    metadata = None

logger.info('Reading genotype matrix ...')
# snp1 gt1s1 gt1s2 ...
inreader = TsvReader(infile, cnames=True)
samples = inreader.meta[1:]

logger.info('Writing tfam file ...')
tfamWriter = TsvWriter(tfamfile)
tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno']
#tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta))
if not metadata:
    for s in samples:
        tfamWriter.write([s, s, '0', '0', 'other', '-9'])
Esempio n. 17
0
outfile = {{ o.outfile | quote}}
outdir  = {{ o.outdir | quote}}
params = {{ args.params | repr}}
idxfile = {{ args.idxfile | quote}}
kallisto = {{ args.kallisto | quote}}
nthread = {{ args.nthread | repr}}

shell.TOOLS.kallisto = kallisto
params.i = idxfile
params.o = outdir
params.t = nthread
params._ = [fq1, fq2]

kallisto = shell.Shell(subcmd = True).kallisto
kallisto.quant(**params).run()

imfile        = path.join(outdir, 'abundance.tsv')
reader        = TsvReader(imfile)
writer        = TsvWriter(outfile)
writer.cnames = ['target_id', 'est_counts']
writer.writeHead()

for r in reader:
	r.target_id = r.target_id.split('::')[0]
	try:
		r.est_counts = int(round(float(r.est_counts)))
	except TypeError:
		r.est_counts = 0
	writer.write(r)
writer.close()
Esempio n. 18
0
dbs      = {{args.libs | alwaysList | repr}}
plot     = {{args.plot | repr}}
nthread  = {{args.nthread | repr}}
Rscript  = {{args.Rscript | repr}}
cutoff   = {{args.cutoff | repr}}
devpars  = {{args.devpars | repr}}
pathview = {{args.pathview | repr}}

shell.TOOLS.Rscript = Rscript
if isinstance(cutoff, dict):
	if cutoff['by'] == 'p':
		cutoff['by'] = 'Pval'
	if cutoff['by'] == 'q':
		cutoff['by'] = 'AdjPval'

reader = TsvReader(infile, **inopts)
genes  = [r[genecol] for r in reader]

en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript)
en.addList(genes, description = path.basename(infile))

para = Parallel(nthread = nthread)
runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run()
for db in dbs:
	outfile = path.join(outdir, prefix + '.' + db + '.txt')
	en.enrich(db)
	en.export(outfile, top = 100)
	if plot:
		plotfile = path.join(outdir, prefix + '.' + db + '.png')
		en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height)
	if pathview and 'KEGG' in db:
Esempio n. 19
0
from gff import Gff
from bioprocs.utils.tsvio2 import TsvWriter, TsvReader, TsvRecord
from bioprocs.utils import logger

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
notfound = {{args.notfound | quote}}
genecol = {{args.genecol or 0 | repr}}
inopts = {{args.inopts | repr}}
refgene = {{args.refgene | quote}}

if not path.isfile(refgene):
    raise OSError('Refgene file does not exists: {}'.format(refgene))

# get genes
genes = TsvReader(infile, **inopts).dump(genecol)
genes = dict(zip(genes, [False] * len(genes)))
writer = TsvWriter(outfile)
writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND']

gff = Gff(refgene)
for g in gff:
    attrs = g['attributes']
    if attrs['gene_id'] not in genes:
        continue
    r = TsvRecord()
    r.CHR = g['seqid']
    r.START = g['start']
    r.END = g['end']
    r.SCORE = g['score']
    r.STRAND = g['strand']
Esempio n. 20
0
            try:
                writer.write(reader.next())
            except StopIteration:
                break
        writer.close()
        thparams = params.copy()
        thparams[""] = [qfile, mfile2]
        thparams.thresh = qval
        thparams.oc = ocdir
        cmdps.append((tomtom, cmdargs(thparams, dash='-', equal=' ')))
    reader.close()
    Parallel(nthread, raiseExc=True).run('{} {}', cmdps)

    writer = TsvWriter(outfile)
    reader = TsvReader(path.join(ocdirs[0], 'tomtom.txt'),
                       comment='##',
                       cnames=lambda header: header[1:].strip().split("\t"))
    writer.cnames = reader.cnames
    writer.writeHead(lambda cnames: "#" + "\t".join(cnames))
    reader.close()
    for ocdir in ocdirs:
        reader = TsvReader(
            path.join(ocdir, 'tomtom.txt'),
            comment='##',
            cnames=lambda header: header[1:].strip().split("\t"))
        for r in reader:
            writer.write(r)
        reader.close()
    writer.close()
else:
    params[""] = [mfile1, mfile2]
Esempio n. 21
0
				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()

if nthread == 1:
	getAlleleCount(tumbam, affysnps, tumsnp)
	getAlleleCount(normbam, affysnps, normsnp)
else:
	# try to split the affysnps into N files and distribute the jobs to nthreads
	# get number of lines of affysnps file
	total  = wc_l(affysnps)
	dists  = distribute(total, nthread)
	reader = TsvReader(affysnps, cnames = False)
	# dir to save the split file and result file
	thdir  = path.join(outdir, 'bamrc.nthreads')
	if not path.exists(thdir):
		makedirs(thdir)
	
	asbname = path.basename(affysnps).split('.')[0]
	for i, dist in enumerate(dists):
		writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format(
			bname = asbname, i = i
		)))
		for _ in range(dist):
			writer.write(next(reader))
		writer.close()
	
	para   = Parallel(nthread, raiseExc = True)