Esempi in Python per TsvReader, esempi in Python per bioprocs.utils.tsvio2.TsvReader

Esempio n. 1

0

Mostra file

File: sorttfgenes.py Progetto: pwwang/ceQTL

def read_tfgenes(tgfile):
    """Read tf-gene pairs"""
    logger.info('Reading TF-gene pairs in %s ...', tgfile)
    reader = TsvReader(tgfile, cnames=False)
    ret = {}  # gene => tf
    for row in reader:
        ret.setdefault(row[1], set()).add(row[0])
    reader.close()
    return ret

Esempio n. 2

0

Mostra file

def getAlleleCount(bamfile, snpfile, outfile):
	brcparams   = Box()
	brcparams.f = ref
	brcparams.w = 0
	brcparams.l = snpfile

	brcparams[''] = bamfile
	cmd = '{bamrc} {args} > {outfile!r}'.format(
		bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp')
	runcmd(cmd)

	# reformated output to desired format
	reader = TsvReader(outfile + '.tmp', cnames = False)
	snper  = TsvReader(snpfile, cnames = False)
	#chr1	564773	C	14	=:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	C:14:...	G:0:...	T:0:...	N:0:...
	writer = TsvWriter(outfile)
	writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount']

	for r in reader:
		while True:
			try:
				snp   = next(snper)
			except StopIteration:
				break
			# use the end position, in case it's 0-based
			if snp[0] == r[0] and snp[2] == r[1]:
				counts = dict(
					A = r[5].split(':', 2)[1],
					C = r[6].split(':', 2)[1],
					G = r[7].split(':', 2)[1],
					T = r[8].split(':', 2)[1]
				)
				rec    = TsvRecord()
				rec.Chrm  = r[0]
				rec.pos   = r[1]
				rec.Total = r[3]
				rec.A = counts['A']
				rec.C = counts['C']
				rec.G = counts['G']
				rec.T = counts['T']
				# if reference allele is unknown, assuming all are ref alleles
				rec.refCount = counts.get(snp[6].upper(), r[3])
				# if mut allele is unknown, assuming no mutations happened
				rec.mutCount = counts.get(snp[7].upper(), 0)
				writer.write(rec)
				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()

Esempio n. 3

0

Mostra file

File: sampleinfo.py Progetto: LeaveYeah/bioprocs

    def __init__(self, sfile):
        reader = TsvReader(sfile)
        self.samcol = reader.cnames[0]
        if self.samcol == 'ROWNAMES':
            self.samcol = 'Sample'
            reader.cnames[0] = 'Sample'

        self.data = reader.dump()
        self.nrow = len(self.data)
        self.ncol = len(reader.cnames)
        self.colnames = reader.cnames
        self.rownames = [row[self.samcol] for row in self.data]

        expectColnames = ['Sample', 'Patient', 'Group', 'Batch']
        if not set(expectColnames) & set(self.colnames):
            raise SampleInfoException('Unexpected column names: %s.' %
                                      str(self.colnames))

Esempio n. 4

0

Mostra file

File: sampleinfo.py Progetto: LeaveYeah/bioprocs

    def _read(self, sifile):
        standard_cnames = ["", "Sample", "Patient", "Group", "Batch"]
        reader = TsvReader(sifile)

        self.cnames = reader.cnames
        if not self.cnames:
            raise SampleInfoException(
                'Headers for sample information file is required.')

        if any(cname not in standard_cnames for cname in self.cnames):
            raise SampleInfoException(
                'Headers should be a subset of {!r}'.format(
                    ', '.join(standard_cnames)))

        if "" in self.cnames:
            self.cnames[self.cnames.index("")] = "Sample"

        self.mat = reader.dump()

Esempio n. 5

0

Mostra file

def assertFileEqual(self,
                    first,
                    second,
                    filetype=None,
                    firstInopts=None,
                    secondInopts=None,
                    msg=None):
    if not self.maxDiff is None:
        self.maxDiff = max(self.maxDiff or 5000, 5000)

    filetype1 = filetype or ('text' if istext(first) else 'nontext')
    filetype2 = filetype or ('text' if istext(second) else 'nontext')
    if filetype1 != filetype2:
        standardMsg = 'Files different, because file1 is {0} but file2 is {1}'.format(
            filetype1, filetype2)
        self.fail(self._formatMessage(msg, standardMsg))
    elif filetype1 == 'text':  # and filetype2 == 'text':
        reader1 = TsvReader(first, **
                            firstInopts) if firstInopts else TsvReader(first)
        reader2 = TsvReader(
            second, **secondInopts) if secondInopts else TsvReader(second)
        rindex = 0
        for r1 in reader1:
            rindex += 1
            try:
                r2 = next(reader2)
            except StopIteration:
                standardMsg = 'File1 and file2 are different.\nFile1: {2}\nFile2: {3}\nRow {0} of file1 is: {1}, but nothing at row {0} of file2.'.format(
                    rindex, r1, first, second)
                self.fail(self._formatMessage(msg, standardMsg))
            if r1 != r2:
                standardMsg = 'File1 and file2 are different.\nFile1: {3}\nFile2: {4}\nRow {0} of file1: {1}\nRow {0} of file2: {2}'.format(
                    rindex, r1, r2, first, second)
                self.fail(self._formatMessage(msg, standardMsg))
    else:  # filetype1 == 'nontext' and filetype2 == 'nonetext': # binary
        import filecmp
        if not filecmp.cmp(first, second, shallow=False):
            standardMsg = 'Binary files are different:\n{}\n{}'.format(
                first, second)
            self.fail(self._formatMessage(msg, standardMsg))

Esempio n. 6

0

Mostra file

if inopts1.headCallback:
	inopts1.headCallback = eval(inopts1.headCallback)
if inopts2.headCallback:
	inopts2.headCallback = eval(inopts2.headCallback)
inopts1.attach = False
inopts2.attach = False

rnames1 = inopts1.get('rnames', True)
rnames2 = inopts2.get('rnames', True)
if 'rnames' in inopts1:
	del inopts1['rnames']
if 'rnames' in inopts2:
	del inopts2['rnames']

indata1 = TsvReader(infile1, **inopts1)
indata2 = TsvReader(infile2, **inopts2)

cnames1 = indata1.meta if not rnames1 else indata1.meta[1:]
cnames2 = indata2.meta if not rnames2 else indata2.meta[1:]
paired  = list(set(cnames1) & set(cnames2))
cnames1 = cnames2 = paired

if rnames1:
	cnames1 = [indata1.meta[0]] + cnames1
if rnames2:
	cnames2 = [indata2.meta[0]] + cnames2

cindex1 = [indata1.meta.index(c) for c in cnames1]
cindex2 = [indata2.meta.index(c) for c in cnames2]

Esempio n. 7

0

Mostra file

File: pBedFlank.py Progetto: LeaveYeah/bioprocs

bedtools = {{ args.bedtools | quote}}

shell.TOOLS.bedtools = bedtools
bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools

params['g']   = gsize
params['i']   = infile

if not 'l' and not 'r' and not 'b' in params:
	raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b')

if args.extend:
	left   = params.get('l', params.get('b', 0))
	right  = params.get('r', params.get('b', 0))
	stdns  = params.get('s', False)
	reader = TsvReader(infile, cnames = False)
	writer = TsvWriter(outfile)
	for r in reader:
		if not stdns or r[5] == '+':
			left2, right2 = left, right
		else:
			left2, right2 = right, left
		if params.pct:
			length   = r[2] - r[1]
			r[1] -= round(length * left2)
			r[2] += round(length * right2)
		else:
			r[1] -= left2
			r[2] += right2
		writer.write(r)
else:

Esempio n. 8

0

Mostra file

File: ceQTL-pToChow.py Progetto: pwwang/ceQTL

covfile   = {{i.covfile | quote}}
genes     = {{i.genes.split(',') | repr}}
tfs       = {{i.tfs.split(';') | repr}}
outdata   = {{o.outdata | quote}}
outgroup  = {{o.outgroup | quote}}
outcase   = {{o.outcase | quote}}
genetfs   = { g:tfs[i].split(',') for i, g in enumerate(genes) }

# get gene, snp pairs
"""
chr1	12463073	12463074	AADACL4	0	+	chr1	12463073	12463074	chr1_12463073_rs7547740_A_G	0	+
chr1	12480504	12480505	AADACL4	0	+	chr1	12480504	12480505	chr1_12480504_rs6660365_T_C	0	+
chr1	12496021	12496022	AADACL4	0	+	chr1	12496021	12496022	chr1_12496021_rs6541023_T_C	0	+
"""
mutgenes   = defaultdict(lambda: [])
intereader = TsvReader(interfile)
genes = set()
for r in intereader:
	if not r[3] in genetfs:
		continue
	mutgenes[r[9]].append(r[3])
	genes.add(r[3])
intereader.close()

# shrink the sets
genetfs = {g: genetfs[g] for g in genes}
tfs     = list({tf for gtfs in genetfs.values() for tf in gtfs})

# nothing, write empty files
if not mutgenes or not genes:
	open(outdata, 'w').close()

Esempio n. 9

0

Mostra file

File: pSGPair2Beds.py Progetto: pwwang/ceQTL

from bioprocs.utils.tsvio2 import TsvReader, TsvWriter

# snp gene
# SNP1 Gene10 # sorted by gene
infile = {{i.infile | quote}}
snpfile = {{o.snpfile | quote}}
genefile = {{o.genefile | quote}}
snppergene = {{args.snppergene | repr}}
nchr = {{args.nchr | repr}}
seed = {{args.seed | repr}}
# distances between genes
dist = {{args.dist | repr}}

random.seed(seed)

reader = TsvReader(infile, cnames=False)
allsnps = set(reader.dump(0))
reader.rewind()
allgenes = set(reader.dump(1))
reader.close()

# assign a probability to each snp
nsnps = len(allsnps)
ngenes = len(allgenes)
snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene),
                                             k=nsnps)))

genebed = TsvWriter(genefile)
snpbed = TsvWriter(snpfile)

geneperchr = math.ceil(float(ngenes) / float(nchr))

Esempio n. 10

0

Mostra file

File: pGTMat2Bed.py Progetto: LeaveYeah/bioprocs

ncol    = {{args.ncol}}
nameopt = {{args.name | quote}}

def rowFactory(row):
	rparts = row[0].split('_')
	if len(rparts) == 4:
		(chrom, pos, ref, alt) = rparts
		name = chrom + '_' + pos if nameopt == 'neat' else row[0]
	elif len(rparts) == 5:
		(chrom, pos, name, ref, alt) = rparts
		if name == 'NOVEL':
			name = chrom + '_' + pos
		if nameopt == 'full':
			name = row[0]
	else:
		raise ValueError('Malformat genotype matrix, expect 4 or 5 items in row names.')
	if ncol == 3:
		return [chrom, pos, int(pos) + 1]
	if ncol == 6:
		return [chrom, pos, int(pos) + 1, name, 0, '+']
	else:
		return [chrom, pos, int(pos) + 1, name, 0, '+', ref + ',' + alt, ','.join(rows[1:])]

reader = TsvReader(infile, cnames = True, attach = False, row = rowFactory)
writer = TsvWriter(outfile)
for r in reader:
	writer.write(r)
writer.close()

Esempio n. 11

0

Mostra file

		func=set(['ncRNA']),	
		locType=u'exact',	
		weight=1L,	
		exceptions=set([]),	
		submitterCount=26,	
		submitters='1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL,BUSHMAN,COMPLETE_GENOMICS,DDI,ENSEMBL,EVA-GONL,EVA_DECODE,EVA_GENOME_DK,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,GMI,HAMMER_LAB,HGSV,HUMANGENOME_JCVI,ILLUMINA-UK,JMKIDD_LAB,PJP,SSAHASNP,SSMP,TISHKOFF,WEILL_CORNELL_DGM,',	
		alleleFreqCount=2,	
		alleles='C,T,',	
		alleleNs='2634.000000,2374.000000,',	
		alleleFreqs='0.525958,0.474042,',	
		bitfields=set(['maf-5-all-pops', 'maf-5-some-pop'])	
	)	
	'''

    # snps
    reader = TsvReader(snpfile, cnames=False)
    snplist = list(set(r[snpcol] for r in reader))
    reader.close()

    from cruzdb import Genome
    g = Genome(genome)
    outfiletmp = outfile + '.tmp'
    writer = TsvWriter(outfiletmp)
    for i in range(0, len(snplist), 1000):
        chunk = snplist[i:i + 1000]
        sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format(
            dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk))
        result = g.sql(sql)
        for r in result:
            allfreqs = dict(zip(r.alleles.split(','),
                                r.alleleFreqs.split(',')))

Esempio n. 12

0

Mostra file

from bioprocs.gene import pPromoters
from bioprocs.bed import pBedIntersect
from bioprocs.tsv import pTsvMerge
from bioprocs.stats import pChow
from bioprocs.vcfnext import pGTMat2Bed
from bioprocs.utils.parallel import distributeList
from bioprocs.utils.tsvio2 import TsvReader

if __name__ == '__main__':
    params = params.parse()

    logger.logger.info('Reading tfhits ...')

    # get tf-gene pairs
    genes = defaultdict(lambda: set())
    reader = TsvReader(params.tfhits, cnames=False)
    allgenes = set()
    npairs = 0
    if params.tfhits.endswith('.bed'):
        for r in reader:
            tf, gene = r[3].split('::')
            genes[gene].add(tf)
            allgenes.add(gene)
            npairs += 1
    else:
        for r in reader:
            tf, gene = r[:2]
            genes[gene].add(tf)
            allgenes.add(gene)
            npairs += 1
    reader.close()

Esempio n. 13

0

Mostra file

tffile   = {{i.tffile | quote}}
sfile    = {{i.sfile | quote}}
outfile  = {{o.outfile | quote}}
outdir   = {{o.outdir | quote}}
tool     = {{args.tool | quote}}
meme     = {{args.meme | quote}}
params   = {{args.params | repr}}
tfmotifs = {{args.tfmotifs | quote}}
pval     = {{args.pval | repr}}
ucsclink = {{args.ucsclink | quote}}
nthread  = {{args.nthread | repr}}

# get all motifs
logger.info('Loading motif names ...')
reader = TsvReader(tffile, cnames = False)
ncol   = len(next(reader))
reader.rewind()
if ncol == 1:
	motifns = {r[0]:r[0] for r in reader}
else:
	motifns = {r[0]:r[1] for r in reader}
logger.info('%s motif names read.', len(motifns))

# match motifs
logger.info('Matching motifs in database ...')
mnames = [m.name for m in MemeReader(tfmotifs)]
motifs = {k:v for k, v in motifns.items() if k in mnames}
logger.info('%s motifs loaded', len(motifs))

if tool == 'meme':

Esempio n. 14

0

Mostra file

		from operator import mul
		try:
			reduce
		except NameError:
			from functools import reduce
		return reduce(mul, pvals, 1) ** (1.0/float(len(pvals)))
	else:
		raise ValueError('Method %s not supported yet.' % method)

def numpval(pval):
	try:
		return float(pval)
	except TypeError:
		return 1.0

reader    = TsvReader(infile)
writer    = TsvWriter(outfile)
prevsnp   = None
prevpvals = []
for r in reader:
	snp = r.Case.split('.')[0]
	if snp != prevsnp:
		if prevsnp:
			writer.write([
				prevsnp,
				aggregate(prevpvals, method)
			])
		prevsnp   = snp
		prevpvals = [numpval(r.Pval)]
	else:
		prevpvals.append(numpval(r.Pval))

Esempio n. 15

0

Mostra file

File: pPromoters.py Progetto: LeaveYeah/bioprocs

infile   = {{i.infile | quote}}
outfile  = {{o.outfile | quote}}
region   = {{args.region | repr}}
notfound = {{args.notfound | quote}}
inopts   = {{args.inopts | repr}}
outopts  = {{args.outopts | repr}}
refgene  = {{args.refgene | quote}}
genecol  = {{args.inopts.genecol | repr}}
ocnames  = {{args.outopts.cnames | repr}}
del inopts['genecol']
del outopts['cnames']
if region.down is None:
	region.down = region.up

# get all genes' TSS and strand
reader = TsvReader(refgene, cnames = False, delimit = '"')
genes  = {r[1]:r[0].split("\t")[:7] for r in reader}

reader = TsvReader(infile, **inopts)
writer = TsvWriter(outfile, **outopts)
if ocnames:
	writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND']
	writer.writeHead()
for r in reader:
	gene = r[genecol]
	if gene not in genes:
		msg = 'Gene does not exist: {}'.format(gene)
		if notfound == 'error':
			raise ValueError(msg)
		else:
			log2pyppl('Gene does not exist: {msg}', 'warning')

Esempio n. 16

0

Mostra file

File: pGTMat2Plink.py Progetto: LeaveYeah/bioprocs

infile = {{i.infile | quote}}
metafile = {{i.metafile | quote}}
outdir = {{o.outdir | quote}}
plink = {{args.plink | quote}}
keeptxt = {{args.keeptxt | repr}}
chrmaps = {{args.chrmaps | repr}}
prefix = path.join(outdir, {{i.infile | fn2 | quote}})
tpedfile = prefix + ".tped"
tfamfile = prefix + ".tfam"

# column names could be:
# FID, IID, PID, MID, Sex, Pheno
if metafile:
    logger.info('Reading metafile ...')
    metadata = dict(
        TsvReader(metafile, cnames=True, row=lambda r: tuple(
            (r.IID, r))).dump())
else:
    metadata = None

logger.info('Reading genotype matrix ...')
# snp1 gt1s1 gt1s2 ...
inreader = TsvReader(infile, cnames=True)
samples = inreader.meta[1:]

logger.info('Writing tfam file ...')
tfamWriter = TsvWriter(tfamfile)
tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno']
#tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta))
if not metadata:
    for s in samples:
        tfamWriter.write([s, s, '0', '0', 'other', '-9'])

Esempio n. 17

0

Mostra file

outfile = {{ o.outfile | quote}}
outdir  = {{ o.outdir | quote}}
params = {{ args.params | repr}}
idxfile = {{ args.idxfile | quote}}
kallisto = {{ args.kallisto | quote}}
nthread = {{ args.nthread | repr}}

shell.TOOLS.kallisto = kallisto
params.i = idxfile
params.o = outdir
params.t = nthread
params._ = [fq1, fq2]

kallisto = shell.Shell(subcmd = True).kallisto
kallisto.quant(**params).run()

imfile        = path.join(outdir, 'abundance.tsv')
reader        = TsvReader(imfile)
writer        = TsvWriter(outfile)
writer.cnames = ['target_id', 'est_counts']
writer.writeHead()

for r in reader:
	r.target_id = r.target_id.split('::')[0]
	try:
		r.est_counts = int(round(float(r.est_counts)))
	except TypeError:
		r.est_counts = 0
	writer.write(r)
writer.close()

Esempio n. 18

0

Mostra file

dbs      = {{args.libs | alwaysList | repr}}
plot     = {{args.plot | repr}}
nthread  = {{args.nthread | repr}}
Rscript  = {{args.Rscript | repr}}
cutoff   = {{args.cutoff | repr}}
devpars  = {{args.devpars | repr}}
pathview = {{args.pathview | repr}}

shell.TOOLS.Rscript = Rscript
if isinstance(cutoff, dict):
	if cutoff['by'] == 'p':
		cutoff['by'] = 'Pval'
	if cutoff['by'] == 'q':
		cutoff['by'] = 'AdjPval'

reader = TsvReader(infile, **inopts)
genes  = [r[genecol] for r in reader]

en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript)
en.addList(genes, description = path.basename(infile))

para = Parallel(nthread = nthread)
runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run()
for db in dbs:
	outfile = path.join(outdir, prefix + '.' + db + '.txt')
	en.enrich(db)
	en.export(outfile, top = 100)
	if plot:
		plotfile = path.join(outdir, prefix + '.' + db + '.png')
		en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height)
	if pathview and 'KEGG' in db:

Esempio n. 19

0

Mostra file

from gff import Gff
from bioprocs.utils.tsvio2 import TsvWriter, TsvReader, TsvRecord
from bioprocs.utils import logger

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
notfound = {{args.notfound | quote}}
genecol = {{args.genecol or 0 | repr}}
inopts = {{args.inopts | repr}}
refgene = {{args.refgene | quote}}

if not path.isfile(refgene):
    raise OSError('Refgene file does not exists: {}'.format(refgene))

# get genes
genes = TsvReader(infile, **inopts).dump(genecol)
genes = dict(zip(genes, [False] * len(genes)))
writer = TsvWriter(outfile)
writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND']

gff = Gff(refgene)
for g in gff:
    attrs = g['attributes']
    if attrs['gene_id'] not in genes:
        continue
    r = TsvRecord()
    r.CHR = g['seqid']
    r.START = g['start']
    r.END = g['end']
    r.SCORE = g['score']
    r.STRAND = g['strand']

Esempio n. 20

0

Mostra file

            try:
                writer.write(reader.next())
            except StopIteration:
                break
        writer.close()
        thparams = params.copy()
        thparams[""] = [qfile, mfile2]
        thparams.thresh = qval
        thparams.oc = ocdir
        cmdps.append((tomtom, cmdargs(thparams, dash='-', equal=' ')))
    reader.close()
    Parallel(nthread, raiseExc=True).run('{} {}', cmdps)

    writer = TsvWriter(outfile)
    reader = TsvReader(path.join(ocdirs[0], 'tomtom.txt'),
                       comment='##',
                       cnames=lambda header: header[1:].strip().split("\t"))
    writer.cnames = reader.cnames
    writer.writeHead(lambda cnames: "#" + "\t".join(cnames))
    reader.close()
    for ocdir in ocdirs:
        reader = TsvReader(
            path.join(ocdir, 'tomtom.txt'),
            comment='##',
            cnames=lambda header: header[1:].strip().split("\t"))
        for r in reader:
            writer.write(r)
        reader.close()
    writer.close()
else:
    params[""] = [mfile1, mfile2]

Esempio n. 21

0

Mostra file

				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()

if nthread == 1:
	getAlleleCount(tumbam, affysnps, tumsnp)
	getAlleleCount(normbam, affysnps, normsnp)
else:
	# try to split the affysnps into N files and distribute the jobs to nthreads
	# get number of lines of affysnps file
	total  = wc_l(affysnps)
	dists  = distribute(total, nthread)
	reader = TsvReader(affysnps, cnames = False)
	# dir to save the split file and result file
	thdir  = path.join(outdir, 'bamrc.nthreads')
	if not path.exists(thdir):
		makedirs(thdir)
	
	asbname = path.basename(affysnps).split('.')[0]
	for i, dist in enumerate(dists):
		writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format(
			bname = asbname, i = i
		)))
		for _ in range(dist):
			writer.write(next(reader))
		writer.close()
	
	para   = Parallel(nthread, raiseExc = True)