コード例 #1
0
def bed6_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		chrom,start,end,name,score,strandother = line.rstrip().split("\t",5)
		yield [chrom,start,end,name,score,strandother]
	f.close()
コード例 #2
0
def gff3_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		chrom,source,seqtype,start,end,score,strand,phase,attributes = line.rstrip("\n").split("\t")
		yield [chrom,source,seqtype,start,end,score,strand,phase,attributes]
	f.close()
コード例 #3
0
def refgene_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		num,nm_name,chrom,strand,exon_s,exon_e,cds_s,cds_e,exon_num,exonstarts,exonends,uniq_id,symbol, kown1, kown2, exon_status = line.rstrip().split("\t")
		yield[num,nm_name,chrom,strand,exon_s,exon_e,cds_s,cds_e,exon_num,exonstarts,exonends,uniq_id,symbol, kown1, kown2, exon_status]
	f.close()
コード例 #4
0
def bwt_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        query_id, strand, subject_id, pos, seq, qual, score, mismatch = line.rstrip(
            "\n").split("\t")
        yield [query_id, strand, subject_id, pos, seq, qual, score, mismatch]
    f.close()
コード例 #5
0
def variant_snpindel_pop(total_fn):
	f = compress.gz_file(total_fn,"r")
	for line in f:
		if line.startswith("#"):continue
		chrom,position1,position2,ref,alt,qual,group_test_pvalue,depth_ref,depth_alt,depth_ref_samples,depth_alt_samples,genotype,other = line.rstrip("\n").split("\t")
		yield [chrom,position1,position2,ref,alt,qual,group_test_pvalue,depth_ref,depth_alt,depth_ref_samples,depth_alt_samples,genotype]
	f.close()
コード例 #6
0
def gtf_parse(fn,add="chr"):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		chrom,rnatype,region_type,start,end,score,strand,codon,commnet = line.rstrip("\n").split("\t")
		yield[add+chrom.lstrip("chr"),rnatype,region_type,start,end,score,strand,codon,commnet]
	f.close()
コード例 #7
0
def arf_read(arffn):
	f = compress.gz_file(arffn,"r")
	for line in f:
		if line.startswith("#"):continue
		rname,rleng,rstart,rend,rseq,gname,gleng,gstart,gend,gseq,gstrand,nmismatch,mathclabel = line.rstrip("\n").split("\t")
		yield [rname,rleng,rstart,rend,rseq,gname,gleng,gstart,gend,gseq,gstrand,nmismatch,mathclabel]
	f.close()
コード例 #8
0
def sigfile_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        anno1, anno2, fc, rawp, fdr = line.rstrip("\n").split("\t")
        yield [anno1, anno2, fc, rawp, fdr]
    f.close()
コード例 #9
0
def sigfile_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		anno1,anno2,fc,rawp,fdr = line.rstrip("\n").split("\t")
		yield [anno1,anno2,fc,rawp,fdr]
	f.close()
コード例 #10
0
def soap_aln_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		seqid,seqread,qual,mcounts,PEtag,length,strand,chrom,sitestart1,mismatch,cigar,match = line.rstrip("\n").split("\t")
		yield [seqid,seqread,qual,mcounts,PEtag,length,strand,chrom,sitestart1,mismatch,cigar,match]
	f.close()
コード例 #11
0
def bed6_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        chrom, start, end, name, score, strandother = line.rstrip().split(
            "\t", 5)
        yield [chrom, start, end, name, score, strandother]
    f.close()
コード例 #12
0
def fileread(fn):
    if not os.path.isfile(fn):
        sys.stderr.write("[Error] '%s' is not a file\n" % fn)
        exit(1)
    if fn.endswith(".gz"):
        f = compress.gz_file(fn, "r")
    elif fn.endswith(".bz2"):
        f = compress.bz2file(fn)
    return f
コード例 #13
0
def fileread(fn):
	if not os.path.isfile(fn):
		sys.stderr.write("[Error] '%s' is not a file\n"%fn)
		exit(1)
	if fn.endswith(".gz"):
		f = compress.gz_file(fn,"r")
	elif fn.endswith(".bz2"):
		f = compress.bz2file(fn)
	return f
コード例 #14
0
def gff3_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        chrom, source, seqtype, start, end, score, strand, phase, attributes = line.rstrip(
            "\n").split("\t")
        yield [
            chrom, source, seqtype, start, end, score, strand, phase,
            attributes
        ]
    f.close()
コード例 #15
0
def gtf_parse(fn, add="chr"):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        chrom, rnatype, region_type, start, end, score, strand, codon, commnet = line.rstrip(
            "\n").split("\t")
        yield [
            add + chrom.lstrip("chr"), rnatype, region_type, start, end, score,
            strand, codon, commnet
        ]
    f.close()
コード例 #16
0
def soap_aln_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        seqid, seqread, qual, mcounts, PEtag, length, strand, chrom, sitestart1, mismatch, cigar, match = line.rstrip(
            "\n").split("\t")
        yield [
            seqid, seqread, qual, mcounts, PEtag, length, strand, chrom,
            sitestart1, mismatch, cigar, match
        ]
    f.close()
コード例 #17
0
def blast6_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		try:
			query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore = line.rstrip("\n").split("\t")
		except:
			sys.stderr.write("[WARN] blast can not parse '%s'"%line)
			continue
		yield [query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore]
	f.close()
コード例 #18
0
def arf_read(arffn):
    f = compress.gz_file(arffn, "r")
    for line in f:
        if line.startswith("#"): continue
        rname, rleng, rstart, rend, rseq, gname, gleng, gstart, gend, gseq, gstrand, nmismatch, mathclabel = line.rstrip(
            "\n").split("\t")
        yield [
            rname, rleng, rstart, rend, rseq, gname, gleng, gstart, gend, gseq,
            gstrand, nmismatch, mathclabel
        ]
    f.close()
コード例 #19
0
def miRNA_target_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        arr = line.rstrip("\n").split("\t")
        microRNAid, detalmciroRNA, target_Genes = arr[0:3]
        UTR = arr[-3]
        pairing = arr[-2]
        miseq = arr[-1]
        yield [microRNAid, detalmciroRNA, target_Genes, UTR, pairing, miseq]
    f.close()
コード例 #20
0
def miRNA_target_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		if line.startswith("#"):continue
		arr = line.rstrip("\n").split("\t")
		microRNAid,detalmciroRNA,target_Genes = arr[0:3]
		UTR = arr[-3]
		pairing = arr[-2]
		miseq = arr[-1]
		yield[microRNAid,detalmciroRNA,target_Genes,UTR,pairing,miseq]
	f.close()
コード例 #21
0
def fasta_read(fn):
    """
	ID: gi|2765658|emb|Z78533.1|CIZ78533
	Name: gi|2765658|emb|Z78533.1|CIZ78533
	Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
	Number of features: 0
	Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...GGG', SingleLetterAlphabet())
	"""
    f = compress.gz_file(fn, "r")
    for seq in SeqIO.parse(f, "fasta"):
        yield seq
    f.close()
コード例 #22
0
def fasta_read(fn):
	"""
	ID: gi|2765658|emb|Z78533.1|CIZ78533
	Name: gi|2765658|emb|Z78533.1|CIZ78533
	Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
	Number of features: 0
	Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...GGG', SingleLetterAlphabet())
	"""
	f = compress.gz_file(fn,"r")
	for seq in SeqIO.parse(f,"fasta"):
		yield seq
	f.close()
コード例 #23
0
def refgene_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        num, nm_name, chrom, strand, exon_s, exon_e, cds_s, cds_e, exon_num, exonstarts, exonends, uniq_id, symbol, kown1, kown2, exon_status = line.rstrip(
        ).split("\t")
        yield [
            num, nm_name, chrom, strand, exon_s, exon_e, cds_s, cds_e,
            exon_num, exonstarts, exonends, uniq_id, symbol, kown1, kown2,
            exon_status
        ]
    f.close()
コード例 #24
0
def variant_snpindel_pop(total_fn):
    f = compress.gz_file(total_fn, "r")
    for line in f:
        if line.startswith("#"): continue
        chrom, position1, position2, ref, alt, qual, group_test_pvalue, depth_ref, depth_alt, depth_ref_samples, depth_alt_samples, genotype, other = line.rstrip(
            "\n").split("\t")
        yield [
            chrom, position1, position2, ref, alt, qual, group_test_pvalue,
            depth_ref, depth_alt, depth_ref_samples, depth_alt_samples,
            genotype
        ]
    f.close()
コード例 #25
0
def blast6_parse(fn):
    f = compress.gz_file(fn, "r")
    for line in f:
        if line.startswith("#"): continue
        try:
            query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore = line.rstrip(
                "\n").split("\t")
        except:
            sys.stderr.write("[WARN] blast can not parse '%s'" % line)
            continue
        yield [
            query_id, subject_id, identity, alignment_length, mismatches,
            gap_opens, qstart, qend, sstart, send, evalue, bitscore
        ]
    f.close()
コード例 #26
0
ファイル: mutilstats.py プロジェクト: zju3351689/SPQC-1.0.0
	def parse_matrix_anno(self,fmatrixanno,cutoff=-10000000.0,precent=0.5,addtolog=1,log2tr=0):
		fh = compress.gz_file(fmatrixanno,"r")
		t0 = time.time()	
		sys.stderr.write('[INFO] Start to Build data ...\n')
		for line in fh:
			if line.startswith("#") or line.startswith("\n") or line.startswith(" ") or line.startswith("\t"):
				continue
			else:
				arr = line.rstrip("\n").split("\t")
				self.n = len(arr[2:])
				break
		fh.seek(0)
		t0 = time.time()
		num = int(self.n * precent)
		for line in fh:
			if line.startswith("#") or line.startswith("\n") or line.startswith(" ") or line.startswith("\t"):continue
			else:
				arr = line.rstrip("\n").split("\t")
				assert self.n == len(arr[2:])
				try:
					tmpdata = np.float64(arr[2:])
				except:
					sys.stderr.write("[ERROR] n is not same as exprsnums\n")
					print arr
					exit(1)
				if np.std(tmpdata,ddof=1) <=0:continue## filter the no var data
				if np.sum(tmpdata > cutoff) <= num: 
					#sys.stderr.write("[INFO] data filtered: %s\n"%(arr[0]+"\t"+arr[1]))
					continue
				if log2tr:
					tmpdata = np.log2(tmpdata+addtolog)
				self.p += 1
				if self.data == None:
					self.data = tmpdata
				else:
					self.data = np.concatenate((self.data,tmpdata))
				self.anno.append(arr[0] + "\t" + arr[1])
				self.anno1.append(arr[0])
				self.anno2.append(arr[1])
		self.data = np.asmatrix(np.transpose(self.data.reshape(self.p,self.n)))
		fh.close()
		assert len(self.anno) == self.p
		sys.stderr.write('[INFO] Data Built done! cost %.2fs\n'%(time.time()-t0))
		return 0
コード例 #27
0
ファイル: mutilstats.py プロジェクト: zju3351689/SPQC-1.0.0
	def parse_factor(self,factorfile):
		f = compress.gz_file(factorfile,"r")
		for line in f:
			if line.startswith("##"):continue
			if line.startswith("#"):
				self.fnm = line.rstrip("\n").split("\t")[1:]
				self.lvs = len(self.fnm)
				self.levels = [0,]*self.lvs
				continue
			arr = line.rstrip("\n").split("\t")
			self.snm.append(arr[0])
			self.var.append(map(str,arr[1:]))
		f.close()
		self.var = np.asarray(self.var)
		for i in xrange(self.lvs):
			self.levels[i] = len(set(self.var[:,i].tolist()))
		print self.levels
		self.var = np.float64(self.var)
		return 0
コード例 #28
0
 def parse_factor(self, factorfile):
     f = compress.gz_file(factorfile, "r")
     for line in f:
         if line.startswith("##"): continue
         if line.startswith("#"):
             self.fnm = line.rstrip("\n").split("\t")[1:]
             self.lvs = len(self.fnm)
             self.levels = [
                 0,
             ] * self.lvs
             continue
         arr = line.rstrip("\n").split("\t")
         self.snm.append(arr[0])
         self.var.append(map(str, arr[1:]))
     f.close()
     self.var = np.asarray(self.var)
     for i in xrange(self.lvs):
         self.levels[i] = len(set(self.var[:, i].tolist()))
     print self.levels
     self.var = np.float64(self.var)
     return 0
コード例 #29
0
 def parse_matrix_anno(self,
                       fmatrixanno,
                       cutoff=-10000000.0,
                       precent=0.5,
                       addtolog=0.001,
                       log2tr=0):
     fh = compress.gz_file(fmatrixanno, "r")  # -np.inf
     t0 = time.time()
     sys.stderr.write('[INFO] Start to Build data ...\n')
     for line in fh:
         if line.startswith("#") or line.startswith(
                 "\n") or line.startswith(" ") or line.startswith("\t"):
             continue
         else:
             #arr = line.rstrip("\n").split("\t")
             arr = line.rstrip("\n").split("\t")
             self.n = len(arr[2:])
             break
     fh.seek(0)
     t0 = time.time()
     num = int(self.n * precent)
     self.p = 0
     for line in fh:
         if line.startswith("#") or line.startswith(
                 "\n") or line.startswith(" ") or line.startswith("\t"):
             continue
         else:
             self.p += 1
     fh.seek(0)
     self.data = np.zeros((self.n, self.p))
     realp = 0
     filterp = 0
     for line in fh:
         if line.startswith("#") or line.startswith(
                 "\n") or line.startswith(" ") or line.startswith("\t"):
             continue
         else:
             arr = line.rstrip("\n").rstrip().split("\t")
             try:
                 tmpdata = np.float64(arr[2:])
             except:
                 sys.stderr.write("[ERROR] %s" % line)
                 sys.stderr.write("[ERROR] n is not same as exprsnums\n")
                 exit(1)
             if self.n >= 2:
                 if np.nanstd(tmpdata, ddof=1) <= 0:
                     sys.stderr.write(
                         "[INFO] data: %s was filtered, no variation \n" %
                         (arr[0] + ": " + arr[1]))
                     filterp += 1
                     continue  ## filter the no var data
                 if np.sum(np.isnan(tmpdata)) > num:
                     sys.stderr.write(
                         "[WARN] data: %s was filtered, too many NANs \n" %
                         (arr[0] + ": " + arr[1]))
                     filterp += 1
                     continue
                 if np.sum(np.isnan(tmpdata)) + np.sum(
                         tmpdata[~np.isnan(tmpdata)] < cutoff) > num:
                     sys.stderr.write(
                         "[WARN] data: %s was filtered, too many exprs lower than noise \n"
                         % (arr[0] + ": " + arr[1]))
                     filterp += 1
                     continue
                 if len(set(arr[2:])) <= 1:
                     sys.stderr.write(
                         "[WARN] data: %s was filtered, because of no variation\n"
                         % (arr[0] + ": " + arr[1]))
                     filterp += 1
                     continue
             if log2tr:
                 tmpdata = np.log2(tmpdata + addtolog)
             realp += 1
             if realp % 100000 == 0:
                 sys.stderr.write("[INFO] parsed %d data\n" % realp)
             self.data[:, realp - 1] = tmpdata
             self.anno.append(arr[0] + "\t" + arr[1])
             self.annosep.append(arr[0] + "|" + arr[1])
             self.anno1.append(arr[0])
             self.anno2.append(arr[1])
     #self.data = np.asmatrix(np.transpose(self.data.reshape(self.p,self.n)))
     #filter the sd
     sys.stderr.write("[INFO] filter numbers: %d\n" % filterp)
     sys.stderr.write("[INFO] real numbers: %d\n" % realp)
     fh.close()
     # 2723,  4195,  8263,  8744, 11416
     self.data = np.asmatrix(self.data[:, 0:realp])
     self.p = realp
     assert len(self.anno) == self.p
     sys.stderr.write("\n")
     sys.stderr.write('[INFO] Data Built done! cost %.2fs\n' %
                      (time.time() - t0))
     #self.data = np.asmatrix(self.data)
     #print self.data[:,10]
     #print self.anno[10]
     #print self.annosep[10]
     return 0
コード例 #30
0
def bwt_parse(fn):
	f = compress.gz_file(fn,"r")
	for line in f:
		query_id,strand,subject_id,pos,seq,qual,score,mismatch = line.rstrip("\n").split("\t")
		yield [query_id,strand,subject_id,pos,seq,qual,score,mismatch]
	f.close()