def bed6_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue chrom,start,end,name,score,strandother = line.rstrip().split("\t",5) yield [chrom,start,end,name,score,strandother] f.close()
def gff3_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue chrom,source,seqtype,start,end,score,strand,phase,attributes = line.rstrip("\n").split("\t") yield [chrom,source,seqtype,start,end,score,strand,phase,attributes] f.close()
def refgene_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue num,nm_name,chrom,strand,exon_s,exon_e,cds_s,cds_e,exon_num,exonstarts,exonends,uniq_id,symbol, kown1, kown2, exon_status = line.rstrip().split("\t") yield[num,nm_name,chrom,strand,exon_s,exon_e,cds_s,cds_e,exon_num,exonstarts,exonends,uniq_id,symbol, kown1, kown2, exon_status] f.close()
def bwt_parse(fn): f = compress.gz_file(fn, "r") for line in f: query_id, strand, subject_id, pos, seq, qual, score, mismatch = line.rstrip( "\n").split("\t") yield [query_id, strand, subject_id, pos, seq, qual, score, mismatch] f.close()
def variant_snpindel_pop(total_fn): f = compress.gz_file(total_fn,"r") for line in f: if line.startswith("#"):continue chrom,position1,position2,ref,alt,qual,group_test_pvalue,depth_ref,depth_alt,depth_ref_samples,depth_alt_samples,genotype,other = line.rstrip("\n").split("\t") yield [chrom,position1,position2,ref,alt,qual,group_test_pvalue,depth_ref,depth_alt,depth_ref_samples,depth_alt_samples,genotype] f.close()
def gtf_parse(fn,add="chr"): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue chrom,rnatype,region_type,start,end,score,strand,codon,commnet = line.rstrip("\n").split("\t") yield[add+chrom.lstrip("chr"),rnatype,region_type,start,end,score,strand,codon,commnet] f.close()
def arf_read(arffn): f = compress.gz_file(arffn,"r") for line in f: if line.startswith("#"):continue rname,rleng,rstart,rend,rseq,gname,gleng,gstart,gend,gseq,gstrand,nmismatch,mathclabel = line.rstrip("\n").split("\t") yield [rname,rleng,rstart,rend,rseq,gname,gleng,gstart,gend,gseq,gstrand,nmismatch,mathclabel] f.close()
def sigfile_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue anno1, anno2, fc, rawp, fdr = line.rstrip("\n").split("\t") yield [anno1, anno2, fc, rawp, fdr] f.close()
def sigfile_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue anno1,anno2,fc,rawp,fdr = line.rstrip("\n").split("\t") yield [anno1,anno2,fc,rawp,fdr] f.close()
def soap_aln_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue seqid,seqread,qual,mcounts,PEtag,length,strand,chrom,sitestart1,mismatch,cigar,match = line.rstrip("\n").split("\t") yield [seqid,seqread,qual,mcounts,PEtag,length,strand,chrom,sitestart1,mismatch,cigar,match] f.close()
def bed6_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue chrom, start, end, name, score, strandother = line.rstrip().split( "\t", 5) yield [chrom, start, end, name, score, strandother] f.close()
def fileread(fn): if not os.path.isfile(fn): sys.stderr.write("[Error] '%s' is not a file\n" % fn) exit(1) if fn.endswith(".gz"): f = compress.gz_file(fn, "r") elif fn.endswith(".bz2"): f = compress.bz2file(fn) return f
def fileread(fn): if not os.path.isfile(fn): sys.stderr.write("[Error] '%s' is not a file\n"%fn) exit(1) if fn.endswith(".gz"): f = compress.gz_file(fn,"r") elif fn.endswith(".bz2"): f = compress.bz2file(fn) return f
def gff3_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue chrom, source, seqtype, start, end, score, strand, phase, attributes = line.rstrip( "\n").split("\t") yield [ chrom, source, seqtype, start, end, score, strand, phase, attributes ] f.close()
def gtf_parse(fn, add="chr"): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue chrom, rnatype, region_type, start, end, score, strand, codon, commnet = line.rstrip( "\n").split("\t") yield [ add + chrom.lstrip("chr"), rnatype, region_type, start, end, score, strand, codon, commnet ] f.close()
def soap_aln_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue seqid, seqread, qual, mcounts, PEtag, length, strand, chrom, sitestart1, mismatch, cigar, match = line.rstrip( "\n").split("\t") yield [ seqid, seqread, qual, mcounts, PEtag, length, strand, chrom, sitestart1, mismatch, cigar, match ] f.close()
def blast6_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue try: query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore = line.rstrip("\n").split("\t") except: sys.stderr.write("[WARN] blast can not parse '%s'"%line) continue yield [query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore] f.close()
def arf_read(arffn): f = compress.gz_file(arffn, "r") for line in f: if line.startswith("#"): continue rname, rleng, rstart, rend, rseq, gname, gleng, gstart, gend, gseq, gstrand, nmismatch, mathclabel = line.rstrip( "\n").split("\t") yield [ rname, rleng, rstart, rend, rseq, gname, gleng, gstart, gend, gseq, gstrand, nmismatch, mathclabel ] f.close()
def miRNA_target_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue arr = line.rstrip("\n").split("\t") microRNAid, detalmciroRNA, target_Genes = arr[0:3] UTR = arr[-3] pairing = arr[-2] miseq = arr[-1] yield [microRNAid, detalmciroRNA, target_Genes, UTR, pairing, miseq] f.close()
def miRNA_target_parse(fn): f = compress.gz_file(fn,"r") for line in f: if line.startswith("#"):continue arr = line.rstrip("\n").split("\t") microRNAid,detalmciroRNA,target_Genes = arr[0:3] UTR = arr[-3] pairing = arr[-2] miseq = arr[-1] yield[microRNAid,detalmciroRNA,target_Genes,UTR,pairing,miseq] f.close()
def fasta_read(fn): """ ID: gi|2765658|emb|Z78533.1|CIZ78533 Name: gi|2765658|emb|Z78533.1|CIZ78533 Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA Number of features: 0 Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...GGG', SingleLetterAlphabet()) """ f = compress.gz_file(fn, "r") for seq in SeqIO.parse(f, "fasta"): yield seq f.close()
def fasta_read(fn): """ ID: gi|2765658|emb|Z78533.1|CIZ78533 Name: gi|2765658|emb|Z78533.1|CIZ78533 Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA Number of features: 0 Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...GGG', SingleLetterAlphabet()) """ f = compress.gz_file(fn,"r") for seq in SeqIO.parse(f,"fasta"): yield seq f.close()
def refgene_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue num, nm_name, chrom, strand, exon_s, exon_e, cds_s, cds_e, exon_num, exonstarts, exonends, uniq_id, symbol, kown1, kown2, exon_status = line.rstrip( ).split("\t") yield [ num, nm_name, chrom, strand, exon_s, exon_e, cds_s, cds_e, exon_num, exonstarts, exonends, uniq_id, symbol, kown1, kown2, exon_status ] f.close()
def variant_snpindel_pop(total_fn): f = compress.gz_file(total_fn, "r") for line in f: if line.startswith("#"): continue chrom, position1, position2, ref, alt, qual, group_test_pvalue, depth_ref, depth_alt, depth_ref_samples, depth_alt_samples, genotype, other = line.rstrip( "\n").split("\t") yield [ chrom, position1, position2, ref, alt, qual, group_test_pvalue, depth_ref, depth_alt, depth_ref_samples, depth_alt_samples, genotype ] f.close()
def blast6_parse(fn): f = compress.gz_file(fn, "r") for line in f: if line.startswith("#"): continue try: query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore = line.rstrip( "\n").split("\t") except: sys.stderr.write("[WARN] blast can not parse '%s'" % line) continue yield [ query_id, subject_id, identity, alignment_length, mismatches, gap_opens, qstart, qend, sstart, send, evalue, bitscore ] f.close()
def parse_matrix_anno(self,fmatrixanno,cutoff=-10000000.0,precent=0.5,addtolog=1,log2tr=0): fh = compress.gz_file(fmatrixanno,"r") t0 = time.time() sys.stderr.write('[INFO] Start to Build data ...\n') for line in fh: if line.startswith("#") or line.startswith("\n") or line.startswith(" ") or line.startswith("\t"): continue else: arr = line.rstrip("\n").split("\t") self.n = len(arr[2:]) break fh.seek(0) t0 = time.time() num = int(self.n * precent) for line in fh: if line.startswith("#") or line.startswith("\n") or line.startswith(" ") or line.startswith("\t"):continue else: arr = line.rstrip("\n").split("\t") assert self.n == len(arr[2:]) try: tmpdata = np.float64(arr[2:]) except: sys.stderr.write("[ERROR] n is not same as exprsnums\n") print arr exit(1) if np.std(tmpdata,ddof=1) <=0:continue## filter the no var data if np.sum(tmpdata > cutoff) <= num: #sys.stderr.write("[INFO] data filtered: %s\n"%(arr[0]+"\t"+arr[1])) continue if log2tr: tmpdata = np.log2(tmpdata+addtolog) self.p += 1 if self.data == None: self.data = tmpdata else: self.data = np.concatenate((self.data,tmpdata)) self.anno.append(arr[0] + "\t" + arr[1]) self.anno1.append(arr[0]) self.anno2.append(arr[1]) self.data = np.asmatrix(np.transpose(self.data.reshape(self.p,self.n))) fh.close() assert len(self.anno) == self.p sys.stderr.write('[INFO] Data Built done! cost %.2fs\n'%(time.time()-t0)) return 0
def parse_factor(self,factorfile): f = compress.gz_file(factorfile,"r") for line in f: if line.startswith("##"):continue if line.startswith("#"): self.fnm = line.rstrip("\n").split("\t")[1:] self.lvs = len(self.fnm) self.levels = [0,]*self.lvs continue arr = line.rstrip("\n").split("\t") self.snm.append(arr[0]) self.var.append(map(str,arr[1:])) f.close() self.var = np.asarray(self.var) for i in xrange(self.lvs): self.levels[i] = len(set(self.var[:,i].tolist())) print self.levels self.var = np.float64(self.var) return 0
def parse_factor(self, factorfile): f = compress.gz_file(factorfile, "r") for line in f: if line.startswith("##"): continue if line.startswith("#"): self.fnm = line.rstrip("\n").split("\t")[1:] self.lvs = len(self.fnm) self.levels = [ 0, ] * self.lvs continue arr = line.rstrip("\n").split("\t") self.snm.append(arr[0]) self.var.append(map(str, arr[1:])) f.close() self.var = np.asarray(self.var) for i in xrange(self.lvs): self.levels[i] = len(set(self.var[:, i].tolist())) print self.levels self.var = np.float64(self.var) return 0
def parse_matrix_anno(self, fmatrixanno, cutoff=-10000000.0, precent=0.5, addtolog=0.001, log2tr=0): fh = compress.gz_file(fmatrixanno, "r") # -np.inf t0 = time.time() sys.stderr.write('[INFO] Start to Build data ...\n') for line in fh: if line.startswith("#") or line.startswith( "\n") or line.startswith(" ") or line.startswith("\t"): continue else: #arr = line.rstrip("\n").split("\t") arr = line.rstrip("\n").split("\t") self.n = len(arr[2:]) break fh.seek(0) t0 = time.time() num = int(self.n * precent) self.p = 0 for line in fh: if line.startswith("#") or line.startswith( "\n") or line.startswith(" ") or line.startswith("\t"): continue else: self.p += 1 fh.seek(0) self.data = np.zeros((self.n, self.p)) realp = 0 filterp = 0 for line in fh: if line.startswith("#") or line.startswith( "\n") or line.startswith(" ") or line.startswith("\t"): continue else: arr = line.rstrip("\n").rstrip().split("\t") try: tmpdata = np.float64(arr[2:]) except: sys.stderr.write("[ERROR] %s" % line) sys.stderr.write("[ERROR] n is not same as exprsnums\n") exit(1) if self.n >= 2: if np.nanstd(tmpdata, ddof=1) <= 0: sys.stderr.write( "[INFO] data: %s was filtered, no variation \n" % (arr[0] + ": " + arr[1])) filterp += 1 continue ## filter the no var data if np.sum(np.isnan(tmpdata)) > num: sys.stderr.write( "[WARN] data: %s was filtered, too many NANs \n" % (arr[0] + ": " + arr[1])) filterp += 1 continue if np.sum(np.isnan(tmpdata)) + np.sum( tmpdata[~np.isnan(tmpdata)] < cutoff) > num: sys.stderr.write( "[WARN] data: %s was filtered, too many exprs lower than noise \n" % (arr[0] + ": " + arr[1])) filterp += 1 continue if len(set(arr[2:])) <= 1: sys.stderr.write( "[WARN] data: %s was filtered, because of no variation\n" % (arr[0] + ": " + arr[1])) filterp += 1 continue if log2tr: tmpdata = np.log2(tmpdata + addtolog) realp += 1 if realp % 100000 == 0: sys.stderr.write("[INFO] parsed %d data\n" % realp) self.data[:, realp - 1] = tmpdata self.anno.append(arr[0] + "\t" + arr[1]) self.annosep.append(arr[0] + "|" + arr[1]) self.anno1.append(arr[0]) self.anno2.append(arr[1]) #self.data = np.asmatrix(np.transpose(self.data.reshape(self.p,self.n))) #filter the sd sys.stderr.write("[INFO] filter numbers: %d\n" % filterp) sys.stderr.write("[INFO] real numbers: %d\n" % realp) fh.close() # 2723, 4195, 8263, 8744, 11416 self.data = np.asmatrix(self.data[:, 0:realp]) self.p = realp assert len(self.anno) == self.p sys.stderr.write("\n") sys.stderr.write('[INFO] Data Built done! cost %.2fs\n' % (time.time() - t0)) #self.data = np.asmatrix(self.data) #print self.data[:,10] #print self.anno[10] #print self.annosep[10] return 0
def bwt_parse(fn): f = compress.gz_file(fn,"r") for line in f: query_id,strand,subject_id,pos,seq,qual,score,mismatch = line.rstrip("\n").split("\t") yield [query_id,strand,subject_id,pos,seq,qual,score,mismatch] f.close()