def AnaClusterFile(infile, anaList): #{{{ """ Analyze Input: a file with a number of clustered topologies with the protein family, the topology is clustered by the number of TM helices of each topology The input file is in FASTA format, while e.g. >Q81PI9, nTM=8 ClusterNo=1 numSeqInCluster=15 iiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMoooooooooooMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMooooMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiiiiiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMooooooooooMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMooooooooMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiiiiiiiiii Output: anaList {'pfamid': pfamid; 'numseq': numseq; 'cluster': [[numTM, numseq, [seqid1, seqid2...]], ...]} """ try: fpin = open(infile, "r") pfamid = os.path.basename(infile).split(".")[0] lines = fpin.read().split("\n") fpin.close() cntseq = 0 addedClusterSet = set([]) ana = {} ana['famid'] = pfamid ana['cluster'] = [] idxCls = -1 for line in lines: if not line or line[0] != ">": continue cntseq += 1 numCls = GetClusterNoFromAnnotation(line) if not numCls in addedClusterSet: # for a new cluster numSeqCls = GetNumSeqInClusterFromAnnotation(line) numTM = GetNumTMFromAnnotation(line) seqid = myfunc.GetFirstWord(line).lstrip('>').rstrip(",") ana['cluster'].append([numTM, numSeqCls, [seqid]]) addedClusterSet.add(numCls) idxCls += 1 else: seqid = myfunc.GetFirstWord(line).lstrip('>').rstrip(",") ana['cluster'][idxCls][2].append(seqid) ana['numseq'] = cntseq if len(ana['cluster']) > 0: anaList.append(ana) else: print >> sys.stderr, "No cluster in file %s" % (infile) except IOError: print >> sys.stderr, "Failed to read file %s" % (infile)
def GetDatabaseIDList(annoList):#{{{ idList = [] for anno in annoList: if anno == "" or anno[0] == "#": continue firstword = myfunc.GetFirstWord(anno) lengthword = len(firstword) p1 = firstword.find('(') if p1 == -1: p1 = lengthword p2 = firstword.find('/') if p2 == -1: p2 = lengthword firstword = firstword[:min(p1,p2)] if firstword.find("target") != -1: pass else: seqid = myfunc.GetSeqIDFromAnnotation(firstword) idList.append(seqid) #print len(myfunc.uniquelist(idList)) #print len(set(idList)) idList = myfunc.uniquelist(idList) return idList
def ExtractFromPairCmpRecordContent(recordContent):#{{{ """ Extract pairwise topology comparison from the record content in the file *.paircmp updated 2011-11-21 """ record = {} lines = recordContent.split('\n') if len(lines) <= 1: # record is empty print("record is empty\n", recordContent, file=sys.stderr) return {} record['mapTMline']=[] record['general_info_line']= "" record['mapArray'] = [] record['ana1'] = {} record['ana2'] = {} record['member'] = [] for line in lines: tag = myfunc.GetFirstWord(line) if tag == "PairwiseComparison:": ScanfOverallInfo_pairwise(line, record) record['general_info_line'] = line elif tag == "SeqID" or tag == "TMMap": record['mapTMline'].append(line) record['mapArray'].append([int(x) for x in line.split(':')[1].replace('-','').split()]) elif tag[0:6] == "Member": record['member'].append(ScanfMemberInfo(line)) elif tag == "NtermTopo1": record['NterTopo1'] = line.split()[1] elif tag == "NtermTopo2": record['NterTopo2'] = line.split()[1] elif tag == "Nterm1": record['ana1']['Nterm'] = ScanfUnmappedRecord(line) elif tag == "Nterm2": record['ana2']['Nterm'] = ScanfUnmappedRecord(line) elif tag == "Cterm1": record['ana1']['Cterm'] = ScanfUnmappedRecord(line) elif tag == "Cterm2": record['ana2']['Cterm'] = ScanfUnmappedRecord(line) elif tag[0:6] == "Inter1": if 'internal' not in record['ana1']: record['ana1']['internal'] = [] record['ana1']['internal'].append(ScanfUnmappedRecord(line)) elif tag[0:6] == "Inter2": if 'internal' not in record['ana2']: record['ana2']['internal'] = [] record['ana2']['internal'].append(ScanfUnmappedRecord(line)) return record
def WriteIndexFasta(seqWithAnno, fpdb, dbname, fpindex, cntdbfile, #{{{ record_offset, idSet, idtype): """Write sequence to indexed fasta file, sequences with redundant IDs are ignored""" if idtype == 0: seqid = myfunc.GetSeqIDFromAnnotation(seqWithAnno) elif idtype == 1: seqid = myfunc.GetFirstWord(seqWithAnno.lstrip(">")) if seqid in idSet: return (fpdb, record_offset) else: seqWithAnno+="\n" if fpdb == None: dbfile=dbname+"%d.db"%(cntdbfile) fpdb=open(dbfile, "wb") print "dbfile %s is created."%dbfile fpindex.write("%s %d %d %d\n"%(seqid, cntdbfile, record_offset, len(seqWithAnno))) fpdb.write("%s"%seqWithAnno) record_offset += len(seqWithAnno) idSet.add(seqid) return (fpdb,record_offset)
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))