def onFileTriggered(self, event): if str(event.text()) == "Save": Pickle.dump(self.view, "current.pkl") if str(event.text()) == "Open": self.view.scene.clear() self.setMode("free") Pickle.load(self.view, "current.pkl")
def makeIterInput(home, batch_size, MAX_LEN=16, buffer_size=buffer_size, for_prediction=False): CMPATH = os.path.join(home, CMNAME) CM = myPickle.load(CMPATH) CMm = max(CM.values()) + 1 DICT_SIZE = len(CM) rpath = os.path.join(home, 'rules.txt') rules = readRULE(rpath) CLASS_NUM = len(rules) print("RULES_NUMBER: ", CLASS_NUM) def G(*args): # for each chunk for xpath, index_path, hits_path in zip(getAll(home, 'X.txt_*'), getAll(home, 'index_hits*'), getAll(home, 'hits*')): # read passwords chunk X = readX_strict(xpath, encoding=ENCODING) # parse plaintext password Xi = prepareXi(X, CM, MAX_LEN, CMm) # read index array index = read_index(index_path, len(Xi)) with open(hits_path, 'rb') as f: # for each password in the chunk for x, nm in zip(Xi, index): # get hitting rules and create dense label tot_byte = nm * 4 b = f.read(tot_byte) rhits = np.frombuffer(b, dtype=np.uint32) y = np.zeros(CLASS_NUM, dtype=np.int32) y[rhits] = 1 if len(x) > MAX_LEN: continue yield x, y dataset = tf.data.Dataset.from_generator(G, (tf.int32, tf.int32), ((MAX_LEN, ), (CLASS_NUM, ))) if not for_prediction: dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(buffer_size=buffer_size) return dataset, CM, CLASS_NUM
def batchExtract(pkldir, bdir, ofname): """ Running the information required for all files """ import glob flist = glob.glob(pkldir + '*.pdb.pkl') TT = len(flist) + 0.0 if os.path.isfile(ofname) is False: fdict = {} else: fdict = myPickle.load(ofname) for cnt, f in enumerate(flist): print '% Done =', cnt / TT (_, k, _) = getFileParts(getFileParts(f)[1]) #pdb.set_trace() k = k[:-2] if k not in fdict: print "Processing", f try: U = myPDB.loader(pkldir + k + '_u.pdb.pkl') B = myPDB.loader(pkldir + k + '_b.pdb.pkl') except: continue pdb.set_trace() #rmsd,Uidx,Bidx=calcRMSD(U,B) try: rpymol = calcRMSD_pymol(bdir + k + '_u.pdb', bdir + k + '_b.pdb') except: print "Error processing", k cmd.reinitialize() time.sleep(0.1) continue #pdb.set_trace() #useq=''.join([three_to_one(U.R[i].get_resname()) for i in Uidx]) #bseq=''.join([three_to_one(B.R[i].get_resname()) for i in Bidx]) #a_useq=ProteinAnalysis(U.seq) #a_bseq=ProteinAnalysis(B.seq) #asa_u=np.sum([U.ASA[i] for i in Uidx]) #asa_b=np.sum([B.ASA[i] for i in Bidx]) fdict[ k] = rpymol #+(BN.nanmean(U.B),BN.nanmean(B.B),BN.nanmedian(U.B),BN.nanmedian(B.B),BN.nanmax(U.B),BN.nanmax(B.B)) #pdb.set_trace() myPickle.dump(ofname, fdict) print k, rpymol[0] else: print "Already found", f return fdict
def makeIterInput(home, batch_size, MAX_MASKED, INCLUDE_END_SYMBOL, MAX_LEN=32, buffer_size=buffer_size, for_prediction=False): XPATH = os.path.join(home, XNAME) CMPATH = os.path.join(home, CMNAME) CM = myPickle.load(CMPATH) vocab_size = max(CM.values()) + 1 def G(*args): # for each chunk with open(XPATH, encoding=ENCODING, errors='ignore') as f: for x in f: x = x[:-1] xl = len(x) #if not INCLUDE_END_SYMBOL: print("NO <END>") if xl > MAX_LEN - int(INCLUDE_END_SYMBOL): continue xi = string2idx(x, CM, MAX_LEN, vocab_size, INCLUDE_END_SYMBOL) xi_in, _, kk = mask(xi.copy(), xl, MAX_MASKED, INCLUDE_END_SYMBOL) prediction_mask = np.zeros(MAX_LEN, np.int32) for k in kk: prediction_mask[k] = 1 xi_out = xi yield xi_in, prediction_mask, xi_out dataset = tf.data.Dataset.from_generator(G, (tf.int32, tf.int32, tf.int32), ((None, ), (None, ), (None, ))) if not for_prediction: dataset = dataset.shuffle(buffer_size) dataset = dataset.padded_batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(buffer_size=buffer_size) return dataset, vocab_size + 1, CM
def __init__(self, mpath, dhome, ls, batch_size=4096, usegpu=-1): self.mpath = mpath self.dhome = dhome self.batch_size = batch_size self.ls = ls cm_ = os.path.join(dhome, 'char_map.pickle') self.cm = myPickle.load(cm_) self.cm_ = [x[0] for x in sorted(self.cm.items(), key=lambda x: x[1])] tf.logging.set_verbosity(tf.logging.ERROR) module = hub.Module(mpath) self.module = module if True: # pure sampling from latent z = self.samplingLatent() o = module(z, signature='latent', as_dict=True) self.latent2data = o['prediction_string'] p = o['prediction'] self.x_len = p.shape.as_list()[-1] else: self.x_len = 16 print("NO LATENT") try: # proximity sampling \eg for SSPG self.x4PP = tf.placeholder(tf.int32, shape=(None, self.x_len)) self.n4PP = tf.placeholder(tf.int32, shape=1) self.stddev4PP = tf.placeholder(tf.float32, shape=(1,)) inputs = {'x':self.x4PP, 'n':self.n4PP, 'stddev':self.stddev4PP} self.latent2data4PP = module(inputs, as_dict=True, signature='sample_from_latent')['prediction_string'] except: ... # simple inference out = module(self.x4PP, as_dict=True) self.infp = out['p'] self.infx = out['x'] self.infprediction_string = out['prediction_string'] ### self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True if usegpu != -1: print("USE_GPU:", str(usegpu)) self.config.gpu_options.visible_device_list = str(usegpu)
def makeIter(home, epochs, batch_size, MAX_LEN, buffer_size, chunk_size=2**13, test=False): CMPATH = os.path.join(home, 'char_map.pickle') char_map = myPickle.load(CMPATH) char_num = len(char_map) XPATH = os.path.join(home, 'X.h5df') if test: key = 'test' else: key = 'train' with h5py.File(XPATH, 'r') as f: f = f[key] N = len(f) def G(*args): with h5py.File(XPATH, 'r') as f: f = f[key] bn = math.ceil(N / chunk_size) for i in range(bn): s = i * chunk_size e = (i + 1) * chunk_size Xchunk = f[s:e] for x in Xchunk: yield x def batch(): dataset = tf.data.Dataset.from_generator(G, tf.int32, (MAX_LEN, )) if not test: dataset = dataset.repeat(epochs) dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=buffer_size) iterator = dataset.make_one_shot_iterator() x = iterator.get_next() return {'x': x}, x return batch, char_num, N
A = np.bmat([[2 * np.dot(pts, pts.T), np.ones((rows, 1))], [np.ones((1, rows)), np.zeros((1, 1))]]) b = np.hstack((np.sum(pts * pts, axis=1), np.ones((1)))) x = np.linalg.solve(A, b) bary_coords = x[:-1] return np.sum(pts * np.tile(bary_coords.reshape((pts.shape[0], 1)), (1, pts.shape[1])), axis=0) categs = ['All', 'Hard'] for pidx, categ in enumerate(categs): fname = 'Data_out/propAsabrx_nogly_' + categ + '.prp.mkl' (Pcnt, Ncnt, APcnt, ANcnt, TAC) = myPickle.load(fname) TAC = np.array(TAC) TAC = TAC[~np.any(TAC > 180, axis=1), :] if categ == 'All': Nc = 60 niter = 2000 res0, _ = kmeans2(np.vstack((TAC[:, :2], TAC[:, 2:])), Nc, iter=niter, minit='points') res = np.zeros((Nc**2, 4)) k = 0 for i in range(Nc): for j in range(Nc): res[k, :] = np.hstack((res0[i, :], res0[j, :]))
ifname = os.path.join(bdir, cid + '.zd3.0.2.cg.out.rmsds') R = np.zeros(N) with open(ifname, 'r') as f: for n in range(N): R[n] = f.readline().split()[1] sidx = np.argsort(A[cid][0][:M]) RS = R + 0.0 RS[:M] = R[:M][sidx] return RS, R bdir = '../zdock_data/decoys_bm4_zd3.0.2_15deg/results/' dfname = '../ascores_2K2.scr.pkl' d4 = parseCSVData('..\Complete Data\DBD4_data.csv') A = mPickle.load(dfname) RS = [] R = [] dthr = 2.5 N = 2000 ncids = 0 for i, cid in enumerate(A.keys()): # if cid not in d4 or d4[cid][1]=='RB': # continue rs, r = getSortedR(cid, bdir, A, N=N) # if np.any(r[:10]<dthr): # # import pdb # print cid, d4[cid][1] # # pdb.set_trace() R.append(np.cumsum(r < dthr) > 0)
x = x[:-1] if len(x) <= MAX_LEN and len(x) >= MIN_LEN: X.append(x) return X def write_tsv(output, X, P, encoding=ENCODING): assert len(X) == len(P) n = len(X) with open(output, 'w', encoding=encoding) as f: for x, p in zip(X, P): print("%s\t%f" % (x, p), file=f) if __name__ == '__main__': try: model_path = sys.argv[1] password_file = sys.argv[2] output_path = sys.argv[3] except: print("USAGE: model_path.h5 password_path.txt output_path.txt") sys.exit(1) X = read_passwords(password_file) cm = myPickle.load(CHARMAP) model = tf.keras.models.load_model(model_path, compile=False) S = Inference(model, cm, MAX_LEN, BATCH_SIZE) logP = S.applyBatch(X, TERMINAL_SYMBOL) write_tsv(output_path, X, logP)
bdir = '../DBD4N/PDBPKL4/' exfile = bdir + 'E_125PN_15_35_50.lbl.pkl' categs = ['All', 'RB', 'Med', 'Hard'] # AAcodes = [ 'Ala', 'Val', 'Leu', 'Ile', 'Cys', 'Met', 'Pro', 'Phe', 'Trp', 'Tyr', 'Gly', 'Ser', 'Thr', 'Glu', 'Gln', 'Arg', 'Lys', 'His', 'Asn', 'Asp' ] AAcodes = [three_to_one(a.upper()) for a in AAcodes] #f3=['1SBB', '1JPS', '2HMI', '1GHQ', '1KTZ', '1K74', '1D6R', '2SIC', '1GPW', '1XD3', '1EAW', '1VFB', '7CEI', '1E4K', '1I4D', '1H1V', '2PCC', '1FQ1', '2HLE', '1FQJ', '1S1Q', '2OOB', '1UDI', '1KLU', '1WQ1', '1CGI', '1ATN', '1N2C', '1GP2', '1FAK', '1NW9', '1GLA', '1GRN', '2HRK', '1AZS', '1JMO', '1PXV', '1EWY', '1RLB', '1DQJ', '2BTF', '2I25', '1I2M', '1BUH', '1BGX', '1ML0', '1EFN', '1DFJ', '1Y64', '2UUY', '1MAH', '1BVK', '1BVN', '1EER', '1MLC', '1NSN', '1AK4', '1A2K', '1QFW', '2H7V', '1T6B', '1KAC', '1YVB', '1J2J', '1QA9', '1AHW', '2OT3', '2FD6', '2AJF', '1K4C', '1NCA', '1OPH', '1XQS', '1B6C', '1PPE', '2O8V', '1HIA', '1Z0K', '1R0R', '1WEJ', '1ACB', '1KXP', '1KXQ', '1R8S', '1IRA', '1GCQ', '1F51', '2B42', '2HQS', '1AKJ', '2JEL', '1KKL', '1FC2', '1E96', '1N8O', '2MTA', '2VIS', '1IB1', '1E6J', '1Z5Y', '1EZU', '1TMQ', '2C0L', '1E6E', '1IQD', '1ZHI', '1M10', '2NZ8', '1AY7', '1HE8', '1IJK', '1HE1', '1FSK', '1F34', '2SNI', '1BJ1', '2CFH', '1BKD', '1DE4', '1IBR', '1I9R', '1K5D', '1AVX'] #f4=['2A5T', '3CPH', '1ZHH', '2ABZ', '1LFD', '2OUL', '1JIW', '2B4J', '1SYX', '1FLE', '1JTG', '2AYO', '4CPA', '1CLV', '1OC0', '1XU1', '1R6Q', '2O3B', '1US7', '3D5S', '1JZD', '1HCF', '1OYV', '2OZA', '1H9D', '2A9K', '2J0T', '2Z0E', '3BP8', '2IDO', '1WDW', '1ZLI', '2VDB', '1RV6', '1FFW', '1F6M', 'BOYV', '1JWH', '2OOR', '1MQ8', '1GL1', '1PVH', '2I9B', '1OFU', '1GXD', '3SGQ', '1JK9', '1ZM4', '1FCC', '2G77', '2J7P', '2FJU'] dbd4 = parseCSVData('..\Complete Data\DBD4_data.csv') fig, axes = plt.subplots(nrows=2, ncols=2) rmsdfname = 'rmsd_atomic.mkl' rmsd = myPickle.load(rmsdfname) for subidx, categ in enumerate(categs): if categ != 'All': clist = [ cid for cid in dbd4.keys() if dbd4[cid][1].upper() == categ.upper() ] else: clist = dbd4.keys() E = getExamplesDBD.loader(exfile) Pcnt = getCountDict() Ncnt = getCountDict() APcnt = dict(zip(AAcodes, [[0, 0, 0, 0] for _ in AAcodes])) ANcnt = dict(zip(AAcodes, [[0, 0, 0, 0] for _ in AAcodes])) TAC = []
def getSortedR(cid,bdir,A,N=2000,M=2000): ifname=os.path.join(bdir,cid+'.zd3.0.2.cg.out.rmsds') R=np.zeros(N) with open(ifname, 'r') as f: for n in range(N): R[n]=f.readline().split()[1] sidx=np.argsort(A[cid][0][:M]) RS=R+0.0 RS[:M]=R[:M][sidx] return RS,R bdir='../zdock_data/decoys_bm4_zd3.0.2_15deg/results/' dfname='../ascores_2K2.scr.pkl' d4=parseCSVData('..\Complete Data\DBD4_data.csv') A=mPickle.load(dfname) RS=[] R=[] dthr=2.5 N=2000 ncids=0 for i,cid in enumerate(A.keys()): # if cid not in d4 or d4[cid][1]=='RB': # continue rs,r=getSortedR(cid,bdir,A,N=N) # if np.any(r[:10]<dthr): # # import pdb # print cid, d4[cid][1] # # pdb.set_trace() R.append(np.cumsum(r<dthr)>0)
nprocs = comm.Get_size() except ImportError: print "Failure importing MPI4py: Not using MPI parallelization." comm = None myid = 0 nprocs = 1 A = parallelRun(N, pdbpklpath, pppath, ofname=ofname, comm=comm, myid=myid, nprocs=nprocs) else: A = Pickle.load(ofname) if A is not None: rmsd = getRMSD() sels = ['ALL', 'RB', 'MED', 'HARD'] # #sels=['ALL'] mrks = { 'ALL': 'o', 'RB': 'o', 'MED': 's', 'HARD': '^' } clrs = { 'ALL': 'b', 'RB': 'g', 'MED': 'b', 'HARD': 'k'
def setup(model_path, home, reg_type, arch_id, latent_size, epochs, batch_size, max_len, learning_rate, **conf): cm_ = os.path.join(home, CM) cm = myPickle.load(cm_) chars = np.array([x[0] for x in sorted(cm.items(), key=lambda x: x[1])]) if reg_type == 0: print("MMAE") from AE import makeIter train, char_num, M = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE']) test, _, _ = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], test=True) elif reg_type == 1: print("Single NoisingAE") from denoising import makeIterNoise as makeIter train, char_num, M = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE']) test, _, _ = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], test=True) elif reg_type == 2: print("MaskedAE") from denoising import makeIterMask as makeIter mask_size = conf['mask_size'] train, char_num, M = makeIter(home, mask_size, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE']) test, _, _ = makeIter(home, mask_size, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], test=True) elif reg_type == 3: print("NoisingAE") from denoising import makeIterMNoise as makeIter holes_number = conf['holes_number'] train, char_num, M = makeIter(home, holes_number, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE']) test, _, _ = makeIter(home, holes_number, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], test=True) elif reg_type == 4: print("Single NoisingAE WITH END CHAR") from denoising import makeIterNoise as makeIter train, char_num, M = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], include_end_char=True) test, _, _ = makeIter(home, epochs, batch_size, max_len, buffer_size=conf['BUFFER_SIZE'], test=True, include_end_char=True) else: sys.exit(1) if arch_id == 0: enc = architecture.enc0_16 dec = architecture.dec0_16 elif arch_id == 1: enc = architecture.enc1_16 dec = architecture.dec1_16 elif arch_id == 2: enc, dec = architecture.ARCH_resnetBNK0 elif arch_id == 3: enc, dec = architecture.ARCH_resnetBNK1 elif arch_id == 4: enc, dec = architecture.ARCH_resnetBNK2 elif arch_id == 5: enc, dec = architecture.ARCH_resnetBNK3 elif arch_id == 6: enc, dec = architecture.ARCH_resnetBNK4 elif arch_id == 7: enc, dec = architecture.ARCH_INAE elif arch_id == 8: enc, dec = architecture.ARCH_INAE2 else: print('NO SUCH ARCH_ID') sys.exit(1) N = math.ceil( ( (epochs*M) / batch_size ) ) print('Train_iter: ', N) hparams = { 'enc_arch' : enc, 'dec_arch' : dec, 'learning_rate' : learning_rate, 'char_num' : char_num, 'batch_size' : batch_size, 'loss_id' : conf.get('loss_id', 0), 'alpha' : conf['alpha'], # scalar loss latent space 'beta' : conf['beta'], # scalar loss data space 'latent_size' : latent_size, 'chars' : chars, } # make estimator ae = MMAE.MMAE(model_path, hparams) run_conf = ae.setupRunConfig(conf['SAVE_SUMMARY_STEPS'], conf['SAVE_CHECKPOINT_STEP'], keep_checkpoint_max=1) estimator = ae(run_conf) train_spec = tf.estimator.TrainSpec(train, max_steps=N) eval_spec = tf.estimator.EvalSpec(test, steps=None, throttle_secs=conf['THROTTLE_SECS']) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return ae, max_len
def load_charmap(path): cm = myPickle.load(path) cm_ = [x[0] for x in sorted(cm.items(), key=lambda x: x[1])] return cm, cm_
def loadMIFile(ofile): return cPickle.load(ofile)
lM = np.max(lD) rM = np.max(rD) lD = lD / lM rD = rD / rM D = [] for k0, (l0, r0) in enumerate(pex): for l1, r1 in pex[k0 + 1:]: d = np.max((lD[l0, l1], rD[r0, r1])) D.append(d) C = C + np.histogram(D, bins)[0] except Exception as e: print "Error", e continue mPickle.dump(ofname, (bins, C)) else: (bins, C) = mPickle.load(ofname) bb = (bins[1:] + bins[:-1]) / 2 idx = bb <= 1 bb = bb[idx] C = C[idx] plt.plot(bb, C, 'b', linewidth=2) plt.grid() plt.xlabel('Normalized pairwise distance (d)') plt.ylabel('Number of pairs of simultaneosuly interacting residue pairs', color='b') ax1 = plt.gca() ax2 = ax1.twinx() ax2.plot(bb, np.cumsum(C) / np.sum(C), 'r.-', linewidth=2) ax2.set_ylabel( 'Cumulative proporion of pairs of simultaneosuly interacting residue pairs',
N=20 try: from mpi4py import MPI comm = MPI.COMM_WORLD myid = comm.Get_rank() nprocs = comm.Get_size() except ImportError: print "Failure importing MPI4py: Not using MPI parallelization." comm=None myid=0 nprocs=1 A=parallelRun(N,pdbpklpath,pppath,ofname=ofname,comm=comm,myid=myid,nprocs=nprocs) else: A=Pickle.load(ofname) if A is not None: rmsd=getRMSD() sels=['ALL','RB','MED','HARD'] # #sels=['ALL'] mrks={'ALL':'o','RB':'o','MED':'s','HARD':'^'}; clrs={'ALL':'b','RB':'g','MED':'b','HARD':'k'}; lbns={'ALL':'','RB':'Rigid Body','MED':'Medium','HARD':'Hard'}; for sel in sels: if sel!='ALL': incids_sel=[cid for cid in rmsd.keys() if rmsd[cid][2]==sel] else: incids_sel=incids lbn=lbns[sel] mrk=mrks[sel]
def main(): # generate_interval() time_tag = myPickle.load('test_user_time_all.pkl') print time_tag
symm = stxt[(stxt.find('<b>') + 3):(stxt.find('</b>'))] return stch, symm def kMer(stch): if len(stch): return int(stch[stch.find('-mer') - 2:][0:2]) else: return 0 if __name__ == "__main__": dbd4file = '..\..\Complete Data\DBD4_data.csv' dbd4 = parseCSVData(dbd4file) dbd4s = myPickle.load('../../Complete Data/DBD4_data_stocihiometry.mkl') from getExamplesDBD_breakup import * E = getExamplesDBD.loader('../../DBD4CSPKL/PKL/ENS_15_35_50.lbl.pkl') dbd4es = {} for cid in E.Pex: scid = cid[0][:4] dbd4es[cid] = dbd4s[scid] + (E.Pex[cid][1], E.Pex[cid][2], E.Pex[cid][1] * E.Pex[cid][2], len(E.Pex[cid][0]), kMer(dbd4s[scid][6])) with open('DBD4_broken.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',') for cid in dbd4es: spamwriter.writerow([cid] + list(dbd4es[cid])) # bdir='../../Complete Data/CPDB/' # from BISEPutils import fetchPDB # for c in dbd4:
import random import myPickle DEBUG = False video_info = myPickle.load('../video.pkl') def transverse(train): ''' {'1':{'class1':5, 'class2':4}} ''' item_user = {} for user, item in train: print user print item def process_rate(fi, fo): ''' Some users rate same class several times,so we calculate a average score User1, tv, 4 User1, tv, 2 ----> User1, tv, 3 ''' # fo = open('train_rate.csv', 'w') new_dict = {} times = {} for line in fi: user, class_name, rate = line.split(',') user = int(user) rate = float(rate.strip()) new_dict.setdefault(user,{})
def triangle_csc(pts): rows, cols = pts.shape A = np.bmat([[2 * np.dot(pts, pts.T), np.ones((rows, 1))], [np.ones((1, rows)), np.zeros((1, 1))]]) b = np.hstack((np.sum(pts * pts, axis=1), np.ones((1)))) x = np.linalg.solve(A,b) bary_coords = x[:-1] return np.sum(pts * np.tile(bary_coords.reshape((pts.shape[0], 1)), (1, pts.shape[1])), axis=0) categs=['All','Hard'] for pidx,categ in enumerate(categs): fname='Data_out/propAsabrx_nogly_'+categ+'.prp.mkl' (Pcnt,Ncnt,APcnt,ANcnt,TAC)=myPickle.load(fname) TAC=np.array(TAC) TAC=TAC[~np.any(TAC>180,axis=1),:] if categ=='All': Nc=60 niter=2000 res0, _ = kmeans2(np.vstack((TAC[:,:2],TAC[:,2:])),Nc,iter=niter,minit='points') res=np.zeros((Nc**2,4)) k=0 for i in range(Nc): for j in range(Nc): res[k,:]=np.hstack((res0[i,:],res0[j,:])) k=k+1 idx = vq(TAC, res)[0]
def makeIterInput(home, batch_size, MAX_MASKED, INCLUDE_END_SYMBOL, MAX_LEN=32, buffer_size=buffer_size, for_prediction=False): XPATH = os.path.join(home, XNAME) # 数据集路径 CMPATH = os.path.join(home, CMNAME) CM = myPickle.load(CMPATH) # 导入charmap vocab_size = max(CM.values()) + 1 # 字典大小 def G(*args): """ 生成器 Args: *args: Returns: """ # for each chunk with open(XPATH, encoding=ENCODING, errors='ignore') as f: for x in f: x = x[:-1] # 去除最后的换行符 x_len = len(x) # if not INCLUDE_END_SYMBOL: print("NO <END>") if x_len > MAX_LEN - int( INCLUDE_END_SYMBOL ): # 需要留下INCLUDE_END_SYMBOL长度的字符作为口令结尾 # 口令过长 continue x_index = string2idx( x, CM, MAX_LEN, vocab_size, INCLUDE_END_SYMBOL) # 口令在charmap下的对应下标列表,长度补全 # .copy()复制新的列表——列表第一层为深拷贝 # 随机选择部分下标将其遮盖,返回遮盖后的结果和遮盖的下标 x_index_in, _, masked_index = mask(x_index.copy(), x_len, MAX_MASKED, INCLUDE_END_SYMBOL) prediction_mask = np.zeros( MAX_LEN, np.int32) # 对遮盖后的结果取反,获得类似[0,0,0,1,0,0]的结果 for k in masked_index: prediction_mask[k] = 1 xi_out = x_index yield x_index_in, prediction_mask, xi_out # 构造数据集(以管道的方式输入网络训练) 输入生成器,输出类型和输出shape,None表明shape自动生成;输出的元组由生成器的输出决定 dataset = tf.data.Dataset.from_generator(G, (tf.int32, tf.int32, tf.int32), ((None, ), (None, ), (None, ))) if not for_prediction: # 以 batch_size 打乱数据集 dataset = dataset.shuffle(buffer_size) # padded_shapes = ( # tf.TensorShape([None]), # tf.TensorShape([None]), # tf.TensorShape([None]) # ) # dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes, drop_remainder=True) # 将此数据集的多个连续元素 (可能具有不同的形状) 合并到单个元素中。结果元素中的张量有一个额外的外部维度, 并填充到 padded_shapes 中的相应形状 dataset = dataset.padded_batch(batch_size, drop_remainder=True) # 这里是源码,有错误,需要用上面的语句替代 dataset = dataset.prefetch( buffer_size=buffer_size ) # 预取数据,将生成数据的时间和使用数据的时间分离,在请求元素之前从输入数据集中预取这些元素 return dataset, vocab_size + 1, CM
# bf=ddir+cid+'_b.pdb' # uf=ddir+cid+'_u.pdb' # print "RMSD =",calcRMSD(uf,bf) ofname = 'rmsd_atomic.mkl' loadDirect = True ddir = '../DBD4N/DBD4/' pkldir = '../DBD4N/PDBPKL4/' if not loadDirect or not os.path.isfile(ofname): import __main__ __main__.pymol_argv = ['pymol', '-qc'] # Pymol: quiet and no GUI # import pymol pymol.finish_launching() from pymol import cmd fdict = batchExtract(pkldir, ddir, ofname) else: fdict = myPickle.load(ofname) #cid_l/r: rmsd, (rmsd_pymol, unbound_asa_pymol, bound_asa_pymol), unbound_mol_weight, bound_mol_weight,unbound_asa, bound_asa Va = [] for v in fdict.values(): Va.append(v[:-2]) V = np.array( Va) #rmsd, unbound_asa, bound_asa, unbound_mol_weight,bound_mol_weight #x=V[:,2]/(4.84*(V[:,4]**0.76)); y=V[:,1]; #x=x[nidx];y=y[nidx]; x = V[:, 1] / (4.84 * (V[:, 3]**0.76)) y = V[:, 0] #x=V[:,3]/(0.346*V[:,5]+2.5e+03) #x=V[:,5]/(4.84*(V[:,2]**0.76)); y=V[:,1]; nidx = (y > 1e-3)
lD=getDistMat(getCoords(L.R)) rD=getDistMat(getCoords(R.R)) lM=np.max(lD) rM=np.max(rD) lD=lD/lM rD=rD/rM D=[] for k0,(l0,r0) in enumerate(pex): for l1,r1 in pex[k0+1:]: d=np.max((lD[l0,l1],rD[r0,r1])) D.append(d) C=C+np.histogram(D,bins)[0] except Exception as e: print "Error",e continue mPickle.dump(ofname,(bins,C)) else: (bins,C)=mPickle.load(ofname) bb=(bins[1:]+bins[:-1])/2 idx=bb<=1 bb=bb[idx] C=C[idx] plt.plot(bb,C,'b',linewidth=2);plt.grid(); plt.xlabel('Normalized pairwise distance (d)'); plt.ylabel('Number of pairs of simultaneosuly interacting residue pairs',color='b'); ax1=plt.gca() ax2 = ax1.twinx() ax2.plot(bb, np.cumsum(C)/np.sum(C), 'r.-',linewidth=2) ax2.set_ylabel('Cumulative proporion of pairs of simultaneosuly interacting residue pairs', color='r') plt.show()
passwords: 口令列表 log_probability: 非归一化的概率列表(log) encoding: 编码 Returns: """ assert len(passwords) == len(log_probability) n = len(passwords) with open(output, 'w', encoding=encoding) as f: for x, p in zip(passwords, log_probability): print("%s\t%f" % (x, p), file=f) if __name__ == '__main__': try: model_path = sys.argv[1] password_file = sys.argv[2] output_path = sys.argv[3] except: print("USAGE: model_path.h5 password_path.txt output_path.txt") sys.exit(1) passwords = read_passwords(password_file) charmap = myPickle.load(CHARMAP) model = tf.keras.models.load_model(model_path, compile=False) infer = Inference(model, charmap, MAX_LEN, BATCH_SIZE) logP = infer.applyBatch(passwords, TERMINAL_SYMBOL) # 计算概率 write_tsv(output_path, passwords, logP)