def main(argv): '''main method.''' # Get random peptides that match structure patterns from PDB: pdb_data, hammings = data.sample_seqs(int(argv[2]), argv[9:], int(argv[3]), int(argv[4])) print hammings args = { # 'aa_props_filter': range(1, (2**holygrail.NUM_AA_PROPS)), # 'input_noise': [i / 10.0 for i in range(0, 10)], # 'hidden_noise': [i / 10.0 for i in range(0, 10)], # 'num_hidden_layers': range(1, 4), # 'num_nodes': range(100, 5000, 100), 'activ_func': ['relu', 'prelu', 'lgrelu'], 'learning_rate': [x / 10000.0 for x in range(1, 100)], 'momentum': [x / 10.0 for x in range(0, 10)], 'patience': range(1, 10), 'min_improvement': [i / 1000.0 for i in range(1, 100)], # 'validate_every': range(1, 25), # 'batch_size': range(10, 50, 10), # 'hidden_dropout': [i * 0.1 for i in range(0, 10)], # 'input_dropout': [i * 0.1 for i in range(0, 10)] } classifier = ClassifierGeneticAlgorithm(pop_size=int(argv[1]), pdb_data=pdb_data, split=float(argv[5]), args=args, retain=float(argv[6]), random_select=float(argv[7]), mutate=float(argv[8]), verbose=True) classifier.run()
def getData(numSamples, rgx, rgLbl, min_ham=3, trn=2, val=1, tst=1): '''Retreives peptide string matching regular expression list and randomly samples them into training, validation and test sets. :param numSamples: number of samples per category. :param rgx: regular expression matching peptide secondary structures to retrieve. :param rgLbl: regex label, replacement labels used for column headers :param min_ham: minimum hamming distance between any two peptides. :param trn,val,tst: correspond to training:validation:test ratio :returns: tupple, (train,valid,test), each element is a pandas DataFrame. ''' sq, _ = hgd.sample_seqs(numSamples, rgx, min_hamming=min_ham) dt = pd.DataFrame() for key in sq.iterkeys(): tmp = pd.DataFrame(sq[key]) tmp['Class'] = key dt = pd.concat([dt, tmp], ignore_index=True) dt.columns = ['seq', 'struct', 'PepChainID', 'StartStop', 'Class'] dt[['PepID', 'ChainID']] = dt['PepChainID'].apply(pd.Series) dt[['pStart', 'pStop']] = dt['StartStop'].apply(pd.Series) dt.drop(['PepChainID', 'StartStop'], axis=1, inplace=True) # humanise class names in Class column for idx, key in enumerate(rgx): dt.loc[dt.Class == key, 'Class'] = rgLbl[idx] dt = dt.iloc[np.random.permutation(len(dt))] dt.reset_index(drop=True, inplace=True) tot = trn + val + tst rows, _ = np.shape(dt) k1 = int(rows * trn / tot) k2 = int(rows * (trn + val) / tot) trDat = dt.iloc[0:k1] vlDat = dt.iloc[k1:k2] tstDat = dt.iloc[k2:] return trDat, vlDat, tstDat