Beispiel #1
0
def main(argv):
    '''main method.'''

    # Get random peptides that match structure patterns from PDB:
    pdb_data, hammings = data.sample_seqs(int(argv[2]), argv[9:],
                                          int(argv[3]), int(argv[4]))

    print hammings

    args = {  # 'aa_props_filter': range(1, (2**holygrail.NUM_AA_PROPS)),
        # 'input_noise': [i / 10.0 for i in range(0, 10)],
        # 'hidden_noise': [i / 10.0 for i in range(0, 10)],
        # 'num_hidden_layers': range(1, 4),
        # 'num_nodes': range(100, 5000, 100),
        'activ_func': ['relu', 'prelu', 'lgrelu'],
        'learning_rate': [x / 10000.0 for x in range(1, 100)],
        'momentum': [x / 10.0 for x in range(0, 10)],
        'patience': range(1, 10),
        'min_improvement': [i / 1000.0 for i in range(1, 100)],
        # 'validate_every': range(1, 25),
        # 'batch_size': range(10, 50, 10),
        # 'hidden_dropout': [i * 0.1 for i in range(0, 10)],
        # 'input_dropout': [i * 0.1 for i in range(0, 10)]
    }

    classifier = ClassifierGeneticAlgorithm(pop_size=int(argv[1]),
                                            pdb_data=pdb_data,
                                            split=float(argv[5]),
                                            args=args,
                                            retain=float(argv[6]),
                                            random_select=float(argv[7]),
                                            mutate=float(argv[8]),
                                            verbose=True)

    classifier.run()
def getData(numSamples, rgx, rgLbl, min_ham=3, trn=2, val=1, tst=1):
    '''Retreives peptide string matching regular expression list and randomly
    samples them
    into training, validation and test sets.
    :param numSamples: number of samples per category.
    :param rgx: regular expression matching peptide secondary structures to
    retrieve.
    :param rgLbl: regex label, replacement labels used for column headers
    :param min_ham: minimum hamming distance between any two peptides.
    :param trn,val,tst: correspond to training:validation:test ratio
    :returns: tupple, (train,valid,test), each element is a pandas DataFrame.
    '''

    sq, _ = hgd.sample_seqs(numSamples, rgx, min_hamming=min_ham)

    dt = pd.DataFrame()
    for key in sq.iterkeys():
        tmp = pd.DataFrame(sq[key])
        tmp['Class'] = key
        dt = pd.concat([dt, tmp], ignore_index=True)

    dt.columns = ['seq', 'struct', 'PepChainID', 'StartStop', 'Class']

    dt[['PepID', 'ChainID']] = dt['PepChainID'].apply(pd.Series)
    dt[['pStart', 'pStop']] = dt['StartStop'].apply(pd.Series)

    dt.drop(['PepChainID', 'StartStop'], axis=1, inplace=True)

    # humanise class names in Class column
    for idx, key in enumerate(rgx):
        dt.loc[dt.Class == key, 'Class'] = rgLbl[idx]

    dt = dt.iloc[np.random.permutation(len(dt))]

    dt.reset_index(drop=True, inplace=True)

    tot = trn + val + tst

    rows, _ = np.shape(dt)

    k1 = int(rows * trn / tot)
    k2 = int(rows * (trn + val) / tot)

    trDat = dt.iloc[0:k1]
    vlDat = dt.iloc[k1:k2]
    tstDat = dt.iloc[k2:]

    return trDat, vlDat, tstDat