Exemple #1
0
 def test_command(self):
     # absolute import (with kinomodel installed)
     #from kinomodel.features import featurize
     #JG (temperary)
     from features import featurize
     # make sure the input command line is expected
     with self.assertRaises(ValueError):
         featurize(chain=0, coord='pdb', feature='conf', pdb='3PP0')
Exemple #2
0
def main():
    names, races = load_data(sys.argv[1])
    X = featurize(names)
    y = races

    clf = deserialize_model("model")
    score = test_model(clf, (X, y))
    print(score)
Exemple #3
0
def fvec(i, j, d, v):
    """Given 4-tuple representations of mentions, return onehot feat vec
    for that pair."""
    m_i = make_mention_dict(i, d)
    m_j = make_mention_dict(j, d)
    feat_dict = featurize(m_i, m_j, d)
    fv = v.transform(feat_dict)
    return fv
Exemple #4
0
    def test_features(self):
        # absolute import (with kinomodel installed)
        #from kinomodel.features import featurize

        #JG (temperary)
        from features import featurize

        # example 1: a kinase with no gap(s) in the binding pocket residues
        (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='3PP0')
        self.assertEqual(key_res,
            [767, 775, 836, 838, 753, 770, 774, 864, 862, 863, 873, 814])
        self.assertEqual(
            round(
                np.asscalar(dihedrals[0][0]), 7),
            -2.0228872)  # the first dihedral value
        self.assertEqual(
            round(
                np.asscalar(distances[0][0]), 7),
            0.7770488)  # the first distance value

        # example 2: a kinase with gap(s) in the binding pocket residues
        (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='3RCD')
        self.assertEqual(key_res,
            [767, 775, 836, 838, 753, 770, 774, 864, 862, 863, 873, 814])
        self.assertEqual(
            round(
                np.asscalar(dihedrals[0][0]), 7),
            -2.1615183)  # the first dihedral value
        self.assertEqual(
            round(
                np.asscalar(distances[0][0]), 7),
            1.0546854)  # the first distance value

        # example 3: a kinase with multiple occupancy
        (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='1M17')
        self.assertEqual(key_res,
            [735, 743, 804, 806, 721, 738, 742, 832, 830, 831, 841, 782])
        self.assertEqual(
            round(
                np.asscalar(dihedrals[0][0]), 7),
            -2.4006705)  # the first dihedral value
        self.assertEqual(
            round(
                np.asscalar(distances[0][0]), 7),
            0.3558538)  # the first distance value
def main():
    names, races = load_data(sys.argv[1])
    X = featurize(names)
    y = races

    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.33, random_state=RANDOM_STATE
    )
    clf = RandomForestClassifier(
        n_estimators=100, min_samples_split=2, random_state=RANDOM_STATE
    )

    clf.fit(Xtr, ytr)
    serialize_model("model", clf)

    score = test_model(clf, (Xte, yte))
    print(f"Score: {score}")
    top_features = np.argsort(clf.feature_importances_)[::-1][:10]
    print(f"Key feature ids: {top_features}")
def main():
    classifier = Classifier()
    targets = dict()
    data = dict()
    for (i, genre) in enumerate(algorithm['genres']):
        targets[genre] = i
        targets[i] = genre
    # Get all training data
    for genre in algorithm['genres']:
        print "training", genre
        data[genre] = classifier.db.get_tracks_by_genre(genre)
        for track in data[genre][:algorithm['training_size']]:
            try:
                if mfcc_key in track:
                    mfcc = track[mfcc_key]
                else:
                    mfcc = compute_mfcc(track, classifier.db)
                feature = featurize(mfcc)
                classifier.add_training_data(feature, targets[genre])
            except Exception, e:
                print e
Exemple #7
0
def make_dataset(training_samples):
    """Make dataset usable by sklearn."""
    # X: array (n_features, n_samples)
    # y: array (n_samples,), 1 if coreferent
    X = []
    y = []
    for (doc_part, d) in tqdm(training_samples):
        for pair in doc_part:
            i = pair[0]
            j = pair[1]
            label = 1 if pair[2] else 0
            m_i = make_mention_dict(i, d)
            m_j = make_mention_dict(j, d)
            feat_dict = featurize(m_i, m_j, d)
            X.append(feat_dict)
            y.append(label)
    print "Vectorizing feature dicts..."
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(X)
    y = np.array(y)
    return X, y, v
Exemple #8
0
 def __init__( self, initialValue ):
     self.feature = features.featurize( initialValue )
Exemple #9
0
 def __init__( self, initialValue, initialFormula, variableIdentifiers=set() ):
     self.feature = features.featurize( initialValue )
     self.formula = formulas.formulize( initialFormula, variableIdentifiers )
# populate the mean and std dictionaries
for i in range(8):  # each of the 8 clusters
    for j in range(7):  # convert each of the 7 dihedrals to radians
        dunbrack_mean[i, j] = float(
            data_mean.loc[names[i], feature_names[j]]) / 180 * np.pi
        dunbrack_std[i, j] = float(
            data_std.loc[names[i], feature_names[j]]) / 180 * np.pi
    for j in range(7, 9):  # each of the 2 distances (nm)
        dunbrack_mean[i, j] = data_mean.loc[names[i], feature_names[j]]
        dunbrack_std[i, j] = data_std.loc[names[i], feature_names[j]]
print(dunbrack_mean)
print(dunbrack_std)
# Specify the set of key atoms and calculate key dihedrals and distances
(key_res, dih, dis) = featurize(chain=f'{chain}',
                                coord='processed_pdb',
                                feature='conf',
                                pdb=f'{pdbid}')

# add dihedrals and/or distances to bias the sampling
kT_md_units = (unit.MOLAR_GAS_CONSTANT_R * temperature).value_in_unit_system(
    unit.md_unit_system)
torsion_force = dict()  # a dict of torsion forces we retain
bond_force = dict()  # a dict of bond forces we retain
for dihedral_index in range(ndihedrals):
    energy_expression = f'switch*coef*(K/2)*(1-cos(theta-phi0_dih{dihedral_index})); K = kT/(dphi_dih{dihedral_index}^2); kT = {kT_md_units}'
    torsion_force[dihedral_index] = mm.CustomTorsionForce(energy_expression)
    torsion_force[dihedral_index].addTorsion(int(dih[dihedral_index][0]),
                                             int(dih[dihedral_index][1]),
                                             int(dih[dihedral_index][2]),
                                             int(dih[dihedral_index][3]))
    torsion_force[dihedral_index].addGlobalParameter(
Exemple #11
0
from features import featurize
from sys import argv

path_to_model = 'model_300dim.pkl'
path_to_input = 's.smi'
path_to_output = 'embeddings_new_compound.csv'

featurize(path_to_input, path_to_output, path_to_model, r=1, uncommon='UNK')
Exemple #12
0
    interesting_sents = pickle.load(sents_f)
random.shuffle(interesting_sents)

features2idx, idx2feature, label2idx, idx2label = make_feature_converters(
    interesting_sents,
    dump_path=args.output_features
)

# Convert the textual dataset to one-hot-encoded features
y = []
X = []
for sent in interesting_sents:
    # Skip longer sentences to speed up the computation (could use bucketing)
    if len(sent[0]) > 30:
        continue
    X.append(featurize(sent, features2idx))  # features => morphological tags of words in the sentence
    y.append(featurize_label(sent[1][1], label2idx))  # target => morphological tags of abbreviable word
y, X = np.array(y), pad_sequences(np.array(X))

# Construct the BLSTM model
L2 = 0.0005

input_layer = Input(shape=(None,))
x = Embedding(len(features2idx), 64)(input_layer)
x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(x)
x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(x)
x = Bidirectional(LSTM(128, dropout=0.2))(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(L2))(x)
x = Dropout(0.5)(x)
output_layer = Dense(len(idx2label), activation='softmax', kernel_regularizer=l2(L2))(x)
Exemple #13
0
words_with_abbreviations = retrieve_words_to_abbreviate(args.abbreviations)

with open(args.pickled_sentences, 'rb') as f:
    ncp_sentences = pickle.load(f)

features2idx, idx2feature, label2idx, idx2label = load_feature_converters(
    args.features)

eval_model = load_model(args.input_model)

y_eval = []
X_eval = []
errors = []
filt_ncp_sentences = []
for sent in ncp_sentences:
    try:
        X_eval.append(
            featurize(sent, features2idx)
        )  # features => morphological tags of words in the sentence
        y_eval.append(featurize_label(
            sent[1][1],
            label2idx))  # target => morphological tags of abbreviable word
        filt_ncp_sentences.append(sent)
    except KeyError:
        errors.append(sent)

y_eval, X_eval = np.array(y_eval), pad_sequences(np.array(X_eval))

print(eval_model.metrics_names)
print(eval_model.evaluate(X_eval, y_eval, batch_size=1024))
    print "Fitting"
    classifier.fit()
    print "Starting Classification"

    start_index = algorithm['training_size']
    prediction_data = []
    prediction_targets = list()
    for genre in algorithm['genres']:
        data[genre] = classifier.db.get_tracks_by_genre(genre)
        for track in data[genre][start_index:start_index+algorithm['testing_size']]:
            try:
                if mfcc_key in track:
                    mfcc = track[mfcc_key]
                else:
                    mfcc = compute_mfcc(track, classifier.db)
                feature = featurize(mfcc)
                prediction_data.append(feature)
                prediction_targets.append(targets[genre])
            except Exception, e:
                print e

    predictions = classifier.predict(prediction_data)

    cm = confusion_matrix(predictions, prediction_targets)

    print(cm)

    # Show confusion matrix in a separate window
    pl.matshow(cm)
    pl.title('Confusion matrix')
    pl.colorbar()
from features import featurize
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import numpy as np

print("Featurizing the protein.")
(key_res, dihedrals, distances) = featurize(chain='A',
                                            coord='dcd',
                                            feature='conf',
                                            pdb='5UG9')

dih = {}
dis = {}

# set the number of states, dihedrals and distances
nstate = 8
ndih = 10
ndis = 5

# separate each dihedral and distance
for i in range(10):
    dih[i] = dihedrals[:, i]

for i in range(5):
    dis[i] = distances[:, i]
#baseline values
dunbrack_phi0 = dict()
dunbrack_dphi = dict()
dunbrack_r0 = dict()
dunbrack_dr = dict()
Exemple #16
0
    filename = directory + '/' + review_files[i]
    i += 1
    if i >= len(review_files):
        exit('There are not enough reviews for the parameters \
			you have selected. Please try again.')
    with open(filename) as review_file:
        for line in review_file:
            try:
                review = json.loads(line)
            except:
                continue
            relevant_classes = classes.intersection(set(review['categories']))
            if len(relevant_classes) < NUM_RELEVANT_CLASSES:
                continue
            review_texts.append(review['text'])
            featurized_reviews.append(featurize(review))
            labels.append(list(relevant_classes))

# Generate train, test data
num_train_reviews = int(PERCENT_TRAIN * len(featurized_reviews))
train_reviews = featurized_reviews[:num_train_reviews]
train_labels = labels[:num_train_reviews]

num_test_reviews = NUM_REVIEWS - num_train_reviews
test_reviews = featurized_reviews[num_train_reviews:]
test_review_texts = review_texts[num_train_reviews:]
test_labels = labels[num_train_reviews:]

# Fit data
v = DictVectorizer(sparse=False)
X_train = v.fit_transform(train_reviews)
def base_forecast_linear(series, 
                         target_date_range = None, 
                         forecast_type="point", 
                         quantile=None, 
                        weights=None):
    """
        Input:
        series: input time series of daily counts
        target_date_range: range of dates predict, default is start to three weeks hence
        forecast_type: point or quantile forecast
        quantile: quantile if quantile forecast
        weights: weighing for samples, allows to create rolling forecasts
        
        Output:
        predict:  prediction using log linear model
    """
    series = series.copy()
    
    base_date = dt.datetime.strptime('01/01/20', '%m/%d/%y')
    end_date = series.index[-1]
    start_date = series.index[0]
    if target_date_range is None: # default three weeks ahead forecast
        target_end_date = end_date + dt.timedelta(days=21)
        target_date_range = pd.date_range(start_date, target_end_date) 
    
#     if target_end_date < end_date:
#         print('target_end_date before end of series, not valid forecast')
#         raise ValueError
    
    
    # create prediction dates and dummy holder
    prediction_dates = target_date_range.copy()
    predictions = pd.Series(0, index=prediction_dates)
    predict_feats = features.featurize(predictions)
    
    
    # get input features and scale input and prediction features
    scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
    feats = features.featurize(series)
    # standardize all features 
    feats_scaled = scaler.fit_transform(feats)
    predict_feats_scaled = scaler.transform(predict_feats)
    
    # create target
    target = series.map( lambda lnx : np.log(1+lnx), na_action='ignore')

    
    # fit linear model  
    # remove intercept because its included explicitly in features above
    lm = skllin.LinearRegression(fit_intercept=False) 
    # weight the samples if necessary
    weighted_feats_scaled = feats_scaled.copy()
    weighted_target = target.copy()
    if weights is not None:
        weighted_feats_scaled = np.diag(weights**0.5).dot(feats_scaled)
        weighted_target = (weights**0.5) * target
    
    lm.fit( weighted_feats_scaled, weighted_target )
    
    # create point predictions
    log_predictions = pd.Series( lm.predict(predict_feats_scaled), index=prediction_dates )
    
    # compute stuff needed for quantiles
    noise_var = np.mean(  (lm.predict(weighted_feats_scaled) - weighted_target)**2)
    fish_mat_inv = weighted_feats_scaled.T.dot(weighted_feats_scaled) 
    beta_var, _, _,_ = la.lstsq( fish_mat_inv, predict_feats_scaled.T   )
    beta_var = predict_feats_scaled.dot(beta_var)
    predict_var  = np.diag(noise_var*beta_var)
    predict_std = np.sqrt(predict_var)
    
    # adjust for quantiles
    # using asymptotics now
    if quantile is None: # quantile is none so make it 0.5 or null
        quantile = 0.5 
    log_predictions_quantile = log_predictions + predict_std*spstats.norm.ppf(quantile)

    # transform back
    predictions = log_predictions_quantile.map(lambda ilnx: np.exp(ilnx) - 1)
    
    return predictions