def test_command(self): # absolute import (with kinomodel installed) #from kinomodel.features import featurize #JG (temperary) from features import featurize # make sure the input command line is expected with self.assertRaises(ValueError): featurize(chain=0, coord='pdb', feature='conf', pdb='3PP0')
def main(): names, races = load_data(sys.argv[1]) X = featurize(names) y = races clf = deserialize_model("model") score = test_model(clf, (X, y)) print(score)
def fvec(i, j, d, v): """Given 4-tuple representations of mentions, return onehot feat vec for that pair.""" m_i = make_mention_dict(i, d) m_j = make_mention_dict(j, d) feat_dict = featurize(m_i, m_j, d) fv = v.transform(feat_dict) return fv
def test_features(self): # absolute import (with kinomodel installed) #from kinomodel.features import featurize #JG (temperary) from features import featurize # example 1: a kinase with no gap(s) in the binding pocket residues (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='3PP0') self.assertEqual(key_res, [767, 775, 836, 838, 753, 770, 774, 864, 862, 863, 873, 814]) self.assertEqual( round( np.asscalar(dihedrals[0][0]), 7), -2.0228872) # the first dihedral value self.assertEqual( round( np.asscalar(distances[0][0]), 7), 0.7770488) # the first distance value # example 2: a kinase with gap(s) in the binding pocket residues (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='3RCD') self.assertEqual(key_res, [767, 775, 836, 838, 753, 770, 774, 864, 862, 863, 873, 814]) self.assertEqual( round( np.asscalar(dihedrals[0][0]), 7), -2.1615183) # the first dihedral value self.assertEqual( round( np.asscalar(distances[0][0]), 7), 1.0546854) # the first distance value # example 3: a kinase with multiple occupancy (key_res, dihedrals, distances) = featurize(chain='A', coord='pdb', feature='conf', pdb='1M17') self.assertEqual(key_res, [735, 743, 804, 806, 721, 738, 742, 832, 830, 831, 841, 782]) self.assertEqual( round( np.asscalar(dihedrals[0][0]), 7), -2.4006705) # the first dihedral value self.assertEqual( round( np.asscalar(distances[0][0]), 7), 0.3558538) # the first distance value
def main(): names, races = load_data(sys.argv[1]) X = featurize(names) y = races Xtr, Xte, ytr, yte = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE ) clf = RandomForestClassifier( n_estimators=100, min_samples_split=2, random_state=RANDOM_STATE ) clf.fit(Xtr, ytr) serialize_model("model", clf) score = test_model(clf, (Xte, yte)) print(f"Score: {score}") top_features = np.argsort(clf.feature_importances_)[::-1][:10] print(f"Key feature ids: {top_features}")
def main(): classifier = Classifier() targets = dict() data = dict() for (i, genre) in enumerate(algorithm['genres']): targets[genre] = i targets[i] = genre # Get all training data for genre in algorithm['genres']: print "training", genre data[genre] = classifier.db.get_tracks_by_genre(genre) for track in data[genre][:algorithm['training_size']]: try: if mfcc_key in track: mfcc = track[mfcc_key] else: mfcc = compute_mfcc(track, classifier.db) feature = featurize(mfcc) classifier.add_training_data(feature, targets[genre]) except Exception, e: print e
def make_dataset(training_samples): """Make dataset usable by sklearn.""" # X: array (n_features, n_samples) # y: array (n_samples,), 1 if coreferent X = [] y = [] for (doc_part, d) in tqdm(training_samples): for pair in doc_part: i = pair[0] j = pair[1] label = 1 if pair[2] else 0 m_i = make_mention_dict(i, d) m_j = make_mention_dict(j, d) feat_dict = featurize(m_i, m_j, d) X.append(feat_dict) y.append(label) print "Vectorizing feature dicts..." v = DictVectorizer(sparse=False) X = v.fit_transform(X) y = np.array(y) return X, y, v
def __init__( self, initialValue ): self.feature = features.featurize( initialValue )
def __init__( self, initialValue, initialFormula, variableIdentifiers=set() ): self.feature = features.featurize( initialValue ) self.formula = formulas.formulize( initialFormula, variableIdentifiers )
# populate the mean and std dictionaries for i in range(8): # each of the 8 clusters for j in range(7): # convert each of the 7 dihedrals to radians dunbrack_mean[i, j] = float( data_mean.loc[names[i], feature_names[j]]) / 180 * np.pi dunbrack_std[i, j] = float( data_std.loc[names[i], feature_names[j]]) / 180 * np.pi for j in range(7, 9): # each of the 2 distances (nm) dunbrack_mean[i, j] = data_mean.loc[names[i], feature_names[j]] dunbrack_std[i, j] = data_std.loc[names[i], feature_names[j]] print(dunbrack_mean) print(dunbrack_std) # Specify the set of key atoms and calculate key dihedrals and distances (key_res, dih, dis) = featurize(chain=f'{chain}', coord='processed_pdb', feature='conf', pdb=f'{pdbid}') # add dihedrals and/or distances to bias the sampling kT_md_units = (unit.MOLAR_GAS_CONSTANT_R * temperature).value_in_unit_system( unit.md_unit_system) torsion_force = dict() # a dict of torsion forces we retain bond_force = dict() # a dict of bond forces we retain for dihedral_index in range(ndihedrals): energy_expression = f'switch*coef*(K/2)*(1-cos(theta-phi0_dih{dihedral_index})); K = kT/(dphi_dih{dihedral_index}^2); kT = {kT_md_units}' torsion_force[dihedral_index] = mm.CustomTorsionForce(energy_expression) torsion_force[dihedral_index].addTorsion(int(dih[dihedral_index][0]), int(dih[dihedral_index][1]), int(dih[dihedral_index][2]), int(dih[dihedral_index][3])) torsion_force[dihedral_index].addGlobalParameter(
from features import featurize from sys import argv path_to_model = 'model_300dim.pkl' path_to_input = 's.smi' path_to_output = 'embeddings_new_compound.csv' featurize(path_to_input, path_to_output, path_to_model, r=1, uncommon='UNK')
interesting_sents = pickle.load(sents_f) random.shuffle(interesting_sents) features2idx, idx2feature, label2idx, idx2label = make_feature_converters( interesting_sents, dump_path=args.output_features ) # Convert the textual dataset to one-hot-encoded features y = [] X = [] for sent in interesting_sents: # Skip longer sentences to speed up the computation (could use bucketing) if len(sent[0]) > 30: continue X.append(featurize(sent, features2idx)) # features => morphological tags of words in the sentence y.append(featurize_label(sent[1][1], label2idx)) # target => morphological tags of abbreviable word y, X = np.array(y), pad_sequences(np.array(X)) # Construct the BLSTM model L2 = 0.0005 input_layer = Input(shape=(None,)) x = Embedding(len(features2idx), 64)(input_layer) x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(x) x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(x) x = Bidirectional(LSTM(128, dropout=0.2))(x) x = Dense(256, activation='relu', kernel_regularizer=l2(L2))(x) x = Dropout(0.5)(x) output_layer = Dense(len(idx2label), activation='softmax', kernel_regularizer=l2(L2))(x)
words_with_abbreviations = retrieve_words_to_abbreviate(args.abbreviations) with open(args.pickled_sentences, 'rb') as f: ncp_sentences = pickle.load(f) features2idx, idx2feature, label2idx, idx2label = load_feature_converters( args.features) eval_model = load_model(args.input_model) y_eval = [] X_eval = [] errors = [] filt_ncp_sentences = [] for sent in ncp_sentences: try: X_eval.append( featurize(sent, features2idx) ) # features => morphological tags of words in the sentence y_eval.append(featurize_label( sent[1][1], label2idx)) # target => morphological tags of abbreviable word filt_ncp_sentences.append(sent) except KeyError: errors.append(sent) y_eval, X_eval = np.array(y_eval), pad_sequences(np.array(X_eval)) print(eval_model.metrics_names) print(eval_model.evaluate(X_eval, y_eval, batch_size=1024))
print "Fitting" classifier.fit() print "Starting Classification" start_index = algorithm['training_size'] prediction_data = [] prediction_targets = list() for genre in algorithm['genres']: data[genre] = classifier.db.get_tracks_by_genre(genre) for track in data[genre][start_index:start_index+algorithm['testing_size']]: try: if mfcc_key in track: mfcc = track[mfcc_key] else: mfcc = compute_mfcc(track, classifier.db) feature = featurize(mfcc) prediction_data.append(feature) prediction_targets.append(targets[genre]) except Exception, e: print e predictions = classifier.predict(prediction_data) cm = confusion_matrix(predictions, prediction_targets) print(cm) # Show confusion matrix in a separate window pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar()
from features import featurize import matplotlib matplotlib.use("TkAgg") import matplotlib.pyplot as plt import numpy as np print("Featurizing the protein.") (key_res, dihedrals, distances) = featurize(chain='A', coord='dcd', feature='conf', pdb='5UG9') dih = {} dis = {} # set the number of states, dihedrals and distances nstate = 8 ndih = 10 ndis = 5 # separate each dihedral and distance for i in range(10): dih[i] = dihedrals[:, i] for i in range(5): dis[i] = distances[:, i] #baseline values dunbrack_phi0 = dict() dunbrack_dphi = dict() dunbrack_r0 = dict() dunbrack_dr = dict()
filename = directory + '/' + review_files[i] i += 1 if i >= len(review_files): exit('There are not enough reviews for the parameters \ you have selected. Please try again.') with open(filename) as review_file: for line in review_file: try: review = json.loads(line) except: continue relevant_classes = classes.intersection(set(review['categories'])) if len(relevant_classes) < NUM_RELEVANT_CLASSES: continue review_texts.append(review['text']) featurized_reviews.append(featurize(review)) labels.append(list(relevant_classes)) # Generate train, test data num_train_reviews = int(PERCENT_TRAIN * len(featurized_reviews)) train_reviews = featurized_reviews[:num_train_reviews] train_labels = labels[:num_train_reviews] num_test_reviews = NUM_REVIEWS - num_train_reviews test_reviews = featurized_reviews[num_train_reviews:] test_review_texts = review_texts[num_train_reviews:] test_labels = labels[num_train_reviews:] # Fit data v = DictVectorizer(sparse=False) X_train = v.fit_transform(train_reviews)
def base_forecast_linear(series, target_date_range = None, forecast_type="point", quantile=None, weights=None): """ Input: series: input time series of daily counts target_date_range: range of dates predict, default is start to three weeks hence forecast_type: point or quantile forecast quantile: quantile if quantile forecast weights: weighing for samples, allows to create rolling forecasts Output: predict: prediction using log linear model """ series = series.copy() base_date = dt.datetime.strptime('01/01/20', '%m/%d/%y') end_date = series.index[-1] start_date = series.index[0] if target_date_range is None: # default three weeks ahead forecast target_end_date = end_date + dt.timedelta(days=21) target_date_range = pd.date_range(start_date, target_end_date) # if target_end_date < end_date: # print('target_end_date before end of series, not valid forecast') # raise ValueError # create prediction dates and dummy holder prediction_dates = target_date_range.copy() predictions = pd.Series(0, index=prediction_dates) predict_feats = features.featurize(predictions) # get input features and scale input and prediction features scaler = sklearn.preprocessing.StandardScaler(with_mean=False) feats = features.featurize(series) # standardize all features feats_scaled = scaler.fit_transform(feats) predict_feats_scaled = scaler.transform(predict_feats) # create target target = series.map( lambda lnx : np.log(1+lnx), na_action='ignore') # fit linear model # remove intercept because its included explicitly in features above lm = skllin.LinearRegression(fit_intercept=False) # weight the samples if necessary weighted_feats_scaled = feats_scaled.copy() weighted_target = target.copy() if weights is not None: weighted_feats_scaled = np.diag(weights**0.5).dot(feats_scaled) weighted_target = (weights**0.5) * target lm.fit( weighted_feats_scaled, weighted_target ) # create point predictions log_predictions = pd.Series( lm.predict(predict_feats_scaled), index=prediction_dates ) # compute stuff needed for quantiles noise_var = np.mean( (lm.predict(weighted_feats_scaled) - weighted_target)**2) fish_mat_inv = weighted_feats_scaled.T.dot(weighted_feats_scaled) beta_var, _, _,_ = la.lstsq( fish_mat_inv, predict_feats_scaled.T ) beta_var = predict_feats_scaled.dot(beta_var) predict_var = np.diag(noise_var*beta_var) predict_std = np.sqrt(predict_var) # adjust for quantiles # using asymptotics now if quantile is None: # quantile is none so make it 0.5 or null quantile = 0.5 log_predictions_quantile = log_predictions + predict_std*spstats.norm.ppf(quantile) # transform back predictions = log_predictions_quantile.map(lambda ilnx: np.exp(ilnx) - 1) return predictions