def _read_csv_file(self, delimiter="\t"): print("Reading CSV file ") begin_time = time.time() # for Micky's big file # wikitree_sf = SFrame.read_csv(self._input_directory_path + self._target_file_name, delimiter="\t") wikitree_sf = SFrame.read_csv(self._input_directory_path + self._target_file_name, delimiter=delimiter) end_time = time.time() run_time = end_time - begin_time print(run_time) return wikitree_sf
from sframe import SFrame from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression products = SFrame('review.csv') from string import punctuation def remove_punctuation(text): return text.translate(punctuation) products['review_clean'] = products['text'].apply(remove_punctuation) products = products[products['stars'] != 3] products['sentiment'] = products['stars'].apply(lambda r: +1 if r > 3 else -1) train_data, test_data = products.random_split(.8, seed=1) vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') train_matrix = vectorizer.fit_transform(train_data['review_clean']) test_matrix = vectorizer.transform(test_data['review_clean']) print(test_matrix[0]) model = LogisticRegression() model.fit(train_matrix, train_data['sentiment']) sample_test_matrix = vectorizer.transform(['ammazing wow wow'])
def __init__(self, batch_size=BATCH_SIZE, seq_len=SUB_SEQ_LEN, slow_fs=SLOW_FS, slow_dim=SLOW_DIM, dim=DIM, mid_fs=MID_FS, q_levels=Q_LEVELS, mlp_activation='relu'): self.weight_norm = True self.stateful = True self.slow_fs = slow_fs self.mid_fs = mid_fs self.q_levels = q_levels self.dim = dim self.slow_dim = slow_dim self.batch_size = batch_size slow_seq_len = max(1, seq_len // slow_fs) mid_seq_len = max(1, seq_len // mid_fs) prev_sample_seq_len = seq_len + 1 ################################################################################ ################## Model to train ################################################################################ self.slow_tier_model_input = Input( batch_shape=(batch_size, slow_seq_len * slow_fs, 1)) self.slow_tier_model = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels), name='slow_scale')(self.slow_tier_model_input) self.slow_tier_model = Reshape( (slow_seq_len, self.slow_fs), name='slow_reshape4rnn')(self.slow_tier_model) self.slow_rnn_h = K.variable( np.zeros((1, self.slow_dim)), dtype=K.floatx(), name='show_h0') self.slow_rnn_h0 = K.tile(self.slow_rnn_h, (batch_size, 1)) self.mid_rnn_h = K.variable( np.zeros((1, self.dim)), dtype=K.floatx(), name='mid_h0') self.mid_rnn_h0 = K.tile(self.mid_rnn_h, (batch_size, 1)) self.state_selector = K.zeros( (), dtype=K.floatx(), name='slow_state_mask') self.slow_rnn = GruWithWeightNorm( slow_dim, use_bias=True, name='slow_rnn', recurrent_activation='sigmoid', return_sequences=True, stateful=self.stateful, state_selector=self.state_selector, weight_norm=self.weight_norm) self.slow_rnn._trainable_weights.append(self.slow_rnn_h) self.slow_tier_model = self.slow_rnn( self.slow_tier_model, initial_state=self.slow_rnn_h0) # upscale slow rnn output to mid tier ticking freq self.slow_tier_model = TimeDistributed( DenseWithWeightNorm(dim * slow_fs / mid_fs, weight_norm=self.weight_norm, ), name='slow_project2mid') \ (self.slow_tier_model) self.slow_tier_model = Reshape( (mid_seq_len, dim), name='slow_reshape4mid')(self.slow_tier_model) self.mid_tier_model_input = Input( batch_shape=(batch_size, mid_seq_len * mid_fs, 1)) self.mid_tier_model = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels), name='mid_scale')(self.mid_tier_model_input) self.mid_tier_model = Reshape( (mid_seq_len, self.mid_fs), name='mid_reshape2rnn')(self.mid_tier_model) mid_proj = DenseWithWeightNorm( dim, name='mid_project2rnn', weight_norm=self.weight_norm) self.mid_tier_model = TimeDistributed( mid_proj, name='mid_project2rnn')(self.mid_tier_model) self.mid_tier_model = layers.add( [self.mid_tier_model, self.slow_tier_model]) self.mid_rnn = GruWithWeightNorm( dim, name='mid_rnn', return_sequences=True, recurrent_activation='sigmoid', stateful=self.stateful, state_selector=self.state_selector) self.mid_rnn._trainable_weights.append(self.mid_rnn_h) self.mid_tier_model = self.mid_rnn( self.mid_tier_model, initial_state=self.mid_rnn_h0) self.mid_adapter = DenseWithWeightNorm( dim * mid_fs, name='mid_project2top', weight_norm=self.weight_norm) self.mid_tier_model = TimeDistributed( self.mid_adapter, name='mid_project2top')(self.mid_tier_model) self.mid_tier_model = Reshape( (mid_seq_len * mid_fs, dim), name='mid_reshape4top')(self.mid_tier_model) self.embed_size = 256 self.sframe = SFrame() self.top_tier_model_input = self.sframe.build_sframe_model( (batch_size, prev_sample_seq_len, 1), frame_size=self.mid_fs, q_levels=self.q_levels, embed_size=self.embed_size) self.top_adapter = DenseWithWeightNorm( dim, use_bias=False, name='top_project2mlp', kernel_initializer='lecun_uniform', weight_norm=self.weight_norm) self.top_tier_model = TimeDistributed( self.top_adapter, name='top_project2mpl')(self.top_tier_model_input.output) self.top_tier_model_input_from_mid_tier = Input( batch_shape=(batch_size, 1, dim)) self.top_tier_model_input_predictor = Input( batch_shape=(batch_size, mid_fs, 1)) self.top_tier_model = layers.add( [self.mid_tier_model, self.top_tier_model]) self.top_tier_mlp_l1 = DenseWithWeightNorm( dim, activation=mlp_activation, name='mlp_1', weight_norm=self.weight_norm) self.top_tier_mlp_l2 = DenseWithWeightNorm( dim, activation=mlp_activation, name='mlp_2', weight_norm=self.weight_norm) self.top_tier_mlp_l3 = DenseWithWeightNorm( q_levels, kernel_initializer='lecun_uniform', name='mlp_3', weight_norm=self.weight_norm) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l1, name='mlp_1')(self.top_tier_model) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l2, name='mlp_2')(self.top_tier_model) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l3, name='mlp_3')(self.top_tier_model) self.mid_tier_model_input_from_slow_tier = Input( batch_shape=(batch_size, 1, dim)) self.mid_tier_model_input_predictor = Input( batch_shape=(batch_size, mid_fs, 1)) self.srnn = Model([ self.slow_tier_model_input, self.mid_tier_model_input, self.top_tier_model_input.input ], self.top_tier_model) ################################################################################ ################## Model to sample from (predictor) ################################################################################ ################################################################################ ################## Slow tier predictor ################################################################################ self.slow_tier_model_predictor = Model( inputs=self.slow_tier_model_input, outputs=self.slow_tier_model) ################################################################################ ################## Mid tier predictor ################################################################################ self.mid_tier_model_predictor = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels))( self.mid_tier_model_input_predictor) self.mid_tier_model_predictor = Reshape( (1, self.mid_fs))(self.mid_tier_model_predictor) self.mid_tier_model_predictor = TimeDistributed(mid_proj)( self.mid_tier_model_predictor) self.mid_tier_model_predictor = layers.add([ self.mid_tier_model_predictor, self.mid_tier_model_input_from_slow_tier ]) """ Creating new layer instead of sharing it with the model to train due to https://github.com/keras-team/keras/issues/6939 Sharing statefull layers gives a crosstalk """ self.predictor_mid_rnn = GruWithWeightNorm( self.dim, name='mid_rnn_pred', return_sequences=True, recurrent_activation='sigmoid', stateful=self.stateful, state_selector=self.state_selector) self.predictor_mid_rnn._trainable_weights.append(self.mid_rnn_h) self.mid_tier_model_predictor = self.predictor_mid_rnn( self.mid_tier_model_predictor, initial_state=self.mid_rnn_h0) self.predictor_mid_rnn.set_weights(self.mid_rnn.get_weights()) self.mid_tier_model_predictor = TimeDistributed(self.mid_adapter)( self.mid_tier_model_predictor) self.mid_tier_model_predictor = Reshape( (mid_fs, dim))(self.mid_tier_model_predictor) self.mid_tier_model_predictor = Model([ self.mid_tier_model_input_predictor, self.mid_tier_model_input_from_slow_tier ], self.mid_tier_model_predictor) ################################################################################ ################## Top tier predictor ################################################################################ self.top_predictor_embedding = self.sframe.get_embedding() self.top_tier_model_predictor = self.top_predictor_embedding( self.top_tier_model_input_predictor) self.top_tier_model_predictor = Reshape( (1, mid_fs * self.embed_size))(self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_adapter)( self.top_tier_model_predictor) self.top_tier_model_predictor = layers.add([ self.top_tier_model_predictor, self.top_tier_model_input_from_mid_tier ]) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l1)( self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l2)( self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l3)( self.top_tier_model_predictor) self.top_tier_model_predictor = Model([ self.top_tier_model_input_predictor, self.top_tier_model_input_from_mid_tier ], self.top_tier_model_predictor) def categorical_crossentropy(target, output): new_target_shape = [ K.shape(output)[i] for i in xrange(K.ndim(output) - 1) ] output = K.reshape(output, (-1, self.q_levels)) xdev = output - K.max(output, axis=1, keepdims=True) lsm = xdev - K.log(K.sum(K.exp(xdev), axis=1, keepdims=True)) cost = -K.sum(lsm * K.reshape(target, (-1, self.q_levels)), axis=1) log2e = K.variable(np.float32(np.log2(np.e))) return K.reshape(cost, new_target_shape) * log2e self.srnn.compile( loss=categorical_crossentropy, optimizer=keras.optimizers.Adam(clipvalue=1.), sample_weight_mode='temporal')
class SRNN(object): def __init__(self, batch_size=BATCH_SIZE, seq_len=SUB_SEQ_LEN, slow_fs=SLOW_FS, slow_dim=SLOW_DIM, dim=DIM, mid_fs=MID_FS, q_levels=Q_LEVELS, mlp_activation='relu'): self.weight_norm = True self.stateful = True self.slow_fs = slow_fs self.mid_fs = mid_fs self.q_levels = q_levels self.dim = dim self.slow_dim = slow_dim self.batch_size = batch_size slow_seq_len = max(1, seq_len // slow_fs) mid_seq_len = max(1, seq_len // mid_fs) prev_sample_seq_len = seq_len + 1 ################################################################################ ################## Model to train ################################################################################ self.slow_tier_model_input = Input( batch_shape=(batch_size, slow_seq_len * slow_fs, 1)) self.slow_tier_model = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels), name='slow_scale')(self.slow_tier_model_input) self.slow_tier_model = Reshape( (slow_seq_len, self.slow_fs), name='slow_reshape4rnn')(self.slow_tier_model) self.slow_rnn_h = K.variable( np.zeros((1, self.slow_dim)), dtype=K.floatx(), name='show_h0') self.slow_rnn_h0 = K.tile(self.slow_rnn_h, (batch_size, 1)) self.mid_rnn_h = K.variable( np.zeros((1, self.dim)), dtype=K.floatx(), name='mid_h0') self.mid_rnn_h0 = K.tile(self.mid_rnn_h, (batch_size, 1)) self.state_selector = K.zeros( (), dtype=K.floatx(), name='slow_state_mask') self.slow_rnn = GruWithWeightNorm( slow_dim, use_bias=True, name='slow_rnn', recurrent_activation='sigmoid', return_sequences=True, stateful=self.stateful, state_selector=self.state_selector, weight_norm=self.weight_norm) self.slow_rnn._trainable_weights.append(self.slow_rnn_h) self.slow_tier_model = self.slow_rnn( self.slow_tier_model, initial_state=self.slow_rnn_h0) # upscale slow rnn output to mid tier ticking freq self.slow_tier_model = TimeDistributed( DenseWithWeightNorm(dim * slow_fs / mid_fs, weight_norm=self.weight_norm, ), name='slow_project2mid') \ (self.slow_tier_model) self.slow_tier_model = Reshape( (mid_seq_len, dim), name='slow_reshape4mid')(self.slow_tier_model) self.mid_tier_model_input = Input( batch_shape=(batch_size, mid_seq_len * mid_fs, 1)) self.mid_tier_model = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels), name='mid_scale')(self.mid_tier_model_input) self.mid_tier_model = Reshape( (mid_seq_len, self.mid_fs), name='mid_reshape2rnn')(self.mid_tier_model) mid_proj = DenseWithWeightNorm( dim, name='mid_project2rnn', weight_norm=self.weight_norm) self.mid_tier_model = TimeDistributed( mid_proj, name='mid_project2rnn')(self.mid_tier_model) self.mid_tier_model = layers.add( [self.mid_tier_model, self.slow_tier_model]) self.mid_rnn = GruWithWeightNorm( dim, name='mid_rnn', return_sequences=True, recurrent_activation='sigmoid', stateful=self.stateful, state_selector=self.state_selector) self.mid_rnn._trainable_weights.append(self.mid_rnn_h) self.mid_tier_model = self.mid_rnn( self.mid_tier_model, initial_state=self.mid_rnn_h0) self.mid_adapter = DenseWithWeightNorm( dim * mid_fs, name='mid_project2top', weight_norm=self.weight_norm) self.mid_tier_model = TimeDistributed( self.mid_adapter, name='mid_project2top')(self.mid_tier_model) self.mid_tier_model = Reshape( (mid_seq_len * mid_fs, dim), name='mid_reshape4top')(self.mid_tier_model) self.embed_size = 256 self.sframe = SFrame() self.top_tier_model_input = self.sframe.build_sframe_model( (batch_size, prev_sample_seq_len, 1), frame_size=self.mid_fs, q_levels=self.q_levels, embed_size=self.embed_size) self.top_adapter = DenseWithWeightNorm( dim, use_bias=False, name='top_project2mlp', kernel_initializer='lecun_uniform', weight_norm=self.weight_norm) self.top_tier_model = TimeDistributed( self.top_adapter, name='top_project2mpl')(self.top_tier_model_input.output) self.top_tier_model_input_from_mid_tier = Input( batch_shape=(batch_size, 1, dim)) self.top_tier_model_input_predictor = Input( batch_shape=(batch_size, mid_fs, 1)) self.top_tier_model = layers.add( [self.mid_tier_model, self.top_tier_model]) self.top_tier_mlp_l1 = DenseWithWeightNorm( dim, activation=mlp_activation, name='mlp_1', weight_norm=self.weight_norm) self.top_tier_mlp_l2 = DenseWithWeightNorm( dim, activation=mlp_activation, name='mlp_2', weight_norm=self.weight_norm) self.top_tier_mlp_l3 = DenseWithWeightNorm( q_levels, kernel_initializer='lecun_uniform', name='mlp_3', weight_norm=self.weight_norm) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l1, name='mlp_1')(self.top_tier_model) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l2, name='mlp_2')(self.top_tier_model) self.top_tier_model = TimeDistributed( self.top_tier_mlp_l3, name='mlp_3')(self.top_tier_model) self.mid_tier_model_input_from_slow_tier = Input( batch_shape=(batch_size, 1, dim)) self.mid_tier_model_input_predictor = Input( batch_shape=(batch_size, mid_fs, 1)) self.srnn = Model([ self.slow_tier_model_input, self.mid_tier_model_input, self.top_tier_model_input.input ], self.top_tier_model) ################################################################################ ################## Model to sample from (predictor) ################################################################################ ################################################################################ ################## Slow tier predictor ################################################################################ self.slow_tier_model_predictor = Model( inputs=self.slow_tier_model_input, outputs=self.slow_tier_model) ################################################################################ ################## Mid tier predictor ################################################################################ self.mid_tier_model_predictor = Lambda( lambda x: scale_samples_for_rnn(x, q_levels=q_levels))( self.mid_tier_model_input_predictor) self.mid_tier_model_predictor = Reshape( (1, self.mid_fs))(self.mid_tier_model_predictor) self.mid_tier_model_predictor = TimeDistributed(mid_proj)( self.mid_tier_model_predictor) self.mid_tier_model_predictor = layers.add([ self.mid_tier_model_predictor, self.mid_tier_model_input_from_slow_tier ]) """ Creating new layer instead of sharing it with the model to train due to https://github.com/keras-team/keras/issues/6939 Sharing statefull layers gives a crosstalk """ self.predictor_mid_rnn = GruWithWeightNorm( self.dim, name='mid_rnn_pred', return_sequences=True, recurrent_activation='sigmoid', stateful=self.stateful, state_selector=self.state_selector) self.predictor_mid_rnn._trainable_weights.append(self.mid_rnn_h) self.mid_tier_model_predictor = self.predictor_mid_rnn( self.mid_tier_model_predictor, initial_state=self.mid_rnn_h0) self.predictor_mid_rnn.set_weights(self.mid_rnn.get_weights()) self.mid_tier_model_predictor = TimeDistributed(self.mid_adapter)( self.mid_tier_model_predictor) self.mid_tier_model_predictor = Reshape( (mid_fs, dim))(self.mid_tier_model_predictor) self.mid_tier_model_predictor = Model([ self.mid_tier_model_input_predictor, self.mid_tier_model_input_from_slow_tier ], self.mid_tier_model_predictor) ################################################################################ ################## Top tier predictor ################################################################################ self.top_predictor_embedding = self.sframe.get_embedding() self.top_tier_model_predictor = self.top_predictor_embedding( self.top_tier_model_input_predictor) self.top_tier_model_predictor = Reshape( (1, mid_fs * self.embed_size))(self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_adapter)( self.top_tier_model_predictor) self.top_tier_model_predictor = layers.add([ self.top_tier_model_predictor, self.top_tier_model_input_from_mid_tier ]) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l1)( self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l2)( self.top_tier_model_predictor) self.top_tier_model_predictor = TimeDistributed(self.top_tier_mlp_l3)( self.top_tier_model_predictor) self.top_tier_model_predictor = Model([ self.top_tier_model_input_predictor, self.top_tier_model_input_from_mid_tier ], self.top_tier_model_predictor) def categorical_crossentropy(target, output): new_target_shape = [ K.shape(output)[i] for i in xrange(K.ndim(output) - 1) ] output = K.reshape(output, (-1, self.q_levels)) xdev = output - K.max(output, axis=1, keepdims=True) lsm = xdev - K.log(K.sum(K.exp(xdev), axis=1, keepdims=True)) cost = -K.sum(lsm * K.reshape(target, (-1, self.q_levels)), axis=1) log2e = K.variable(np.float32(np.log2(np.e))) return K.reshape(cost, new_target_shape) * log2e self.srnn.compile( loss=categorical_crossentropy, optimizer=keras.optimizers.Adam(clipvalue=1.), sample_weight_mode='temporal') def set_h0_selector(self, use_learned_h0): if use_learned_h0: self.srnn.reset_states() self.slow_rnn.reset_states() self.mid_rnn.reset_states() self.slow_tier_model_predictor.reset_states() self.mid_tier_model_predictor.reset_states() K.set_value(self.state_selector, np.ones(())) else: K.set_value(self.state_selector, np.zeros(())) def save_weights(self, file_name): self.srnn.save_weights(file_name) def load_weights(self, file_name): self.srnn.load_weights(file_name) self.predictor_mid_rnn.set_weights(self.mid_rnn.get_weights()) def numpy_one_hot(self, labels_dense, n_classes): """Convert class labels from scalars to one-hot vectors.""" labels_shape = labels_dense.shape[:-1] labels_dtype = labels_dense.dtype labels_dense = labels_dense.ravel().astype("int32") n_labels = labels_dense.shape[0] index_offset = np.arange(n_labels) * n_classes labels_one_hot = np.zeros((n_labels, n_classes)) labels_one_hot[np.arange(n_labels).astype("int32"), labels_dense.ravel()] = 1 labels_one_hot = labels_one_hot.reshape(labels_shape + (n_classes, )) return labels_one_hot.astype(labels_dtype) def _prep_batch(self, x, mask): x_slow = x[:, :-self.slow_fs] x_mid = x[:, self.slow_fs - self.mid_fs:-self.mid_fs] x_prev = x[:, self.slow_fs - self.mid_fs:-1] target = x[:, self.slow_fs:] target = self.numpy_one_hot(target, self.q_levels) if mask is None: mask = np.ones((x.shape[0], x.shape[1])) target_mask = mask[:, self.slow_fs:] return x_slow, x_mid, x_prev, target, target_mask def train_on_batch(self, x, mask=None): x_slow, x_mid, x_prev, target, target_mask = self._prep_batch(x, mask) return self.model().train_on_batch( [x_slow, x_mid, x_prev], target, sample_weight=target_mask) def predict_on_batch(self, x, mask=None): x_slow, x_mid, x_prev, target, target_mask = self._prep_batch(x, mask) return self.model().predict_on_batch([x_slow, x_mid, x_prev]) def test_on_batch(self, x, mask=None): x_slow, x_mid, x_prev, target, target_mask = self._prep_batch(x, mask) return self.model().test_on_batch( [x_slow, x_mid, x_prev], target, sample_weight=target_mask) def model(self): return self.srnn def numpy_sample_softmax2d(self, coeff, random_state, debug=False): if coeff.ndim > 2: raise ValueError("Unsupported dim") if debug: idx = coeff.argmax(axis=1) else: # renormalize to avoid numpy errors about summation... coeff = coeff / (coeff.sum(axis=1, keepdims=True) + 1E-6) idxs = [ np.argmax(random_state.multinomial(1, pvals=coeff[i])) for i in range(len(coeff)) ] idx = np.array(idxs) return idx.astype(K.floatx()) def numpy_sample_softmax(self, logits, random_state, debug=False): old_shape = logits.shape flattened_logits = logits.reshape((-1, logits.shape[logits.ndim - 1])) new_shape = list(old_shape) new_shape[-1] = 1 samples = self.numpy_sample_softmax2d(flattened_logits, random_state, debug).reshape(new_shape) return samples def numpy_softmax(self, X, temperature=1.): # should work for both 2D and 3D dim = X.ndim X = X / temperature e_X = np.exp((X - X.max(axis=dim - 1, keepdims=True))) out = e_X / e_X.sum(axis=dim - 1, keepdims=True) return out def sample(self, ts, random_state, debug): samples = np.zeros((1, ts, 1), dtype='int32') Q_ZERO = self.q_levels // 2 samples[:, :self.slow_fs] = Q_ZERO big_frame_level_outputs = None frame_level_outputs = None self.set_h0_selector(False) for t in xrange(self.slow_fs, ts): if t % self.slow_fs == 0: big_frame_level_outputs = self.slow_tier_model_predictor. \ predict_on_batch([samples[:, t-self.slow_fs:t,:]]) if t % self.mid_fs == 0: frame_level_outputs = self.mid_tier_model_predictor. \ predict_on_batch([samples[:, t-self.mid_fs:t], big_frame_level_outputs[:, (t / self.mid_fs) % (self.slow_fs / self.mid_fs)][:,np.newaxis,:]]) sample_prob = self.top_tier_model_predictor. \ predict_on_batch([samples[:, t-self.mid_fs:t], frame_level_outputs[:, t % self.mid_fs][:,np.newaxis,:]]) sample_prob = self.numpy_softmax(sample_prob) samples[:, t] = self.numpy_sample_softmax( sample_prob, random_state, debug=debug > 0) return samples[0].astype('float32')
profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.dir', symlink_path) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/html,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') profile.set_preference('pdfjs.disabled', True) browser = webdriver.Firefox(profile) # load classifier lr_clf = joblib.load('./model/pfpj_classifier.pkl') totalprocessos = 0 totalerros = 0 seeds = SFrame.read_csv('seedSP.csv', verbose=False, column_type_hints=[str, str, int]) del seeds['Seed'] if hasattr(args, 'a') and args.a: fh = open(args.a, 'r') numprocessos, numerro = [buscaprocesso(busca) for busca in fh.readlines()] fh.close totalprocessos += numprocessos totalerros += numerro else: buscas = args.q totalprocessos, totalerros = buscaprocesso(buscas) totalbuscas = 1 print("Parsing has been done") print('Total de erros / processos: %d / %d:' % (totalerros, totalprocessos))
*z : pre-activation function ** return (Float value) ** """ return sigmoid(z) * (1 - sigmoid(z)) def sigmoid(z): """ Compute the sigmoid function ** input : ** *z : pre-activation function ** return (Float value) from O to 1 ** """ return 1 / (1 + np.exp(-z)) if __name__ == '__main__': dataset = SFrame.read_csv("adult.csv") CATEGORY_KEYS = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"] CONTINU_KEYS = ["capital-gain", "fnlwgt", "hours-per-week", "age", "capital-loss", "educational-num"] # Process nonlinear columns dataset = columns_to_category(dataset, CATEGORY_KEYS) # Process linear columns dataset = columns_to_normalize(dataset, CONTINU_KEYS) # Convert the output from string to binary dataset["income"] = dataset["income"].apply(lambda x : 1. if x == ">50K" else 0.) keys = CATEGORY_KEYS + CONTINU_KEYS + ["income"] features = [] # Create the features matrix for line in dataset:
from sframe import SFrame from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression products = SFrame('review.csv') from string import punctuation def remove_punctuation(text): return text.translate(punctuation) products['review_clean'] = products['text'].apply(remove_punctuation) products = products[products['stars'] != 3] products['sentiment'] = products['stars'].apply(lambda r: +1 if r>3 else -1) train_data, test_data = products.random_split(.8, seed=1) vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') train_matrix = vectorizer.fit_transform(train_data['review_clean']) test_matrix = vectorizer.transform(test_data['review_clean']) print(test_matrix[0]) model = LogisticRegression() model.fit(train_matrix, train_data['sentiment']) sample_test_matrix = vectorizer.transform(['ammazing wow wow'])
""" Shuffle the two lists keeping the order ** input : ** *features : numpy array of features *targets : numpy vector of targets ** return (numpy array of features, numpy vector of targets) ** """ c = list(zip(features.tolist(), targets.tolist())) random.shuffle(c) features[:], targets[:] = zip(*c) return np.array(features), np.array(targets) if __name__ == '__main__': # Load both csv with sframe train_data = SFrame.read_csv("train.csv") test_data = SFrame.read_csv("test.csv") test_data["Survived"] = -1 # We add a new columns for each csv to be abel to differentiate them later train_data["type"] = "train" test_data["type"] = "test" # We now can merge the two csv together data = train_data.append(test_data) # We extract features and targets from the csv train_features, train_targets, test_features = process_csv(data) # We initialize all variables. The weight is a one dimensional vector (one weight per feature) weights = np.random.randn(train_features.shape[1]) # The bias
) # run at the start of every ipython notebook to use plotly.offline # this injects the plotly.js source files into the notebook #-------------------------------------------------- # %matplotlib inline # import matplotlib.pyplot as plt # import seaborn as sns #-------------------------------------------------- # --- # # Read data into SFrames # In[4]: usersSF = SFrame.read_csv("%s/users.dat" % DATADIR, delimiter='::', header=False, verbose=False, column_type_hints=[int, str, int, int, str]) usersSF = usersSF.rename({ 'X1': 'UserID', 'X2': 'Gender', 'X3': 'Age', 'X4': 'Occupation', 'X5': 'ZipCode', }) usersDescSF = dict(zip(usersSF.column_names(), usersSF.column_types())) print usersDescSF # In[5]: ratingsSF = SFrame.read_csv("%s/ratings.dat" % DATADIR,