def explore_k(svd_trans, k_range): ''' Explores various values of k in KMeans Args: svd_trans: dense array with lsi transformed data k_range: the range of k-values to explore Returns: scores: list of intertia scores for each k value ''' scores = [] # spherical kmeans, so normalize normalizer = Normalizer() norm_data = normalizer.fit_transform(svd_trans) for k in np.arange: km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, verbose=2) km.fit(norm_data) scores.append(-1*km.score(norm_data)) plt.plot(k_range, scores) plt.xlabel('# of clusters') plt.ylabel('Inertia') sns.despine(offset=5, trim=True) return scores
class TfIdf(Feature): def __init__(self): self.kbest = None self.vect = None self.truncated = None self.normalizer = None def train(self, reviews, labels): self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english') reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews] tfidf_matrix = self.vect.fit_transform(reviews_text).toarray() self.truncated = TruncatedSVD(n_components=50) self.truncated.fit(tfidf_matrix, labels) trunc = self.truncated.transform(tfidf_matrix) self.normalizer = Normalizer() self.normalizer.fit(trunc) self.kbest = SelectKBest(f_classif, k=5) self.kbest.fit(self.normalizer.transform(trunc), labels) def score(self, data): reviews_text = ' '.join(list(chain.from_iterable(data))) tfidf_matrix = self.vect.transform([reviews_text]).toarray() trunc = self.truncated.transform(tfidf_matrix) return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def preprocess(data, n_components, use_tf_idf=True): """ Preproecess the data for clustering by running SVD and normalizing the results. This process is also known as LSA. arguments: data -- Dataset, if tf_idf is Truethe object must contain a tf_idf table alongside a raw frequencies dataframe. n_components -- int, the number of components to use for the SVD a minimum of 100 is recommended. use_tf_idf -- bool, whether to use the tf-idf frequencies for the preprocessing. returns: e -- float, a measure of variance explained by the SVD. X -- np.array, an array with the data reduced to n_components. """ if use_tf_idf: d = data.tf_idf.as_matrix() else: d = data.df.as_matrix() svd = TruncatedSVD(n_components=n_components) X = svd.fit_transform(d) norm = Normalizer() # Record a measure of explained variance e = svd.explained_variance_ratio_.sum()*100 return e, norm.fit_transform(d)
def __init__(self, YTrain_file, XTrain_file, XTest_file, output_path, normalise, C, class_weight, ): """ Arguments: """ self.YTrain = joblib.load(YTrain_file) XTrain = joblib.load(XTrain_file) self.XTrain = XTrain.reshape(np.size(XTrain, axis=0), -1) XTest = joblib.load(XTest_file) self.XTest = XTest.reshape(np.size(XTest, axis=0), -1) self.output_path = output_path if normalise: normalizer = Normalizer(copy=False) normalizer.transform(self.XTrain) normalizer.transform(self.XTest) self.C = C if class_weight == 'none': class_weight = None self.class_weight = class_weight
def getPcaFeatures(self, images, components, image_size): imageDataset = self.getImagesAsDataset(images, image_size) norm = Normalizer() imageDataset = norm.fit_transform(imageDataset) pca = PCA(n_components=components) imageDataset = pca.fit_transform(imageDataset) return pca, norm, imageDataset
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3): out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) X = agetext["text"] X = X.tolist() label = agetext["agegroup"].tolist() vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2)) docs = [] for doc in X: docs.append(" ".join(doc)) docs2 = [doc.replace("\t","").replace("\n","") for doc in docs] traindocs = docs2[:7999] X = vec.fit_transform(traindocs) testdocs = docs2[8000:9500] X_test = vec.transform(testdocs) tlabel = label[:7999] testl = label[8000:9500] if(check): lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(X) X = normalizer.fit_transform(X) X_test = lsa.transform(X_test) X_test = normalizer.transform(X_test) model.fit(X,tlabel) pred = model.predict(X_test) out.append(round(accuracy_score(testl, pred),2)) print str(out) print np.mean(out)
def kfold(agetext,k,model,k2): import collections out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) datatb = agetext.iloc[:,1:] label = agetext["agegroup"].tolist() X_train, X_test, y_train, y_test = cross_validation.train_test_split( datatb, label, test_size=0.15, random_state=i*6) data = X_train.values counter = collections.Counter(y_train) print counter testdata = X_test.values lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(data) X = normalizer.fit_transform(X) X_test = lsa.transform(testdata) X_test = normalizer.transform(X_test) model.fit(X,y_train) pred = model.predict(X_test) counter = collections.Counter(y_test) print counter counter = collections.Counter(pred) print counter out.append(round(accuracy_score(y_test, pred),5)) print str(out) print np.mean(out)
def normalize_test(): X=[1,2,3,4,5,2,6,8] from sklearn.preprocessing import Normalizer normalizer = Normalizer() X2 = normalizer.fit_transform(X) print X2
def _normalize(self, X, y, X_t): from sklearn.preprocessing import Normalizer NORM = Normalizer() X = NORM.fit_transform(X, y) X_t = NORM.transform(X_t) return X, X_t
def readAndPreProcess(): print("\n\n********** CS-412 HW5 Mini Project **********") print("************ Submitted by Sankul ************\n\n") print("Reading data, please ensure that the dataset is in same folder.") resp = pd.read_csv('responses.csv') print("Data reading complete!") print("Some stats reagarding data:") resp.describe() print("\nStarting pre-processing.....") print("\nFinding missing values:") print("Missing values found, removing them") emptyVals = resp.isnull().sum().sort_values(ascending=False) emptyPlot = emptyVals.plot(kind='barh', figsize = (20,35)) plt.show() print("Empty values removed") print("\nChecking for NaN and infinite values in target column (Empathy):") if len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]): print("Number of infinite or NaN values in Empathy column: ", len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])])) print("Removing them") resp = resp[np.isfinite(resp['Empathy'])] print("Infinite and NaN values removed") print("\nChecking for categorical features:") if pd.Categorical(resp).dtype.name == 'category': print("Categorical features found. Removing them...") resp = resp.select_dtypes(exclude=[object]) print("Categorical features removed") print("\nReplacing NaN values with the mean value:") resp=resp.fillna(resp.mean()) resp.isnull().sum() print("Values replaced") print("\nSeperating labels from data:") Y = resp['Empathy'].values X = resp.drop('Empathy',axis=1) print("Labels seperated") print("\nScaling, standardizing and normalizing the data:") scaler = MinMaxScaler(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X) scaler = StandardScaler().fit(rescaledX) standardizedX = scaler.transform(rescaledX) normalizer = Normalizer().fit(standardizedX) normalizedX = normalizer.transform(standardizedX) print("Scaling, standardizing and normalizing completed") print("\nFinal data looks like:") print(normalizedX.shape) print("Values inside look like:") print(normalizedX[0]) return normalizedX,Y
def kmeans(tfidf, svd, svd_trans, k=200, n_words=10): ''' Performs k-means clustering on svd transformed data and plots it Args: tfidf: sklearn fitted TfidfVectorizer svd: sklearn fitted TruncatedSVD svd_trans: dense array with lsi transformed data k: the k in k-means Returns: km: the fitted KMean object ''' # spherical kmeans, so normalize normalizer = Normalizer() norm_data = normalizer.fit_transform(svd_trans) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=2) km.fit(norm_data) original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] terms = tfidf.get_feature_names() terms = prettify(terms) terms = np.array(terms) fig = plt.figure(figsize=(10, 8)) for i in range(10): print("Cluster {:d}:".format(i)) for ind in order_centroids[i, :n_words]: print(' {:s}'.format(terms[ind])) print('\n') # Make a figure and axes with dimensions as desired. ax = fig.add_subplot(2, 5, i+1) ax.set_title('Cluster {:d}'.format(i+1)) component = order_centroids[i] cmap = plt.cm.Purples mn = np.min(component[:n_words]) mx = np.max(component[:n_words]) norm = mpl.colors.Normalize(mn, mx) cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm, orientation='vertical') # sorted_component = np.sort(component) colors = sns.color_palette('Purples', 9).as_hex() colors = np.repeat(colors[-1], n_words) cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1]) cb.ax.yaxis.set_tick_params(size=0) cb.ax.tick_params(labelsize=10) for color, tick in zip(colors, cb.ax.get_yticklabels()): tick.set_color(color) tick.set_fontsize(14) cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]]) plt.tight_layout() return km
def __init__(self, img_dir): self._imgdir = img_dir self._extractors = self.__get_extractors() self._normalizer = Normalizer() self._face_normalizer = Normalizer() self._estimator = NearestNeighbors(n_neighbors=3) self._face_estimator = NearestNeighbors(n_neighbors=3) self._imgnames = [] self._face_imgnames = []
class ScikitNormalizer(object): def __init__(self): self.data_normalizer = Normalizer() def fit(self, data): self.data_normalizer.fit(data) def transform(self, data): return (self.data_normalizer.transform(data) + 1) / 2
def test_ver2_syntetic_dataset(self): self.ex = experiment.Experiment() self.ex.cf_matrix = load_sparse_data('syntetic_cf.dat') n = Normalizer(norm='l2', copy=True) self.ex.cf_matrix = n.transform(self.ex.cf_matrix) #normalized. self.ex.cb_prox = experiment.Experiment.load_data(PKL + 'cb_prox.pkl') self.ex.cf_prox = self.ex.cf_matrix * self.ex.cf_matrix.T self.ex.test_corr_sparsity(draw=True, interval=100)
def reduce_dimension(self, n_components=2): """ Return PCA transform of self.data, with n_components. """ reducer = PCA(n_components=n_components) X = self.data.values norm = Normalizer() Xnorm = norm.fit_transform(X) return reducer.fit_transform(Xnorm)
def make_nn_regression(n_samples=100, n_features=100, n_informative=10, dense=False, noise=0.0, test_size=0, normalize_x=True, normalize_y=True, shuffle=True, random_state=None): X, y, w = _make_nn_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, shuffle=shuffle, random_state=random_state) if dense: X = X.toarray() if test_size > 0: cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state, test_size=test_size, train_size=1-test_size) train, test = list(cv)[0] X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] if not dense: X_train.sort_indices() X_test.sort_indices() else: X_train, y_train = X, y if not dense: X_train.sort_indices() X_test, y_test = None, None # Add noise if noise > 0.0: generator = check_random_state(random_state) y_train += generator.normal(scale=noise * np.std(y_train), size=y_train.shape) y_train = np.maximum(y_train, 0) if normalize_x: normalizer = Normalizer() X_train = normalizer.fit_transform(X_train) if X_test is not None: X_test = normalizer.transform(X_test) if normalize_y: scaler = MinMaxScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() if y_test is not None: y_test = scaler.transform(y_test.reshape(-1, 1)).ravel() if X_test is not None: return X_train, y_train, X_test, y_test, w else: return X_train, y_train, w
def normalize(self, msi, norm="l1"): original_shape = msi.get_image().shape collapsed_image = collapse_image(msi.get_image()) # temporarily save mask, since scipy normalizer removes mask is_masked_array = isinstance(msi.get_image(), np.ma.MaskedArray) if is_masked_array: mask = msi.get_image().mask normalizer = Normalizer(norm=norm) normalized_image = normalizer.transform(collapsed_image) if is_masked_array: normalized_image = np.ma.MaskedArray(normalized_image, mask=mask) msi.set_image(np.reshape(normalized_image, original_shape))
class KNN(Model): def __init__(self, X_train, y_train, X_val, y_val): super().__init__() self.normalizer = Normalizer() self.normalizer.fit(X_train) self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1) self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train)) print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val)) def guess(self, feature): return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
def test_pipeline(): norm = Normalizer(norm='l1') norm_id = norm.what().id() assert norm_id == "Normalizer(norm='l1')" kmeans = KMeans(n_clusters=12) kmeans_id = kmeans.what().id() print(kmeans_id) assert kmeans_id == \ "KMeans(algorithm='auto',init='k-means++',max_iter=300,n_clusters=12,n_init=10,random_state=None,tol=0.0001)" # noinspection PyTypeChecker pipeline_id = Pipeline((('norm', norm), ('kmeans', kmeans))).what().id() assert pipeline_id == "Pipeline(steps=(('norm',%s),('kmeans',%s)))" % (norm_id, kmeans_id)
def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False): N = len(M) if tf == "raw": tf_M = np.copy(M) #just the frequency of the word in a text # #TODO: check if dnorm is implemented OK # elif tf == "dnorm": # tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1)))) if idf == "c": idf_v = [] for i in range(M.shape[1]): #get the number of texts that contain a word words[i] idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M idf_v = np.array(idf_v) idf_v = np.log(N/idf_v) tf_idf_M = tf_M*idf_v if norm_samps: normalizer = Normalizer() tf_idf_M = normalizer.fit_transform(tf_idf_M) # np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s") return tf_idf_M
def load_data(self): if not os.path.exists('features_train.txt'): self.feature_extraction('train.txt', 'features_train.txt') data_train, target_train = load_svmlight_file('features_train.txt') if not os.path.exists('features_test.txt'): self.feature_extraction('test.txt', 'features_test.txt') data_test, target_test = load_svmlight_file('features_test.txt') normalizer = Normalizer().fit(data_train) data_train = normalizer.transform(data_train) data_test = normalizer.transform(data_test) return data_train.toarray(), target_train, data_test.toarray(), target_test
def test_normalizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Normalizer # with sklearn.preprocessing.Normalizer normalizerr = NormalizerR() normalizerr.fit(np.concatenate(trajs)) normalizer = Normalizer() normalizer.fit(trajs) y_ref1 = normalizerr.transform(trajs[0]) y1 = normalizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def lstm_validate(lstm_model, evaluation_dataset, create_confusion_matrix=False, number_of_subframes=0, sample_strategy="random", batch_size=32): print("evaluate neural network...") validation_data = [] validation_labels = [] accuracy = 0 n = 0 idx = 0 for _obj in evaluation_dataset: if number_of_subframes > 0: validation_data.append(get_buckets(_obj.get_hoj_set(), number_of_subframes, sample_strategy)) else: validation_data.append(_obj.get_hoj_set()) validation_labels.append(_obj.get_hoj_label()[0]) # evaluate neural network score, acc = lstm_model.evaluate(np.array(validation_data), np.array(validation_labels), batch_size=batch_size, verbose=0) print("Accuracy:",acc) if create_confusion_matrix is True: predictions = lstm_model.predict(np.array(validation_data),batch_size = batch_size) predicted_labels = [] real_labels = [] for k in range(len(predictions)): predicted_idx = np.argmax(predictions[k]) label_idx = np.argmax(validation_labels[k]) real_labels.append(label_idx) predicted_labels.append(predicted_idx) cnf_matrix = confusion_matrix(real_labels, predicted_labels) norm = Normalizer() cnf_matrix = norm.fit_transform(cnf_matrix) return score, acc, cnf_matrix return score, acc, None
def __init__(self, X_train, y_train, X_val, y_val): super().__init__() self.normalizer = Normalizer() self.normalizer.fit(X_train) self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1) self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train)) print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))
def __init__(self, nor='nor', fold=2): self.fold = fold dataframe = pandas.read_csv(open('wine.data')) array = dataframe.values # separate array into input and output components self.X = array[:,1:] self.Y = array[:,0] self.nor = nor # normalizer can turn length of vector into 1. if self.nor == 'nor': scaler = Normalizer().fit(self.X) else: scaler = MinMaxScaler().fit(self.X) self.X = scaler.transform(self.X) numpy.set_printoptions(precision=3)
def test_normalizer_l1(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sp.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sp.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) X_norm = normalizer.transform(X) assert X_norm is not X X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) X_norm = normalizer.transform(X) assert X_norm is X X_norm2 = toarray(X_norm) for X_norm in (X_norm1, X_norm2): row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) # check input for which copy=False won't prevent a copy for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sp.csr_matrix) X_norm = toarray(X_norm) for i in xrange(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0)
def test_sklearn_transform(): transformer = Normalizer() transformer.fit(X_train) computation = SklearnTransform("test-sklearn", transformer, istreams=[], ostream="out") context = ComputationContext(computation) data = pd.DataFrame(X_test).to_json(orient="records") computation.process_record(context, Record("transform", data, None)) assert len(context.records) == 1 assert len(context.records["out"]) == 1 record = context.records["out"][0] assert record.key == "transform" assert np.allclose(transformer.transform(X_test), json.loads(record.data))
def __init__(self, dataset, n_words=300, add_global_desc=True, color_sift=False): self.dataset = dataset self.n_words = n_words self.add_global_desc = add_global_desc self.normalizer = Normalizer(norm='l1') self.color_sift = color_sift if self.color_sift: self.feature_extractor = color_sift_descriptors else: self.feature_extractor = sift_descriptors
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): ''' Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extn: extension of subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code) :param class_labels_fname: files containing labels of each graph :return: None ''' gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname) logging.info('Loaded gensim model of subgraph vectors') subgraph_vocab = sorted(gensim_model.vocab.keys()) logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab))) wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False, vocabulary=subgraph_vocab) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test) subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab) deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
def vectorize(n, comp=0): tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english', sublinear_tf=True, use_idf=True, smooth_idf=True) # Fit and transform X = tfv.fit_transform(boiler_stream(trainfnm, n)) lsa = None scaler = None if comp > 0: lsa = TruncatedSVD(comp) scaler = Normalizer(copy=False) X = lsa.fit_transform(X) X = scaler.fit_transform(X) # Transform only Z = tfv.transform(boiler_stream(testfnm, n)) if lsa: Z = lsa.transform(Z) Z = scaler.transform(Z) np.save(trainvecfnm, X) np.save(testvecfnm, Z)
from sklearn.model_selection import train_test_split from data import load_data, parse_params from java_bridge import JavaBridge from common_model import CommonModel # Create the bridge. bridge = JavaBridge() # Read and parse the parameters. ps = bridge.read() params = parse_params(ps) # Load data from file. train_X, train_Y = load_data(params['train_path']) test_X, test_Y = load_data(params['test_path']) # Normalize datasets. if params['normalize_features']: norm = Normalizer().fit(train_X) train_X = norm.transform(train_X, copy=False) test_X = norm.transform(test_X, copy=False) # Train and score the model. model = CommonModel(params) model.fit(train_X, train_Y, bridge) model.score(test_X, test_Y, bridge) # Close the bridge and end program. bridge.end()
def model_build(): data = pd.read_csv('./result_datasets/preprocessed_data.csv') # spliting data to train and test data X = data.drop('Score', axis=1) Y = data.Score.values X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.33, stratify=Y, random_state=42) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # # Normalising all the numerical features std_scaler = Normalizer() min_max = MinMaxScaler() # payment_sequential feature payment_sequential_train = std_scaler.fit_transform( X_train.payment_sequential.values.reshape(-1, 1)) payment_sequential_test = std_scaler.transform( X_test.payment_sequential.values.reshape(-1, 1)) # payment_installments feature payment_installments_train = std_scaler.fit_transform( X_train.payment_installments.values.reshape(-1, 1)) payment_installments_test = std_scaler.transform( X_test.payment_installments.values.reshape(-1, 1)) # Payment value feature payment_value_train = std_scaler.fit_transform( X_train.payment_value.values.reshape(-1, 1)) payment_value_test = std_scaler.transform( X_test.payment_value.values.reshape(-1, 1)) # price price_train = std_scaler.fit_transform(X_train.price.values.reshape(-1, 1)) price_test = std_scaler.transform(X_test.price.values.reshape(-1, 1)) # freight_value freight_value_train = std_scaler.fit_transform( X_train.freight_value.values.reshape(-1, 1)) freight_value_test = std_scaler.transform( X_test.freight_value.values.reshape(-1, 1)) # product_name_length product_name_length_train = std_scaler.fit_transform( X_train.product_name_length.values.reshape(-1, 1)) product_name_length_test = std_scaler.transform( X_test.product_name_length.values.reshape(-1, 1)) # product_description_length product_description_length_train = std_scaler.fit_transform( X_train.product_description_length.values.reshape(-1, 1)) product_description_length_test = std_scaler.transform( X_test.product_description_length.values.reshape(-1, 1)) # product_photos_qty product_photos_qty_train = std_scaler.fit_transform( X_train.product_photos_qty.values.reshape(-1, 1)) product_photos_qty_test = std_scaler.transform( X_test.product_photos_qty.values.reshape(-1, 1)) # delivery_days delivery_days_train = std_scaler.fit_transform( X_train.delivery_days.values.reshape(-1, 1)) delivery_days_test = std_scaler.transform( X_test.delivery_days.values.reshape(-1, 1)) # estimated_days estimated_days_train = std_scaler.fit_transform( X_train.estimated_days.values.reshape(-1, 1)) estimated_days_test = std_scaler.transform( X_test.estimated_days.values.reshape(-1, 1)) # ships_in ships_in_train = std_scaler.fit_transform( X_train.ships_in.values.reshape(-1, 1)) ships_in_test = std_scaler.transform(X_test.ships_in.values.reshape(-1, 1)) # seller_popularity seller_popularity_train = min_max.fit_transform( X_train.seller_popularity.values.reshape(-1, 1)) seller_popularity_test = min_max.transform( X_test.seller_popularity.values.reshape(-1, 1)) # # Normalising Categorical features # In[169]: # initialising oneHotEncoder onehot = CountVectorizer() cat = OneHotEncoder() # payment_type payment_type_train = onehot.fit_transform(X_train.payment_type.values) payment_type_test = onehot.transform(X_test.payment_type.values) # customer_state customer_state_train = onehot.fit_transform(X_train.customer_state.values) customer_state_test = onehot.transform(X_test.customer_state.values) # seller_state seller_state_train = onehot.fit_transform(X_train.seller_state.values) seller_state_test = onehot.transform(X_test.seller_state.values) # product_category_name product_category_name_train = onehot.fit_transform( X_train.product_category_name.values) product_category_name_test = onehot.transform( X_test.product_category_name.values) # arrival_time arrival_time_train = onehot.fit_transform(X_train.arrival_time.values) arrival_time_test = onehot.transform(X_test.arrival_time.values) # delivery_impression delivery_impression_train = onehot.fit_transform( X_train.delivery_impression.values) delivery_impression_test = onehot.transform( X_test.delivery_impression.values) # estimated_del_impression estimated_del_impression_train = onehot.fit_transform( X_train.estimated_del_impression.values) estimated_del_impression_test = onehot.transform( X_test.estimated_del_impression.values) # ship_impression ship_impression_train = onehot.fit_transform( X_train.ship_impression.values) ship_impression_test = onehot.transform(X_test.ship_impression.values) # existing_cust existing_cust_train = cat.fit_transform( X_train.existing_cust.values.reshape(-1, 1)) existing_cust_test = cat.transform( X_test.existing_cust.values.reshape(-1, 1)) # **Stacking the data** # stacking up all the encoded features X_train_vec = hstack( (payment_sequential_train, payment_installments_train, payment_value_train, price_train, freight_value_train, product_name_length_train, product_description_length_train, product_photos_qty_train, delivery_days_train, estimated_days_train, ships_in_train, payment_type_train, customer_state_train, seller_state_train, product_category_name_train, arrival_time_train, delivery_impression_train, estimated_del_impression_train, ship_impression_train, seller_popularity_train)) X_test_vec = hstack( (payment_sequential_test, payment_installments_test, payment_value_test, price_test, freight_value_test, product_name_length_test, product_description_length_test, product_photos_qty_test, delivery_days_test, estimated_days_test, ships_in_test, payment_type_test, customer_state_test, seller_state_test, product_category_name_test, arrival_time_test, delivery_impression_test, estimated_del_impression_test, ship_impression_test, seller_popularity_test)) print(X_train_vec.shape, X_test_vec.shape) # # Naive Bayes # # Hyper parameter Tuning naive = MultinomialNB(class_prior=[0.5, 0.5]) param = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]} # for the bow based model NB = GridSearchCV(naive, param, cv=3, refit=False, return_train_score=True, scoring='roc_auc') NB.fit(X_train_vec, y_train) NB.best_params_ # # Fitting the Model clf = MultinomialNB(alpha=0.0001, class_prior=[0.5, 0.5]) clf.fit(X_train_vec, y_train) # predicted value of y probabilities y_pred_train = clf.predict_proba(X_train_vec) y_pred_test = clf.predict_proba(X_test_vec) # predicted values of Y labels pred_label_train = clf.predict(X_train_vec) pred_label_test = clf.predict(X_test_vec) # Confusion Matrix cf_matrix_train = confusion_matrix(y_train, pred_label_train) cf_matrix_test = confusion_matrix(y_test, pred_label_test) fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:, 1]) fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1]) train_auc = round(auc(fpr_train, tpr_train), 3) test_auc = round(auc(fpr_test, tpr_test), 3) plt.plot(fpr_train, tpr_train, color='red', label='train-auc = ' + str(train_auc)) plt.plot(fpr_test, tpr_test, color='blue', label='test-auc = ' + str(test_auc)) plt.plot(np.array([0, 1]), np.array([0, 1]), color='black', label='random model auc = ' + str(0.5)) plt.xlabel('False Positive Rate(FPR)') plt.ylabel('True Positive Rate(TPR)') plt.title('ROC curve') plt.legend() plt.show() print('Best AUC for the model is {} '.format(test_auc)) # plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test), annot=True, fmt='.2%', cmap='Greens') plt.show() # f1 score print('Train F1_score for this model is : ', round(f1_score(y_train, pred_label_train), 4)) print('Test F1_score for this model is : ', round(f1_score(y_test, pred_label_test), 4)) print('Train Accuracy score for this model : ', round(accuracy_score(y_train, pred_label_train), 4)) print('Test Accuracy score for this model : ', round(accuracy_score(y_test, pred_label_test), 4)) # # Observations # # 1. Naive bayes performed pretty decent in terms of minimal overfitting in train and test performances. # 2. Both train and test f1 score was 0.86 and accuracy 77%. # 3. But the confusion matrix says it has misclassified many points as False Positives. # 4. AUC score for test data was 0.694. # # Logistic Regression # # Hyper parameter Tuning # we have used max_iter 1000 as it was causing exception while fitting Logi = LogisticRegression(max_iter=1000, solver='lbfgs') param = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30]} # for the bow based model LR = GridSearchCV(Logi, param, cv=3, refit=False, return_train_score=True, scoring='roc_auc') LR.fit(X_train_vec, y_train) LR.best_params_ # **NOTE** # # * For performance measurement we will not use accuracy as a metric as the data set is highly imbalanced. # * We will use AUC score and f1 score as performance metric. # model clf = LogisticRegression(C=0.1, max_iter=1000, solver='lbfgs') clf.fit(X_train_vec, y_train) # In[180]: # predicted value of y probabilities y_pred_train = clf.predict_proba(X_train_vec) y_pred_test = clf.predict_proba(X_test_vec) # predicted values of Y labels pred_label_train = clf.predict(X_train_vec) pred_label_test = clf.predict(X_test_vec) # Confusion Matrix cf_matrix_train = confusion_matrix(y_train, pred_label_train) cf_matrix_test = confusion_matrix(y_test, pred_label_test) fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:, 1]) fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1]) train_auc = round(auc(fpr_train, tpr_train), 3) test_auc = round(auc(fpr_test, tpr_test), 3) plt.plot(fpr_train, tpr_train, color='red', label='train-auc = ' + str(train_auc)) plt.plot(fpr_test, tpr_test, color='blue', label='test-auc = ' + str(test_auc)) plt.plot(np.array([0, 1]), np.array([0, 1]), color='black', label='random model auc = ' + str(0.5)) plt.xlabel('False Positive Rate(FPR)') plt.ylabel('True Positive Rate(TPR)') plt.title('ROC curve') plt.legend() plt.show() print('Best AUC for the model is {} '.format(test_auc)) # In[181]: # plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test), annot=True, fmt='.2%', cmap='Greens') plt.show() # In[182]: # f1 score print('Train F1_score for this model is : ', round(f1_score(y_train, pred_label_train), 4)) print('Test F1_score for this model is : ', round(f1_score(y_test, pred_label_test), 4)) # In[183]: print('Train Accuracy score for this model : ', round(accuracy_score(y_train, pred_label_train), 4)) print('Test Accuracy score for this model : ', round(accuracy_score(y_test, pred_label_test), 4)) # # Observations # # 1. Logistic regression performs considerably better than Naive bayes in terms of f1 score, however AUC score being almost the same. # 2. Misclassification of False positives reduced which resulted in the increase of f1 score of 92%. # 3. Accuracy was 86% for both train and test which shows the model doesn't overfit at all. # # Decision Tree # # HyperParmater tuning # In[184]: # model initialize DT = DecisionTreeClassifier(class_weight='balanced') # hyper parameters param = { 'max_depth': [1, 5, 10, 15, 20], 'min_samples_split': [5, 10, 100, 300, 500, 1000] } # Grid search CV DT = GridSearchCV(DT, param, cv=3, refit=False, return_train_score=True, scoring='roc_auc') DT.fit(X_train_vec, y_train) # In[185]: # best params DT.best_params_ # In[186]: # model clf = DecisionTreeClassifier(class_weight='balanced', max_depth=20, min_samples_split=300) clf.fit(X_train_vec, y_train) # predicted value of y probabilities y_pred_train = clf.predict_proba(X_train_vec) y_pred_test = clf.predict_proba(X_test_vec) # predicted values of Y labels pred_label_train = clf.predict(X_train_vec) pred_label_test = clf.predict(X_test_vec) # Confusion Matrix cf_matrix_train = confusion_matrix(y_train, pred_label_train) cf_matrix_test = confusion_matrix(y_test, pred_label_test) # taking the probabilit scores instead of the predicted label # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:, 1]) fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1]) train_auc = round(auc(fpr_train, tpr_train), 3) test_auc = round(auc(fpr_test, tpr_test), 3) plt.plot(fpr_train, tpr_train, color='red', label='train-auc = ' + str(train_auc)) plt.plot(fpr_test, tpr_test, color='blue', label='test-auc = ' + str(test_auc)) plt.plot(np.array([0, 1]), np.array([0, 1]), color='black', label='random model auc = ' + str(0.5)) plt.xlabel('False Positive Rate(FPR)') plt.ylabel('True Positive Rate(TPR)') plt.title('ROC curve') plt.legend() plt.show() print('Best AUC for the model is {} '.format(test_auc)) # In[187]: # plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test), annot=True, fmt='.2%', cmap='Greens') plt.show() # In[188]: # f1 score print('Train F1_score for this model is : ', round(f1_score(y_train, pred_label_train), 4)) print('Test F1_score for this model is : ', round(f1_score(y_test, pred_label_test), 4)) # In[189]: print('Train Accuracy score for this model : ', round(accuracy_score(y_train, pred_label_train), 4)) print('Test Accuracy score for this model : ', round(accuracy_score(y_test, pred_label_test), 4)) # # Observations # # 1. Decision Tree does nothing better interms of both f1 score , auc score and accuracy comes out to be 0.708 and 70%. # 2. It misclassfied False Positives to a lot. # 3. Model doesn't overfit but doesn't perform better either. # # Random Forest # # Hyperparameter Tuning # In[190]: # param grid # we have limit max_depth to 10 so that the model doesn't overfit param = { 'min_samples_split': [5, 10, 30, 50, 100], 'max_depth': [5, 7, 10] } # Random forest classifier RFclf = RandomForestClassifier(class_weight='balanced') # using grid search cv to tune parameters RF = GridSearchCV(RFclf, param, cv=5, refit=False, n_jobs=-1, verbose=1, return_train_score=True, scoring='roc_auc') RF.fit(X_train_vec, y_train) # In[191]: RF.best_params_ # In[192]: # model clf = RandomForestClassifier(class_weight='balanced', max_depth=10, min_samples_split=5) clf.fit(X_train_vec, y_train) # predicted value of y probabilities y_pred_train = clf.predict_proba(X_train_vec) y_pred_test = clf.predict_proba(X_test_vec) # predicted values of Y labels pred_label_train = clf.predict(X_train_vec) pred_label_test = clf.predict(X_test_vec) # Confusion Matrix cf_matrix_train = confusion_matrix(y_train, pred_label_train) cf_matrix_test = confusion_matrix(y_test, pred_label_test) # taking the probabilit scores instead of the predicted label # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:, 1]) fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1]) train_auc = round(auc(fpr_train, tpr_train), 3) test_auc = round(auc(fpr_test, tpr_test), 3) plt.plot(fpr_train, tpr_train, color='red', label='train-auc = ' + str(train_auc)) plt.plot(fpr_test, tpr_test, color='blue', label='test-auc = ' + str(test_auc)) plt.plot(np.array([0, 1]), np.array([0, 1]), color='black', label='random model auc = ' + str(0.5)) plt.xlabel('False Positive Rate(FPR)') plt.ylabel('True Positive Rate(TPR)') plt.title('ROC curve') plt.legend() plt.show() print('Best AUC for the model is {} '.format(test_auc)) # In[193]: # plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test), annot=True, fmt='.2%', cmap='Greens') plt.show() # In[194]: # f1 score print('Train F1_score for this model is : ', round(f1_score(y_train, pred_label_train), 4)) print('Test F1_score for this model is : ', round(f1_score(y_test, pred_label_test), 4)) # In[195]: print('Train Accuracy score for this model : ', round(accuracy_score(y_train, pred_label_train), 4)) print('Test Accuracy score for this model : ', round(accuracy_score(y_test, pred_label_test), 4)) # # Observations # # 1. Random forest performs better than logistic regression in terms of f1 score and accuracy. # 2. It gives an f1 score of 90.13% and doesn't seem to overfit. # 3. Misclassification rate is still not that great. # 4. AUC is score is 0.718 # 5. Accuracy score is 83%. # # GBDT # # Hyper parameter tuning # In[196]: # param grid # we have limit max_depth to 8 so that the model doesn't overfit param = {'min_samples_split': [5, 10, 30, 50], 'max_depth': [3, 5, 7, 8]} GBDTclf = GradientBoostingClassifier() clf = GridSearchCV(RFclf, param, cv=5, refit=False, return_train_score=True, scoring='roc_auc') clf.fit(X_train_vec, y_train) # In[197]: # best parameters clf.best_params_ # In[198]: import pickle # In[199]: # Model clf = GradientBoostingClassifier(max_depth=8, min_samples_split=5) clf.fit(X_train_vec, y_train) # save the model to disk Pkl_Filename = "final_model.pkl" with open(Pkl_Filename, 'wb') as file: pickle.dump(clf, file) # predicted value of y probabilities y_pred_train = clf.predict_proba(X_train_vec) y_pred_test = clf.predict_proba(X_test_vec) # predicted values of Y labels pred_label_train = clf.predict(X_train_vec) pred_label_test = clf.predict(X_test_vec) # Confusion Matrix cf_matrix_train = confusion_matrix(y_train, pred_label_train) cf_matrix_test = confusion_matrix(y_test, pred_label_test) # taking the probabilit scores instead of the predicted label # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:, 1]) fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1]) train_auc = round(auc(fpr_train, tpr_train), 3) test_auc = round(auc(fpr_test, tpr_test), 3) plt.plot(fpr_train, tpr_train, color='red', label='train-auc = ' + str(train_auc)) plt.plot(fpr_test, tpr_test, color='blue', label='test-auc = ' + str(test_auc)) plt.plot(np.array([0, 1]), np.array([0, 1]), color='black', label='random model auc = ' + str(0.5)) plt.xlabel('False Positive Rate(FPR)') plt.ylabel('True Positive Rate(TPR)') plt.title('ROC curve') plt.legend() plt.show() print('Best AUC for the model is {} '.format(test_auc)) # In[200]: # plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test), annot=True, fmt='.2%', cmap='Greens') plt.show() # In[201]: # f1 score print('Train F1_score for this model is : ', round(f1_score(y_train, pred_label_train), 4)) print('Test F1_score for this model is : ', round(f1_score(y_test, pred_label_test), 4)) # In[202]: print('Train Accuracy score for this model : ', round(accuracy_score(y_train, pred_label_train), 4)) print('Test Accuracy score for this model : ', round(accuracy_score(y_test, pred_label_test), 4)) # # Observations # # 1. Gradient Boosted classifier results the best f1 score of 0.9243 and auc score of 0.745. # 2. Misclassification of False Positives and True negetives is also reduced to 11% also true positive rate is 83%. # 3. Accuracy score is 86% for test and 87% for train data. # 4. Model does overfit a slight comapred to rest of the models. # # Observations # # 1. We created a standard deep Neural network model and trained it for 20 epochs this resulted f1 score very similar to our best ML model yet which is GBDT. # 2. Kindly note that this neural network was very little hyper-parameter tuning done,and still results in a very decent performance. # 3. However the auc score of GBDT is still better than the NN model. # 4. Important thing to note that NN based models can be much better than conventional ML models for such problems. # # Results from prettytable import PrettyTable table = PrettyTable() table.field_names = ["Model", "F1_score", " AUC_score ", " Accuracy "] table.add_row(["Naive Bayes", '0.8575', '0.694', '0.7689']) table.add_row(["Logistic Regression", '0.9217', '0.699', '0.8605']) table.add_row(["Decision Tree", '0.8031', '0.713', '0.7021']) table.add_row([ "Random Forest", '0.9013', '0.718', '0.8315', ]) table.add_row(["GBDT**(BEST)", '0.9243', '0.745', '0.8651']) # table.add_row(["Deep NN",'0.9233','0.710','0.8629']) print(table) return
from sklearn.kernel_approximation import RBFSampler, Nystroem from sklearn.cluster import FeatureAgglomeration from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100))] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore') for (preprocessor, C, loss, fit_intercept) in itertools.product( preprocessor_list, [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1., 10., 50., 100.],
#Add this version of X to the list X_all.append(['StdSca','All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem]) #MinMax #Apply transform only for non-categorical data X_temp = MinMaxScaler().fit_transform(X_train[:,0:size]) X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size]) #Concatenate non-categorical data and categorical X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1) X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1) #Add this version of X to the list X_all.append(['MinMax', 'All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem]) #Normalize #Apply transform only for non-categorical data X_temp = Normalizer().fit_transform(X_train[:,0:size]) X_val_temp = Normalizer().fit_transform(X_val[:,0:size]) #Concatenate non-categorical data and categorical X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1) X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1) #Add this version of X to the list X_all.append(['Norm', 'All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem]) #Impute #Imputer is not used as no data is missing #List of transformations trans_list = [] for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all: trans_list.append(trans)
"syllablesPerWord", "charactersPerWord" ] ############################################################################### # Size Matters: Word Count as a Measure of Quality on Wikipedia FEATURES #features_cols = ["wordCount"] ############################################################################### # Select only the columns corresponding to the features in the list X = data[features_cols] # Select qualityClass as the response (y) y = data.qualityClass # NORMALIZE DATASET scaler = Normalizer().fit(X) X = scaler.transform(X) # STANDARDIZE DATASET #X = preprocessing.scale(X) #y = preprocessing.scale(y) # FEATURE SELECTION #from sklearn.feature_selection import VarianceThreshold #sel = VarianceThreshold(threshold=(.8 * (1 - .8))) #X = sel.fit_transform(X) # 10-fold cross-validation with multilayer perceptron #mlp = MLPClassifier() #print cross_val_score(mlp, X, y, cv=10, scoring='accuracy').mean()
dimension=10 pool_size=20 iteration=2000 loop=1 sigma=0.01# noise delta=0.1# high probability alpha=1# regularizer alpha_2=0.1# edge delete CLUB epsilon=8 # Ts beta=0.15# exploration for CLUB, SCLUB and GOB thres=0.0 state=False # False for artificial dataset, True for real dataset lambda_list=[4] item_feature_matrix=Normalizer().fit_transform(np.random.normal(size=(item_num, dimension))) neighbor_num=3 ws_adj=WS_graph(user_num, neighbor_num, 0.1) er_adj=ER_graph(user_num, 0.2) ba_adj=BA_graph(user_num, 3) random_weights=np.round(np.random.uniform(size=(user_num, user_num)), decimals=2) random_weights=(random_weights.T+random_weights)/2 ws_adj=ws_adj*random_weights er_adj=er_adj*random_weights ba_adj=ba_adj*random_weights true_adj=rbf_kernel(np.random.normal(size=(user_num, dimension)), gamma=0.25/dimension) #true_adj=ws_adj
def prepare_scale_train_valid_test( data: Union[pd.DataFrame, pd.Series], n_input_days: int, n_predict_days: int, test_size: float, s_end_date: str, no_shuffle: bool, ): """ Prepare and scale train, validate and test data. Parameters ---------- data: pd.DataFrame Dataframe of stock prices ns_parser: argparse.Namespace Parsed arguments Returns ------- X_train: np.ndarray Array of training data. Shape (# samples, n_inputs, 1) X_test: np.ndarray Array of validation data. Shape (totoal sequences - #samples, n_inputs, 1) y_train: np.ndarray Array of training outputs. Shape (#samples, n_days) y_test: np.ndarray Array of validation outputs. Shape (total sequences -#samples, n_days) X_dates_train: np.ndarray Array of dates for X_train X_dates_test: np.ndarray Array of dates for X_test y_dates_train: np.ndarray Array of dates for y_train y_dates_test: np.ndarray Array of dates for y_test test_data: np.ndarray Array of prices after the specified end date dates_test: np.ndarray Array of dates after specified end date scaler: Fitted preprocesser """ # Pre-process data if PREPROCESSER == "standardization": scaler = StandardScaler() elif PREPROCESSER == "minmax": scaler = MinMaxScaler() elif PREPROCESSER == "normalization": scaler = Normalizer() elif (PREPROCESSER == "none") or (PREPROCESSER is None): scaler = None # Test data is used for forecasting. Takes the last n_input_days data points. # These points are not fed into training if s_end_date: data = data[data.index <= s_end_date] if n_input_days + n_predict_days > data.shape[0]: print("Cannot train enough input days to predict with loaded dataframe\n") return ( None, None, None, None, None, None, None, None, None, None, None, True, ) test_data = data.iloc[-n_input_days:] train_data = data.iloc[:-n_input_days] dates = data.index dates_test = test_data.index if scaler: train_data = scaler.fit_transform(data.values.reshape(-1, 1)) test_data = scaler.transform(test_data.values.reshape(-1, 1)) else: train_data = data.values.reshape(-1, 1) test_data = test_data.values.reshape(-1, 1) prices = train_data input_dates = [] input_prices = [] next_n_day_prices = [] next_n_day_dates = [] for idx in range(len(prices) - n_input_days - n_predict_days): input_prices.append(prices[idx : idx + n_input_days]) input_dates.append(dates[idx : idx + n_input_days]) next_n_day_prices.append( prices[idx + n_input_days : idx + n_input_days + n_predict_days] ) next_n_day_dates.append( dates[idx + n_input_days : idx + n_input_days + n_predict_days] ) input_dates = np.asarray(input_dates) input_prices = np.array(input_prices) next_n_day_prices = np.array(next_n_day_prices) next_n_day_dates = np.asarray(next_n_day_dates) ( X_train, X_valid, y_train, y_valid, X_dates_train, X_dates_valid, y_dates_train, y_dates_valid, ) = train_test_split( input_prices, next_n_day_prices, input_dates, next_n_day_dates, test_size=test_size, shuffle=no_shuffle, ) return ( X_train, X_valid, y_train, y_valid, X_dates_train, X_dates_valid, y_dates_train, y_dates_valid, test_data, dates_test, scaler, False, )
BASELINE = 'essays' labels = get_labels(train_partition_name, test_partition_name, BASELINE) encoded_train_labels, original_training_labels = labels[0] encoded_test_labels, original_test_labels = labels[1] # # Load essays # vectorizer = TfidfVectorizer(input="filename", ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, max_features=30000) transformer = Normalizer() # Normalize frequencies to unit length preprocessor = 'tokenized' training_and_test_data_essays = get_features_from_text( train_partition_name, test_partition_name, baseline=BASELINE, preprocessor=preprocessor, vectorizer=vectorizer, transformer=transformer) train_matrix_essays = training_and_test_data_essays[0] test_matrix_essays = training_and_test_data_essays[1] #------------------------------------------------------------------------------------- # -----------------NN classifier... on essays--------------------------------------------
def apply(self, df): arr = Normalizer().fit_transform(df.values) return sklearn.cluster.SpectralClustering( **self.options).fit_predict(arr)
def apply(self, df): arr = Normalizer().fit_transform(df.values) return sklearn.mixture.GaussianMixture(**self.options).fit_predict(arr)
from sklearn import metrics from sklearn.preprocessing import Normalizer import h5py from keras import callbacks from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger traindata = pd.read_csv('data/train.csv', header=None) testdata = pd.read_csv('data/valid.csv', header=None) X = traindata.iloc[:,1:61] Y = traindata.iloc[:,0] C = testdata.iloc[:,0] T = testdata.iloc[:,1:61] scaler = Normalizer().fit(X) trainX = scaler.transform(X) # summarize transformed data np.set_printoptions(precision=3) #print(trainX[0:5,:]) scaler = Normalizer().fit(T) testT = scaler.transform(T) # summarize transformed data np.set_printoptions(precision=3) #print(testT[0:5,:]) y_train = np.array(Y) y_test = np.array(C)
# Hacemos la descripcion de las columnas que escalamos print(df[['age', 'diabetes', 'high_blood_pressure']].describe()) """ Metodo 2 Normalizacion """ # Creamos una figura con dos sub figuras para graficar fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(9, 5)) ax1.set_title('Antes de Normalizar') # Graficamos tres columnas para ver el comportamiento sns.kdeplot(data=copd, ax=ax1) # Definimos la variable normal para hacer el preprocesamiento de Normalizacion normal = Normalizer(norm='l2', copy=True) df1[['age', 'diabetes', 'high_blood_pressure' ]] = normal.fit_transform(df1[['age', 'diabetes', 'high_blood_pressure']]) # guardamos los datos preprocesados en un nuevo archivo df1.to_csv('preproNorm.csv', sep='\t') # Graficamos los datos escaldos ax2.set_title('Despues de Normalizar') sns.kdeplot(data=df1[['age', 'diabetes', 'high_blood_pressure']], ax=ax2) # Mostramos la grafica plt.show() print(" --- Normalizer ----")
print("Train set size: ", data_train.shape[0]) # build classification model y = data_train['label'].values X = data_train.drop(['label'], axis=1) if not cognates: svm = LinearSVC(C=10, fit_intercept=True) else: svm = svm.SVC(C=10) features = [('cst', digit_col())] clf = pipeline.Pipeline([('union', FeatureUnion(transformer_list=features, n_jobs=1)), ('scale', Normalizer()), ('svm', svm)]) clf.fit(X, y) if not predict_source and not predict_target: y = data_test['label'].values X = data_test.drop(['label'], axis=1) y_pred = clf.predict(X) if term_length_filter: result = pd.concat([ X, pd.DataFrame(y_pred, columns=['prediction']), pd.DataFrame(y, columns=['label'])
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.builtins import DatasetSelector # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=85) # Average CV score on the training set was:0.7012173913043479 exported_pipeline = make_pipeline( DatasetSelector(sel_subset=12, subset_list="module23.csv"), Normalizer(norm="l2"), RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.05, min_samples_leaf=10, min_samples_split=14, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#splitting the data frame to x and y target = pd.DataFrame(data['CASE_STATUS']) data = data.drop(['CASE_STATUS'], 1) # In[29]: from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.2) # In[30]: from sklearn.preprocessing import Normalizer normalizer = Normalizer() normalizer.fit(X_train) train_data = normalizer.transform(X_train) test_data = normalizer.transform(X_test) # In[31]: #Dimensionality reduction : PCA from sklearn.decomposition import PCA import time start_time = time.clock() pca = PCA(n_components=100) pca = pca.fit(train_data)
Y.append(words[1]) return X, Y if __name__ == '__main__': # Read all the documents. X, Y = read_data('all_sentiment_shuffled.txt') # Split into training and test parts. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0) # Set up the preprocessing steps and the classifier. myclf=PegasosWithSVC() #please uncomment to run the below log loss and then comment the above #myclf=PegasosWithLogLoss() model_pl = make_pipeline( TfidfVectorizer(preprocessor = lambda x: x, tokenizer = lambda x: x), SelectKBest(k=1000), Normalizer(), myclf, ) t0 = time.time() model_pl.fit(Xtrain, Ytrain) t1 = time.time() print('Training time: {:.2f} sec.'.format(t1 - t0)) Yguess = model_pl.predict(Xtest) print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))
from sklearn.model_selection import train_test_split from sklearn.preprocessing import Normalizer from sklearn.preprocessing import StandardScaler import qsharp from ML import QClassifier stdsc = StandardScaler() iris = datasets.load_iris() x1 = iris.data[:, 0].reshape(-1, 1) x2 = iris.data[:, 1].reshape(-1, 1) y = iris.target x1_norm = stdsc.fit_transform(x1) x2_norm = stdsc.fit_transform(x2) normalized_set = Normalizer().transform(np.hstack([x1_norm, x2_norm])) data_list = normalized_set.tolist() target_list = y.tolist() data_list_12 = [] target_list_12 = [] for (data, target) in zip(data_list, target_list): if target != 2: data_list_12.append(data) target_list_12.append(target) X_train, X_test, y_train, y_test = train_test_split(data_list_12, target_list_12, test_size=0.2) for (input, desire) in zip(X_test, y_test):
x * y - np.dot(np.outer(x, x), Theta_old_vector) - alpha * np.dot(A, Theta_old_vector)) return Theta_vector user_num = 20 item_num = 100 dimension = 5 alpha = 0.1 # regularizer beta = 0.1 #regularizer mu = 0.001 #step size lambda_ = 0.1 #step size user_feature = np.random.normal(size=(user_num, dimension)) user_feature = Normalizer().fit_transform(user_feature) user_feature_vector = user_feature.flatten() adj = rbf_kernel(user_feature) lap = csgraph.laplacian(adj, normed=False) item_feature = np.random.normal(size=(item_num, dimension)) item_feature = Normalizer().fit_transform(item_feature) Y = np.dot(user_feature, item_feature.T) + np.random.normal( size=(user_num, item_num), scale=0.1) A_true = np.kron(lap, np.identity(dimension)) A = np.identity(user_num * dimension) Theta_matrix = np.zeros((user_num, dimension)) Theta_vector = Theta_matrix.flatten() L = np.identity(user_num)
def function_2(file_name, rows_to_parse): data = pd.read_csv(os.path.join('data', 'nycflights', file_name +'.csv'),nrows=int(rows_to_parse)) #User data = data.fillna(0) print(data) train, test = train_test_split(data,train_size=0.5, test_size=0.5) train_x = train.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1) train_y = train['DepDelay'] test_x = test.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1) test_y = test['DepDelay'] # Support Vector Machines GridSearch.support_vector_machine(train_x,train_y,test_x,test_y) import time start_time = time.time() GridSearch.sklearn_grid_search(train_x, train_y) print("--- %s seconds ---" % (time.time() - start_time)) #____DASK____ c = dask.distributed.Client() client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB') print(client) import time start_time = time.time() GridSearch.dask_grid_search(train_x, train_y) print(f"--- {time.time() - start_time}seconds ---") #DASK DELAY output = [] #for x in data: a = dask.delayed(GridSearch.support_vector_machine)(train_x,train_y,test_x,test_y) print(a) start_time = time.time() a.compute() print("--- %s seconds ---" % (time.time() - start_time)) output.append(a) b = dask.delayed(GridSearch.sklearn_grid_search)(train_x, train_y) print(b) output.append(b) start_time = time.time() b.compute() print("--- %s seconds ---" % (time.time() - start_time)) c = dask.delayed(GridSearch.dask_grid_search)(train_x, train_y) print(c) output.append(c) start_time = time.time() c.compute() print("--- %s seconds ---" % (time.time() - start_time)) total = dask.delayed(sum)(output) #Visaualize total.visualize() #Other Code: clean_dataset(train_x) train_x = train_x.values train_x train_y = train_y.values train_y from sklearn.preprocessing import Normalizer x = train_x transformer = Normalizer().fit(x) transformer transformer.transform(x) train_x = transformer.transform(x) train_x = train_x.round(decimals=2) train_x train_x, train_y = make_classification( n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=1000) train_x[:5] train_y # Scale up: increase N, the number of times we replicate the data. N = 2 X_large = da.concatenate([da.from_array(train_x, chunks=train_x.shape) for _ in range(N)]) y_large = da.concatenate([da.from_array(train_y, chunks=train_y.shape) for _ in range(N)]) print(X_large) clf = ParallelPostFit(LogisticRegressionCV(cv=3), scoring="r2") y_pred = clf.predict(X_large) print(y_pred)
# 1: bus # 0: no bus imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X) X = imputer.transform(X) labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) labelencoder_Y_bus = LabelEncoder() Y_bus = labelencoder_Y_bus.fit_transform(Y_bus) # ================================================Splitting Training/Test Data========================================== #training and testing splitting multi class sc_X = Normalizer() X = sc_X.fit_transform(X) # X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=0) # X_train = sc_X.fit_transform(X_train) # X_test = sc_X.transform(X_test) #training and testing splitting single class # X_bus_train, X_bus_test, Y_bus_train, Y_bus_test = train_test_split(X,Y_bus, test_size=0.25, random_state=0) # X_bus_train = sc_X.fit_transform(X_bus_train) # ================================================Model Selection======================================================= #classifier svm_clf = SVC(kernel='rbf', random_state=0) svm_clf_bus = SVC(kernel='rbf', random_state=0)
tpot = TPOTRegressor(generations=10, verbosity=2) tpot.fit(trX, trY) print(tpot.score(teX, teY)) # 导出 tpot.export('pipeline_yield.py') #================= use pipeline result ========================== from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Binarizer, FunctionTransformer, Normalizer from tpot.operators.preprocessors import ZeroCount exported_pipeline = make_pipeline(ZeroCount(), Binarizer(threshold=0.17), Normalizer(norm="l1"), LassoLarsCV(normalize=True)) exported_pipeline.fit(trX, trY) trY_pred = exported_pipeline.predict(trX) teY_pred = exported_pipeline.predict(teX) accuracy = exported_pipeline.score(teX, teY) from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error print('MSE train: %.3f, test: %.3f' % (mean_squared_error(trY, trY_pred), mean_squared_error(teY, teY_pred))) print('R^2 train: %.3f, test: %.3f' % (r2_score(trY, trY_pred), r2_score(teY, teY_pred))) """
from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import Normalizer from sklearn.svm import SVC in_encoder = Normalizer() out_encoder = LabelEncoder() model = SVC(kernel='linear', probability=True) def train_model(emdTrainX, trainy): emdTrainX_norm = in_encoder.transform(emdTrainX) out_encoder.fit(trainy) trainy_enc = out_encoder.transform(trainy) model.fit(emdTrainX_norm, trainy_enc) def test(emdTestX, trainy): emdTestX_norm = in_encoder.transform(emdTestX) yhat_class = model.predict(emdTestX_norm) # score_train = accuracy_score(trainy_enc, yhat_train) predict_names = out_encoder.inverse_transform(yhat_class) # print('Accuracy: train=%.3f' % (score_train*100)) return predict_names
def test_random_sparse_data(self): n_columns = 8 n_categories = 20 import numpy.random as rn rn.seed(0) categories = rn.randint(50000, size=(n_columns, n_categories)) for dt in ["int32", "float32", "float64"]: _X = np.array( [[ categories[j, rn.randint(n_categories)] for j in range(n_columns) ] for i in range(100)], dtype=dt, ) # Test this data on a bunch of possible inputs. for sparse in (True, False): for categorical_features in [ "all", [3], [4], range(2, 8), range(0, 4), range(0, 8), ]: X = _X.copy() # This appears to be the only type now working. assert X.dtype == np.dtype(dt) model = OneHotEncoder( categorical_features=categorical_features, sparse=sparse) model.fit(X) # Convert the model spec = sklearn.convert(model, [("data", Array(n_columns))], "out") X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{"data": row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0 # Test normal data inside a pipeline for sparse in (True, False): for categorical_features in [ "all", [3], [4], range(2, 8), range(0, 4), range(0, 8), ]: X = _X.copy() model = Pipeline([ ( "OHE", OneHotEncoder( categorical_features=categorical_features, sparse=sparse, ), ), ("Normalizer", Normalizer()), ]) model.fit(X) # Convert the model spec = sklearn.convert(model, [("data", Array(n_columns))], "out").get_spec() X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{"data": row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
dummy_cols3 = [ "dummy_living", "dummy_luminoso", "dummy_terraza", "dummy_laundry", "dummy_cochera", "dummy_split", "dummy_piscina", "dummy_spa", "dummy_acondicionado", "dummy_subte", "dummy_pozo", "dummy_balcon", "dummy_sum", "dummy_vigilancia" ] #dummy_cols2=[] #dummy_cols3=[] distance_cols = [col for col in df_train if col.startswith('dist')] #distance_cols=[] cols = dummy_cols + dummy_cols2 + dummy_cols3 + distance_cols + [ 'surface_total_in_m2', 'expenses', 'rooms' ] # scaler = Normalizer() scalercols = cols + ["price_usd_per_m2"] df_train[scalercols] = scaler.fit_transform(df_train[scalercols]) df_test[scalercols] = scaler.fit_transform(df_test[scalercols]) X_train = df_train[cols] y_train = df_train["price_usd_per_m2"] X_test = df_test[cols] y_test = df_test["price_usd_per_m2"] #print("X:",X) #print("y:",y) v = CarlosLib.PolyDictVectorizer(sparse=False) #print(X_train.T.to_dict()) #hasher = CarlosLib.PolyFeatureHasher()
df = df.drop(['Surname'], axis='columns') # here we dont have any NULL or missing values. so, ignoring this #%% Look for categorial values # import preprocessing from sklearn from sklearn import preprocessing # 1. INSTANTIATE # encode labels with value between 0 and n_classes-1. le = preprocessing.LabelEncoder() # 2/3. FIT AND TRANSFORM # use df.apply() to apply le.fit_transform to all columns df["Geography"] = le.fit_transform(df["Geography"]) df["Gender"] = le.fit_transform(df["Gender"]) df.head() #%% feature scaling from sklearn.preprocessing import Normalizer scaler = Normalizer() df1 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns.values) #%% Create correlation matrix corr_matrix = df1.corr().abs() # Select upper triangle of correlation matrix upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.95 to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # %%
min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() ############################################################################### # Do the actual clustering
# Here we select 5,000 samples for training and 10,000 for testing. # To actually reproduce the results in the original Tensor Sketch paper, # select 100,000 for training. X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5_000, test_size=10_000, random_state=42) # %% # Now scale features to the range [0, 1] to match the format of the dataset in # the LIBSVM webpage, and then normalize to unit length as done in the # original Tensor Sketch paper [1]. mm = make_pipeline(MinMaxScaler(), Normalizer()) X_train = mm.fit_transform(X_train) X_test = mm.transform(X_test) # %% # As a baseline, train a linear SVM on the original features and print the # accuracy. We also measure and store accuracies and training times to # plot them latter. results = {} lsvm = LinearSVC() start = time.time() lsvm.fit(X_train, y_train) lsvm_time = time.time() - start lsvm_score = 100 * lsvm.score(X_test, y_test)
def train_classi(model_name, inputs, X_pos, y_pos, X, y, X_neg, y_neg): scaler = None model_type = inputs['model_type'] out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm'] if (model_type == "tpot"): logging_info("Training model... %s", str(model_type)) from sklearn.pipeline import make_pipeline if (model_name == "tpot_select"): clf = tpot_classi(inputs) elif (model_name == "SVM"): logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.preprocessing import StandardScaler #from sklearn.svm import LinearSVC from sklearn.svm import SVC # Pipeline from tpot #clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5)) # Cross validate with C vals - default is 1 # LinearSVC does not have a predict_proba function clf = make_pipeline( StandardScaler(), SVC(kernel='linear', probability=True, random_state=0, tol=1e-5)) elif (model_name == "estimator_SVM"): from sklearn.ensemble import GradientBoostingClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline, make_union #from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from xgboost import XGBClassifier # Score on the training set was:0.968003998605 #clf = make_pipeline(StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)),SelectFwe(score_func=f_classif, alpha=0.02),StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True, penalty="l2")),StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)),LinearSVC(C=1.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001)) clf = make_pipeline( StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)), SelectFwe(score_func=f_classif, alpha=0.02), StackingEstimator(estimator=LogisticRegression( C=1.0, dual=True, penalty="l2")), StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)), SVC(kernel='linear', probability=True, C=1.0, tol=0.001)) elif (model_name == "log_reg"): logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from tpot.builtins import StackingEstimator, ZeroCount # Pipeline from tpot # Score on humap was:0.986160063433 clf = make_pipeline( ZeroCount(), StackingEstimator( estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6, min_samples_leaf=4, min_samples_split=6, n_estimators=100)), LogisticRegression(C=15.0, dual=False, penalty="l2")) elif (model_name == "extra_trees"): from sklearn.ensemble import ExtraTreesClassifier from tpot.builtins import StackingEstimator from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Normalizer from sklearn.preprocessing import FunctionTransformer from copy import copy # Score on the training set was:0.948305771055 clf = make_pipeline( make_union( FunctionTransformer(copy), make_pipeline( StackingEstimator(estimator=ExtraTreesClassifier( bootstrap=False, criterion="gini", max_features=0.25, min_samples_leaf=8, min_samples_split=11, n_estimators=100)), Normalizer(norm="l1"))), StackingEstimator( estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.75, min_samples_leaf=15, min_samples_split=18, n_estimators=100)), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.85, min_samples_leaf=5, min_samples_split=4, n_estimators=100)) else: # Random forest logging_info("Training model... %s", str(model_name)) # Imports from tpot output from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import PolynomialFeatures # Pipeline from tpot # Score on humap was:0.986160063433 clf = make_pipeline( VarianceThreshold(threshold=0.05), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.35, min_samples_leaf=1, min_samples_split=11, n_estimators=100)) clf.fit(X, y) logging_info("Finished Training model") logging_info("Evaluating training accuracy...") #Training accuracy acc_overall_train = clf.score(X, y) acc_pos_train = clf.score(X_pos, y_pos) acc_neg_train = clf.score(X_neg, y_neg) res_pos = clf.predict(X_pos) res = clf.predict(X_neg) n_pos = len(X_pos) n_neg = len(X_neg) acc, acc_neg, Recall, Precision, F1_score = calc_metrics( res, res_pos, n_neg, n_pos) analyze_sizewise_accuracies( X_pos, res_pos, X_neg, res, out_comp_nm + '_size_wise_accuracies_train.png') train_fit_probs = clf.predict_proba(X)[:, 1] train_aps = sklearn_metrics_average_precision_score(y, train_fit_probs) with open(out_comp_nm + '_metrics.out', "a") as fid: print("Training set average precision score = %.3f" % train_aps, file=fid) model = clf if hasattr(model, 'decision_function'): score = model.decision_function(X_neg) np_savetxt(out_comp_nm + '_train_neg_score.out', score) score = model.decision_function(X_pos) np_savetxt(out_comp_nm + '_train_pos_score.out', score) elif (model_type == "NN"): # Standardizing the feature matrix from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Scaling X_pos and X_neg as well now for testing with them later X_pos = scaler.transform(X_pos) X_neg = scaler.transform(X_neg) import tensorflow as tf from tensorflow import keras #tf.enable_eager_execution() # Fix ensuing errors logging_info("Training model... %s", str(model_type)) # multi-layer perceptron #for most problems, one could probably get decent performance (even without a second optimization step) by setting the hidden layer configuration using just two rules: (i) number of hidden layers equals one; and (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers. print() dims = X.shape n_feats = dims[1] n_classes = 2 logging_info("No. of nodes in input layer = %s", str(n_feats)) logging_info("No. of nodes in output layer (since softmax) = %s", str(n_classes)) hidden_nodes = int((n_feats + n_classes) / 2) logging_info("No. of nodes in the one hidden layer = %s", str(hidden_nodes)) model = keras.Sequential([ keras.layers.Dense(n_feats, activation=tf.nn.relu), keras.layers.Dense(hidden_nodes, activation=tf.nn.relu), keras.layers.Dense(n_classes, activation=tf.nn.softmax) ]) #model = keras.Sequential([keras.layers.Dense(n_feats, activation = tf.nn.relu), keras.layers.Dense(n_classes, activation = tf.nn.softmax)]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) N_epochs = 1000 model.fit(X, y, epochs=N_epochs, verbose=0) with open(out_comp_nm + '_metrics.out', "a") as fid: print("No. of epochs = ", N_epochs, file=fid) logging_info("Finished Training model") logging_info("Evaluating training accuracy...") loss_overall, acc_overall_train = model.evaluate(X, y, verbose=0) loss_pos, acc_pos_train = model.evaluate(X_pos, y_pos, verbose=0) loss_neg, acc_neg_train = model.evaluate(X_neg, y_neg, verbose=0) else: print("Model type not found") logging_info("Finished Evaluating training accuracy.") with open(out_comp_nm + '_metrics.out', "a") as fid: print("Accuracy overall train = %.3f" % acc_overall_train, file=fid) print("Accuracy positive train = %.3f" % acc_pos_train, file=fid) print("Accuracy negative train = %.3f" % acc_neg_train, file=fid) print("Train Precision = %.3f" % Precision, file=fid) print("Train Recall = %.3f" % Recall, file=fid) print("Train F1 score = %.3f" % F1_score, file=fid) return model, scaler
def LSA(): def connect(): client = MongoClient() return client['myproject'] def get_documents(query): ''' Retrieves the tokenized version of the timelines, followers of a 'parent' account that we choose to include in the corpus: is_included: True ''' condition = {'query': query} tweets = db.tweets.find_one(condition)['tweet_data'] documents = [{ 'user_id': tw['user']['id'], 'tokens': tw['tokens'] } for tw in tweets] return documents def display_topics(svd, terms, n_components, n_out=7, n_weight=5, topic=None): ''' This displays a weight measure of each topic (dimension) and the 'n_out' first words of these topics. n_weight is the number of words used to calculate the weight Input: svd: the TruncatedSVD model that has been fitted terms: the list of words n_components: The reduced dimension topic: by default prints all topics in the SVD, if topic (int) given prints only the weight and words for that topic n_out: Number of words per topic to display n_weight: Number of words to average on to calculate the weight of the topic. The smaller, the more spread bwteen the topic relative weights ''' if topic is None: for k in range(n_components): idx = {i: abs(j) for i, j in enumerate(svd.components_[k])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) print("T%s)" % k) for item in sorted_idx[0:n_out - 1]: print(" %0.3f*%s" % (item[1], terms[item[0]])) print() else: m = max(svd.components_[topic]) idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) print("* T %s) weight: %0.2f" % (topic, weight)) for item in sorted_idx[0:n_out - 1]: print(" %0.3f*%s" % (item[1], terms[item[0]])) print() def plot_clusters(svdX, y_pred, centers): plt.style.use('fivethirtyeight') f, ax1 = plt.subplots(1, 1, figsize=(16, 8), facecolor='white') ax1.set_xlabel("") ax1.set_ylabel("") ax1.set_title("K-Means") # Only plots the first 2 dimensions of the svdX matrix ax1.scatter(svdX[:, 0], svdX[:, 1], c=y_pred, cmap=plt.cm.Paired, s=45) ax1.scatter(centers[:, 0], centers[:, 1], marker='o', c="black", alpha=1, s=150) ax1.axis('off') plt.show() # ------------------------------------- # Params # ------------------------------------- n_components = 3 # Number of dimension for TruncatedSVD n_clusters = 3 db = connect() # Get the already tokenized version of the timelines documents = get_documents('Israel') # This is hacky and due to the fact that we re-use previously tokenized documents # We re assemble the tokens prior to tokenizing them again tokenized = [' '.join(doc['tokens']) for doc in documents] vectorizer = TfidfVectorizer(max_df=0.9, min_df=6, max_features=500, use_idf=True, strip_accents='ascii') # X contains token frequency for each token X = vectorizer.fit_transform(tokenized) # SVD decomposition svd = TruncatedSVD(n_components, random_state=10) svdX = svd.fit_transform(X) # Normalization. # Note: for 2 dimensions this will cause the points to be on an ellipse. # Comment the 2 lines below to produce more meaningful plots nlzr = Normalizer(copy=False) svdX = nlzr.fit_transform(svdX) # Clustering km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=4, verbose=False, random_state=10) km.fit(svdX) print(" --------------------- ") print(" Silhouette Coefficient: %0.3f" % metrics.silhouette_score(svdX, km.labels_, sample_size=1000)) print(" --------------------- ") # Array mapping from words integer indices to actual words terms = vectorizer.get_feature_names() display_topics(svd, terms, n_components) # to plot the documents and clusters centers # Only relevant for K = 2 y_pred = km.predict(svdX) centers = km.cluster_centers_ plot_clusters(svdX, y_pred, centers)
else: os.mkdir(args.d2v_dir + pathname) with timed('Running Doc2Vec'): model = Doc2Vec(documents, dm=1, sample=args.sample, size=args.size, window=args.window, min_count=args.min_count, workers=args.workers) if args.norm: with timed('Norming vectors'): from sklearn.preprocessing import Normalizer nrm = Normalizer('l2') normed = nrm.fit_transform(model.docvecs.doctag_syn0) words_normed = nrm.fit_transform(model.wv.syn0) with timed('Saving data'): if args.norm: np.save( '{0}{1}/user_features_normed_{1}.npy'.format( args.d2v_dir, pathname), normed) np.save( '{0}{1}/song_features_normed_{1}.npy'.format( args.d2v_dir, pathname), words_normed) model.save('{0}{1}/model_{1}'.format(args.d2v_dir, pathname)) with open('{0}{1}/song_indices_{1}'.format(args.d2v_dir, pathname), 'w') as out: for song in model.wv.index2word: