def __init__(self, YTrain_file, XTrain_file, XTest_file, output_path, normalise, C, class_weight, ): """ Arguments: """ self.YTrain = joblib.load(YTrain_file) XTrain = joblib.load(XTrain_file) self.XTrain = XTrain.reshape(np.size(XTrain, axis=0), -1) XTest = joblib.load(XTest_file) self.XTest = XTest.reshape(np.size(XTest, axis=0), -1) self.output_path = output_path if normalise: normalizer = Normalizer(copy=False) normalizer.transform(self.XTrain) normalizer.transform(self.XTest) self.C = C if class_weight == 'none': class_weight = None self.class_weight = class_weight
class TfIdf(Feature): def __init__(self): self.kbest = None self.vect = None self.truncated = None self.normalizer = None def train(self, reviews, labels): self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english') reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews] tfidf_matrix = self.vect.fit_transform(reviews_text).toarray() self.truncated = TruncatedSVD(n_components=50) self.truncated.fit(tfidf_matrix, labels) trunc = self.truncated.transform(tfidf_matrix) self.normalizer = Normalizer() self.normalizer.fit(trunc) self.kbest = SelectKBest(f_classif, k=5) self.kbest.fit(self.normalizer.transform(trunc), labels) def score(self, data): reviews_text = ' '.join(list(chain.from_iterable(data))) tfidf_matrix = self.vect.transform([reviews_text]).toarray() trunc = self.truncated.transform(tfidf_matrix) return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
class KNN(Model): def __init__(self, X_train, y_train, X_val, y_val): super().__init__() self.normalizer = Normalizer() self.normalizer.fit(X_train) self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1) self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train)) print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val)) def guess(self, feature): return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
def load_data(self): if not os.path.exists('features_train.txt'): self.feature_extraction('train.txt', 'features_train.txt') data_train, target_train = load_svmlight_file('features_train.txt') if not os.path.exists('features_test.txt'): self.feature_extraction('test.txt', 'features_test.txt') data_test, target_test = load_svmlight_file('features_test.txt') normalizer = Normalizer().fit(data_train) data_train = normalizer.transform(data_train) data_test = normalizer.transform(data_test) return data_train.toarray(), target_train, data_test.toarray(), target_test
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3): out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) X = agetext["text"] X = X.tolist() label = agetext["agegroup"].tolist() vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2)) docs = [] for doc in X: docs.append(" ".join(doc)) docs2 = [doc.replace("\t","").replace("\n","") for doc in docs] traindocs = docs2[:7999] X = vec.fit_transform(traindocs) testdocs = docs2[8000:9500] X_test = vec.transform(testdocs) tlabel = label[:7999] testl = label[8000:9500] if(check): lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(X) X = normalizer.fit_transform(X) X_test = lsa.transform(X_test) X_test = normalizer.transform(X_test) model.fit(X,tlabel) pred = model.predict(X_test) out.append(round(accuracy_score(testl, pred),2)) print str(out) print np.mean(out)
def kfold(agetext,k,model,k2): import collections out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) datatb = agetext.iloc[:,1:] label = agetext["agegroup"].tolist() X_train, X_test, y_train, y_test = cross_validation.train_test_split( datatb, label, test_size=0.15, random_state=i*6) data = X_train.values counter = collections.Counter(y_train) print counter testdata = X_test.values lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(data) X = normalizer.fit_transform(X) X_test = lsa.transform(testdata) X_test = normalizer.transform(X_test) model.fit(X,y_train) pred = model.predict(X_test) counter = collections.Counter(y_test) print counter counter = collections.Counter(pred) print counter out.append(round(accuracy_score(y_test, pred),5)) print str(out) print np.mean(out)
def test_normalizer_l1(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sp.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sp.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) X_norm = normalizer.transform(X) assert X_norm is not X X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) X_norm = normalizer.transform(X) assert X_norm is X X_norm2 = toarray(X_norm) for X_norm in (X_norm1, X_norm2): row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) # check input for which copy=False won't prevent a copy for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sp.csr_matrix) X_norm = toarray(X_norm) for i in xrange(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0)
def _normalize(self, X, y, X_t): from sklearn.preprocessing import Normalizer NORM = Normalizer() X = NORM.fit_transform(X, y) X_t = NORM.transform(X_t) return X, X_t
def readAndPreProcess(): print("\n\n********** CS-412 HW5 Mini Project **********") print("************ Submitted by Sankul ************\n\n") print("Reading data, please ensure that the dataset is in same folder.") resp = pd.read_csv('responses.csv') print("Data reading complete!") print("Some stats reagarding data:") resp.describe() print("\nStarting pre-processing.....") print("\nFinding missing values:") print("Missing values found, removing them") emptyVals = resp.isnull().sum().sort_values(ascending=False) emptyPlot = emptyVals.plot(kind='barh', figsize = (20,35)) plt.show() print("Empty values removed") print("\nChecking for NaN and infinite values in target column (Empathy):") if len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]): print("Number of infinite or NaN values in Empathy column: ", len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])])) print("Removing them") resp = resp[np.isfinite(resp['Empathy'])] print("Infinite and NaN values removed") print("\nChecking for categorical features:") if pd.Categorical(resp).dtype.name == 'category': print("Categorical features found. Removing them...") resp = resp.select_dtypes(exclude=[object]) print("Categorical features removed") print("\nReplacing NaN values with the mean value:") resp=resp.fillna(resp.mean()) resp.isnull().sum() print("Values replaced") print("\nSeperating labels from data:") Y = resp['Empathy'].values X = resp.drop('Empathy',axis=1) print("Labels seperated") print("\nScaling, standardizing and normalizing the data:") scaler = MinMaxScaler(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X) scaler = StandardScaler().fit(rescaledX) standardizedX = scaler.transform(rescaledX) normalizer = Normalizer().fit(standardizedX) normalizedX = normalizer.transform(standardizedX) print("Scaling, standardizing and normalizing completed") print("\nFinal data looks like:") print(normalizedX.shape) print("Values inside look like:") print(normalizedX[0]) return normalizedX,Y
class ScikitNormalizer(object): def __init__(self): self.data_normalizer = Normalizer() def fit(self, data): self.data_normalizer.fit(data) def transform(self, data): return (self.data_normalizer.transform(data) + 1) / 2
def test_ver2_syntetic_dataset(self): self.ex = experiment.Experiment() self.ex.cf_matrix = load_sparse_data('syntetic_cf.dat') n = Normalizer(norm='l2', copy=True) self.ex.cf_matrix = n.transform(self.ex.cf_matrix) #normalized. self.ex.cb_prox = experiment.Experiment.load_data(PKL + 'cb_prox.pkl') self.ex.cf_prox = self.ex.cf_matrix * self.ex.cf_matrix.T self.ex.test_corr_sparsity(draw=True, interval=100)
def make_nn_regression(n_samples=100, n_features=100, n_informative=10, dense=False, noise=0.0, test_size=0, normalize_x=True, normalize_y=True, shuffle=True, random_state=None): X, y, w = _make_nn_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, shuffle=shuffle, random_state=random_state) if dense: X = X.toarray() if test_size > 0: cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state, test_size=test_size, train_size=1-test_size) train, test = list(cv)[0] X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] if not dense: X_train.sort_indices() X_test.sort_indices() else: X_train, y_train = X, y if not dense: X_train.sort_indices() X_test, y_test = None, None # Add noise if noise > 0.0: generator = check_random_state(random_state) y_train += generator.normal(scale=noise * np.std(y_train), size=y_train.shape) y_train = np.maximum(y_train, 0) if normalize_x: normalizer = Normalizer() X_train = normalizer.fit_transform(X_train) if X_test is not None: X_test = normalizer.transform(X_test) if normalize_y: scaler = MinMaxScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() if y_test is not None: y_test = scaler.transform(y_test.reshape(-1, 1)).ravel() if X_test is not None: return X_train, y_train, X_test, y_test, w else: return X_train, y_train, w
def normalize(self, msi, norm="l1"): original_shape = msi.get_image().shape collapsed_image = collapse_image(msi.get_image()) # temporarily save mask, since scipy normalizer removes mask is_masked_array = isinstance(msi.get_image(), np.ma.MaskedArray) if is_masked_array: mask = msi.get_image().mask normalizer = Normalizer(norm=norm) normalized_image = normalizer.transform(collapsed_image) if is_masked_array: normalized_image = np.ma.MaskedArray(normalized_image, mask=mask) msi.set_image(np.reshape(normalized_image, original_shape))
def test_normalizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Normalizer # with sklearn.preprocessing.Normalizer normalizerr = NormalizerR() normalizerr.fit(np.concatenate(trajs)) normalizer = Normalizer() normalizer.fit(trajs) y_ref1 = normalizerr.transform(trajs[0]) y1 = normalizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
class SiftBOW(object): def __init__(self, dataset, n_words=300, add_global_desc=True, color_sift=False): self.dataset = dataset self.n_words = n_words self.add_global_desc = add_global_desc self.normalizer = Normalizer(norm='l1') self.color_sift = color_sift if self.color_sift: self.feature_extractor = color_sift_descriptors else: self.feature_extractor = sift_descriptors def fit_transform(self, image_names, superpixels): descriptors, coordinates = self.feature_extractor(image_names, self.dataset) print("end sift descriptors") vq, X = bag_of_words(descriptors, superpixels, coordinates) X = [self.normalizer.transform(x) for x in X] self.vq_ = vq Y = [gt_in_sp(self.dataset, f, sp) for f, sp in zip(image_names, superpixels)] return DataBunch(X, Y, image_names, superpixels) def fit(self, image_names, spixel): self.fit_predict(image_names, spixel) return self def transform(self, image_names, superpixels): descriptors, coordinates = self.feature_extractor(image_names, self.dataset) _, X = bag_of_words(descriptors, superpixels, coordinates, vq=self.vq_) Y = [gt_in_sp(self.dataset, f, sp) for f, sp in zip(image_names, superpixels)] X = [self.normalizer.transform(x) for x in X] return DataBunch(X, Y, image_names, superpixels)
def __init__(self, nor='nor', fold=2): self.fold = fold dataframe = pandas.read_csv(open('wine.data')) array = dataframe.values # separate array into input and output components self.X = array[:,1:] self.Y = array[:,0] self.nor = nor # normalizer can turn length of vector into 1. if self.nor == 'nor': scaler = Normalizer().fit(self.X) else: scaler = MinMaxScaler().fit(self.X) self.X = scaler.transform(self.X) numpy.set_printoptions(precision=3)
class LineTransformer(TransformerMixin, BaseEstimator): def __init__(self, *args): self.args = args self.normalize = Normalizer() def fit(self, X, y=None): return self def predict(self, X): X_test = [numpy.array([numpy.array([1 if item else 0], dtype='float64') for item in X[arg]]) for arg in self.args] X_test = numpy.concatenate(X_test, axis=1) X_test = self.normalize.transform(X_test) return X_test def transform(self, X, y=None): return self.predict(X)
def test_sklearn_transform(): transformer = Normalizer() transformer.fit(X_train) computation = SklearnTransform("test-sklearn", transformer, istreams=[], ostream="out") context = ComputationContext(computation) data = pd.DataFrame(X_test).to_json(orient="records") computation.process_record(context, Record("transform", data, None)) assert len(context.records) == 1 assert len(context.records["out"]) == 1 record = context.records["out"][0] assert record.key == "transform" assert np.allclose(transformer.transform(X_test), json.loads(record.data))
def vectorize(n, comp=0): tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english', sublinear_tf=True, use_idf=True, smooth_idf=True) # Fit and transform X = tfv.fit_transform(boiler_stream(trainfnm, n)) lsa = None scaler = None if comp > 0: lsa = TruncatedSVD(comp) scaler = Normalizer(copy=False) X = lsa.fit_transform(X) X = scaler.fit_transform(X) # Transform only Z = tfv.transform(boiler_stream(testfnm, n)) if lsa: Z = lsa.transform(Z) Z = scaler.transform(Z) np.save(trainvecfnm, X) np.save(testvecfnm, Z)
) tfv.fit(text) Q = tfv.transform(list(train["q"])) R = tfv.transform(list(train["q_ex"])) X = tfv.transform(list(train["t"])) Qt = tfv.transform(list(test["q"])) Rt = tfv.transform(list(test["q_ex"])) Xt = tfv.transform(list(test["t"])) svd = TruncatedSVD(n_components=svd_components) svd.fit(vstack([Q, X, Qt, Xt])) normalizer = Normalizer() Q = svd.transform(Q) Q = normalizer.transform(Q) R = svd.transform(R) R = normalizer.transform(R) X = svd.transform(X) X = normalizer.transform(X) Qt = svd.transform(Qt) Qt = normalizer.transform(Qt) Rt = svd.transform(Rt) Rt = normalizer.transform(Rt) Xt = svd.transform(Xt) Xt = normalizer.transform(Xt) colname = "cos_dist_%d_%d" % (ngram, svd_components) D = [np.dot(a, b) for (a, b) in zip(Q, X)] Dt = [np.dot(a, b) for (a, b) in zip(Qt, Xt)]
class IWTBA(): def __init__(self, svd=True, categorizer=True): # Triggers for building models self.svd=svd self.categorizer=categorizer # Initialize Tokenizer Parts # Create regex pattern self.re_pattern = re.compile('[^a-zA-Z]') # Create stemmer self.stemmer = SnowballStemmer('english') # Create stopwords self.eng_stop = set(stopwords.words('english')) # Placeholders self.feat_mat = None self.feat_labels = None self.course_list = None self.course_id_to_index = None self.jobs_titles = None self.cat_id_to_name = None self.course_cats_binarized = None self.label_arr_to_cat_id = None # ---------------- # Read in Corpus # ---------------- def concatenate_coursera_text_data(self, course_dict): """helper function for parsing coursera courses""" name = course_dict['name'] syllabus = BeautifulSoup(course_dict['courseSyllabus']).text short_desc = course_dict['shortDescription'] about = BeautifulSoup(course_dict['aboutTheCourse']).text return " ".join([name, syllabus, short_desc, about]) def _get_coursera_corpus(self): """collect coursera course text and metadata""" with open('./data/coursera/coursera_courses.json') as c_file: coursera_courses = json.load(c_file) course_id_to_index = {} # dict to allow reverse searching from id course_text_list = [] course_list = [] course_categories = [] i = 0 for course in coursera_courses['elements']: if course['language'] == 'en': course_id_to_index[course['id']] = i course_text_list.append(self.concatenate_coursera_text_data(course)) course_list.append(course) if self.categorizer: course_categories.append(course['links'].get('categories', [-1])) i += 1 if self.categorizer: # get category list cat_info_list = coursera_courses['linked']['categories'] self.cat_id_to_name = {cat['id']: {'name':cat['name'], 'shortName':cat['shortName']} for cat in cat_info_list} # binarize labels and discard low-count categories mlb = MultiLabelBinarizer() course_cats_binarized = mlb.fit_transform(course_categories) # filter to only tags with > 40 courses mask = course_cats_binarized.sum(axis=0) > 40 course_cats_binarized = course_cats_binarized[:, mask] self.course_cats_binarized = course_cats_binarized # create dict to get back from masked index, to index, to id label_arr_to_cat_id = {} for i, k in enumerate(mask.nonzero()[0].tolist()): label_arr_to_cat_id[i] = mlb.classes_[k] self.label_arr_to_cat_id = label_arr_to_cat_id return course_list, course_text_list, course_id_to_index def _get_nyc_corpus(self): """collect nyc gov't job descriptions and titles""" ny_jobs_raw = open('./data/nyc/ny_jobs_data.json').read() ny_jobs_data = json.loads(ny_jobs_raw) ny_jobs_columns = [col['fieldName'] for col in ny_jobs_data['meta']['view']['columns']] ny_jobs_desc_index = ny_jobs_columns.index('job_description') ny_jobs_title_index = ny_jobs_columns.index('business_title') #there's also civic title ny_jobs_descriptions = [] ny_jobs_titles = [] for job in ny_jobs_data['data']: ny_jobs_descriptions.append(job[ny_jobs_desc_index]) ny_jobs_titles.append(job[ny_jobs_title_index]) return ny_jobs_titles, ny_jobs_descriptions def _get_github_corpus(self): """collect github job descriptions and titles""" git_data = json.load(open('./data/github/github_postings')) git_jobs_titles = [] git_jobs_descriptions = [] for job in git_data: git_jobs_titles.append(job['title']) git_jobs_descriptions.append(BeautifulSoup(job['description']).text) return git_jobs_titles, git_jobs_descriptions def get_corpus(self, coursera=True, nyc=True, github=True): """collect data sets, return combined corpus and store metadata""" combined_text = [] job_titles = [] if coursera: course_list, course_text_list, course_id_to_index = self._get_coursera_corpus() combined_text += course_text_list self.course_list = course_list self.course_id_to_index = course_id_to_index if nyc: ny_jobs_titles, ny_jobs_descriptions = self._get_nyc_corpus() combined_text += ny_jobs_descriptions job_titles.extend(ny_jobs_titles) if github: git_jobs_titles, git_jobs_descriptions = self._get_github_corpus() combined_text += git_jobs_descriptions job_titles.extend(git_jobs_titles) if job_titles: self.job_titles = job_titles return combined_text #-------------------- # Model Building and Processing #-------------------- def tokenize_text(self, text): """clean and tokenize a job description""" #should modify this to get rid of single letter words or ' caused junk clean_text = self.re_pattern.sub(" ", text).lower() tokenized_desc = [self.stemmer.stem(word) for word in clean_text.split() if word not in self.eng_stop] return tokenized_desc def _fit_svd(self, feat_mat, svd_comps): self.svd = TruncatedSVD(n_components=svd_comps) feat_mat = self.svd.fit_transform(feat_mat) self.normalizer = Normalizer(copy=False) self.normalizer.transform(feat_mat) return feat_mat def _fit_categorizer(self): classifier = SVC(kernel='linear', probability=True, class_weight='auto') cat_clf = OneVsRestClassifier(classifier) cat_clf.fit(self.feat_mat[:len(self.course_list), :], self.course_cats_binarized) self.categorizer = cat_clf def fit(self, svd_comps=1000): """fit the tfidf vectorizer (and svd) and store it and the resulting feature matrix""" vectorizer = TfidfVectorizer(tokenizer=self.tokenize_text) feat_mat = vectorizer.fit_transform(self.get_corpus()) self.vectorizer = vectorizer if self.svd: feat_mat = self._fit_svd(feat_mat, svd_comps) self.feat_mat = feat_mat if self.categorizer: self._fit_categorizer() self.feat_labels = vectorizer.get_feature_names() def vectorize(self, input_text): vector = self.vectorizer.transform([input_text]) if self.svd: vector = self.svd.transform(vector) self.normalizer.transform(vector) return vector #-------------------- # Result Functions #-------------------- def _get_course_sims(self, input_text): """get course similarities.""" input_vect = self.vectorize(input_text) c_feat_mat = self.feat_mat[:len(self.course_list), :] cos_sims = np.dot(c_feat_mat, input_vect.T) #nx1 shape if type(cos_sims) != np.ndarray: #tfidf is in sparse format cos_sims = np.array(cos_sims.todense()) return cos_sims def get_n_most_similar_course_indices(self, input_text, n=5, threshold=.3): """get n most similar indices, sorted, from a sparse matrix""" input_vect = self.vectorize(input_text) c_feat_mat = self.feat_mat[:len(self.course_list), :] cos_sims = np.dot(c_feat_mat, input_vect.T) if type(cos_sims) != np.ndarray: #tfidf is in sparse format cos_sims = np.array(cos_sims.todense()) n = min(n, np.sum(cos_sims > threshold)) # return only good courses n = max(n, 1) # return at least 1 course top_n_indices = np.argsort(cos_sims, axis=0)[-1:-(n + 1):-1, 0] return top_n_indices.ravel().tolist() def build_recommend_table(self, input_text, n=5): """ Collect meta data from recommended courses, and then return a table for displaying recommendations. """ indices = self.get_n_most_similar_course_indices(input_text, n=n) header = ['Course Name', 'Course Description'] table = [header] for i in indices: course = self.course_list[i] name = course['name'] short_desc = course['shortDescription'] url = 'https://www.coursera.org/course/' + course['shortName'] table.append([name, short_desc, url]) return table def _get_job_category_scores(self, input_text): """ Get decision function results for categorizer. """ vect = self.vectorize(input_text) cat_scores = self.categorizer.decision_function(vect) return cat_scores def get_job_categories(self, input_text, threshold=.034): """ Classify posting and return categories. Threshold of 0.034 corresponds to a .05 false positive rate """ cat_scores = self._get_job_category_scores(input_text) cat_predictions = cat_scores > threshold cat_names = [] for i in cat_predictions.nonzero()[1].tolist(): cat_id = self.label_arr_to_cat_id[i] cat_name = self.cat_id_to_name[cat_id]['name'] cat_names.append(cat_name) return cat_names def get_n_most_similar_job_indices(self, input_text, n=3, threshold=.3): """get n most similar job indices, sorted""" input_vect = self.vectorize(input_text) j_feat_mat = self.feat_mat[len(self.course_list):, :] cos_sims = np.dot(j_feat_mat, input_vect.T) if type(cos_sims) != np.ndarray: #tfidf is in sparse format cos_sims = np.array(cos_sims.todense()) n = min(n, np.sum(cos_sims > threshold)) # return only good courses n = max(n, 1) # return at least 1 course top_n_indices = np.argsort(cos_sims, axis=0)[-1:-(n + 1):-1, 0] return top_n_indices.ravel().tolist() def get_job_titles_from_indices(self, top_n_indices): """return titles from indices.""" titles = [] for i in top_n_indices: job_title = self.job_titles[i] if job_title not in titles: titles.append(job_title) return titles def build_course_row(self, course_id): """collect course metadata and build row for recommendation page. Input: Course ID Output: List of strings. Course IMG as url Course Title Course URL Course Description Course Categories """ course = self.course_list[course_id] c_name = course['name'] c_img = course['smallIcon'] #url works for coursera only c_url = 'https://www.coursera.org/course/' + course['shortName'] c_desc = course['shortDescription'] c_cats = [self.cat_id_to_name[cat_id]['name'] for cat_id in course['links']['categories']] return c_name, c_img, c_url, c_desc, c_cats def build_recommend_page(self, input_text, thresh=.3): have_recommendations = False # get n job titles > threshold job_titles = [] job_indices = self.get_n_most_similar_job_indices(input_text, n=3, threshold=thresh) job_titles = self.get_job_titles_from_indices(job_indices) # get and sort course similarities course_sims = self._get_course_sims(input_text) #nx1 shape sorted_sim_indices = course_sims.argsort(axis=0)[::-1, :] #sorted descending # get category scores for job posting job_cat_scores = self._get_job_category_scores(input_text) # get category scores for each recommended course # 1 * job cat score if cat has tag, 0 otherwise course_cat_scores = job_cat_scores * self.course_cats_binarized # for each course, largest value becomes it's parent category course_parent_cat = np.argmax(course_cat_scores, axis = 1) # best recommendations thresholds # get courses with cos sim > high thresh max_best = 3 high_thresh = .5 best_course_ids = [] for course_id in sorted_sim_indices: if course_sims[course_id] > high_thresh: best_course_ids.append(course_id) if best_course_ids: best_course_ids = best_course_ids[:max_best] have_recommendations = True # for each category in job_cat_scores > thresh # create list of courses with sim > thresh # get course recommendations by category cat_list = [] thresh_mask = (course_sims > thresh).ravel() for i in np.argsort(job_cat_scores[0]): if job_cat_scores[0][i] > .034: cat_mask = course_parent_cat == i cat_and_thresh_mask = np.logical_and(cat_mask, thresh_mask) valid_courses = cat_and_thresh_mask.nonzero()[0].tolist() if valid_courses: course_order = np.argsort(course_sims[cat_and_thresh_mask], axis=0)[::-1, :] cat_id = self.label_arr_to_cat_id[i] cat_name = self.cat_id_to_name[cat_id]['name'] cat_list.append([cat_name, [valid_courses[i] for i in course_order]]) if cat_list: have_recommendations = True # if have_recommendations is False and job_indices: # job_titles, best_course_ids, cat_list = self._rec_from_jobs_only(input_text) return job_titles, best_course_ids, cat_list, have_recommendations
# .. your code here .. from sklearn.preprocessing import Normalizer normalizer = Normalizer(copy=True) normalizer.fit(X_train) # # TODO: With your trained pre-processor, transform both your training AND # testing data. # # NOTE: Any testing data has to be transformed with your preprocessor # that has ben fit against your training data, so that it exist in the same # feature-space as the original data used to train your models. # # .. your code here .. X_train = normalizer.transform(X_train) X_test = normalizer.transform(X_test) # # TODO: Just like your preprocessing transformation, create a PCA # transformation as well. Fit it against your training data, and then # project your training and testing features into PCA space using the # PCA model's .transform() method. # # NOTE: This has to be done because the only way to visualize the decision # boundary in 2D would be if your KNN algo ran in 2D as well: # # .. your code here .. from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X_train)
class ContentRecommend(object): create_date = datetime.utcnow() days = 15 training_end = datetime.utcnow() db = None n_components = 20 # Number of dimension for TruncatedSVD account = '' svd = None normalizer = None svdX = None vectorizor = None training_docs = None threshold = 0.25 k_means = None sil_score = -1.0 cluster_count = 0 range_n_clusters = [3, 4, 5, 6, 7, 8] missionId = '' def __init__(self, mission_id, db_name='plover_development', db_port=27017, db_host='localhost'): self.missionId = mission_id config.LOGGER.info('Instantiation recommender') self.connect(db_name, self.missionId, db_port=db_port, db_host=db_host) config.LOGGER.debug("Loading NLTK stopword list for English") def connect(self, db_name="plover_development", mission_id="", db_port=27017, db_host='localhost'): config.LOGGER.info('Instantiating recommender object for mission %s', mission_id) config.LOGGER.debug('Using database %s, host %s and port %s', db_name, db_host, db_port) try: client = MongoClient(db_host, db_port) self.db = client[db_name] profile = self.db.socialProfile.find_one({'mission': ObjectId(self.missionId)}) self.account = self.db.linkedAccount.find_one({'_id': profile['account']}) if self.account is None: config.LOGGER.debug('No such account id') self.setup_training(days=30) except Exception as ex: config.LOGGER.error("Error %s opening mission _id=%s", ex.message, self.missionId) def get_updates(self, maximum=100, conditions={}): documents = [] config.LOGGER.info('Getting timeline updates for mission %s', self.missionId) config.LOGGER.debug(' query condition: %s', json.dumps(conditions, default=json_util.default)) try: if self.account is None: config.LOGGER.debug('No account id') else: projection = {'keywords': 1, 'text': 1, 'externalID': 1, 'postTime': 1, 'sender': 1, 'quotedStatus': 1} updates = self.db.statusUpdate.find(conditions, projection).sort('postTime', pymongo.DESCENDING).limit(maximum) for tw in updates: if 'quotedStatus' in tw: tw['text'] += " QT " + tw['quotedStatus']['text'] for keyword in tw['quotedStatus']['keywords']: tw['keywords'].append(keyword) smu = self.db.socialMediaUser.find_one({'_id': tw['sender']}, {'screenNameLC': 1}) if smu is not None: tw['keywords'].append(smu['screenNameLC']) documents.append(tw) except Exception as ex: config.LOGGER.error("Error %s getting updates from timeline for mission %s", ex.message, self.missionId) config.LOGGER.debug('Found %d updates in timeline', len(documents)) return documents def topics(self, n_components, n_out=7, n_weight=5, topic=None): config.LOGGER.info('Get topices timeline for %s', self.account['profile']['preferredUsername']) results = [] terms = self.vectorizer.get_feature_names() if topic is None: for k in range(n_components): idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) for item in sorted_idx[0:n_out - 1]: results.append({'term': terms[item[0]], 'weight': item[1]}) else: m = max(self.svd.components_[topic]) idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) for item in sorted_idx[0:n_out - 1]: results.append({'term': terms[item[0]], 'weight': item[1]}) results def get_componentCount(self, min=.05): count = 0 for k in range(len(self.svd.components_)): idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) kcount = 0 for entry in (sorted_idx): if entry[1] > min: kcount += 1 else: break if kcount > count: count = kcount return count def setup_training(self, end_time=datetime.utcnow(), days=15, maximum=1000): try: start = end_time - timedelta(minutes=days*24*60) condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': True}, {'sentByMe': True}], 'postTime': {'$gt': start, '$lte': end_time}, '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]} self.training_docs = self.get_updates(conditions=condition, maximum=10000) config.LOGGER.info('Train model for %s', self.account['profile']['preferredUsername']) if len(self.training_docs) > 50: config.LOGGER.debug('Found %d updates for training from %s', len(self.training_docs), self.account['profile']['preferredUsername']) self.training_end = end_time self.days = days trainingRaw = [' '.join(doc['keywords']) for doc in self.training_docs] #trainingRaw = [tw['text'] for tw in self.training_docs] self.vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=500, use_idf=True, strip_accents='ascii', ) X = self.vectorizer.fit_transform(trainingRaw) if X.shape[1] <= self.n_components: self.n_components = X.shape[1] - 1 config.LOGGER.debug('%d components found for SVD', self.n_components) self.svd = TruncatedSVD(self.n_components, algorithm='arpack') self.svdX = self.svd.fit_transform(X) # self.n_components = self.get_componentCount(self.threshold) # self.svd = TruncatedSVD(self.n_components, random_state=10) # self.svdX = self.svd.fit_transform(X) self.normalizer = Normalizer().fit(self.svdX) self.svdX = self.normalizer.transform(self.svdX) # Clustering config.LOGGER.debug('Determining cluster count ') for n_clusters in self.range_n_clusters: self.k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, verbose=False, random_state=10) self.k_means.fit(self.svdX) score = metrics.silhouette_score(self.svdX, self.k_means.labels_) if score > self.sil_score: self.sil_score = score self.cluster_count = n_clusters config.LOGGER.debug('Cluster count is %d, Silhouette Coefficient is %0.3f ', self.cluster_count, self.sil_score) self.k_means = KMeans(n_clusters=self.cluster_count, init='k-means++', max_iter=100, n_init=4, verbose=False, random_state=10) self.k_means.fit(self.svdX) # now get the top tweets for each cluster x_transform = self.k_means.transform(self.svdX) x_predict = self.k_means.predict(self.svdX) self.all_cluster_dist = [] for i in range(self.cluster_count): cluster_distance = [] for j in range(len(x_predict)): if x_predict[j] == i and sum(self.svdX[j]) != 0.0: cluster_distance.append( {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in x_transform[j]]))}) newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False) self.all_cluster_dist.append(newlist) #now verify this self.self_test() else: config.LOGGER.info('Too few training updates from user timeline') self.svd = None except Exception as ex: config.LOGGER.exception("Error %s computing SVD and kmeans from user history for mission %s", ex.message, self.missionId) def self_test(self): try: config.LOGGER.info("Beginning self test. Better if it were cross validation but not enough data for that") results = self.find_recommendations(self.training_docs, top=10, quality=.001, min_examples=1) config.LOGGER.info("Self test found %d recommendations", len(results)) for rec in results: if rec['text'] != rec['samples_svd'][0]: config.LOGGER.error("Error training SVD for mission %s in tweet %s", self.missionId, rec['text']) except Exception as ex: config.LOGGER.error("Error in self test building training for mission %s", ex.message, self.missionId) def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1): working_list = [] result_list = [] try: config.LOGGER.info('Generating content recommendations for user %s', self.account['profile']['preferredUsername']) if self.svd is not None: if len(tweets) < top: config.LOGGER.debug("Too few tweets passed for recommendation") return [] #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets] #tweetText = [tw['text'] for tw in tweets] tweetText = [' '.join(tw['keywords']) for tw in tweets] Y = self.vectorizer.transform(tweetText) svdY = self.svd.transform(Y) svdY = self.normalizer.transform(svdY) y_transform = self.k_means.transform(svdY) # terms = self.vectorizer.get_feature_names() selected_updates = [] y_predict = self.k_means.predict(svdY) for i in range(self.cluster_count): cluster_distance = [] for j in range(len(y_predict)): if y_predict[j] == i and sum(svdY[j]) != 0.0: cluster_distance.append( {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))}) newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False) selected_updates.append(newlist) temp = [entry for entry in it.izip_longest(*selected_updates)] clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top] clean_list_svdY = [svdY[entry['index']] for entry in clean_list] config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY)) neigh = NearestNeighbors() neigh.fit(self.svdX) if len(clean_list_svdY) > 0: distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality) else: svd_neighbors =[] examples=[] for idx, entry in enumerate(svd_neighbors): if len(entry) >= min_examples: config.LOGGER.debug("Suggested tweet has %d examples" % len(entry)) original = tweets[clean_list[idx]['index']]['text'] for jdx, neighbor in enumerate(entry): examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]}) sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False) min_examples = [item['text'] for item in sorted_examples][:min_examples] t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text'] t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text'] working_list.append({"dist": sorted_examples[0]['dist'], "text": original, "id": str(tweets[clean_list[idx]['index']]['_id']), "sender": str(tweets[clean_list[idx]['index']]['sender']), 'samples_svd': min_examples, 'samples_cluster':[t1,t2]}) result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False) return result_list[:top] except Exception as ex: config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId) return [] def recommend_from_timeline(self, end_time=datetime.utcnow(), minutes_prior=15, top=10, quality=.1, min_examples=1): try: config.LOGGER.info("generating content recommendation from timeline for %s" % self.account['profile']['preferredUsername']) results = [] if self.svd is not None: start = end_time - timedelta(minutes=minutes_prior) condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': False}, {'sentByMe': False}, {'mentionsMe' : False},{'retweetOfMe':False}], 'postTime': {'$gt': start, '$lte': end_time}, '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]} tweets = self.get_updates(maximum=10000, conditions=condition) config.LOGGER.debug('%d updates from account timeline read from database', len(tweets)) results = self.find_recommendations(tweets, top=top, quality=quality, min_examples=min_examples) config.LOGGER.debug('%d recommendations found for mission %s', len(tweets), self.missionId) return results[:top] except Exception as ex: config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId) return []
normalizer = Normalizer(copy=False) X = lsa.fit_transform(X) data = normalizer.fit_transform(X) ### after several experiments linear svc gave the best result: from sklearn.svm import SVC clf = OneVsRestClassifier(SVC(kernel='linear')) #### prepare training set and testing set: #******************************************TRAINING AND TESTING SEPERATE***************************************************** clf.fit(data, label) print "finished fitting model for age-group classification" #### prepare the test data text datasets ####### X_test = vec.transform(testdocs) testdata = lsa.transform(X_test) testdata = normalizer.transform(testdata) ### getting the predicted values: pred = clf.predict(testdata) outdfage = pd.DataFrame({"userid": testuids, "agegroup": pred.tolist()}) #outdf["agegroup"] = pred.tolist() outdf = pd.merge(outdf,outdfage,on="userid") print "Gender and Age Group Classification Process finished" # ####################################### personality ############################## profile_path = path + "profile/profile.csv" o = open(profile_path,'rU') profiletb = csv.DictReader(o) ### read the texttb csv file to get the user ids and corresponding cleaned text data # texttb = pd.read_csv("texttb.csv",encoding="latin-1")
#traindata = pd.read_csv('kdd/binary/kddtrain.csv', header=None) testdata = pd.read_csv('kdd/binary/kddtest.csv', header=None) #X = traindata.iloc[:,0:42] #Y = traindata.iloc[:,0] C = testdata.iloc[:, 0] T = testdata.iloc[:, 1:42] ''' scaler = Normalizer().fit(X) trainX = scaler.transform(X) # summarize transformed data np.set_printoptions(precision=3) #print(trainX[0:5,:]) ''' scaler = Normalizer().fit(T) testT = scaler.transform(T) # summarize transformed data np.set_printoptions(precision=3) #print(testT[0:5,:]) #y_train = np.array(Y) y_test = np.array(C) # reshape input to be [samples, time steps, features] #X_train = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) X_train = np.reshape(testT, (testT.shape[0], 1, testT.shape[1])) batch_size = 32 # 1. define the network model = Sequential()
import pickle import numpy as np from libs.arcface.arcface import Arcface from sklearn.preprocessing import Normalizer l2_normalizer = Normalizer('l2') model = Arcface() from PIL import Image face_data = "FaceName\\" encoding_dict = {} for face_names in os.listdir(face_data): person_dir = os.path.join(face_data, face_names) embeddings = [] for image_name in os.listdir(person_dir): image_path = os.path.join(person_dir, image_name) img = cv2.imread(image_path) encode = model(img) encode = encode[0] embeddings.append(encode) if embeddings: encode = np.sum(embeddings, axis=0) encode = l2_normalizer.transform(encode.reshape(1, -1))[0] encoding_dict[face_names] = encode path = 'encodings/encodings.pkl' with open(path, 'wb') as file: pickle.dump(encoding_dict, file)
# Normalize data (length of 1) from sklearn.preprocessing import Normalizer import pandas import numpy url = "https://goo.gl/vhm1eU" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = pandas.read_csv(url, names=names) array = dataframe.values # separate array into input and output components X = array[:,0:8] Y = array[:,8] scaler = Normalizer().fit(X) normalizedX = scaler.transform(X) # summarize transformed data numpy.set_printoptions(precision=3) print(normalizedX[0:5,:])
def normalize_scaler(self): # 数据归一化。 scaler = Normalizer().fit(self.__x) nor_x = scaler.transform(self.__x) return nor_x
testdata = pd.read_csv('kdd/multiclass/Testing.csv', header=None) X = traindata.iloc[:,0:42] Y = traindata.iloc[:,42] C = testdata.iloc[:,42] T = testdata.iloc[:,0:42] trainX = np.array(X) testT = np.array(T) trainX.astype(float) testT.astype(float) scaler = Normalizer().fit(trainX) trainX = scaler.transform(trainX) scaler = Normalizer().fit(testT) testT = scaler.transform(testT) y_train = np.array(Y) y_test = np.array(C) X_train = np.array(trainX) X_test = np.array(testT) batch_size = 64 # 1. define the network
from sklearn import metrics from sklearn.preprocessing import Normalizer import h5py from keras import callbacks from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger traindata = pd.read_csv('classical/data/train.csv', header=None) testdata = pd.read_csv('data/valid.csv', header=None) X = traindata.iloc[:, 1:61] Y = traindata.iloc[:, 0] C = testdata.iloc[:, 0] T = testdata.iloc[:, 1:61] scaler = Normalizer().fit(X) trainX = scaler.transform(X) # summarize transformed data np.set_printoptions(precision=3) #print(trainX[0:5,:]) scaler = Normalizer().fit(T) testT = scaler.transform(T) # summarize transformed data np.set_printoptions(precision=3) #print(testT[0:5,:]) y_train = np.array(Y) y_test = np.array(C) # reshape input to be [samples, time steps, features] X_train = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
def regression_step(self, file, step, mdl): data = pd.read_csv(file) #, index_col = 'Munic&Year') features = list(data.columns[3:-1]) print(features) out = pd.Series() NN = len(data) / step NN = int(NN) datay = data.loc[:, "NextYear"] datax = data[features] names = data['Munic'].drop_duplicates().values scaler = Normalizer().fit(datax) #scalx = scaler.transform(datax) #normedx = normalize(scalx) normedx = scaler.transform(datax) norm = pd.DataFrame(data=normedx, columns=features, index=data.index) norm_data = pd.merge(norm, datay, how='right', left_index=True, right_index=True, sort=False) prog = 0 for nn in names: test = norm_data.loc[data['Munic'] == nn] train = norm_data.loc[data['Munic'] != nn] y = train["NextYear"] x = train[features] x_test = test[features] names = data.loc[test.index, 'Munic&Year'] rates = data.loc[test.index, 'NextYear'] if mdl == 1: clf = RandomForestRegressor(n_estimators=200) model = clf.fit(x, y) elif mdl == 2: clf = RandomForestRegressor(n_estimators=500) abc = AdaBoostClassifier(n_estimators=100, base_estimator=clf, learning_rate=1) model = abc.fit(x, y) elif mdl == 3: gb = GradientBoostingRegressor(n_estimators=500, learning_rate=1, warm_start=True) model = gb.fit(x, y) # if nn % 10 == 0 : if prog % 7 < 2: print( sorted(zip( map(lambda x: round(x, 4), model.feature_importances_), features), reverse=True)) est = model.predict(x_test) aa = {'Munic&Year': names, 'model': est, 'R/E': rates} add = pd.DataFrame(aa) out = out.append(add) prog += len(test) print(str(round(prog / len(data) * 100, 2)) + '% Complete') out.to_csv("./Model/AllYears_Regressed.csv")
'arr_2'], data['arr_3'] print('Dataset: train=%d, test=%d' % (trainX.shape[0], testX.shape[0])) print(trainX.shape) PCA_COMPONENT = 400 pca = PCA(n_components=PCA_COMPONENT) trainX = np.reshape(trainX, (len(trainX), 4096)) testX = np.reshape(testX, (len(testX), 4096)) trainX = pca.fit_transform(trainX) testX = pca.fit_transform(testX) # normalize input vectors in_encoder = Normalizer(norm='l2') trainX = in_encoder.transform(trainX) testX = in_encoder.transform(testX) # label encode targets #out_encoder = LabelEncoder() #out_encoder.fit(trainy) #trainy = out_encoder.transform(trainy) #testy = out_encoder.transform(testy) # fit model model = SVC(kernel='linear', probability=True) model.fit(trainX, trainy) # predict yhat_train = model.predict(trainX) yhat_test = model.predict(testX) # score
# In this lab setting, you have both train+test data; but in the wild, # you'll only have your training data, and then unlabeled data you want to # apply your models to. # normalizer = Normalizer() normalizer.fit(X_train) # # TODO: With your trained pre-processor, transform both your training AND # testing data. # # NOTE: Any testing data has to be transformed with your preprocessor # that has ben fit against your training data, so that it exist in the same # feature-space as the original data used to train your models. # X_train_n, X_test_n = normalizer.transform(X_train), normalizer.transform( X_test) # # TODO: Just like your preprocessing transformation, create a PCA # transformation as well. Fit it against your training data, and then # project your training and testing features into PCA space using the # PCA model's .transform() method. # # NOTE: This has to be done because the only way to visualize the decision # boundary in 2D would be if your KNN algo ran in 2D as well: # pca = RandomizedPCA(n_components=2) pca.fit(X_train_n) X_train_pca = pca.transform(X_train_n) X_test_pca = pca.transform(X_test_n)
mean_absolute_error, roc_curve, classification_report, auc) data = pd.read_csv('CICIDS2017-full.csv', header=0) print(data.head()) X = data.iloc[0:, 0:78] print(X.head()) Y = data.iloc[0:, 78] print(Y.head()) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3) print("1.....") scaler = Normalizer().fit(Xtrain) # train trainX = scaler.transform(Xtrain) scaler = Normalizer().fit(Xtest) testT = scaler.transform(Xtest) traindata = np.array(trainX) trainlabel = np.array(Ytrain) testdata = np.array(testT) testlabel = np.array(Ytest) model = LogisticRegression() model.fit(traindata, trainlabel) # make predictions expected = testlabel
############################################################################ # rescale feature values of observations to have a unit norm (total lenght of 1.0) # the following relies on from sklearn.preprocessing import Normalizer # Use normalizer with a norm argument # Create feature matrix features = np.array([[0.5, 0.5], [1.1, 3.4], [1.5, 20.2], [1.63, 34.4], [10.9, 3.3]]) # Create normalizer normalizer = Normalizer(norm="l2") # Transform feature matrix print(normalizer.transform(features), "use norm to tansform features") ##Many rescaling methods (e.g., min-max scaling and standardization) ##operate on features; however, we can also rescale across individual ##observations. Normalizer rescales the values on individual observations ##to have unit norm (the sum of their lengths is 1). This type of rescaling ##is often used when we have many equivalent features (e.g., text classification ##when every word or n-word group is a feature). ##Normalizer provides three norm options with Euclidean norm (often called L2) ##being the default argument. # Transform feature matrix features_l2_norm = Normalizer(norm="l2").transform(features) # Show feature matrix
def K_Neighbors(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = Twitter_Depression_Detection.Reader( ) # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Above 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc( x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] ####################################################################################################################### # leaf_size: int, optional(default=30) # p : integer, optional (default = 2) # When p = 1, this is equivalent to using manhattan_distance (l1), # and euclidean_distance (l2) for p = 2. # For arbitrary p, minkowski_distance (l_p) is used. # algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional Algorithm used to compute the nearest neighbors: # ‘ball_tree’ will use BallTree # ‘kd_tree’ will use KDTree # ‘brute’ will use a brute-force search. # ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method. # weights : str or callable, optional (default = ‘uniform’) weight function used in prediction. Possible values: # ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally. # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. scaler = Normalizer() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) classifier = KNeighborsClassifier(n_neighbors=40) # 'minority': resample only the minority class; oversample = SMOTE(sampling_strategy='minority', k_neighbors=10, random_state=0) x_train, y_train = oversample.fit_resample(x_train, y_train) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') roc = roc_auc_score(y_test, y_pred) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- y_pred = (y_pred > 0.5) # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') report = classification_report(y_test, y_pred) print(report) temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support( y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
# apply your models to. # # .. your code here .. from sklearn.preprocessing import Normalizer data_train_fit = Normalizer().fit(data_train) # # TODO: With your trained pre-processor, transform both your training AND # testing data. # # NOTE: Any testing data has to be transformed with your preprocessor # that has ben fit against your training data, so that it exist in the same # feature-space as the original data used to train your models. # # .. your code here .. Trans_datatrain = data_train_fit.transform(data_train) Trans_datatest = data_train_fit.transform(data_test) # # TODO: Just like your preprocessing transformation, create a PCA # transformation as well. Fit it against your training data, and then # project your training and testing features into PCA space using the # PCA model's .transform() method. # # NOTE: This has to be done because the only way to visualize the decision # boundary in 2D would be if your KNN algo ran in 2D as well: # # .. your code here .. from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(Trans_datatrain)
def normalize(data): in_encoder = Normalizer(norm='l2') data = in_encoder.transform(data) return data
le2 = LabelEncoder() le2.fit(np.ravel(data_np[:, [10]])) #print le2.classes_ data_np[:, [10]] = le2.transform(np.ravel(data_np[:, [10]])).reshape(n_lin, 1) # Replace missing values by 0 for the column 16 and 17 data_np = preprocess_replace_NaN(data_np, [15, 16], 'nan') # plot_NA_ratio_features(data_np, feature_names) # Normalize the dataset for columns 5, 6, 7, 10, 11, 13, 14, 17 and 25 nor = Normalizer( norm='l1') nor.fit(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64)) # [0, 1, 2, 6, 11, 17, 18, 19, 20, 21, 22, 23] data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]] = \ nor.transform(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64)) # Replace missing values for the risk_factor using a svm classifier preprocess_missing_risk_factor(data_np) # plot_pourcentage_result(data_np, feature_names, [17, 18, 19, 20, 21 ,22, 23]) # plot_NA_ratio_features(data_np, feature_names) ################################################################################ # # Replace all missing values for the column 12, 16 and 17 with the median value # imp = Imputer(strategy='median', axis=0) # imp.fit(data_np[:, [11, 15, 16]]) # data_np[:, [11, 15, 16]] = imp.transform(data_np[:, [11, 15, 16]])
# Append new features newAct_train = np.zeros((activation_train.shape[0], activation_train.shape[1]+3)) for i in range(activation_train.shape[0]): newAct_train[i] = np.append(activation_train[i], pttImg_sample_train[i][:3]) newAct_valid = np.zeros((activation_valid.shape[0], activation_valid.shape[1]+3)) for i in range(activation_valid.shape[0]): newAct_valid[i] = np.append(activation_valid[i], valid_pttImg[i][:3]) newAct_test = np.zeros((activation_test.shape[0], activation_test.shape[1]+3)) for i in range(activation_test.shape[0]): newAct_test[i] = np.append(activation_test[i], test_blogImg[i][:3]) # Normalize normalizer = Normalizer() normalizer.fit(newAct_train) newAct_train = normalizer.transform(newAct_train) newAct_valid = normalizer.transform(newAct_valid) newAct_test = normalizer.transform(newAct_test) # Final model model3 = Sequential() model3.add(Dense(2, input_shape=(newAct_train.shape[1],), activation='softmax')) adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model3.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) print(model3.summary()) model3.fit(newAct_train, y_train_sample, epochs=epochs3, batch_size=batch_size) # Evaluating by using validation data or testing data print("Valid:") scores = model3.evaluate(newAct_valid, y_valid_sample, verbose=0) print("Loss:", scores[0])
if dataset[column].dtype == type(object): le = LabelEncoder() dataset[column] = le.fit_transform(dataset[column]) array = dataset.values #print(dataset.describe) #print(dataset.shape) #print(pd.DataFrame(dataset)) #Visualization scatter_matrix(dataset) plt.show() #plt.plot(dataset.iloc[:,[0,1,2,3]],label='names[]') X = array[:, 0:4] Y = array[:, 4] # Step 2: Pre-Processing the data to best expose the structure of the problem. scaler = Normalizer().fit(X) rescaled = scaler.transform(X) np.set_printoptions(precision=3) #print('The result after pre-processing of data :\n') #print(rescaled[0:6,:]) #Setp3 :Spot-checking a number of algorithms using your own test harness. scoring = 'accuracy' models = [] results = [] names = [] models.append(('SVC', SVC())) models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) print('The result after spot-checking is :\n') for name, model in models: kfold = KFold(n_splits=10, random_state=7) cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
def predict_proba(self, X, test_data, test_data_idx=None): ''' Predict based by a simple geographical distance threshold ''' tfs = [] # words that only occur in the test data test_word_idx = {} test_words = [] test_station_ids = set() # take only the pairs present in the test data set for i in range(X.shape[0]): lookup = i if test_data_idx is not None: lookup = test_data_idx[i] stid1 = test_data.pairs[lookup][0] stid2 = test_data.pairs[lookup][1] test_station_ids.add(stid1) test_station_ids.add(stid2) test_station_ids = list(test_station_ids) for _, sid in enumerate(test_station_ids): station = test_data.stations[sid] tfs.append({}) for word in re.split(r"[^\w]+", station.name): word = word.strip() if len(word) == 0: continue if word not in self.word_idx: if word not in test_word_idx: test_words.append(word) test_word_idx[word] = len(test_words) - 1 # test word ids begin at the end of the training word ids wid = len(self.words) + test_word_idx[word] else: wid = self.word_idx[word] if wid not in tfs[-1]: tfs[-1][wid] = 0 tfs[-1][wid] += 1 # build tf.idf matrix data = [] indices = [] indptr = [0] # mapping maps station ids to rows in the td.idf matrix glob_mapping = {} for iid, sid in enumerate(test_station_ids): station = test_data.stations[sid] glob_mapping[sid] = iid for _, wid in enumerate(tfs[iid]): indices.append(wid) tf = tfs[iid][wid] if wid >= len(self.dfs): # no document frequency, count as unique word idf = math.log(self.train_num_stations) else: idf = math.log(1 / self.dfs[wid]) data.append(tf * idf) indptr.append(len(indices)) matrix = csr_matrix( (data, indices, indptr), shape=(len(test_station_ids), len(self.words) + len(test_words)), dtype=float) # L2 normalize norm = Normalizer(norm="l2", copy=False) norm.transform(matrix) ret = numpy.empty([X.shape[0], 2], dtype=numpy.float) # we build the similarity scores in chunks, because # otherwise the cosine similarity matrix would get way too big chunksize = 5000 def proc_chunk(a, b, out): for minr in range(a, b, chunksize): maxr = min(b, minr + chunksize) locret = numpy.empty([maxr - minr, 2], dtype=numpy.float) chunkstationids = set() for i in range(minr, maxr): lookup = i if test_data_idx is not None: lookup = test_data_idx[i] stid1 = test_data.pairs[lookup][0] stid2 = test_data.pairs[lookup][1] chunkstationids.add(stid1) chunkstationids.add(stid2) chunkstationids = list(chunkstationids) # build a view of our tfidf score matrix containing the # stations in this chunk chunk_map = {} mapping_l = [] chunk_matrix = None for iid, sid in enumerate(chunkstationids): chunk_map[sid] = iid mapping_l.append(glob_mapping[sid]) chunk_matrix = matrix[mapping_l, :] simi_mat = cosine_similarity(chunk_matrix) for i in range(minr, maxr): lookup = i if test_data_idx is not None: lookup = test_data_idx[i] simi = simi_mat[chunk_map[test_data.pairs[lookup][0]], chunk_map[test_data.pairs[lookup][1]]] if simi > self.t: simi = 0.5 + (simi - self.t) / (2.0 * (1.0 - self.t)) else: simi = simi / (2 * self.t) locret[i - minr, 1] = simi locret[i - minr, 0] = 1 - locret[i - minr, 1] out.append([locret, minr, maxr]) manager = mp.Manager() rets = manager.list() procs = [] processors = mp.cpu_count() csize = int(X.shape[0] / processors) for a in range(0, X.shape[0], csize): b = min(X.shape[0], a + csize) procs.append(mp.Process(target=proc_chunk, args=(a, b, rets))) for p in procs: p.start() for p in procs: p.join() for locret in rets: ret[locret[1]:locret[2], :] = locret[0] return ret
def load_blood_data(train=True, SEED=97, scale = False, minmax = False, norm = False, nointercept = False, engineering = False): """ Load training and test datasets for DrivenData's Predict Blood Donations warmup contest The training data is shuffled before it's returned; test data is not Note: patsy returns float64 data; Theano requires float32 so conversion will be required; the y values are converted to int32, so they're OK Arguments --------- train (bool) if True y_train, X_train = load_blood_data(train=True, ... if False X_test, IDs = load_blood_data(train=False, ... SEED (int) random seed scale (bool) if True, scale the data to mean zero, var 1; standard normal minmax (2-tuple) to scale the data to a specified range, provide a 2-tuple (min, max) norm (bool) if True, L2 normalize for distance and similarity measures nointercept (bool) if True, patsy will not create an intercept Usage ----- from load_blood_data import load_blood_data """ from sklearn.utils import shuffle from patsy import dmatrices, dmatrix from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import Normalizer import numpy as np import pandas as pd import re global scaler global minmaxer global normalizer if (scale and minmax): raise ValueError("cannot specify both scale and minmax") if (scale and norm): raise ValueError("cannot specify both scale and norm") if (norm and minmax): raise ValueError("cannot specify both norm and minmax") if type(train) is not bool: raise ValueError("train must be boolean") if type(SEED) is not int: raise ValueError("SEED must be int") if type(scale) is not bool: raise ValueError("scale must be boolean") if type(norm) is not bool: raise ValueError("norm must be boolean") if type(nointercept) is not bool: raise ValueError("nointercept must be boolean") if type(engineering) is not bool: raise ValueError("engineering must be boolean") # ------------- read the file ------------- file_name = '../data/train.csv' if train else '../data/test.csv' data = pd.read_csv(file_name) # ------------- shorten the column names ------------- column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated'] data.columns = column_names if train else column_names[:-1] # ------------- create new variables ------------- if engineering: # Ratio of moSinceLast / moSinceFirst = moRatio data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index) # Ratio of (volume/numDonations) / moSinceFirst = avgDonation data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index) # Ratio of moSinceFirst / numDonations = avgWait data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index) # ------------- scale the data ------------- # transform data to mean zero, unit variance # ========================================== if scale: if train: scaler = StandardScaler(copy=True, with_mean=True, with_std=True) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = scaler.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # transform data to fit in a range # ================================ if minmax: if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple") if train: minmaxer = MinMaxScaler(feature_range = minmax) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = minmaxer.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # transform data to unit vector (L2 norm for distance and similarity) # =================================================================== if norm: if train: normalizer = Normalizer(norm='l2', copy=True) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = normalizer.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # ------------- create the design matrix ------------- # create the datasets with a patsy formula formula = 'donated ~ moSinceLast * moSinceFirst + numDonations + volume' if engineering: formula = formula + ' + moRatio + avgDonation + avgWait' if nointercept: formula = formula + ' -1' if not train: match = re.search(r"~\s??(.*)", formula) if match: formula = match.group(1) else: raise ValueError("Patsy formula {} does not match the expected format".format(formula)) # ------------- return the values ------------- if train: y_train, X_train = dmatrices(formula, data=data, return_type="dataframe") y_train = np.ravel(y_train).astype(np.int32) X_train, y_train = shuffle(X_train, y_train, random_state=SEED) return y_train, X_train else: X_test = dmatrix(formula, data=data, return_type="dataframe") IDs = data.ID.values return X_test, IDs
with open('./data/{}/a_list.npy'.format(prefix), 'w') as file: json.dump(a_list, file) with open('./data/{}/h_list.npy'.format(prefix), 'w') as file: json.dump(h_list, file) with open('./data/{}/q_id_list.npy'.format(prefix), 'w') as file: json.dump(q_id_list, file) # Create the map from sklearn.preprocessing import Normalizer from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA transformer = Normalizer(copy=True, norm='l2') normalized = transformer.transform(v_array) #X = PCA(n_components=50).fit_transform(normalized) #X = StandardScaler().fit_transform(reduced_data) import time import umap time_start = time.time() umap_ = umap.UMAP(n_neighbors=100, verbose=True) embedding = umap_.fit_transform(normalized) print('umap done! Time elapsed: {} seconds'.format(time.time() - time_start)) np.save('./data/{}/lgd_umap.npy'.format(prefix), embedding) new_list = [] for i, md5 in enumerate(h_list):
def main(): # if sys.argv[2] == 'svm': # Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) # elif sys.argv[2] == 'lr': # Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8) # elif sys.argv[2] == 'pa': # Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced') # else: # Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced') Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced') Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5, param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3, # param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt' Ngram = 2 print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram) PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100] NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100] print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples)) X = PosSamples + NegSamples y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random.randint(0,100)) print '# TrainLabels', len(y_train) print '# TestLabels', len(y_test) print 'performing CVectorizer' CVectorizer = CountVectorizer(lowercase = True, stop_words='english', # token_pattern='(?u)\b\w\w+\b', # tokenizer = SGTokenizer, tokenizer = Tokenizer, ngram_range=(1,2), dtype=np.float64, decode_error = 'ignore', max_df=0.8) print 'performing TfidfTransformer and Normalizer' # TFIDFTransformer = TfidfTransformer() normalizer = Normalizer() print 'creating Train and Test FVs' T0 = time() TrainFVs = CVectorizer.fit_transform(X_train) TestFVs = CVectorizer.transform(X_test) print 'feat ext time', time() - T0 # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs) # TestFVs = TFIDFTransformer.transform(TestFVs) TrainFVs = normalizer.fit_transform(TrainFVs) TestFVs = normalizer.transform(TestFVs) print 'Trai/test split' print TrainFVs.shape print TestFVs.shape # raw_input('hit any key...') print 'training classifier with train samples shape:', TrainFVs.shape T0 = time() # memory_dump('before_train_mem.txt') Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily) print 'batch fitted' print 'training time', time() - T0 # memory_dump('after_train_mem.txt') print 'testing classifier with test samples shape:', TestFVs.shape T0 = time() # memory_dump('before_test_mem.txt') PredictedLabels = Clf.predict(TestFVs) print 'testing time', time() - T0 # memory_dump('after_test_mem.txt') print '*'*100 print 'classification report' print '-'*20 Accuracy = np.mean(PredictedLabels == y_test) print "Test Set Accuracy = ", Accuracy print(metrics.classification_report(y_test, PredictedLabels, target_names=['Neg', 'Pos'])) print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels) print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels) print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels) print "Log loss:", metrics.log_loss(y_test, PredictedLabels) print "F1 Score:", metrics.f1_score(y_test, PredictedLabels) print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels) print '*'*100 Vocab = CVectorizer.get_feature_names() # print Vocab[:100] # raw_input() try: FeatureImportances = Clf.coef_[0] except: FeatureImportances = Clf.best_estimator_.coef_[0] print FeatureImportances.shape raw_input() PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1] NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1] for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices): print Vocab[PosFIndex], '+-', Vocab[NegFIndex] FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1])) FeatureImportancesSparseArray.setdiag(FeatureImportances) AllFVsTimesW = TestFVs*FeatureImportancesSparseArray print AllFVsTimesW.shape Ind = 0 for TestFV in TestFVs: if PredictedLabels[Ind] != y_test[Ind]: Ind += 1 continue if len(X_test[Ind].split()) < 5: Ind += 1 continue print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind]) # print TestFV # print TestFV.shape CurTestFV = np.array(AllFVsTimesW[Ind].toarray()) CurTestFV = CurTestFV.transpose() CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],) # print CurTestFV.shape # raw_input() PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1] NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1] PosFeatImps= CurTestFV.argsort()[-2:] NegFeatImps = CurTestFV.argsort()[:2] Tmp = AllFVsTimesW[Ind].todense() Tmp = np.sort(Tmp) # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp # print NegTopFeatureIndices, NegFeatImps if y_test[Ind] == 1: print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green') else: print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red') Ind += 1 raw_input()
def normalize_features(self): normalizer = Normalizer() self.X_train = normalizer.fit_transform(self.X_train) self.X_test = normalizer.transform(self.X_test) return self.X_train, self.X_test
import pandas names = ('квартира', 'хрущевка', 'улучшенной', 'брежневка', 'старой', 'проект', 'свердловка', 'сталинка', 'нестандартная', 'новостройка', 'многокомнатная', 'левый берег', 'ленинский', 'правобережный', 'орджоникидзевский', 'поселок', 'агаповка', 'другой р-н', 'кол-во комнат', 'общ площадь', 'жил площадь', 'кухня', 'этаж', 'ремонт', 'соcтояние', 'окна', 'балкон', 'балкон застеклен', 'торг', 'ипотека', 'срочно', 'цена 1кв м') array = pandas.read_csv('cs_datasets.csv', names=names) dataset = array.values X = dataset[:, 0:31] Y = dataset[:, 31] scaler = Normalizer().fit(X) normalizedX = scaler.transform(X) numpy.set_printoptions(precision=3) (trainX, testX, trainY, testY) = train_test_split(normalizedX, Y, test_size=0.25, random_state=42) model = Sequential() model.add(Dense(128, activation='relu', input_shape=(trainX.shape[1], ))) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse', metrics=['mae']) model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2) mse, mae = model.evaluate(testX, testY, verbose=0) print("Средняя абсолютная ошибка (тысяч рублей): ", round(mae, 0))
import numpy as np from sklearn.preprocessing import Normalizer features = np.array([[0.5, 0.5], [1.1, 3.4], [1.5, 20.2], [1.63, 34.4], [10.9, 3.3]]) #변환기 객체 normalizer = Normalizer(norm="l2") #특성 행렬 변환 normalizer.transform(features) #특성 행렬을 변환 features_l2_norm = Normalizer(norm="l2").transform(features) features_l2_norm ''' norm에는 2가지 종류가 있음 l1, l2 ''' features_l1_norm = Normalizer(norm="l1").transform(features) features_l1_norm print("첫 번째 샘플 값의 합:",
def normalize(self, data): scaler = Normalizer().fit(data) data = scaler.transform(data) return data
def facerecog(video): #slot='slot5' slot = filecreator(video) trainX, trainy = load_datasettrain('facenettest/train/') print(trainX.shape, trainy.shape) # load test dataset all = datetime.datetime.now() date = str(all.day) + '_' + str(all.month) + '_' + str(all.year) path_test = 'facenettest/storagevideo/' + date + '/' + slot + '/' testX = loadfacetest(path_test) print(testX.shape) # save arrays to one file in compressed format savez_compressed('facenettest.npz', trainX, trainy, testX) data = load('facenettest.npz') trainX, trainy, testX = data['arr_0'], data['arr_1'], data['arr_2'] print('Loaded: ', trainX.shape, trainy.shape, testX.shape) model = load_model('facenet_keras.h5') print('Loaded Model') newTrainX = list() for face_pixels in trainX: embedding = get_embedding(model, face_pixels) newTrainX.append(embedding) newTrainX = asarray(newTrainX) print(newTrainX.shape) newTestX = list() for face_pixels in testX: embedding = get_embedding(model, face_pixels) newTestX.append(embedding) newTestX = asarray(newTestX) print(newTestX.shape) savez_compressed('facenettest.npz', newTrainX, trainy, newTestX) data = load('facenettest.npz') testX_faces = data['arr_2'] data = load('facenettest.npz') trainX, trainy, testX = data['arr_0'], data['arr_1'], data['arr_2'] in_encoder = Normalizer(norm='l2') trainX = in_encoder.transform(trainX) testX = in_encoder.transform(testX) out_encoder = LabelEncoder() out_encoder.fit(trainy) trainy = out_encoder.transform(trainy) #n=zeros() model = SVC(kernel='linear', probability=True) model.fit(trainX, trainy) print(testX.shape[0]) present = list() for i in range(testX.shape[0]): random_face_pixels = testX_faces[i] random_face_emb = testX[i] samples = expand_dims(random_face_emb, axis=0) yhat_class = model.predict(samples) yhat_prob = model.predict_proba(samples) class_index = yhat_class[0] class_probability = yhat_prob[0, class_index] * 100 predict_names = out_encoder.inverse_transform(yhat_class) c = 0 if i != 0: for i in range(0, len(present)): if present[i] == predict_names[0]: c = 1 print(predict_names[0] + ' ' + str(class_probability)) if c == 0 and class_probability > 40: present.append(predict_names[0]) print(present) return present
] scale = StandardScaler() scale.fit(X) scale.transform(X) #正则化 from sklearn.preprocessing import Normalizer X = [ [1,2,3,4,5], [5,4,3,2,1], [3,3,3,3,3], [1,1,1,1,1] ] nomolizer = Normalizer(norm='l2') nomolizer.transform(X) #过滤式特征选取:VarianceThreshold,剔除方差小于给定阈值的特征 from sklearn.feature_selection import VarianceThreshold X = [ [1,2,3,4,5], [5,4,3,2,1], [3,3,3,3,3], [1,1,1,1,1] ] selected = VarianceThreshold(2) selected.fit(X) selected.transform(X) #过滤式特征选取:单变量特征提取,SelectBest保留在某统计指标上得分最高的k个指标,SelectPercentile则保留百分比 from sklearn.feature_selection import SelectKBest,f_classif
person_img) face_bounding_boxes = face_recognition.face_locations(face) #If training image contains exactly one face if len(face_bounding_boxes) == 1: face_enc = face_recognition.face_encodings(face)[0] # Add face encoding for current image with corresponding label (name) to the training data encodings.append(face_enc) names.append(person) else: print(person + "/" + person_img + " was skipped and can't be used for training") # normalize input vectors in_encoder = Normalizer(norm='l2') encodings = in_encoder.transform(encodings) # label encode targets out_encoder = LabelEncoder() out_encoder.fit(names) names = out_encoder.transform(names) # Create and train the SVC classifier clf = svm.SVC(gamma='scale', probability=True) #clf = svm.SVC(kernel='linear', probability=True) clf.fit(encodings, names) # Load the test image with unknown faces into a numpy array test_image = face_recognition.load_image_file('test/test.jpg') # Find all the faces in the test image using the default HOG-based model face_locations = face_recognition.face_locations(test_image)
def make_nn_regression(n_samples=100, n_features=100, n_informative=10, dense=False, noise=0.0, test_size=0, normalize_x=True, normalize_y=True, shuffle=True, random_state=None): X, y, w = _make_nn_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, shuffle=shuffle, random_state=random_state) if dense: X = X.toarray() if test_size > 0: cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state, test_size=test_size, train_size=1 - test_size) train, test = list(cv)[0] X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] if not dense: X_train.sort_indices() X_test.sort_indices() else: X_train, y_train = X, y if not dense: X_train.sort_indices() X_test, y_test = None, None # Add noise if noise > 0.0: generator = check_random_state(random_state) y_train += generator.normal(scale=noise * np.std(y_train), size=y_train.shape) y_train = np.maximum(y_train, 0) if normalize_x: normalizer = Normalizer() X_train = normalizer.fit_transform(X_train) if X_test is not None: X_test = normalizer.transform(X_test) if normalize_y: scaler = MinMaxScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() if y_test is not None: y_test = scaler.transform(y_test.reshape(-1, 1)).ravel() if X_test is not None: return X_train, y_train, X_test, y_test, w else: return X_train, y_train, w
t1.start() t2.start() t1.join() t2.join() X, Y = result_queue1.get() test, label = result_queue2.get() print(np.shape(test)) print(np.shape(X)) partx, party = random_partition(X, Y, 600000) print(np.shape(partx)) nrm = Normalizer() partx = nrm.fit_transform(partx) print(np.shape(partx)) test = nrm.transform(test) print(np.shape(test)) parameters = [[35, 45, 36], 2.0, 0.0012044393115519438, 1.0, 3.5784753152461046e-06, 0.2751874654816516, 0.0023804489838965626, 0.0, -0.23317429215526964] hidden_layers = parameters[0] solver = ["lbfgs", "sgd", "adam"] learning_rate = ["constant", "adaptive"] cur_solver = solver[int(parameters[1])] cur_learning_rate = learning_rate[int(parameters[3])] cur_nester = False if parameters[7] == 1: cur_nester = True cur_momentum = parameters[6]
df_test_set = df_total.drop(df_train_set.index) # final training and test data df_train_input = pd.DataFrame( df_train_set.loc[:, df_train_set.columns != 'class']) df_train_output = pd.DataFrame(df_train_set['class']) df_test_input = pd.DataFrame( df_test_set.loc[:, df_test_set.columns != 'class']) df_test_output = pd.DataFrame(df_test_set['class']) # _____________________________________________________________________________________________________________________ # Standardize/Normalize data # data are now named with an additional "_" at the end # print("Standardizing the data\n" + line_str) stsc = Normalizer().fit(df_train_input) df_train_input_ = pd.DataFrame(stsc.transform(df_train_input)) df_test_input_ = pd.DataFrame(stsc.transform(df_test_input)) stsc = StandardScaler().fit(df_train_input) df_train_input_ = pd.DataFrame(stsc.transform(df_train_input)) df_test_input_ = pd.DataFrame(stsc.transform(df_test_input)) # PCA minimum_explained_variance = 0.95 pca = PCA(minimum_explained_variance) # alternative: # Choose the number of components by our own # number_principal_components = 10 # pca = PCA(n_components=number_principal_components) pca.fit(df_train_input_)
X_train = X_train.drop('index', axis=1) X_test = X_test.drop('index', axis=1) Y_train = Y_train.drop('index', axis=1) Y_test = Y_test.drop('index', axis=1) X_train.head(5) # In[4]: #Normalization from sklearn.preprocessing import Normalizer normalizer = Normalizer() normalizer.fit(X_train) train_norm = normalizer.transform(X_train) test_norm = normalizer.transform(X_test) # In[5]: #Standardization from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(train_norm) train_data = normalizer.transform(train_norm) test_data = normalizer.transform(test_norm) # In[6]: #Dimensionality reduction : Truncated SVD
from sklearn import svm from sklearn import metrics from sklearn.preprocessing import StandardScaler, Normalizer from sklearn.externals import joblib from grid_search import grid_estimation # downloading matrix of text features and assigned clusters all_data = genfromtxt('features_and_clusters.csv', delimiter=',') data = all_data[:, 0:29] target = all_data[:, 29] # normalization and scaling of data normalizer = Normalizer() normalizer.fit(data) data = normalizer.transform(data) scaler = StandardScaler() data = scaler.fit_transform(data) # choosing of training and test sets X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0) #clf = svm.SVC(kernel="rbf", gamma=0.001, C=1000).fit(X_train, y_train) clf = svm.SVC(kernel="linear", gamma=1.0, C=1).fit(X_train, y_train) # saving of classifier, scaler and normalizer joblib.dump(clf, 'classifier_data\\model.pkl') joblib.dump(scaler, 'classifier_data\\scaler.pkl') joblib.dump(normalizer, 'classifier_data\\normalizer.pkl')
def norm_pre(X_train): norm = Normalizer() norm.fit(X_train) return norm.transform(X_train)
attribute, label = dataset(benign_attribute, gafgyt_attribute, mirai_attribute, benign_label, gafgyt_label, mirai_label) # print(attribute[0]) #print("num of samples: ", len(label)) attribute_train, label_train, attribute_test, label_test = div_train_test( attribute, label, len(attribute)) #print("num of test samples: ", len(label_test)) #print("attribute_train: ",attribute_train) #data process #normalize the data normal = Normalizer().fit(attribute_train) normal.transform(attribute_train) normal.transform(attribute_test) #print("normal proecessed: ", attribute_test) print("original number of features for each sample: ", len(attribute_train[0])) pca = PCA(n_components=0.98) processed_train_attribute = pca.fit_transform(attribute_train) processed_test_attribute = pca.transform(attribute_test) print("number of features for each sample after PCA processed: ", len(processed_train_attribute[0])) #print("PCA processed: ", processed_train_attribute) print("before normalization: ", processed_train_attribute) #scale the data and delete the outliers
X.compactness.fillna(X.compactness.mean(), inplace=True) X.width.fillna(X.width.mean(), inplace=True) X.groove.fillna(X.groove.mean(), inplace=True) X.isnull().sum() from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1) from sklearn.preprocessing import Normalizer norml = Normalizer() norml.fit(X_train) # we got trained normalizer here - now using it transform both train and test data X_trainNrm = norml.transform(X_train) X_testNrm = norml.transform(X_test) from sklearn.decomposition import PCA pca = PCA(n_components=2, svd_solver='randomized') pcx = pca.fit(X_trainNrm) train_pca = pca.transform(X_trainNrm) test_pca = pca.transform(X_testNrm) from sklearn.neighbors import KNeighborsClassifier modelknn = KNeighborsClassifier(n_neighbors=9) modelknn.fit(train_pca, Y_train) plotDecisionBoundary(modelknn, train_pca, Y_train) print(modelknn.score(test_pca, Y_test))