Beispiel #1
0
    def __init__(self, 
                 YTrain_file,
                 XTrain_file,
                 XTest_file,
                 output_path,
                 normalise,
                 C,
                 class_weight,
                 ):
        """
        Arguments:
      
        """
        self.YTrain = joblib.load(YTrain_file)
        XTrain = joblib.load(XTrain_file)
        self.XTrain = XTrain.reshape(np.size(XTrain, axis=0), -1)
       
        XTest = joblib.load(XTest_file)   
        self.XTest = XTest.reshape(np.size(XTest, axis=0), -1)

        self.output_path = output_path
    
        if normalise:
            normalizer = Normalizer(copy=False)
            normalizer.transform(self.XTrain)
            normalizer.transform(self.XTest)

        self.C = C
        if class_weight == 'none':
            class_weight = None
        self.class_weight = class_weight
Beispiel #2
0
class TfIdf(Feature):
    def __init__(self):
        self.kbest = None
        self.vect = None
        self.truncated = None
        self.normalizer = None

    def train(self, reviews, labels):
        self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')

        reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews]
        tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()

        self.truncated = TruncatedSVD(n_components=50)
        self.truncated.fit(tfidf_matrix, labels)

        trunc = self.truncated.transform(tfidf_matrix)
        self.normalizer = Normalizer()
        self.normalizer.fit(trunc)

        self.kbest = SelectKBest(f_classif, k=5)
        self.kbest.fit(self.normalizer.transform(trunc), labels)

    def score(self, data):
        reviews_text = ' '.join(list(chain.from_iterable(data)))
        tfidf_matrix = self.vect.transform([reviews_text]).toarray()

        trunc = self.truncated.transform(tfidf_matrix)

        return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
Beispiel #4
0
    def load_data(self):
        if not os.path.exists('features_train.txt'):
            self.feature_extraction('train.txt', 'features_train.txt')
        data_train, target_train = load_svmlight_file('features_train.txt')

        if not os.path.exists('features_test.txt'):
            self.feature_extraction('test.txt', 'features_test.txt')
        data_test, target_test = load_svmlight_file('features_test.txt')

        normalizer = Normalizer().fit(data_train)
        data_train = normalizer.transform(data_train)
        data_test = normalizer.transform(data_test)

        return data_train.toarray(), target_train, data_test.toarray(), target_test
Beispiel #5
0
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3):
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        X = agetext["text"]
        X = X.tolist()
        label = agetext["agegroup"].tolist()
        vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
        docs = []
        for doc in X:
            docs.append(" ".join(doc))
        docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
        traindocs = docs2[:7999]
        X = vec.fit_transform(traindocs)
        testdocs = docs2[8000:9500]
        X_test = vec.transform(testdocs)
        tlabel = label[:7999]
        testl = label[8000:9500]
        if(check):
            lsa = TruncatedSVD(k2, algorithm = 'arpack')
            normalizer = Normalizer(copy=False)
            X = lsa.fit_transform(X)
            X = normalizer.fit_transform(X)
            X_test = lsa.transform(X_test)
            X_test = normalizer.transform(X_test)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        out.append(round(accuracy_score(testl, pred),2))
    print str(out)
    print np.mean(out)
Beispiel #6
0
def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)
def test_normalizer_l1():
    rng = np.random.RandomState(0)
    X_dense = rng.randn(4, 5)
    X_sparse_unpruned = sp.csr_matrix(X_dense)

    # set the row number 3 to zero
    X_dense[3, :] = 0.0

    # set the row number 3 to zero without pruning (can happen in real life)
    indptr_3 = X_sparse_unpruned.indptr[3]
    indptr_4 = X_sparse_unpruned.indptr[4]
    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0

    # build the pruned variant using the regular constructor
    X_sparse_pruned = sp.csr_matrix(X_dense)

    # check inputs that support the no-copy optim
    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):

        normalizer = Normalizer(norm='l1', copy=True)
        X_norm = normalizer.transform(X)
        assert X_norm is not X
        X_norm1 = toarray(X_norm)

        normalizer = Normalizer(norm='l1', copy=False)
        X_norm = normalizer.transform(X)
        assert X_norm is X
        X_norm2 = toarray(X_norm)

        for X_norm in (X_norm1, X_norm2):
            row_sums = np.abs(X_norm).sum(axis=1)
            for i in range(3):
                assert_almost_equal(row_sums[i], 1.0)
            assert_almost_equal(row_sums[3], 0.0)

    # check input for which copy=False won't prevent a copy
    for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix):
        X = init(X_dense)
        X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)

        assert X_norm is not X
        assert isinstance(X_norm, sp.csr_matrix)

        X_norm = toarray(X_norm)
        for i in xrange(3):
            assert_almost_equal(row_sums[i], 1.0)
        assert_almost_equal(la.norm(X_norm[3]), 0.0)
Beispiel #8
0
    def _normalize(self, X, y, X_t):
        from sklearn.preprocessing import Normalizer
        NORM = Normalizer()

        X = NORM.fit_transform(X, y)
        X_t = NORM.transform(X_t)

        return X, X_t
def readAndPreProcess():
	print("\n\n********** CS-412 HW5 Mini Project **********")
	print("************ Submitted by Sankul ************\n\n")
	print("Reading data, please ensure that the dataset is in same folder.")
	resp = pd.read_csv('responses.csv')
	print("Data reading complete!")
	print("Some stats reagarding data:")
	resp.describe()
	
	print("\nStarting pre-processing.....")
	
	print("\nFinding missing values:")
	print("Missing values found, removing them")
	emptyVals = resp.isnull().sum().sort_values(ascending=False)
	emptyPlot = emptyVals.plot(kind='barh', figsize = (20,35))
	plt.show()
	print("Empty values removed")
	
	print("\nChecking for NaN and infinite values in target column (Empathy):")
	if len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]):
		print("Number of infinite or NaN values in Empathy column: ", len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]))
		print("Removing them")
		resp = resp[np.isfinite(resp['Empathy'])]
		print("Infinite and NaN values removed")
		
	print("\nChecking for categorical features:")
	if pd.Categorical(resp).dtype.name == 'category':
		print("Categorical features found. Removing them...")
		resp = resp.select_dtypes(exclude=[object])	
		print("Categorical features removed")
		
	print("\nReplacing NaN values with the mean value:")
	resp=resp.fillna(resp.mean()) 
	resp.isnull().sum()
	print("Values replaced")
	
	print("\nSeperating labels from data:")
	Y = resp['Empathy'].values
	X = resp.drop('Empathy',axis=1)
	print("Labels seperated")
	
	print("\nScaling, standardizing and normalizing the data:")
	scaler = MinMaxScaler(feature_range=(0, 1))
	rescaledX = scaler.fit_transform(X)
	
	scaler = StandardScaler().fit(rescaledX)
	standardizedX = scaler.transform(rescaledX)
	
	normalizer = Normalizer().fit(standardizedX)
	normalizedX = normalizer.transform(standardizedX)
	print("Scaling, standardizing and normalizing completed")
	
	print("\nFinal data looks like:")
	print(normalizedX.shape)
	print("Values inside look like:")
	print(normalizedX[0])
	
	return normalizedX,Y
class ScikitNormalizer(object):
    def __init__(self):
        self.data_normalizer = Normalizer()

    def fit(self, data):
        self.data_normalizer.fit(data)

    def transform(self, data):
        return (self.data_normalizer.transform(data) + 1) / 2
    def test_ver2_syntetic_dataset(self):

        self.ex = experiment.Experiment()
        self.ex.cf_matrix = load_sparse_data('syntetic_cf.dat')
        n = Normalizer(norm='l2', copy=True)
        self.ex.cf_matrix = n.transform(self.ex.cf_matrix) #normalized.
        self.ex.cb_prox = experiment.Experiment.load_data(PKL + 'cb_prox.pkl')
        self.ex.cf_prox = self.ex.cf_matrix * self.ex.cf_matrix.T
        self.ex.test_corr_sparsity(draw=True, interval=100)
Beispiel #12
0
def make_nn_regression(n_samples=100, n_features=100, n_informative=10,
                       dense=False, noise=0.0, test_size=0,
                       normalize_x=True, normalize_y=True,
                       shuffle=True, random_state=None):

    X, y, w = _make_nn_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  shuffle=shuffle,
                                  random_state=random_state)

    if dense:
        X = X.toarray()

    if test_size > 0:
        cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state,
                          test_size=test_size, train_size=1-test_size)

        train, test = list(cv)[0]
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        if not dense:
            X_train.sort_indices()
            X_test.sort_indices()
    else:
        X_train, y_train = X, y
        if not dense:
            X_train.sort_indices()
        X_test, y_test = None, None

    # Add noise
    if noise > 0.0:
        generator = check_random_state(random_state)
        y_train += generator.normal(scale=noise * np.std(y_train),
                                    size=y_train.shape)
        y_train = np.maximum(y_train, 0)

    if normalize_x:
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        if X_test is not None:
            X_test = normalizer.transform(X_test)

    if normalize_y:
        scaler = MinMaxScaler()
        y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        if y_test is not None:
            y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()

    if X_test is not None:
        return X_train, y_train, X_test, y_test, w
    else:
        return X_train, y_train, w
Beispiel #13
0
 def normalize(self, msi, norm="l1"):
     original_shape = msi.get_image().shape
     collapsed_image = collapse_image(msi.get_image())
     # temporarily save mask, since scipy normalizer removes mask
     is_masked_array = isinstance(msi.get_image(), np.ma.MaskedArray)
     if is_masked_array:
         mask = msi.get_image().mask
     normalizer = Normalizer(norm=norm)
     normalized_image = normalizer.transform(collapsed_image)
     if is_masked_array:
         normalized_image = np.ma.MaskedArray(normalized_image, mask=mask)
     msi.set_image(np.reshape(normalized_image, original_shape))
def test_normalizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Normalizer
    # with sklearn.preprocessing.Normalizer

    normalizerr = NormalizerR()
    normalizerr.fit(np.concatenate(trajs))

    normalizer = Normalizer()
    normalizer.fit(trajs)

    y_ref1 = normalizerr.transform(trajs[0])
    y1 = normalizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #15
0
class SiftBOW(object):
    def __init__(self, dataset, n_words=300, add_global_desc=True,
                 color_sift=False):
        self.dataset = dataset
        self.n_words = n_words
        self.add_global_desc = add_global_desc
        self.normalizer = Normalizer(norm='l1')
        self.color_sift = color_sift
        if self.color_sift:
            self.feature_extractor = color_sift_descriptors
        else:
            self.feature_extractor = sift_descriptors

    def fit_transform(self, image_names, superpixels):
        descriptors, coordinates = self.feature_extractor(image_names,
                                                          self.dataset)
        print("end sift descriptors")
        vq, X = bag_of_words(descriptors, superpixels, coordinates)
        X = [self.normalizer.transform(x) for x in X]

        self.vq_ = vq
        Y = [gt_in_sp(self.dataset, f, sp) for f, sp in zip(image_names,
                                                            superpixels)]
        return DataBunch(X, Y, image_names, superpixels)

    def fit(self, image_names, spixel):
        self.fit_predict(image_names, spixel)
        return self

    def transform(self, image_names, superpixels):
        descriptors, coordinates = self.feature_extractor(image_names,
                                                          self.dataset)
        _, X = bag_of_words(descriptors, superpixels, coordinates, vq=self.vq_)
        Y = [gt_in_sp(self.dataset, f, sp) for f, sp in zip(image_names,
                                                            superpixels)]
        X = [self.normalizer.transform(x) for x in X]
        return DataBunch(X, Y, image_names, superpixels)
Beispiel #16
0
	def __init__(self, nor='nor', fold=2):
		self.fold = fold
		dataframe = pandas.read_csv(open('wine.data'))
		array = dataframe.values
		# separate array into input and output components
		self.X = array[:,1:]
		self.Y = array[:,0]
		self.nor = nor
		# normalizer can turn length of vector into 1.
		if self.nor == 'nor':
			scaler = Normalizer().fit(self.X)
		else:
			scaler = MinMaxScaler().fit(self.X)

		self.X = scaler.transform(self.X)
		numpy.set_printoptions(precision=3)
Beispiel #17
0
class LineTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, *args):
        self.args = args
        self.normalize = Normalizer()

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        X_test = [numpy.array([numpy.array([1 if item else 0], dtype='float64') for item in X[arg]]) for arg in
                  self.args]
        X_test = numpy.concatenate(X_test, axis=1)
        X_test = self.normalize.transform(X_test)
        return X_test

    def transform(self, X, y=None):
        return self.predict(X)
Beispiel #18
0
def test_sklearn_transform():
    transformer = Normalizer()
    transformer.fit(X_train)

    computation = SklearnTransform("test-sklearn", transformer,
                                   istreams=[], ostream="out")
    context = ComputationContext(computation)

    data = pd.DataFrame(X_test).to_json(orient="records")
    computation.process_record(context, Record("transform", data, None))

    assert len(context.records) == 1
    assert len(context.records["out"]) == 1

    record = context.records["out"][0]
    assert record.key == "transform"
    assert np.allclose(transformer.transform(X_test), json.loads(record.data))
Beispiel #19
0
def vectorize(n, comp=0):
    tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english',
        sublinear_tf=True, use_idf=True, smooth_idf=True)

    # Fit and transform
    X = tfv.fit_transform(boiler_stream(trainfnm, n))
    lsa = None
    scaler = None
    if comp > 0:
        lsa = TruncatedSVD(comp)
        scaler = Normalizer(copy=False)
        X = lsa.fit_transform(X)
        X = scaler.fit_transform(X)

    # Transform only
    Z = tfv.transform(boiler_stream(testfnm, n))
    if lsa:
        Z = lsa.transform(Z)
        Z = scaler.transform(Z)
    
    np.save(trainvecfnm, X)
    np.save(testvecfnm, Z)
            )
            tfv.fit(text)

            Q = tfv.transform(list(train["q"]))
            R = tfv.transform(list(train["q_ex"]))
            X = tfv.transform(list(train["t"]))
            Qt = tfv.transform(list(test["q"]))
            Rt = tfv.transform(list(test["q_ex"]))
            Xt = tfv.transform(list(test["t"]))

            svd = TruncatedSVD(n_components=svd_components)
            svd.fit(vstack([Q, X, Qt, Xt]))
            normalizer = Normalizer()

            Q = svd.transform(Q)
            Q = normalizer.transform(Q)
            R = svd.transform(R)
            R = normalizer.transform(R)
            X = svd.transform(X)
            X = normalizer.transform(X)

            Qt = svd.transform(Qt)
            Qt = normalizer.transform(Qt)
            Rt = svd.transform(Rt)
            Rt = normalizer.transform(Rt)
            Xt = svd.transform(Xt)
            Xt = normalizer.transform(Xt)

            colname = "cos_dist_%d_%d" % (ngram, svd_components)
            D = [np.dot(a, b) for (a, b) in zip(Q, X)]
            Dt = [np.dot(a, b) for (a, b) in zip(Qt, Xt)]
Beispiel #21
0
class IWTBA():
    def __init__(self, svd=True, categorizer=True):
        # Triggers for building models
        self.svd=svd
        self.categorizer=categorizer
        # Initialize Tokenizer Parts
        # Create regex pattern
        self.re_pattern = re.compile('[^a-zA-Z]')

        # Create stemmer
        self.stemmer = SnowballStemmer('english')

        # Create stopwords
        self.eng_stop = set(stopwords.words('english'))

        # Placeholders
        self.feat_mat = None
        self.feat_labels = None

        self.course_list = None
        self.course_id_to_index = None
        self.jobs_titles = None

        self.cat_id_to_name = None
        self.course_cats_binarized = None
        self.label_arr_to_cat_id = None

    # ----------------
    # Read in Corpus
    # ----------------
    def concatenate_coursera_text_data(self, course_dict):
        """helper function for parsing coursera courses"""
        name = course_dict['name']
        syllabus = BeautifulSoup(course_dict['courseSyllabus']).text
        short_desc = course_dict['shortDescription']
        about = BeautifulSoup(course_dict['aboutTheCourse']).text
        return " ".join([name, syllabus, short_desc, about])

    def _get_coursera_corpus(self):
        """collect coursera course text and metadata"""
        with open('./data/coursera/coursera_courses.json') as c_file:
            coursera_courses = json.load(c_file)

        course_id_to_index = {} # dict to allow reverse searching from id
        course_text_list = []
        course_list = []
        course_categories = []

        i = 0
        for course in coursera_courses['elements']:
            if course['language'] == 'en':
                course_id_to_index[course['id']] = i
                course_text_list.append(self.concatenate_coursera_text_data(course))
                course_list.append(course)
                if self.categorizer:
                    course_categories.append(course['links'].get('categories', [-1]))
                i += 1


        if self.categorizer:
            # get category list
            cat_info_list = coursera_courses['linked']['categories']
            self.cat_id_to_name = {cat['id']:
                {'name':cat['name'], 'shortName':cat['shortName']} for cat in cat_info_list}

            # binarize labels and discard low-count categories    
            mlb = MultiLabelBinarizer()
            course_cats_binarized = mlb.fit_transform(course_categories)

            # filter to only tags with > 40 courses
            mask = course_cats_binarized.sum(axis=0) > 40
            course_cats_binarized = course_cats_binarized[:, mask]
            self.course_cats_binarized = course_cats_binarized

            # create dict to get back from masked index, to index, to id
            label_arr_to_cat_id = {}
            for i, k in enumerate(mask.nonzero()[0].tolist()):
                label_arr_to_cat_id[i] = mlb.classes_[k]

            self.label_arr_to_cat_id = label_arr_to_cat_id


        return course_list, course_text_list, course_id_to_index

    def _get_nyc_corpus(self):
        """collect nyc gov't job descriptions and titles"""
        ny_jobs_raw = open('./data/nyc/ny_jobs_data.json').read()
        ny_jobs_data = json.loads(ny_jobs_raw)

        ny_jobs_columns = [col['fieldName'] for col in ny_jobs_data['meta']['view']['columns']]

        ny_jobs_desc_index = ny_jobs_columns.index('job_description')
        ny_jobs_title_index = ny_jobs_columns.index('business_title') #there's also civic title

        ny_jobs_descriptions = []
        ny_jobs_titles = []

        for job in ny_jobs_data['data']:
            ny_jobs_descriptions.append(job[ny_jobs_desc_index])
            ny_jobs_titles.append(job[ny_jobs_title_index])

        return ny_jobs_titles, ny_jobs_descriptions

    def _get_github_corpus(self):
        """collect github job descriptions and titles"""
        git_data = json.load(open('./data/github/github_postings'))
        git_jobs_titles = []
        git_jobs_descriptions = []

        for job in git_data:
            git_jobs_titles.append(job['title'])
            git_jobs_descriptions.append(BeautifulSoup(job['description']).text)

        return git_jobs_titles, git_jobs_descriptions

    def get_corpus(self, coursera=True, nyc=True, github=True):
        """collect data sets, return combined corpus and store metadata"""
        combined_text = []
        job_titles = []
        if coursera:
            course_list, course_text_list, course_id_to_index = self._get_coursera_corpus()
            combined_text += course_text_list
            self.course_list = course_list
            self.course_id_to_index = course_id_to_index
        if nyc:
            ny_jobs_titles, ny_jobs_descriptions = self._get_nyc_corpus()
            combined_text += ny_jobs_descriptions
            job_titles.extend(ny_jobs_titles)
        if github:
            git_jobs_titles, git_jobs_descriptions = self._get_github_corpus()
            combined_text += git_jobs_descriptions
            job_titles.extend(git_jobs_titles)
        if job_titles:
            self.job_titles = job_titles

        return combined_text

    #--------------------
    # Model Building and Processing
    #--------------------
    def tokenize_text(self, text):
        """clean and tokenize a job description"""
        #should modify this to get rid of single letter words or ' caused junk
        clean_text = self.re_pattern.sub(" ", text).lower()
        tokenized_desc = [self.stemmer.stem(word) for word in clean_text.split() if word not in self.eng_stop]
        return tokenized_desc

    def _fit_svd(self, feat_mat, svd_comps):
        self.svd = TruncatedSVD(n_components=svd_comps)
        feat_mat = self.svd.fit_transform(feat_mat)
        self.normalizer = Normalizer(copy=False)
        self.normalizer.transform(feat_mat)
        return feat_mat        

    def _fit_categorizer(self):
        classifier = SVC(kernel='linear', probability=True, class_weight='auto')
        cat_clf = OneVsRestClassifier(classifier)
        cat_clf.fit(self.feat_mat[:len(self.course_list), :], self.course_cats_binarized)
        self.categorizer = cat_clf

    def fit(self, svd_comps=1000):
        """fit the tfidf vectorizer (and svd) and store it and the resulting feature matrix"""
        vectorizer = TfidfVectorizer(tokenizer=self.tokenize_text)
        feat_mat = vectorizer.fit_transform(self.get_corpus())
        self.vectorizer = vectorizer
        if self.svd:
            feat_mat = self._fit_svd(feat_mat, svd_comps)
        self.feat_mat = feat_mat
        if self.categorizer:
            self._fit_categorizer()
        self.feat_labels = vectorizer.get_feature_names()

    def vectorize(self, input_text):
        vector = self.vectorizer.transform([input_text])
        if self.svd:
            vector = self.svd.transform(vector)
            self.normalizer.transform(vector)
        return vector

    #--------------------
    # Result Functions
    #--------------------
    def _get_course_sims(self, input_text):
        """get course similarities."""
        input_vect = self.vectorize(input_text)
        c_feat_mat = self.feat_mat[:len(self.course_list), :]
        cos_sims = np.dot(c_feat_mat, input_vect.T) #nx1 shape
        if type(cos_sims) != np.ndarray: #tfidf is in sparse format
            cos_sims = np.array(cos_sims.todense())
        return cos_sims   

    def get_n_most_similar_course_indices(self, input_text, n=5, threshold=.3):
        """get n most similar indices, sorted, from a sparse matrix"""
        input_vect = self.vectorize(input_text)
        c_feat_mat = self.feat_mat[:len(self.course_list), :]
        cos_sims = np.dot(c_feat_mat, input_vect.T)
        if type(cos_sims) != np.ndarray: #tfidf is in sparse format
            cos_sims = np.array(cos_sims.todense())
        n = min(n, np.sum(cos_sims > threshold)) # return only good courses
        n = max(n, 1) # return at least 1 course
        top_n_indices = np.argsort(cos_sims, axis=0)[-1:-(n + 1):-1, 0]
        return top_n_indices.ravel().tolist()

    def build_recommend_table(self, input_text, n=5):
        """
        Collect meta data from recommended courses,
        and then return a table for displaying recommendations.
        """
        indices = self.get_n_most_similar_course_indices(input_text, n=n)
        header = ['Course Name', 'Course Description']
        table = [header]
        for i in indices:
            course = self.course_list[i]
            name = course['name']
            short_desc = course['shortDescription']
            url = 'https://www.coursera.org/course/' + course['shortName']
            table.append([name, short_desc, url])
        return table

    def _get_job_category_scores(self, input_text):
        """
        Get decision function results for categorizer.
        """
        vect = self.vectorize(input_text)
        cat_scores = self.categorizer.decision_function(vect)
        return cat_scores

    def get_job_categories(self, input_text, threshold=.034):
        """
        Classify posting and return categories.
        Threshold of 0.034 corresponds to a .05 false positive rate
        """
        cat_scores = self._get_job_category_scores(input_text)
        cat_predictions = cat_scores > threshold
        cat_names = []
        for i in cat_predictions.nonzero()[1].tolist():
            cat_id = self.label_arr_to_cat_id[i]
            cat_name = self.cat_id_to_name[cat_id]['name']
            cat_names.append(cat_name)
        return cat_names

    def get_n_most_similar_job_indices(self, input_text, n=3, threshold=.3):
        """get n most similar job indices, sorted"""
        input_vect = self.vectorize(input_text)
        j_feat_mat = self.feat_mat[len(self.course_list):, :]
        cos_sims = np.dot(j_feat_mat, input_vect.T)
        if type(cos_sims) != np.ndarray: #tfidf is in sparse format
            cos_sims = np.array(cos_sims.todense())
        n = min(n, np.sum(cos_sims > threshold)) # return only good courses
        n = max(n, 1) # return at least 1 course
        top_n_indices = np.argsort(cos_sims, axis=0)[-1:-(n + 1):-1, 0]
        return top_n_indices.ravel().tolist()

    def get_job_titles_from_indices(self, top_n_indices):
        """return titles from indices."""
        titles = []
        for i in top_n_indices:
            job_title = self.job_titles[i]
            if job_title not in titles:
                titles.append(job_title)
        return titles

    def build_course_row(self, course_id):
        """collect course metadata and build row for recommendation page.
        Input: Course ID
        Output: List of strings.
            Course IMG as url
            Course Title
            Course URL
            Course Description
            Course Categories
        """
        course = self.course_list[course_id]
        c_name = course['name']
        c_img = course['smallIcon']
        #url works for coursera only
        c_url = 'https://www.coursera.org/course/' + course['shortName']
        c_desc = course['shortDescription']
        c_cats = [self.cat_id_to_name[cat_id]['name'] for cat_id in course['links']['categories']]
        return c_name, c_img, c_url, c_desc, c_cats

    def build_recommend_page(self, input_text, thresh=.3):
        have_recommendations = False

        # get n job titles > threshold
        job_titles = []
        job_indices = self.get_n_most_similar_job_indices(input_text, n=3, threshold=thresh)
        job_titles = self.get_job_titles_from_indices(job_indices)

        # get and sort course similarities
        course_sims = self._get_course_sims(input_text) #nx1 shape
        sorted_sim_indices = course_sims.argsort(axis=0)[::-1, :] #sorted descending

        # get category scores for job posting
        job_cat_scores = self._get_job_category_scores(input_text)

        # get category scores for each recommended course
        # 1 * job cat score if cat has tag, 0 otherwise
        course_cat_scores = job_cat_scores * self.course_cats_binarized

        # for each course, largest value becomes it's parent category
        course_parent_cat = np.argmax(course_cat_scores, axis = 1)

        # best recommendations thresholds
        # get courses with cos sim > high thresh
        max_best = 3
        high_thresh = .5
        best_course_ids = []
        for course_id in sorted_sim_indices:
            if course_sims[course_id] > high_thresh:
                best_course_ids.append(course_id)
        if best_course_ids:
            best_course_ids = best_course_ids[:max_best]
            have_recommendations = True
                
        # for each category in job_cat_scores > thresh
        # create list of courses with sim > thresh
        # get course recommendations by category
        cat_list = []
        thresh_mask = (course_sims > thresh).ravel()
        for i in np.argsort(job_cat_scores[0]):
            if job_cat_scores[0][i] > .034:
                cat_mask = course_parent_cat == i
                cat_and_thresh_mask = np.logical_and(cat_mask, thresh_mask)
                valid_courses = cat_and_thresh_mask.nonzero()[0].tolist()
                if valid_courses:
                    course_order = np.argsort(course_sims[cat_and_thresh_mask], axis=0)[::-1, :]
                    cat_id = self.label_arr_to_cat_id[i]
                    cat_name = self.cat_id_to_name[cat_id]['name']
                    cat_list.append([cat_name, [valid_courses[i] for i in course_order]])

        if cat_list:
            have_recommendations = True

        # if have_recommendations is False and job_indices:
        #     job_titles, best_course_ids, cat_list = self._rec_from_jobs_only(input_text)

        return job_titles, best_course_ids, cat_list, have_recommendations
# .. your code here ..

from sklearn.preprocessing import Normalizer
normalizer = Normalizer(copy=True)
normalizer.fit(X_train)

#
# TODO: With your trained pre-processor, transform both your training AND
# testing data.
#
# NOTE: Any testing data has to be transformed with your preprocessor
# that has ben fit against your training data, so that it exist in the same
# feature-space as the original data used to train your models.
#
# .. your code here ..
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

#
# TODO: Just like your preprocessing transformation, create a PCA
# transformation as well. Fit it against your training data, and then
# project your training and testing features into PCA space using the
# PCA model's .transform() method.
#
# NOTE: This has to be done because the only way to visualize the decision
# boundary in 2D would be if your KNN algo ran in 2D as well:
#
# .. your code here ..
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_train)
class ContentRecommend(object):
    create_date = datetime.utcnow()
    days = 15
    training_end = datetime.utcnow()
    db = None
    n_components = 20  # Number of dimension for TruncatedSVD
    account = ''
    svd = None
    normalizer = None
    svdX = None
    vectorizor = None
    training_docs = None
    threshold = 0.25
    k_means = None
    sil_score = -1.0
    cluster_count = 0
    range_n_clusters = [3, 4, 5, 6, 7, 8]
    missionId = ''

    def __init__(self, mission_id, db_name='plover_development', db_port=27017, db_host='localhost'):
        self.missionId = mission_id
        config.LOGGER.info('Instantiation recommender')
        self.connect(db_name, self.missionId, db_port=db_port, db_host=db_host)
        config.LOGGER.debug("Loading NLTK stopword list for English")

    def connect(self, db_name="plover_development", mission_id="", db_port=27017, db_host='localhost'):
        config.LOGGER.info('Instantiating recommender object for mission %s', mission_id)
        config.LOGGER.debug('Using database %s, host %s and port %s', db_name, db_host, db_port)

        try:
            client = MongoClient(db_host, db_port)
            self.db = client[db_name]
            profile = self.db.socialProfile.find_one({'mission': ObjectId(self.missionId)})
            self.account = self.db.linkedAccount.find_one({'_id': profile['account']})
            if self.account is None:
                config.LOGGER.debug('No such account id')
            self.setup_training(days=30)
        except Exception as ex:
            config.LOGGER.error("Error %s opening mission _id=%s", ex.message, self.missionId)


    def get_updates(self, maximum=100, conditions={}):
        documents = []
        config.LOGGER.info('Getting timeline updates for mission %s', self.missionId)
        config.LOGGER.debug(' query condition: %s', json.dumps(conditions, default=json_util.default))
        try:
            if self.account is None:
                config.LOGGER.debug('No account id')
            else:
                projection = {'keywords': 1, 'text': 1, 'externalID': 1, 'postTime': 1, 'sender': 1,
                              'quotedStatus': 1}
                updates = self.db.statusUpdate.find(conditions, projection).sort('postTime', pymongo.DESCENDING).limit(maximum)
                for tw in updates:
                    if 'quotedStatus' in tw:
                        tw['text'] += " QT " + tw['quotedStatus']['text']
                        for keyword in tw['quotedStatus']['keywords']:
                            tw['keywords'].append(keyword)
                    smu = self.db.socialMediaUser.find_one({'_id': tw['sender']}, {'screenNameLC': 1})
                    if smu is not None:
                        tw['keywords'].append(smu['screenNameLC'])
                    documents.append(tw)


        except Exception as ex:
            config.LOGGER.error("Error %s getting updates from timeline for mission %s", ex.message, self.missionId)

        config.LOGGER.debug('Found %d updates in timeline', len(documents))
        return documents



    def topics(self, n_components, n_out=7, n_weight=5, topic=None):
        config.LOGGER.info('Get topices timeline for %s', self.account['profile']['preferredUsername'])
        results = []
        terms = self.vectorizer.get_feature_names()
        if topic is None:
            for k in range(n_components):
                idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])}
                sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
                weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])

                for item in sorted_idx[0:n_out - 1]:
                    results.append({'term': terms[item[0]], 'weight': item[1]})
        else:
            m = max(self.svd.components_[topic])
            idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])}
            sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
            weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])

            for item in sorted_idx[0:n_out - 1]:
                results.append({'term': terms[item[0]], 'weight': item[1]})
        results

    def get_componentCount(self, min=.05):
        count = 0
        for k in range(len(self.svd.components_)):
            idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])}
            sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
            kcount = 0
            for entry in (sorted_idx):
                if entry[1] > min:
                    kcount += 1
                else:
                    break
            if kcount > count:
                count = kcount
        return count

    def setup_training(self, end_time=datetime.utcnow(), days=15, maximum=1000):
        try:
            start = end_time - timedelta(minutes=days*24*60)
            condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': True}, {'sentByMe': True}],
                        'postTime': {'$gt': start, '$lte': end_time},
                         '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]}
            self.training_docs = self.get_updates(conditions=condition, maximum=10000)
            config.LOGGER.info('Train model for %s', self.account['profile']['preferredUsername'])
            if len(self.training_docs) > 50:
                config.LOGGER.debug('Found %d updates for training from %s', len(self.training_docs),
                                    self.account['profile']['preferredUsername'])
                self.training_end = end_time
                self.days = days

                trainingRaw = [' '.join(doc['keywords']) for doc in self.training_docs]
                #trainingRaw = [tw['text'] for tw in self.training_docs]
                self.vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=500, use_idf=True,
                                                  strip_accents='ascii', )
                X = self.vectorizer.fit_transform(trainingRaw)
                if X.shape[1] <= self.n_components:
                    self.n_components = X.shape[1] - 1
                config.LOGGER.debug('%d components found for  SVD', self.n_components)
                self.svd = TruncatedSVD(self.n_components, algorithm='arpack')
                self.svdX = self.svd.fit_transform(X)
                # self.n_components = self.get_componentCount(self.threshold)
                # self.svd = TruncatedSVD(self.n_components, random_state=10)
                # self.svdX = self.svd.fit_transform(X)
                self.normalizer = Normalizer().fit(self.svdX)
                self.svdX = self.normalizer.transform(self.svdX)

                # Clustering
                config.LOGGER.debug('Determining cluster count ')
                for n_clusters in self.range_n_clusters:
                    self.k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10,
                                          verbose=False, random_state=10)
                    self.k_means.fit(self.svdX)
                    score = metrics.silhouette_score(self.svdX, self.k_means.labels_)
                    if score > self.sil_score:
                        self.sil_score = score
                        self.cluster_count = n_clusters

                config.LOGGER.debug('Cluster count is %d, Silhouette Coefficient is %0.3f  ', self.cluster_count,
                                    self.sil_score)
                self.k_means = KMeans(n_clusters=self.cluster_count, init='k-means++', max_iter=100, n_init=4,
                                      verbose=False, random_state=10)
                self.k_means.fit(self.svdX)

                # now get the top tweets for each cluster
                x_transform = self.k_means.transform(self.svdX)
                x_predict = self.k_means.predict(self.svdX)

                self.all_cluster_dist = []
                for i in range(self.cluster_count):
                    cluster_distance = []
                    for j in range(len(x_predict)):
                        if x_predict[j] == i and sum(self.svdX[j]) != 0.0:
                            cluster_distance.append(
                                {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in x_transform[j]]))})
                    newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False)
                    self.all_cluster_dist.append(newlist)

                #now verify this
                self.self_test()

            else:
                config.LOGGER.info('Too few training updates from user timeline')
                self.svd = None
        except Exception as ex:
            config.LOGGER.exception("Error %s computing SVD and kmeans from user history for mission %s", ex.message,
                                self.missionId)


    def self_test(self):
        try:
            config.LOGGER.info("Beginning self test. Better if it were cross validation but not enough data for that")
            results = self.find_recommendations(self.training_docs, top=10, quality=.001, min_examples=1)
            config.LOGGER.info("Self test found %d recommendations", len(results))
            for rec in results:
                if rec['text'] != rec['samples_svd'][0]:
                    config.LOGGER.error("Error training SVD for mission %s in tweet %s", self.missionId, rec['text'])
        except Exception as ex:
            config.LOGGER.error("Error in self test building training for mission %s", ex.message, self.missionId)


    def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1):

        working_list = []
        result_list = []
        try:
            config.LOGGER.info('Generating content recommendations for user %s',
                               self.account['profile']['preferredUsername'])
            if self.svd is not None:
                if len(tweets) < top:
                    config.LOGGER.debug("Too few tweets passed for recommendation")
                    return []

                #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets]
                #tweetText = [tw['text'] for tw in tweets]
                tweetText = [' '.join(tw['keywords']) for tw in tweets]
                Y = self.vectorizer.transform(tweetText)
                svdY = self.svd.transform(Y)
                svdY = self.normalizer.transform(svdY)
                y_transform = self.k_means.transform(svdY)
                # terms = self.vectorizer.get_feature_names()

                selected_updates = []
                y_predict = self.k_means.predict(svdY)

                for i in range(self.cluster_count):
                    cluster_distance = []
                    for j in range(len(y_predict)):
                        if y_predict[j] == i and sum(svdY[j]) != 0.0:
                            cluster_distance.append(
                                {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))})
                    newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False)
                    selected_updates.append(newlist)

                temp = [entry for entry in it.izip_longest(*selected_updates)]
                clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top]
                clean_list_svdY = [svdY[entry['index']] for entry in clean_list]
                config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY))

                neigh = NearestNeighbors()
                neigh.fit(self.svdX)
                if len(clean_list_svdY) > 0:
                    distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality)
                else:
                    svd_neighbors =[]

                examples=[]
                for idx, entry in enumerate(svd_neighbors):
                    if len(entry) >= min_examples:
                        config.LOGGER.debug("Suggested tweet has %d examples" % len(entry))
                        original = tweets[clean_list[idx]['index']]['text']
                        for jdx, neighbor in enumerate(entry):
                            examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]})
                        sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False)
                        min_examples = [item['text'] for item in sorted_examples][:min_examples]
                        t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text']
                        t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text']
                        working_list.append({"dist": sorted_examples[0]['dist'], "text": original,
                                                     "id": str(tweets[clean_list[idx]['index']]['_id']),
                                                     "sender": str(tweets[clean_list[idx]['index']]['sender']),
                                                     'samples_svd': min_examples, 'samples_cluster':[t1,t2]})

                result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False)
            return result_list[:top]

        except Exception as ex:
            config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId)
            return []

    def recommend_from_timeline(self, end_time=datetime.utcnow(), minutes_prior=15, top=10, quality=.1, min_examples=1):

        try:
            config.LOGGER.info("generating content recommendation from timeline for %s" % self.account['profile']['preferredUsername'])
            results = []
            if self.svd is not None:
                start = end_time - timedelta(minutes=minutes_prior)
                condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': False}, {'sentByMe': False}, {'mentionsMe' : False},{'retweetOfMe':False}],
                            'postTime': {'$gt': start, '$lte': end_time},
                              '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]}
                tweets = self.get_updates(maximum=10000, conditions=condition)
                config.LOGGER.debug('%d updates from account timeline read from database', len(tweets))
                results = self.find_recommendations(tweets, top=top, quality=quality, min_examples=min_examples)
                config.LOGGER.debug('%d recommendations found for mission %s', len(tweets), self.missionId)
            return results[:top]

        except Exception as ex:
            config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId)
            return []
Beispiel #24
0
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(X)
data = normalizer.fit_transform(X)

### after several experiments linear svc gave the best result:
from sklearn.svm import SVC
clf = OneVsRestClassifier(SVC(kernel='linear'))
#### prepare training set and testing set:
#******************************************TRAINING AND TESTING SEPERATE*****************************************************
clf.fit(data, label)
print "finished fitting model for age-group classification"

#### prepare the test data text datasets #######
X_test = vec.transform(testdocs)
testdata = lsa.transform(X_test)
testdata = normalizer.transform(testdata)
### getting the predicted values: 
pred = clf.predict(testdata)
outdfage = pd.DataFrame({"userid": testuids, "agegroup": pred.tolist()})
#outdf["agegroup"] = pred.tolist()
outdf = pd.merge(outdf,outdfage,on="userid")

print "Gender and Age Group Classification Process finished"
#
####################################### personality ##############################
profile_path = path + "profile/profile.csv"
o = open(profile_path,'rU')
profiletb = csv.DictReader(o)

### read the texttb csv file to get the user ids and corresponding cleaned text data
# texttb = pd.read_csv("texttb.csv",encoding="latin-1")
#traindata = pd.read_csv('kdd/binary/kddtrain.csv', header=None)
testdata = pd.read_csv('kdd/binary/kddtest.csv', header=None)

#X = traindata.iloc[:,0:42]
#Y = traindata.iloc[:,0]
C = testdata.iloc[:, 0]
T = testdata.iloc[:, 1:42]
'''
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
# summarize transformed data
np.set_printoptions(precision=3)
#print(trainX[0:5,:])
'''
scaler = Normalizer().fit(T)
testT = scaler.transform(T)
# summarize transformed data
np.set_printoptions(precision=3)
#print(testT[0:5,:])

#y_train = np.array(Y)
y_test = np.array(C)

# reshape input to be [samples, time steps, features]
#X_train = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
X_train = np.reshape(testT, (testT.shape[0], 1, testT.shape[1]))

batch_size = 32

# 1. define the network
model = Sequential()
Beispiel #26
0
import pickle
import numpy as np
from libs.arcface.arcface import Arcface
from sklearn.preprocessing import Normalizer
l2_normalizer = Normalizer('l2')

model = Arcface()
from PIL import Image
face_data = "FaceName\\"
encoding_dict = {}
for face_names in os.listdir(face_data):
    person_dir = os.path.join(face_data, face_names)

    embeddings = []
    for image_name in os.listdir(person_dir):
        image_path = os.path.join(person_dir, image_name)

        img = cv2.imread(image_path)
        encode = model(img)
        encode = encode[0]
        embeddings.append(encode)

    if embeddings:
        encode = np.sum(embeddings, axis=0)
        encode = l2_normalizer.transform(encode.reshape(1, -1))[0]
        encoding_dict[face_names] = encode

path = 'encodings/encodings.pkl'
with open(path, 'wb') as file:
    pickle.dump(encoding_dict, file)
# Normalize data (length of 1)
from sklearn.preprocessing import Normalizer
import pandas
import numpy
url = "https://goo.gl/vhm1eU"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(normalizedX[0:5,:])
 def normalize_scaler(self):  # 数据归一化。
     scaler = Normalizer().fit(self.__x)
     nor_x = scaler.transform(self.__x)
     return nor_x
testdata = pd.read_csv('kdd/multiclass/Testing.csv', header=None)


X = traindata.iloc[:,0:42]
Y = traindata.iloc[:,42]
C = testdata.iloc[:,42]
T = testdata.iloc[:,0:42]

trainX = np.array(X)
testT = np.array(T)

trainX.astype(float)
testT.astype(float)

scaler = Normalizer().fit(trainX)
trainX = scaler.transform(trainX)

scaler = Normalizer().fit(testT)
testT = scaler.transform(testT)

y_train = np.array(Y)
y_test = np.array(C)


X_train = np.array(trainX)
X_test = np.array(testT)


batch_size = 64

# 1. define the network
Beispiel #30
0
from sklearn import metrics
from sklearn.preprocessing import Normalizer
import h5py
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

traindata = pd.read_csv('classical/data/train.csv', header=None)
testdata = pd.read_csv('data/valid.csv', header=None)

X = traindata.iloc[:, 1:61]
Y = traindata.iloc[:, 0]
C = testdata.iloc[:, 0]
T = testdata.iloc[:, 1:61]

scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
# summarize transformed data
np.set_printoptions(precision=3)
#print(trainX[0:5,:])

scaler = Normalizer().fit(T)
testT = scaler.transform(T)
# summarize transformed data
np.set_printoptions(precision=3)
#print(testT[0:5,:])

y_train = np.array(Y)
y_test = np.array(C)

# reshape input to be [samples, time steps, features]
X_train = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    def regression_step(self, file, step, mdl):

        data = pd.read_csv(file)  #, index_col = 'Munic&Year')
        features = list(data.columns[3:-1])
        print(features)
        out = pd.Series()
        NN = len(data) / step
        NN = int(NN)
        datay = data.loc[:, "NextYear"]
        datax = data[features]

        names = data['Munic'].drop_duplicates().values

        scaler = Normalizer().fit(datax)
        #scalx = scaler.transform(datax)
        #normedx = normalize(scalx)
        normedx = scaler.transform(datax)
        norm = pd.DataFrame(data=normedx, columns=features, index=data.index)

        norm_data = pd.merge(norm,
                             datay,
                             how='right',
                             left_index=True,
                             right_index=True,
                             sort=False)

        prog = 0

        for nn in names:

            test = norm_data.loc[data['Munic'] == nn]
            train = norm_data.loc[data['Munic'] != nn]

            y = train["NextYear"]
            x = train[features]

            x_test = test[features]

            names = data.loc[test.index, 'Munic&Year']
            rates = data.loc[test.index, 'NextYear']

            if mdl == 1:

                clf = RandomForestRegressor(n_estimators=200)
                model = clf.fit(x, y)

            elif mdl == 2:

                clf = RandomForestRegressor(n_estimators=500)
                abc = AdaBoostClassifier(n_estimators=100,
                                         base_estimator=clf,
                                         learning_rate=1)
                model = abc.fit(x, y)

            elif mdl == 3:

                gb = GradientBoostingRegressor(n_estimators=500,
                                               learning_rate=1,
                                               warm_start=True)
                model = gb.fit(x, y)

        # if nn % 10 == 0 :
            if prog % 7 < 2:
                print(
                    sorted(zip(
                        map(lambda x: round(x, 4), model.feature_importances_),
                        features),
                           reverse=True))

            est = model.predict(x_test)

            aa = {'Munic&Year': names, 'model': est, 'R/E': rates}
            add = pd.DataFrame(aa)

            out = out.append(add)
            prog += len(test)
            print(str(round(prog / len(data) * 100, 2)) + '% Complete')

        out.to_csv("./Model/AllYears_Regressed.csv")
Beispiel #32
0
    'arr_2'], data['arr_3']
print('Dataset: train=%d, test=%d' % (trainX.shape[0], testX.shape[0]))
print(trainX.shape)

PCA_COMPONENT = 400
pca = PCA(n_components=PCA_COMPONENT)

trainX = np.reshape(trainX, (len(trainX), 4096))
testX = np.reshape(testX, (len(testX), 4096))

trainX = pca.fit_transform(trainX)
testX = pca.fit_transform(testX)

# normalize input vectors
in_encoder = Normalizer(norm='l2')
trainX = in_encoder.transform(trainX)
testX = in_encoder.transform(testX)

# label encode targets
#out_encoder = LabelEncoder()
#out_encoder.fit(trainy)
#trainy = out_encoder.transform(trainy)
#testy = out_encoder.transform(testy)

# fit model
model = SVC(kernel='linear', probability=True)
model.fit(trainX, trainy)
# predict
yhat_train = model.predict(trainX)
yhat_test = model.predict(testX)
# score
# In this lab setting, you have both train+test data; but in the wild,
# you'll only have your training data, and then unlabeled data you want to
# apply your models to.
#
normalizer = Normalizer()
normalizer.fit(X_train)

#
# TODO: With your trained pre-processor, transform both your training AND
# testing data.
#
# NOTE: Any testing data has to be transformed with your preprocessor
# that has ben fit against your training data, so that it exist in the same
# feature-space as the original data used to train your models.
#
X_train_n, X_test_n = normalizer.transform(X_train), normalizer.transform(
    X_test)

#
# TODO: Just like your preprocessing transformation, create a PCA
# transformation as well. Fit it against your training data, and then
# project your training and testing features into PCA space using the
# PCA model's .transform() method.
#
# NOTE: This has to be done because the only way to visualize the decision
# boundary in 2D would be if your KNN algo ran in 2D as well:
#
pca = RandomizedPCA(n_components=2)
pca.fit(X_train_n)
X_train_pca = pca.transform(X_train_n)
X_test_pca = pca.transform(X_test_n)
                             mean_absolute_error, roc_curve,
                             classification_report, auc)

data = pd.read_csv('CICIDS2017-full.csv', header=0)
print(data.head())

X = data.iloc[0:, 0:78]
print(X.head())
Y = data.iloc[0:, 78]
print(Y.head())

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3)
print("1.....")

scaler = Normalizer().fit(Xtrain)  # train
trainX = scaler.transform(Xtrain)

scaler = Normalizer().fit(Xtest)
testT = scaler.transform(Xtest)

traindata = np.array(trainX)
trainlabel = np.array(Ytrain)

testdata = np.array(testT)
testlabel = np.array(Ytest)

model = LogisticRegression()
model.fit(traindata, trainlabel)

# make predictions
expected = testlabel
Beispiel #35
0
############################################################################

# rescale feature values of observations to have a unit norm (total lenght of 1.0)
# the following relies on from sklearn.preprocessing import Normalizer

# Use normalizer with a norm argument

# Create feature matrix
features = np.array([[0.5, 0.5], [1.1, 3.4], [1.5, 20.2], [1.63, 34.4],
                     [10.9, 3.3]])

# Create normalizer
normalizer = Normalizer(norm="l2")

# Transform feature matrix
print(normalizer.transform(features), "use norm to tansform features")

##Many rescaling methods (e.g., min-max scaling and standardization)
##operate on features; however, we can also rescale across individual
##observations. Normalizer rescales the values on individual observations
##to have unit norm (the sum of their lengths is 1). This type of rescaling
##is often used when we have many equivalent features (e.g., text classification
##when every word or n-word group is a feature).

##Normalizer provides three norm options with Euclidean norm (often called L2)
##being the default argument.

# Transform feature matrix
features_l2_norm = Normalizer(norm="l2").transform(features)

# Show feature matrix
Beispiel #36
0
def K_Neighbors(train_A, words_of_tweets, extra_features, feature_selection,
                encoding, print_file):
    reading = Twitter_Depression_Detection.Reader(
    )  # Import the ClassRead.py file, to get the encoding

    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Above 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index],
                                          train_index, extra_features,
                                          feature_selection, encoding,
                                          print_file), reading.get_enc(
                                              x[test_index], 0, y[test_index],
                                              test_index, extra_features,
                                              feature_selection, encoding,
                                              print_file)
        y_train, y_test = y[train_index], y[test_index]

        #######################################################################################################################

        # leaf_size: int, optional(default=30)

        # p : integer, optional (default = 2)
        # When p = 1, this is equivalent to using manhattan_distance (l1),
        # and euclidean_distance (l2) for p = 2.
        # For arbitrary p, minkowski_distance (l_p) is used.

        # algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional Algorithm used to compute the nearest neighbors:
        # ‘ball_tree’ will use BallTree
        # ‘kd_tree’ will use KDTree
        # ‘brute’ will use a brute-force search.
        # ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.

        # weights : str or callable, optional (default = ‘uniform’) weight function used in prediction. Possible values:
        # ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
        # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

        scaler = Normalizer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        classifier = KNeighborsClassifier(n_neighbors=40)

        # 'minority': resample only the minority class;
        oversample = SMOTE(sampling_strategy='minority',
                           k_neighbors=10,
                           random_state=0)
        x_train, y_train = oversample.fit_resample(x_train, y_train)

        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)

        #######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        print('ROC: ', roc)
        av_roc += roc
        print('Continued Avg: ', av_roc / count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' +
                         str(av_roc / count) + '\n')

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        '''
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        y_pred = (y_pred > 0.5)

        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(str(cm) + '\n')

        report = classification_report(y_test, y_pred)
        print(report)

        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(
            y_test, y_pred, average='binary')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)

# Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' +
                     "Average Accuracy: " + str(accuracy / 10) + '\n' +
                     "Average Recall: " + str(recall / 10) + '\n' +
                     "Average F1-score: " + str(f1score / 10) + '\n' +
                     'Average ROC:' + str(av_roc / 10) + '\n')
Beispiel #37
0
# apply your models to.
#
# .. your code here ..
from sklearn.preprocessing import Normalizer
data_train_fit = Normalizer().fit(data_train)

#
# TODO: With your trained pre-processor, transform both your training AND
# testing data.
#
# NOTE: Any testing data has to be transformed with your preprocessor
# that has ben fit against your training data, so that it exist in the same
# feature-space as the original data used to train your models.
#
# .. your code here ..
Trans_datatrain = data_train_fit.transform(data_train)
Trans_datatest = data_train_fit.transform(data_test)

#
# TODO: Just like your preprocessing transformation, create a PCA
# transformation as well. Fit it against your training data, and then
# project your training and testing features into PCA space using the
# PCA model's .transform() method.
#
# NOTE: This has to be done because the only way to visualize the decision
# boundary in 2D would be if your KNN algo ran in 2D as well:
#
# .. your code here ..
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(Trans_datatrain)
def normalize(data):
    in_encoder = Normalizer(norm='l2')
    data = in_encoder.transform(data)
    return data
le2 = LabelEncoder()
le2.fit(np.ravel(data_np[:, [10]]))
#print le2.classes_
data_np[:, [10]] = le2.transform(np.ravel(data_np[:, [10]])).reshape(n_lin, 1)

# Replace missing values by 0 for the column 16 and 17
data_np = preprocess_replace_NaN(data_np, [15, 16], 'nan')

# plot_NA_ratio_features(data_np, feature_names)

# Normalize the dataset for columns 5, 6, 7, 10, 11, 13, 14, 17 and 25
nor = Normalizer( norm='l1')
nor.fit(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64))
# [0, 1, 2, 6, 11, 17, 18, 19, 20, 21, 22, 23]
data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]] = \
	nor.transform(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64))

# Replace missing values for the risk_factor using a svm classifier
preprocess_missing_risk_factor(data_np)

# plot_pourcentage_result(data_np, feature_names, [17, 18, 19, 20, 21 ,22, 23])

# plot_NA_ratio_features(data_np, feature_names)


################################################################################

# # Replace all missing values for the column 12, 16 and 17 with the median value
# imp = Imputer(strategy='median', axis=0)
# imp.fit(data_np[:, [11, 15, 16]])
# data_np[:, [11, 15, 16]] = imp.transform(data_np[:, [11, 15, 16]])
    # Append new features
    newAct_train = np.zeros((activation_train.shape[0], activation_train.shape[1]+3))
    for i in range(activation_train.shape[0]):
        newAct_train[i] = np.append(activation_train[i], pttImg_sample_train[i][:3])

    newAct_valid = np.zeros((activation_valid.shape[0], activation_valid.shape[1]+3))
    for i in range(activation_valid.shape[0]):
        newAct_valid[i] = np.append(activation_valid[i], valid_pttImg[i][:3])

    newAct_test = np.zeros((activation_test.shape[0], activation_test.shape[1]+3))
    for i in range(activation_test.shape[0]):
        newAct_test[i] = np.append(activation_test[i], test_blogImg[i][:3])
    # Normalize
    normalizer = Normalizer()
    normalizer.fit(newAct_train)
    newAct_train = normalizer.transform(newAct_train)
    newAct_valid = normalizer.transform(newAct_valid)
    newAct_test = normalizer.transform(newAct_test)

    # Final model
    model3 = Sequential()
    model3.add(Dense(2, input_shape=(newAct_train.shape[1],), activation='softmax'))
    adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model3.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    print(model3.summary())
    model3.fit(newAct_train, y_train_sample, epochs=epochs3, batch_size=batch_size)

    # Evaluating by using validation data or testing data
    print("Valid:")
    scores = model3.evaluate(newAct_valid, y_valid_sample, verbose=0)
    print("Loss:", scores[0])
Beispiel #41
0
    if dataset[column].dtype == type(object):
        le = LabelEncoder()
        dataset[column] = le.fit_transform(dataset[column])
array = dataset.values
#print(dataset.describe)
#print(dataset.shape)
#print(pd.DataFrame(dataset))
#Visualization
scatter_matrix(dataset)
plt.show()
#plt.plot(dataset.iloc[:,[0,1,2,3]],label='names[]')
X = array[:, 0:4]
Y = array[:, 4]
# Step 2: Pre-Processing the data to best expose the structure of the problem.
scaler = Normalizer().fit(X)
rescaled = scaler.transform(X)
np.set_printoptions(precision=3)
#print('The result after pre-processing of data :\n')
#print(rescaled[0:6,:])
#Setp3 :Spot-checking a number of algorithms using your own test harness.
scoring = 'accuracy'
models = []
results = []
names = []
models.append(('SVC', SVC()))
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
print('The result after spot-checking is :\n')
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    def predict_proba(self, X, test_data, test_data_idx=None):
        '''
        Predict based by a simple geographical distance threshold
        '''

        tfs = []

        # words that only occur in the test data
        test_word_idx = {}
        test_words = []

        test_station_ids = set()

        # take only the pairs present in the test data set
        for i in range(X.shape[0]):
            lookup = i
            if test_data_idx is not None:
                lookup = test_data_idx[i]

            stid1 = test_data.pairs[lookup][0]
            stid2 = test_data.pairs[lookup][1]

            test_station_ids.add(stid1)
            test_station_ids.add(stid2)

        test_station_ids = list(test_station_ids)

        for _, sid in enumerate(test_station_ids):
            station = test_data.stations[sid]

            tfs.append({})

            for word in re.split(r"[^\w]+", station.name):
                word = word.strip()

                if len(word) == 0:
                    continue

                if word not in self.word_idx:
                    if word not in test_word_idx:
                        test_words.append(word)
                        test_word_idx[word] = len(test_words) - 1

                    # test word ids begin at the end of the training word ids
                    wid = len(self.words) + test_word_idx[word]
                else:
                    wid = self.word_idx[word]

                if wid not in tfs[-1]:
                    tfs[-1][wid] = 0

                tfs[-1][wid] += 1

        # build tf.idf matrix
        data = []
        indices = []
        indptr = [0]

        # mapping maps station ids to rows in the td.idf matrix
        glob_mapping = {}

        for iid, sid in enumerate(test_station_ids):
            station = test_data.stations[sid]
            glob_mapping[sid] = iid
            for _, wid in enumerate(tfs[iid]):
                indices.append(wid)
                tf = tfs[iid][wid]

                if wid >= len(self.dfs):
                    # no document frequency, count as unique word
                    idf = math.log(self.train_num_stations)
                else:
                    idf = math.log(1 / self.dfs[wid])
                data.append(tf * idf)
            indptr.append(len(indices))

        matrix = csr_matrix(
            (data, indices, indptr),
            shape=(len(test_station_ids), len(self.words) + len(test_words)),
            dtype=float)

        # L2 normalize
        norm = Normalizer(norm="l2", copy=False)
        norm.transform(matrix)

        ret = numpy.empty([X.shape[0], 2], dtype=numpy.float)

        # we build the similarity scores in chunks, because
        # otherwise the cosine similarity matrix would get way too big
        chunksize = 5000

        def proc_chunk(a, b, out):
            for minr in range(a, b, chunksize):
                maxr = min(b, minr + chunksize)

                locret = numpy.empty([maxr - minr, 2], dtype=numpy.float)

                chunkstationids = set()
                for i in range(minr, maxr):
                    lookup = i
                    if test_data_idx is not None:
                        lookup = test_data_idx[i]

                    stid1 = test_data.pairs[lookup][0]
                    stid2 = test_data.pairs[lookup][1]

                    chunkstationids.add(stid1)
                    chunkstationids.add(stid2)

                chunkstationids = list(chunkstationids)

                # build a view of our tfidf score matrix containing the
                # stations in this chunk
                chunk_map = {}
                mapping_l = []
                chunk_matrix = None

                for iid, sid in enumerate(chunkstationids):
                    chunk_map[sid] = iid
                    mapping_l.append(glob_mapping[sid])

                chunk_matrix = matrix[mapping_l, :]
                simi_mat = cosine_similarity(chunk_matrix)

                for i in range(minr, maxr):
                    lookup = i
                    if test_data_idx is not None:
                        lookup = test_data_idx[i]

                    simi = simi_mat[chunk_map[test_data.pairs[lookup][0]],
                                    chunk_map[test_data.pairs[lookup][1]]]

                    if simi > self.t:
                        simi = 0.5 + (simi - self.t) / (2.0 * (1.0 - self.t))
                    else:
                        simi = simi / (2 * self.t)

                    locret[i - minr, 1] = simi
                    locret[i - minr, 0] = 1 - locret[i - minr, 1]
                out.append([locret, minr, maxr])

        manager = mp.Manager()
        rets = manager.list()
        procs = []

        processors = mp.cpu_count()
        csize = int(X.shape[0] / processors)

        for a in range(0, X.shape[0], csize):
            b = min(X.shape[0], a + csize)
            procs.append(mp.Process(target=proc_chunk, args=(a, b, rets)))

        for p in procs:
            p.start()

        for p in procs:
            p.join()

        for locret in rets:
            ret[locret[1]:locret[2], :] = locret[0]

        return ret
def load_blood_data(train=True, SEED=97, scale  = False, 
                                         minmax = False,
                                         norm   = False,
                                         nointercept = False,
                                         engineering = False):
    """
    Load training and test datasets
    for DrivenData's Predict Blood Donations warmup contest
    
    The training data is shuffled before it's returned; test data is not
    
    Note: patsy returns float64 data; Theano requires float32 so conversion
          will be required; the y values are converted to int32, so they're OK
    
    Arguments
    ---------
        train (bool) if True
                         y_train, X_train = load_blood_data(train=True, ...
                     if False
                         X_test, IDs = load_blood_data(train=False, ...
                         
        SEED (int)   random seed
        
        scale (bool) if True, scale the data to mean zero, var 1; standard normal
        
        minmax (2-tuple) to scale the data to a specified range, provide a
                         2-tuple (min, max)
                         
        norm (bool)  if True, L2 normalize for distance and similarity measures
        
        nointercept (bool) if True, patsy will not create an intercept
                         
                         
    Usage
    -----
    from load_blood_data import load_blood_data
    """
    from sklearn.utils         import shuffle
    from patsy                 import dmatrices, dmatrix
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    import numpy  as np
    import pandas as pd
    import re
    
    global scaler
    global minmaxer
    global normalizer
    
    if (scale and minmax): raise ValueError("cannot specify both scale and minmax")
    if (scale and norm):   raise ValueError("cannot specify both scale and norm")
    if (norm  and minmax): raise ValueError("cannot specify both norm and minmax")
        
    if type(train) is not bool: raise ValueError("train must be boolean")
    if type(SEED)  is not int:  raise ValueError("SEED must be int")
    if type(scale) is not bool: raise ValueError("scale must be boolean")
    if type(norm)  is not bool: raise ValueError("norm must be boolean")
    if type(nointercept) is not bool: raise ValueError("nointercept must be boolean")
    if type(engineering) is not bool: raise ValueError("engineering must be boolean")
    
    # ------------- read the file -------------
    
    file_name = '../data/train.csv' if train else '../data/test.csv'
    data = pd.read_csv(file_name)
    
    
    # ------------- shorten the column names -------------
    
    column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated']
    data.columns = column_names if train else column_names[:-1]
    
    
    # ------------- create new variables -------------
    
    if engineering:
        # Ratio of moSinceLast / moSinceFirst = moRatio
        data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index)
    
        # Ratio of (volume/numDonations) / moSinceFirst = avgDonation
        data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index)
    
        # Ratio of moSinceFirst / numDonations = avgWait
        data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

        
    # ------------- scale the data -------------

    # transform data to mean zero, unit variance
    # ==========================================
    if scale:
        if train:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to fit in a range
    # ================================
    if minmax:
        if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple")
        if train:
            minmaxer = MinMaxScaler(feature_range = minmax)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to unit vector (L2 norm for distance and similarity)
    # ===================================================================
    if norm:
        if train:
            normalizer = Normalizer(norm='l2', copy=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        
        
    # ------------- create the design matrix -------------
        
    # create the datasets with a patsy formula
    formula = 'donated ~ moSinceLast * moSinceFirst +  numDonations + volume'
    
    if engineering:
        formula = formula + ' + moRatio + avgDonation + avgWait'
        
    if nointercept: 
        formula = formula + ' -1'
        
    if not train:
        match = re.search(r"~\s??(.*)", formula)
        if match:
            formula = match.group(1)
        else:
            raise ValueError("Patsy formula {} does not match the expected format".format(formula))
            
            
    # ------------- return the values -------------
            
    if train:
        y_train, X_train = dmatrices(formula, data=data, return_type="dataframe")
        y_train = np.ravel(y_train).astype(np.int32)
        
        X_train, y_train = shuffle(X_train, y_train, random_state=SEED)
        return y_train, X_train
    else:
        X_test = dmatrix(formula, data=data, return_type="dataframe")
        IDs    = data.ID.values
        return X_test, IDs
Beispiel #44
0
with open('./data/{}/a_list.npy'.format(prefix), 'w') as file:
    json.dump(a_list, file)
with open('./data/{}/h_list.npy'.format(prefix), 'w') as file:
    json.dump(h_list, file)
with open('./data/{}/q_id_list.npy'.format(prefix), 'w') as file:
    json.dump(q_id_list, file)

# Create the map

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

transformer = Normalizer(copy=True, norm='l2')
normalized = transformer.transform(v_array)
#X = PCA(n_components=50).fit_transform(normalized)
#X = StandardScaler().fit_transform(reduced_data)

import time
import umap

time_start = time.time()
umap_ = umap.UMAP(n_neighbors=100, verbose=True)
embedding = umap_.fit_transform(normalized)
print('umap done! Time elapsed: {} seconds'.format(time.time() - time_start))

np.save('./data/{}/lgd_umap.npy'.format(prefix), embedding)

new_list = []
for i, md5 in enumerate(h_list):
def main():
    # if sys.argv[2] == 'svm':
    #     Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    # elif sys.argv[2] == 'lr':
    #     Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8)
    # elif sys.argv[2] == 'pa':
    #     Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced')
    # else:
    #     Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced')

    Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced')
    Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5,
                   param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)
    # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3,
    #                param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)

    File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt'
    Ngram = 2

    print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram)


    PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100]
    NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100]
    print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples))
    X = PosSamples + NegSamples
    y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=random.randint(0,100))
    print '# TrainLabels', len(y_train)
    print '# TestLabels', len(y_test)

    print 'performing CVectorizer'
    CVectorizer = CountVectorizer(lowercase = True,
                                  stop_words='english',
                                  # token_pattern='(?u)\b\w\w+\b',
                                  # tokenizer = SGTokenizer,
                                  tokenizer = Tokenizer,
                                  ngram_range=(1,2),
                                  dtype=np.float64,
                                  decode_error = 'ignore',
                                  max_df=0.8)
    print 'performing TfidfTransformer and Normalizer'
    # TFIDFTransformer = TfidfTransformer()
    normalizer = Normalizer()
    print 'creating Train and Test FVs'
    T0 = time()
    TrainFVs = CVectorizer.fit_transform(X_train)
    TestFVs = CVectorizer.transform(X_test)
    print 'feat ext time', time() - T0

    # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs)
    # TestFVs = TFIDFTransformer.transform(TestFVs)

    TrainFVs = normalizer.fit_transform(TrainFVs)
    TestFVs = normalizer.transform(TestFVs)

    print 'Trai/test split'
    print TrainFVs.shape
    print TestFVs.shape
    # raw_input('hit any key...')

    print 'training classifier with train samples shape:', TrainFVs.shape
    T0 = time()
    # memory_dump('before_train_mem.txt')
    Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily)
    print 'batch fitted'
    print 'training time', time() - T0
    # memory_dump('after_train_mem.txt')

    print 'testing classifier with test samples shape:', TestFVs.shape
    T0 = time()
    # memory_dump('before_test_mem.txt')
    PredictedLabels = Clf.predict(TestFVs)
    print 'testing time', time() - T0
    # memory_dump('after_test_mem.txt')

    print '*'*100
    print 'classification report'
    print '-'*20
    Accuracy = np.mean(PredictedLabels == y_test)
    print "Test Set Accuracy = ", Accuracy

    print(metrics.classification_report(y_test,
                PredictedLabels, target_names=['Neg', 'Pos']))

    print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels)
    print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels)
    print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels)
    print "Log loss:", metrics.log_loss(y_test, PredictedLabels)
    print "F1 Score:", metrics.f1_score(y_test, PredictedLabels)
    print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels)
    print '*'*100

    Vocab = CVectorizer.get_feature_names()
    # print Vocab[:100]
    # raw_input()
    try:
        FeatureImportances = Clf.coef_[0]
    except:
        FeatureImportances = Clf.best_estimator_.coef_[0]

    print FeatureImportances.shape
    raw_input()
    PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1]
    NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1]
    for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices):
                print Vocab[PosFIndex], '+-', Vocab[NegFIndex]


    FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1]))
    FeatureImportancesSparseArray.setdiag(FeatureImportances)

    AllFVsTimesW = TestFVs*FeatureImportancesSparseArray
    print AllFVsTimesW.shape

    Ind = 0
    for TestFV in TestFVs:
        if PredictedLabels[Ind] != y_test[Ind]:
            Ind += 1
            continue
        if len(X_test[Ind].split()) < 5:
            Ind += 1
            continue
        print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind])
        # print TestFV
        # print TestFV.shape
        CurTestFV = np.array(AllFVsTimesW[Ind].toarray())
        CurTestFV = CurTestFV.transpose()
        CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],)
        # print CurTestFV.shape
        # raw_input()
        PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1]
        NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1]
        PosFeatImps= CurTestFV.argsort()[-2:]
        NegFeatImps = CurTestFV.argsort()[:2]
        Tmp = AllFVsTimesW[Ind].todense()
        Tmp = np.sort(Tmp)
        # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp
        # print NegTopFeatureIndices, NegFeatImps
        if y_test[Ind] == 1:
            print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green')

        else:
            print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red')
        Ind += 1
        raw_input()
Beispiel #46
0
 def normalize_features(self):
     normalizer = Normalizer()
     self.X_train = normalizer.fit_transform(self.X_train)
     self.X_test = normalizer.transform(self.X_test)
     return self.X_train, self.X_test
Beispiel #47
0
import pandas

names = ('квартира', 'хрущевка', 'улучшенной', 'брежневка', 'старой', 'проект',
         'свердловка', 'сталинка', 'нестандартная', 'новостройка',
         'многокомнатная', 'левый берег', 'ленинский', 'правобережный',
         'орджоникидзевский', 'поселок', 'агаповка', 'другой р-н',
         'кол-во комнат', 'общ площадь', 'жил площадь', 'кухня', 'этаж',
         'ремонт', 'соcтояние', 'окна', 'балкон', 'балкон застеклен', 'торг',
         'ипотека', 'срочно', 'цена 1кв м')

array = pandas.read_csv('cs_datasets.csv', names=names)
dataset = array.values
X = dataset[:, 0:31]
Y = dataset[:, 31]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
numpy.set_printoptions(precision=3)
(trainX, testX, trainY, testY) = train_test_split(normalizedX,
                                                  Y,
                                                  test_size=0.25,
                                                  random_state=42)

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(trainX.shape[1], )))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
mse, mae = model.evaluate(testX, testY, verbose=0)

print("Средняя абсолютная ошибка (тысяч рублей): ", round(mae, 0))

import numpy as np
from sklearn.preprocessing import Normalizer

features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

#변환기 객체
normalizer = Normalizer(norm="l2")

#특성 행렬 변환
normalizer.transform(features)

#특성 행렬을 변환
features_l2_norm = Normalizer(norm="l2").transform(features)

features_l2_norm

'''
norm에는 2가지 종류가 있음
l1, l2
'''
features_l1_norm = Normalizer(norm="l1").transform(features)

features_l1_norm

print("첫 번째 샘플 값의 합:",
 def normalize(self, data):
     scaler = Normalizer().fit(data)
     data = scaler.transform(data)
     return data
Beispiel #50
0
def facerecog(video):
    #slot='slot5'
    slot = filecreator(video)
    trainX, trainy = load_datasettrain('facenettest/train/')
    print(trainX.shape, trainy.shape)
    # load test dataset
    all = datetime.datetime.now()
    date = str(all.day) + '_' + str(all.month) + '_' + str(all.year)
    path_test = 'facenettest/storagevideo/' + date + '/' + slot + '/'
    testX = loadfacetest(path_test)
    print(testX.shape)
    # save arrays to one file in compressed format
    savez_compressed('facenettest.npz', trainX, trainy, testX)

    data = load('facenettest.npz')
    trainX, trainy, testX = data['arr_0'], data['arr_1'], data['arr_2']
    print('Loaded: ', trainX.shape, trainy.shape, testX.shape)

    model = load_model('facenet_keras.h5')
    print('Loaded Model')

    newTrainX = list()
    for face_pixels in trainX:
        embedding = get_embedding(model, face_pixels)
        newTrainX.append(embedding)
    newTrainX = asarray(newTrainX)
    print(newTrainX.shape)

    newTestX = list()
    for face_pixels in testX:
        embedding = get_embedding(model, face_pixels)
        newTestX.append(embedding)
    newTestX = asarray(newTestX)
    print(newTestX.shape)

    savez_compressed('facenettest.npz', newTrainX, trainy, newTestX)

    data = load('facenettest.npz')
    testX_faces = data['arr_2']

    data = load('facenettest.npz')
    trainX, trainy, testX = data['arr_0'], data['arr_1'], data['arr_2']

    in_encoder = Normalizer(norm='l2')
    trainX = in_encoder.transform(trainX)
    testX = in_encoder.transform(testX)

    out_encoder = LabelEncoder()
    out_encoder.fit(trainy)
    trainy = out_encoder.transform(trainy)
    #n=zeros()

    model = SVC(kernel='linear', probability=True)
    model.fit(trainX, trainy)
    print(testX.shape[0])
    present = list()

    for i in range(testX.shape[0]):
        random_face_pixels = testX_faces[i]
        random_face_emb = testX[i]

        samples = expand_dims(random_face_emb, axis=0)
        yhat_class = model.predict(samples)
        yhat_prob = model.predict_proba(samples)

        class_index = yhat_class[0]
        class_probability = yhat_prob[0, class_index] * 100

        predict_names = out_encoder.inverse_transform(yhat_class)
        c = 0

        if i != 0:
            for i in range(0, len(present)):
                if present[i] == predict_names[0]:
                    c = 1

        print(predict_names[0] + '  ' + str(class_probability))
        if c == 0 and class_probability > 40:
            present.append(predict_names[0])

    print(present)

    return present
    ]
scale = StandardScaler()
scale.fit(X)
scale.transform(X)


#正则化
from sklearn.preprocessing import Normalizer
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
    ]
nomolizer = Normalizer(norm='l2')
nomolizer.transform(X)

#过滤式特征选取:VarianceThreshold,剔除方差小于给定阈值的特征
from sklearn.feature_selection import VarianceThreshold
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
    ]
selected = VarianceThreshold(2)
selected.fit(X)
selected.transform(X)

#过滤式特征选取:单变量特征提取,SelectBest保留在某统计指标上得分最高的k个指标,SelectPercentile则保留百分比
from sklearn.feature_selection import SelectKBest,f_classif
Beispiel #52
0
                                                person_img)
        face_bounding_boxes = face_recognition.face_locations(face)

        #If training image contains exactly one face
        if len(face_bounding_boxes) == 1:
            face_enc = face_recognition.face_encodings(face)[0]
            # Add face encoding for current image with corresponding label (name) to the training data
            encodings.append(face_enc)
            names.append(person)
        else:
            print(person + "/" + person_img +
                  " was skipped and can't be used for training")

# normalize input vectors
in_encoder = Normalizer(norm='l2')
encodings = in_encoder.transform(encodings)
# label encode targets
out_encoder = LabelEncoder()
out_encoder.fit(names)
names = out_encoder.transform(names)

# Create and train the SVC classifier
clf = svm.SVC(gamma='scale', probability=True)
#clf = svm.SVC(kernel='linear', probability=True)
clf.fit(encodings, names)

# Load the test image with unknown faces into a numpy array
test_image = face_recognition.load_image_file('test/test.jpg')

# Find all the faces in the test image using the default HOG-based model
face_locations = face_recognition.face_locations(test_image)
def make_nn_regression(n_samples=100,
                       n_features=100,
                       n_informative=10,
                       dense=False,
                       noise=0.0,
                       test_size=0,
                       normalize_x=True,
                       normalize_y=True,
                       shuffle=True,
                       random_state=None):

    X, y, w = _make_nn_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  shuffle=shuffle,
                                  random_state=random_state)

    if dense:
        X = X.toarray()

    if test_size > 0:
        cv = ShuffleSplit(len(y),
                          n_iter=1,
                          random_state=random_state,
                          test_size=test_size,
                          train_size=1 - test_size)

        train, test = list(cv)[0]
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        if not dense:
            X_train.sort_indices()
            X_test.sort_indices()
    else:
        X_train, y_train = X, y
        if not dense:
            X_train.sort_indices()
        X_test, y_test = None, None

    # Add noise
    if noise > 0.0:
        generator = check_random_state(random_state)
        y_train += generator.normal(scale=noise * np.std(y_train),
                                    size=y_train.shape)
        y_train = np.maximum(y_train, 0)

    if normalize_x:
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        if X_test is not None:
            X_test = normalizer.transform(X_test)

    if normalize_y:
        scaler = MinMaxScaler()
        y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        if y_test is not None:
            y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()

    if X_test is not None:
        return X_train, y_train, X_test, y_test, w
    else:
        return X_train, y_train, w
Beispiel #54
0
t1.start()
t2.start()
t1.join()
t2.join()

X, Y = result_queue1.get()
test, label = result_queue2.get()
print(np.shape(test))
print(np.shape(X))

partx, party = random_partition(X, Y, 600000)
print(np.shape(partx))
nrm = Normalizer()
partx = nrm.fit_transform(partx)
print(np.shape(partx))
test = nrm.transform(test)
print(np.shape(test))

parameters = [[35, 45, 36], 2.0, 0.0012044393115519438, 1.0,
              3.5784753152461046e-06, 0.2751874654816516,
              0.0023804489838965626, 0.0, -0.23317429215526964]
hidden_layers = parameters[0]
solver = ["lbfgs", "sgd", "adam"]
learning_rate = ["constant", "adaptive"]
cur_solver = solver[int(parameters[1])]
cur_learning_rate = learning_rate[int(parameters[3])]
cur_nester = False
if parameters[7] == 1:
    cur_nester = True

cur_momentum = parameters[6]
    df_test_set = df_total.drop(df_train_set.index)
    # final training and test data
    df_train_input = pd.DataFrame(
        df_train_set.loc[:, df_train_set.columns != 'class'])
    df_train_output = pd.DataFrame(df_train_set['class'])
    df_test_input = pd.DataFrame(
        df_test_set.loc[:, df_test_set.columns != 'class'])
    df_test_output = pd.DataFrame(df_test_set['class'])

    # _____________________________________________________________________________________________________________________
    # Standardize/Normalize data
    # data are now named with an additional "_" at the end
    # print("Standardizing the data\n" + line_str)

    stsc = Normalizer().fit(df_train_input)
    df_train_input_ = pd.DataFrame(stsc.transform(df_train_input))
    df_test_input_ = pd.DataFrame(stsc.transform(df_test_input))
    stsc = StandardScaler().fit(df_train_input)
    df_train_input_ = pd.DataFrame(stsc.transform(df_train_input))
    df_test_input_ = pd.DataFrame(stsc.transform(df_test_input))

    # PCA
    minimum_explained_variance = 0.95
    pca = PCA(minimum_explained_variance)

    # alternative:
    # Choose the number of components by our own
    # number_principal_components = 10
    # pca = PCA(n_components=number_principal_components)

    pca.fit(df_train_input_)
Beispiel #56
0
X_train = X_train.drop('index', axis=1)
X_test = X_test.drop('index', axis=1)
Y_train = Y_train.drop('index', axis=1)
Y_test = Y_test.drop('index', axis=1)

X_train.head(5)

# In[4]:

#Normalization

from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
train_norm = normalizer.transform(X_train)
test_norm = normalizer.transform(X_test)

# In[5]:

#Standardization

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_norm)
train_data = normalizer.transform(train_norm)
test_data = normalizer.transform(test_norm)

# In[6]:

#Dimensionality reduction : Truncated SVD
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.externals import joblib
from grid_search import grid_estimation

# downloading matrix of text features and assigned clusters
all_data = genfromtxt('features_and_clusters.csv', delimiter=',')

data = all_data[:, 0:29]
target = all_data[:, 29]

# normalization and scaling of data
normalizer = Normalizer()
normalizer.fit(data)
data = normalizer.transform(data)
scaler = StandardScaler()
data = scaler.fit_transform(data)

# choosing of training and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0)

#clf = svm.SVC(kernel="rbf", gamma=0.001, C=1000).fit(X_train, y_train)
clf = svm.SVC(kernel="linear", gamma=1.0, C=1).fit(X_train, y_train)

# saving of classifier, scaler and normalizer
joblib.dump(clf, 'classifier_data\\model.pkl')
joblib.dump(scaler, 'classifier_data\\scaler.pkl')
joblib.dump(normalizer, 'classifier_data\\normalizer.pkl')

Beispiel #58
0
def norm_pre(X_train):
    norm = Normalizer()
    norm.fit(X_train)
    return norm.transform(X_train)
Beispiel #59
0
    attribute, label = dataset(benign_attribute, gafgyt_attribute,
                               mirai_attribute, benign_label, gafgyt_label,
                               mirai_label)
    # print(attribute[0])

    #print("num of samples: ", len(label))
    attribute_train, label_train, attribute_test, label_test = div_train_test(
        attribute, label, len(attribute))
    #print("num of test samples: ", len(label_test))

    #print("attribute_train: ",attribute_train)
    #data process

    #normalize the data
    normal = Normalizer().fit(attribute_train)
    normal.transform(attribute_train)
    normal.transform(attribute_test)
    #print("normal proecessed: ", attribute_test)
    print("original number of features for each sample: ",
          len(attribute_train[0]))
    pca = PCA(n_components=0.98)
    processed_train_attribute = pca.fit_transform(attribute_train)
    processed_test_attribute = pca.transform(attribute_test)

    print("number of features for each sample after PCA processed: ",
          len(processed_train_attribute[0]))
    #print("PCA processed: ", processed_train_attribute)

    print("before normalization: ", processed_train_attribute)

    #scale the data and delete the outliers
Beispiel #60
0
X.compactness.fillna(X.compactness.mean(), inplace=True)
X.width.fillna(X.width.mean(), inplace=True)
X.groove.fillna(X.groove.mean(), inplace=True)
X.isnull().sum()

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=1)

from sklearn.preprocessing import Normalizer
norml = Normalizer()
norml.fit(X_train)
# we got trained normalizer here - now using it transform both train and test data
X_trainNrm = norml.transform(X_train)
X_testNrm = norml.transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=2, svd_solver='randomized')
pcx = pca.fit(X_trainNrm)

train_pca = pca.transform(X_trainNrm)
test_pca = pca.transform(X_testNrm)

from sklearn.neighbors import KNeighborsClassifier
modelknn = KNeighborsClassifier(n_neighbors=9)
modelknn.fit(train_pca, Y_train)

plotDecisionBoundary(modelknn, train_pca, Y_train)
print(modelknn.score(test_pca, Y_test))