def dsto_norm_labeled_points( input_dataset, feature_keys, preprocess_script="item['CloseDate'] = (datetime.datetime.strptime(item['CloseDate'], '%Y-%m-%d') - datetime.datetime(1970,1,1)).total_seconds()" ): from pyspark.mllib.classification import LabeledPoint from cognub.propmixapi.normalizers import NPNormalizer datamap = {'targets': [], 'features': []} for item in input_dataset: features = [] exec preprocess_script for column in feature_keys: try: if item[column] is not None and str(item[column]).replace( '.', '', 1).strip('-').isdigit(): features.append(float(item[column])) else: raise InvalidFeatureValue() except: break else: datamap['targets'].append( float(item['ClosePrice']) / float(item['LivingArea'])) datamap['features'].append(features) datamap['features'] = NPNormalizer().fit(datamap['features']).transform( datamap['features']) dataset = [] for index, target in enumerate(datamap['targets']): dataset.append(LabeledPoint(target, datamap['features'][index])) return dataset
def main(self): """ Execute the process of loading, training and testing features to class them Input: Nothing Output: Result file named result_classifier.json """ print( "#1: Reads the bucket specified in argument and extracts both classes and values from features" ) self.load_classes_and_values_from_features() print("#2: Processing 1 vs All classification") for class1 in self.classes: print("#3: Dataframes construction for classifier %s vs All" % (class1)) print("#4: Loading features values into main dataframe") self.load_features_in_dataframe_1_vs_All(class1) print("#8: Spliting dataframe into train and test samples") self.train_features_df, self.test_features_df = self.features_df.randomSplit( [0.5, 0.5]) print("#8.1: %i training data" % (self.train_features_df.count())) print("#8.2: %i testing data" % (self.test_features_df.count())) print("#9: Convert strings label into float with an estimator") self.convert_labels_to_float() print("#10: Convert dataframe into labelpoints RDD") self.train_features_labelpoints = self.train_features_df.rdd.map( lambda row: LabeledPoint(row.label_index, row.features)) self.test_features_labelpoints = self.test_features_df.rdd.map( lambda row: LabeledPoint(row.label_index, row.features)) print("#11: Training classifier") self.training(class1, "All") print( "#Final results: loading best_models dictionnary informations to best_classifiers.json file" ) with open("./best_classifiers.json", "w") as out: json.dump(self.best_models, out)
def build_regressors(self, split_dataset, split_kmeans_dataset, feature_keys): self.logger.info('building regressors') mce_tuples = [] for dataset, kmeans_dataset in zip(split_dataset, split_kmeans_dataset): kmeans_train_set = [] for item in kmeans_dataset: features = [item[column] for column in feature_keys] kmeans_train_set.append(array(features)) # print "kmeans_train_set", len(kmeans_train_set) del kmeans_dataset kmeans_train_set = sc.parallelize(kmeans_train_set) clusters = KMeans.train(kmeans_train_set, 100, maxIterations=200, runs=10, initializationMode="random") del kmeans_train_set data = [] for item in dataset: features = [] for column in feature_keys: features.append(item[column]) data.append(LabeledPoint(item[self.target_key], features)) del dataset data = sc.parallelize(data) def preprocess(observation): observation.label = float(observation.label / 10000) return observation data = data.map(preprocess) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # del data model = RandomForest.trainRegressor( trainingData, categoricalFeaturesInfo={}, numTrees=self.rfr_config['num_trees'], featureSubsetStrategy=self. rfr_config['feature_subset_strategy'], # "all", impurity='variance', maxDepth=self.rfr_config['max_depth']) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) testMSE = -1 try: testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) except: pass mce_tuples.append((model, clusters, testMSE)) self.logger.info('regressors build finished') return mce_tuples
def create_labeled_point(labels_and_features, wanted_category): """ Parses the line using the parser function lambda, and creates a LabeledPoing with the 'wanted' category as label :param line: the line to parse :param parser_function: the lambda function that creates the tuples :param line: the string line to parse """ labels = labels_and_features[0] features = labels_and_features[1] return LabeledPoint(labels[wanted_category], features)
def preprocess(sc, data, labels=None): data = classifier.tolibsvm(data) points = [] for i in range(len(data)): wordarr = data[i] label = 0 if labels: label = labels[i] point = LabeledPoint(label, wordarr) points.append(point) rdd = sc.parallelize(points) return rdd
def tuple_to_labeled_point(entry, category, l_encoder=None): """ Creates a label point from a text line that is formated as a tuple :param entry: a tuple of format (3, 2, 1, [3,4,4 ..]), where the first entries in the tuple are labels, and the last entry is a list of features :param category: which one of the labels in the tuple to keep for the labeled point (0 to 2 for imr dataset) :param l_encoder: the label encoder to encode the label (if any) :return: a LabeledPoint """ from pyspark.mllib.classification import LabeledPoint label = entry[category] if l_encoder: label = l_encoder.transform(label) features = entry[-1] return LabeledPoint(label, features) # return a new labelPoint
def dsto_labeled_points(dataset, feature_keys): from pyspark.mllib.classification import LabeledPoint data = [] for item in dataset: features = [] for column in feature_keys: try: if item[column] is not None and str(item[column]).replace( '.', '', 1).replace('-', '').isdigit(): features.append(float(item[column])) else: raise InvalidFeatureValue() except: break else: data.append( LabeledPoint( float(item['ClosePrice']) / float(item['LivingArea']), features)) return data
def build_classifier(self, dataset, kmeans_dataset, feature_keys): self.logger.info('building classifier') kmeans_train_set = [] for item in kmeans_dataset: features = [item[column] for column in feature_keys] kmeans_train_set.append(array(features)) self.logger.debug("kmeans_train_set %d", len(kmeans_train_set)) kmeans_train_set = sc.parallelize(kmeans_train_set) clusters = KMeans.train(kmeans_train_set, 100, maxIterations=500, runs=10, initializationMode="random") del kmeans_dataset del kmeans_train_set data = [] for item in dataset: features = [item[column] for column in feature_keys] data.append(LabeledPoint(int(item['classifier_label']), features)) del dataset data = sc.parallelize(data) (trainingData, testData) = data.randomSplit([0.7, 0.3]) del data model = RandomForest.trainClassifier( trainingData, numClasses=self.total_splits, categoricalFeaturesInfo={}, numTrees=self.rfc_config['num_trees'], featureSubsetStrategy=self. rfr_config['feature_subset_strategy'], # "all", impurity='gini', maxDepth=self.rfc_config['max_depth'], maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) self.logger.info('classifier build finished') return model, clusters, testErr
def main(): # parameters features_dir = sys.argv[1] global train_features_lp global test_features_lp global best_models global features_value_list features_value_list = [] best_models = {} classes = [] for feature_file in os.listdir(features_dir): new_class = re.sub(r'[0-9]', '', feature_file) new_class = new_class[:-9].strip('_') classes.append(new_class) classes = sorted(list(set(classes))) classes_dup = classes # [FEATURES EXTRACTION] # subprocess.call(["python", "features_extract.py"]) # [LOADING FEATURE VALUES] loading featuresvalues into dictionnary print(">>>>> Loading features values into list of rows..") features_value_list = load_features_value(features_dir) # [CLASSIFIER SELECTION] Selecting classifiers (1vs1, 1vsAll) # 1vs1 classifiers for class1 in classes: class2_set = [x for x in classes_dup] del class2_set[0:(classes.index(class1)+1)] print("classes") print(classes) print("class2_set") print(class2_set) for class2 in class2_set: print(">>>>> Building dataframes for classifier %s vs. %s.." % (class1,class2)) # [LOADING FEATURES] loading features values into dataframe print("_____ Loading features values into main dataframe") features_df = load_features_df_1vs1(features_value_list) print("_____ Filtering data within dataframe") features_classifier_df = features_df\ .filter((features_df.label == class1)\ | (features_df.label == class2)) # [SPLIT DATA] Split data into train & test print("_____ Spliting data into training & test data..") train_features_df, test_features_df = features_classifier_df.randomSplit([0.8, 0.20]) train_count = train_features_df.count() test_count = test_features_df.count() print("%i training data" % (train_count)) print("%i testing data" % (test_count)) # [CONVERET LABELS] Convert string labels into floats with an estimator print("_____ Converting string labels into floats with an estimator..") train_features_df, test_features_df = convert_labels(train_features_df,test_features_df) # [CONVERT INTO LABELDPOINTS] print(">>>>> Converting dataframe into labelpoint rdd..") train_features_lp = train_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features)) test_features_lp = test_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features)) # [BUILD MODEL] Learn classifier on training data print(">>>>> Training classifier..") training(class1,class2) # 1vsAll classifiers print("1vsALL---------------------------------------------------------------------------------------") print("classes") print(classes) print("class2_set") for class1 in classes: print(">>>>> Building dataframes for classifier %s vs. All.." % (class1)) # [LOADING FEATURES] loading features values into dataframe print("_____ Loading features values into main dataframe") features_df = load_features_df_1vsAll(features_value_list,class1) # [SPLIT DATA] Split data into train & test print("_____ Spliting data into training & test data..") train_features_df, test_features_df = features_df.randomSplit([0.8, 0.20]) train_count = train_features_df.count() test_count = test_features_df.count() print("%i training data" % (train_count)) print("%i testing data" % (test_count)) # [CONVERET LABELS] Convert string labels into floats with an estimator print("_____ Converting string labels into floats with an estimator..") train_features_df, test_features_df = convert_labels(train_features_df,test_features_df) # [CONVERT INTO LABELDPOINTS] print(">>>>> Converting dataframe into labelpoint rdd..") train_features_lp = train_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features)) test_features_lp = test_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features)) # [BUILD MODEL] Learn classifier on training data print(">>>>> Training classifier..") training(class1,"All") # [OUTPUT] # For each classifier, send model parameters in best_classifiers.json print(">>>>> Sending best model information to \"best_classifiers.json\"..") with open("./output/best_classifiers.json", "w") as out: json.dump(best_models, out) # hang script to tune it with Spark Web UI (available @ http://localhost:4040) raw_input("press ctrl+c to exit")
def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:])
def clean_feature(rec): label = int(rec[-1]) features = [float(x) for x in rec[:-1]] return LabeledPoint(label, features)