def generate_feature_from_data(self, inp_data): try: LOGGER.info("What features do we need") except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def get_features_for_prediction(self, data): try: LOGGER.info("I can predict") except Exception: LOGGER.error(traceback.format_exc()) raise
def train(self, df, target, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) X_train = spark_df.select(*feature_columns).map(lambda x: list(x)) y_train = spark_df.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train( train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) self.model = logistic_model except Exception as e: raise e
def complete_task(fileobj): try: with fileobj.open('w') as f: f.write('Done') except Exception as e: LOGGER.error(traceback.format_exc())
def get_data(self): try: LOGGER.info("Howdy data cruncher") except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) train, test = spark_df.randomSplit([train_split, test_split], seed=1000000) X_train = train.select(*feature_columns).map(lambda x: list(x)) y_train = train.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) X_test = test.select(*feature_columns).map(lambda x: list(x)) y_test = test.select(target).map(lambda x: x[0]) prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp)))) prediction_and_label = prediction.zip(y_test) LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean()) except Exception as e: raise e
def train_model(self, df): try: # use this data to train the model and predict model = LogisticRegression() model.train(df=df, target='target') return model except Exception as e: LOGGER.error(traceback.format_exc())
def train_test_model(self, df): try: # use this data to train the model and predict model = LogisticRegression() LOGGER.info("training the model") model.test_train(df=df, target='target', train_split=0.8, test_split=0.2) except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def get_data(self): try: conn = DBConn().get_connection() query = '''SELECT * from trips ''' LOGGER.info("Reading data from db : %s" % (query)) df = pd.read_sql(query, con=conn) return df except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def predict(self, df): try: LOGGER.info("Predicting using logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns inp_data = spark_df.select(*feature_columns).map(lambda x: list(x)) inp_data = spark_df.map(lambda x: list(x)) result = self.model.predict(inp_data.map(lambda x: x)).collect() LOGGER.info("Predicted output is %s" % str(result)) return result except Exception as e: raise e
def predict(self, df): try: LOGGER.info("Predicting using logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns inp_data = spark_df.select(*feature_columns).map(lambda x: list(x)) inp_data = spark_df.map(lambda x: list(x)) result = self.model.predict(inp_data.map(lambda x: x)).collect() LOGGER.info("Predicted output is %s"%str(result)) return result except Exception as e: raise e
def generate_target(self, df): try: LOGGER.info("generating target column") # generate target variables df.trip_count.mean() df["target"] = 0 df.ix[(df.trip_count > df.trip_count.mean()), "target"] = 1 df.target.value_counts() df = df.ix[:, ["hour", "day", "terminal_code", "target"]] return df except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) train, test = spark_df.randomSplit([train_split, test_split], seed=1000000) X_train = train.select(*feature_columns).map(lambda x: list(x)) y_train = train.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train( train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) X_test = test.select(*feature_columns).map(lambda x: list(x)) y_test = test.select(target).map(lambda x: x[0]) prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp)))) prediction_and_label = prediction.zip(y_test) LOGGER.info( prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean()) except Exception as e: raise e
def get_categorical_codes(data_df, categorical_columns): try: for column in categorical_columns: print(column) data_df[column] = data_df[column].str.strip() data_df[column] = data_df[column].str.lower() data_df[column] = data_df[column].astype('category') modified_string = '_code' categorical_columns_modified = [ column + modified_string for column in categorical_columns] data_df[categorical_columns_modified] = data_df[categorical_columns].apply(lambda x: x.cat.codes) return data_df except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def get_features_for_prediction(self, data): try: json_data = json.dumps(data) df = pd.read_json(json_data) LOGGER.info("feature set is") LOGGER.info(df.head()) return df except Exception: LOGGER.error(traceback.format_exc()) raise
def run(self): try: LOGGER.info("starting load trip task") bike_share = BayBikeShare() bike_share.load_data() LOGGER.info("Load trip data complete") complete_task(self.output()) except Exception as e: LOGGER.error(traceback.format_exc())
def persist(self, location): try: LOGGER.info("Writing the model to location %s"%location) data = 'data' meta_data = 'metadata' data_location = os.path.join(location, data) if os.path.exists(data_location): LOGGER.info("Removing directory %s"%data_location) shutil.rmtree(data_location) data_location = os.path.join(location, meta_data) if os.path.exists(data_location): LOGGER.info("Removing directory %s"%data_location) shutil.rmtree(data_location) self.model.save(self.sc, location) except Exception as e: raise e
def persist(self, location): try: LOGGER.info("Writing the model to location %s" % location) data = 'data' meta_data = 'metadata' data_location = os.path.join(location, data) if os.path.exists(data_location): LOGGER.info("Removing directory %s" % data_location) shutil.rmtree(data_location) data_location = os.path.join(location, meta_data) if os.path.exists(data_location): LOGGER.info("Removing directory %s" % data_location) shutil.rmtree(data_location) self.model.save(self.sc, location) except Exception as e: raise e
def run(self): try: LOGGER.info("starting terminal traffic train task") terminal_traffic = TerminalTraffic() LOGGER.info("get traffic terminal data") data = terminal_traffic.get_data() LOGGER.info("generate features for terminal traffic") df = terminal_traffic.generate_feature_from_data(inp_data=data) LOGGER.info("generate target for terminal traffic") df = terminal_traffic.generate_target(df) LOGGER.info("train traffic model") model = terminal_traffic.train_model(df) self.set_path() dir = os.getcwd() main_dir = os.path.join(dir, self.main_directory, self.models_directory, self.name) LOGGER.info("persisting predictive model") model.persist(location=main_dir) complete_task(self.output()) except Exception: LOGGER.error(traceback.format_exc())
def generate_feature_from_data(self, inp_data): try: LOGGER.info("Generating features from data") LOGGER.info("Input data has the shape %s" % str(inp_data.shape)) inp_data['start_hour'] = inp_data["Start Date"].apply( mlUtils.get_hour) inp_data['start_day'] = inp_data["Start Date"].apply( mlUtils.get_day) inp_data['end_hour'] = inp_data["End Date"].apply(mlUtils.get_hour) inp_data['end_day'] = inp_data["End Date"].apply(mlUtils.get_day) LOGGER.info(inp_data.head()) #now lets find the count traffic for an hour given a day of the week and terminal start_df = inp_data.groupby( by=["start_hour", "start_day", "Start Terminal" ]).count().copy() start_df = start_df.reset_index() LOGGER.info(start_df.head()) LOGGER.info("creating start df") # getting only the required columns start_df = start_df.ix[:, [ "start_hour", "start_day", "Start Terminal", "Trip ID" ]] start_df.columns = ["hour", "day", "terminal_code", "trip_id"] start_df.head() LOGGER.info("creating end df") end_df = inp_data.groupby( by=["end_hour", "end_day", "End Terminal"]).count().copy() end_df = end_df.reset_index() end_df = end_df.ix[:, [ "end_hour", "end_day", "End Terminal", "Trip ID" ]] end_df.columns = ["hour", "day", "terminal_code", "trip_id"] LOGGER.info(end_df.head()) LOGGER.info("merging start and end df") # merge start and end data frames to generate traffic counts for a terminal merged_df = start_df.merge(end_df, how="inner", on=["hour", "day", "terminal_code"]) merged_df[ "trip_count"] = merged_df["trip_id_x"] + merged_df["trip_id_y"] merged_df = merged_df.ix[:, [ "hour", "day", "terminal_code", "trip_count" ]] return merged_df except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def generate_feature_from_data(self, inp_data): try: LOGGER.info("Generating features from data") LOGGER.info("Input data has the shape %s" % str(inp_data.shape)) inp_data['start_hour'] = inp_data["Start Date"].apply(mlUtils.get_hour) inp_data['start_day'] = inp_data["Start Date"].apply(mlUtils.get_day) inp_data['end_hour'] = inp_data["End Date"].apply(mlUtils.get_hour) inp_data['end_day'] = inp_data["End Date"].apply(mlUtils.get_day) LOGGER.info(inp_data.head()) #now lets find the count traffic for an hour given a day of the week and terminal start_df = inp_data.groupby(by=["start_hour", "start_day", "Start Terminal"]).count().copy() start_df = start_df.reset_index() LOGGER.info(start_df.head()) LOGGER.info("creating start df") # getting only the required columns start_df = start_df.ix[:, ["start_hour", "start_day", "Start Terminal", "Trip ID"]] start_df.columns = ["hour", "day", "terminal_code", "trip_id"] start_df.head() LOGGER.info("creating end df") end_df = inp_data.groupby(by=["end_hour", "end_day", "End Terminal"]).count().copy() end_df = end_df.reset_index() end_df = end_df.ix[:, ["end_hour", "end_day", "End Terminal", "Trip ID"]] end_df.columns = ["hour", "day", "terminal_code", "trip_id"] LOGGER.info(end_df.head()) LOGGER.info("merging start and end df") # merge start and end data frames to generate traffic counts for a terminal merged_df = start_df.merge(end_df, how="inner", on=["hour", "day", "terminal_code"]) merged_df["trip_count"] = merged_df["trip_id_x"] + merged_df["trip_id_y"] merged_df = merged_df.ix[:, ["hour", "day", "terminal_code", "trip_count"]] return merged_df except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def train_test_model(self, df): try: LOGGER.info("Train and test people, train and test") except Exception as e: LOGGER.error(traceback.format_exc()) raise e
def train_model(self, df): try: LOGGER.info("I love machine learning") except Exception as e: LOGGER.error(traceback.format_exc())
def train(self, df, target, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) X_train = spark_df.select(*feature_columns).map(lambda x: list(x)) y_train = spark_df.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) self.model = logistic_model except Exception as e: raise e