Esempio n. 1
0
    def __init__(self, kafka_configfile, schema_file, s3_configfile):
        """
        class constructor that initializes the instance according to the configurations
        of the S3 bucket and Kafka
        :type kafka_configfile: str     path to kafka config file
        :type schema_file     : str     path to schema file
        :type s3_configfile   : str     path to S3 config file
        """
        self.kafka_config = utility.parse_config(kafka_configfile)

        self.schema = utility.parse_config(schema_file)
        self.s3_config = utility.parse_config(s3_configfile)

        self.producer = KafkaProducer(
            bootstrap_servers=self.kafka_config["BROKERS_IP"])
Esempio n. 2
0
    def __init__(self, s3_configfile, schema_configfile, psql_configfile):
        """
        class constructor that initializes the instance according to the configurations
        of the S3 bucket, raw data and PostgreSQL table
        
        :type s3_configfile:     str        path to s3 config file
        :type schema_configfile: str        path to schema config file
        :type psql_configfile:   str        path to psql config file
        """
        self.s3_config   = utility.parse_config(s3_configfile)
        self.schema      = utility.parse_config(schema_configfile)
        self.psql_config = utility.parse_config(psql_configfile)

        self.sc = pyspark.SparkContext.getOrCreate()
        self.sc.setLogLevel("ERROR")
Esempio n. 3
0
def predict(config_file):
    """
    Main function that runs predictions
    Args:
        config_file [str]: path to config file
    Returns:
        None
    """
    ##################
    # configure logger
    ##################
    logger = set_logger("../log/predict.log")

    ##################
    # Load config from config file
    ##################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)
    image_width = config['common']['in_image_width']
    image_height = config['common']['in_image_height']
    predict_img = config['predict']['folder_path']
    weights_path = config['common']['weights_path']

    X,img_names = preprocess(predict_img, image_width, image_height)

    model = KeyPointModel().getModel()
    logger.info(f"Loading weights from {weights_path}")
    model.load_weights(weights_path)
    # logger.info("-----------Model Summary------------")
    # logger.info(model.summary())

    predicted_keypoints = model.predict(X)
    logger.info("Prediction Completed. Writing output to predicted.csv")
    write_output(predicted_keypoints, img_names)
def predict(config_file):
    """
    Main function that runs predictions

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """
    ##################
    # configure logger
    ##################
    logger = set_logger("./log/predict.log")

    ##################
    # Load config from config file
    ##################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    model_path = Path(config["predict"]["model_path"])
    processed_test = config["predict"]["processed_test"]
    predicted_file = config["predict"]["predicted_file"]
    export_result = config["predict"]["export_result"]

    logger.info(f"config: {config['predict']}")

    ##################
    # Load model & test set
    ##################
    # Load model
    logger.info(
        f"-------------------Load the trained model-------------------")
    with open(model_path, "rb") as f:
        trained_model = load(f)

    # Load test set
    logger.info(f"Load the test data from {processed_test}")
    X, y, cols = load_data(processed_test)
    logger.info(f"cols: {cols}")
    logger.info(f"X: {X.shape}")
    logger.info(f"y: {y.shape}")

    ##################
    # Make prediction and evaluate
    ##################
    logger.info(f"-------------------Predict and evaluate-------------------")
    y_hat = trained_model.predict(X)
    logger.info(f"Classification report: \n {classification_report(y, y_hat)}")
    output = pd.DataFrame(y)
    output["prediction"] = y_hat
    if export_result:
        output.to_csv(predicted_file, index=False)
        logger.info(f"Export prediction to : {predicted_file}")
Esempio n. 5
0
    def __init__(self, kafka_configfile, schema_configfile, stream_configfile,
                 start_offset):
        """
        class constructor that initializes the instance according to the configurations
        of Kafka (brokers, topic, offsets), data schema and batch interval for streaming
        :type kafka_configfile:  str        path to s3 config file
        :type schema_configfile: str        path to schema config file
        :type stream_configfile: str        path to stream config file
        :type start_offset:      int        offset from which to read from partitions of Kafka topic
        """
        self.kafka_config = utility.parse_config(kafka_configfile)
        self.stream_config = utility.parse_config(stream_configfile)
        self.schema = utility.parse_config(schema_configfile)

        self.start_offset = start_offset

        self.sc = pyspark.SparkContext().getOrCreate()
        self.ssc = pyspark.streaming.StreamingContext(
            self.sc, self.stream_config["INTERVAL"])

        self.sc.setLogLevel("ERROR")
Esempio n. 6
0
def main(repo_owner, repo_name, start_date, end_date):
    """
    main logic of ETL process : call github API get commit -> commit process -> commit to Postgre
    """
    print ('start_date : ', str(start_date), 'end_date : ', str(end_date))
    repo_url = 'https://api.github.com/repos/{}/{}/commits?since={}T00:00:00Z&until={}T23:59:59Z'.format(repo_owner, repo_name, start_date, end_date)
    df = Commit2df(repo_url)
    output_df = extract_inform(df)
    postgre_config = parse_config('config/postgre.config')
    dumptopostgre = DumpToPostgre()
    dumptopostgre.insert_all_to_table(output_df,'git_commit',postgre_config)
    # sleep 3 sec after every scraping, avoid block by server
    time.sleep(3) 
Esempio n. 7
0
def main():
    """
    main func that create fact tables, and attr table from raw git data
    """
    postgre_config = parse_config('config/postgre.config')
    dumptopostgre = DumpToPostgre()
    connection = dumptopostgre.get_conn(postgre_config)
    sql_list = [sql_commit_fact, sql_commit_commitor, sql_commited_repo] 
    for sql in sql_list: 
        try:
            with connection.cursor() as cursor:
                print (sql)
                cursor.execute(sql)
                connection.commit()
                print ('>>>> table build ok')
        except Exception as e:
            print ('>>>> table build failed', str(e))
Esempio n. 8
0
 def __init__(self,
              kafka_configfile,
              schema_configfile,
              stream_configfile,
              psql_configfile,
              start_offset=0):
     """
     class constructor that initializes the instance according to the configurations
     of Kafka (brokers, topic, offsets), PostgreSQL database, data schema and batch interval for streaming
     :type kafka_configfile:  str        path to s3 config file
     :type schema_configfile: str        path to schema config file
     :type stream_configfile: str        path to stream config file
     :type psql_configfile:   str        path to psql config file
     :type start_offset:      int        offset from which to read from partitions of Kafka topic
     """
     SparkStreamerFromKafka.__init__(self, kafka_configfile,
                                     schema_configfile, stream_configfile,
                                     start_offset)
     self.psql_config = utility.parse_config(psql_configfile)
     self.sqlContext = pyspark.sql.SQLContext(self.sc)
     self.load_batch_data()
     self.psql_n = 0
Esempio n. 9
0
def train(config_file):
    """
    Main function that train and persists model based on training set/

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """
    ################
    # config logger
    ################
    logger = set_logger("../log/train.log")

    ###############################
    # Load config from config file
    ###############################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    keypoints_csv = Path(config['common']['labels_csv_path'])
    val_split = config['common']['val_split']
    train_img_scr_path = config['common']['img_source_path']
    test_img_scr_path = config['common']['img_source_path']
    image_width = config['common']['in_image_width']
    image_height = config['common']['in_image_height']

    epochs = config['train']['epochs']
    train_batch_size = config['train']['batch_size']
    weight_path = config['common']['weight_path']
    no_of_aug = config['train']['no_of_aug']
    test_batch_size = config['test']['batch_size']

    ############
    # Load Data
    ############
    logger.info(f"----------------Load the data----------------")
    selected_img, keypoint_df = load_data(keypoints_csv)
    logger.info(f"Number of selected images are {selected_img.shape}")
    logger.info(f"Few of the selected images are {selected_img[0:5]}")

    ####################################
    # Get train and test data generators
    ####################################

    X_train, y_train, X_test, y_test = train_test_split(
        selected_img, keypoint_df, val_split)

    train_gen = Car(x_set=X_train,
                    y_set=y_train,
                    mode='Train',
                    data_path=train_img_scr_path,
                    image_width=image_width,
                    image_height=image_height,
                    batch_size=train_batch_size,
                    augmentations='Self',
                    no_of_aug=no_of_aug)
    test_gen = Car(
        x_set=X_test,
        y_set=y_test,
        mode='Test',
        data_path=test_img_scr_path,
        image_width=image_width,
        image_height=image_height,
        batch_size=test_batch_size,
    )

    #####################
    # Set and train model
    #####################

    logger.info(
        f"-------------------------Initiate Model---------------------")
    model = KeyPointModel().getModel()

    logger.info(
        f"--------------------Model Summary---------------------------")
    logger.info(f"{model.summary}")

    # compile the model
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['mean_absolute_error'])

    # modelCheckPoint = ModelCheckpoint('car-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)
    earlyS = EarlyStopping(monitor='val_loss',
                           min_delta=1,
                           patience=3,
                           restore_best_weights=True)
    reducelr = ReduceLROnPlateau(monitor='val_loss',
                                 factor=0.1,
                                 patience=2,
                                 min_lr=1e-7)

    history = model.fit(x=train_gen,
                        validation_data=test_gen,
                        callbacks=[earlyS, reducelr],
                        epochs=epochs)
    logger.info(history)
    logger.info("------------Saving Weights--------------")
    model.save_weights(weight_path)
Esempio n. 10
0
def etl(config_file):
    """
    ETL function that load raw data and convert to train and test set

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """

    ##################
    # configure logger
    ##################
    logger = set_logger("./log/etl.log")

    ##################
    # Load config from config file
    ##################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    raw_data_file = config["etl"]["raw_data_file"]
    processed_path = Path(config["etl"]["processed_path"])
    test_size = config["etl"]["test_size"]
    random_state = config["etl"]["random_state"]
    logger.info(f"config: {config['etl']}")

    ##################
    # Data transformation
    ##################
    logger.info(
        "-------------------Start data transformation-------------------")
    wine = pd.read_csv(raw_data_file)

    bins = (2, 6.5, 8)
    group_names = ["bad", "good"]
    wine["quality"] = pd.cut(wine["quality"], bins=bins, labels=group_names)

    label_quality = LabelEncoder()

    wine["quality"] = label_quality.fit_transform(wine["quality"])
    logger.info("End data transformation")

    ##################
    # train test split & Export
    ##################
    # train test split
    logger.info(
        "-------------------Train test split & Export-------------------")
    train, test = train_test_split(wine,
                                   test_size=test_size,
                                   random_state=random_state)

    # export data
    logger.info(f"write data to {processed_path}")
    train.to_csv(processed_path / "train.csv", index=False)
    test.to_csv(processed_path / "test.csv", index=False)
    logger.info(f"train: {train.shape}")
    logger.info(f"test: {test.shape}")
    logger.info("\n")
def train(config_file):
    """
    Main function that trains & persists model based on training set

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """
    ##################
    # configure logger
    ##################
    logger = set_logger("../log/train.log")

    ##################
    # Load config from config file
    ##################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    processed_train = Path(config["train"]["processed_train"])
    ensemble_model = config["train"]["ensemble_model"]
    model_config = config["train"]["model_config"]
    model_path = Path(config["train"]["model_path"])

    logger.info(f"config: {config['train']}")

    ##################
    # Load data
    ##################
    logger.info(
        f"-------------------Load the processed data-------------------")
    X, y, cols = load_data(processed_train)
    logger.info(f"cols: {cols}")
    logger.info(f"X: {X.shape}")
    logger.info(f"y: {y.shape}")

    ##################
    # Set & train model
    ##################
    # Load model
    # Limited to sklearn ensemble for the moment
    logger.info(f"-------------------Initiate model-------------------")
    model = initiate_model(ensemble_model, model_config)

    # Train model
    logger.info(f"Train model using {ensemble_model}, {model_config}")
    model.fit(X, y)
    logger.info(f"Train score: {model.score(X, y)}")
    logger.info(
        f"CV score: {cross_val_score(estimator = model, X = X, y = y, cv = 5).mean()}"
    )
    ##################
    # Persist model
    ##################

    logger.info(f"-------------------Persist model-------------------")
    model_path.parent.mkdir(parents=True, exist_ok=True)
    with open(model_path, "wb") as f:
        dump(model, f)
    logger.info(f"Persisted model to {model_path}")