Example #1
0
def metric_by_test_size(df: pd.DataFrame, test_sizes=range(1, 10), train_size: int = 5,
                        date=datetime.datetime.now().strftime("%m%Y")):
    """
            fix train size and adjust test size
    :param df: pd.DataFrame
    :param test_sizes: list of test sizes
    :param train_size: 5 -> 5 days (one week, no weekend)
    :param date: research date
    """
    # split dataset to train, test
    time_series = df["date"].drop_duplicates().tolist()
    train_test = map(
        lambda test_size: split_time_series(
            str_d=time_series[0], mid_d=time_series[train_size], end_d=time_series[train_size + test_size], df=df),
        test_sizes)

    # scoring
    param = S3Manager(bucket_name="production-bobsim").load_dump(
        key="food_material_price_predict_model/research/tuned_params.pkl")
    scores = map(lambda x: scoring(x, param=param), train_test)
    ser = pd.Series(scores, index=test_sizes)

    # plot
    series_plot(ser=ser, kind="bar", x_label="test size", y_label="customized RMSE",
                title="train_size: {}".format(train_size))

    # save
    S3Manager(bucket_name="production-bobsim").save_plt_to_png(
        key="food_material_price_predict_model/research/{date}/image/metric_by_test_size_train{train}.png".format(
            date=date, train=train_size))
Example #2
0
def metric_by_other_term(df: pd.DataFrame, train_size: int, test_size: int, n_days=range(10),
                         date=datetime.datetime.now().strftime("%m%Y")):
    """
        measure the metric by pushing the term of train/test data set by day
    :param df: dataset
    :param train_size: term of train data
    :param test_size:  term of test data
    :param n_days: how many days do you push aside.
    :param date: research date
    """
    # split dataset to train, test
    time_series = df["date"].drop_duplicates().tolist()
    train_test = map(
        lambda x: split_time_series(
            str_d=time_series[x], mid_d=time_series[x + train_size], end_d=time_series[x + train_size + test_size],
            df=df),
        n_days)

    # scoring
    param = S3Manager(bucket_name="production-bobsim").load_dump(
        key="food_material_price_predict_model/research/tuned_params.pkl")
    scores = map(lambda x: scoring(x, param=param), train_test)
    ser = pd.Series(scores, index=n_days)

    # plot
    series_plot(ser=ser, kind="bar", x_label="day", y_label="customized RMSE",
                title="train, test size: {}, {}".format(train_size, test_size), d=0.3)

    # save
    S3Manager(bucket_name="production-bobsim").save_plt_to_png(
        key="food_material_price_predict_model/research/{date}/image/metric_by_other_term_train{train}/test{test}.png".format(
            date=date, train=train_size, test=test_size))
Example #3
0
    def tuned_process(self, dataset):
        """
            tuned ElasticNet for production
        :param dataset: merged 3 dataset (raw material price, terrestrial weather, marine weather)
        :return: metric (customized rmse)
        """
        train_x, train_y, test_x, test_y = dataset

        # init model & fit
        model = ElasticNetModel(
            bucket_name=self.bucket_name,
            x_train=train_x, y_train=train_y,
            params=S3Manager(bucket_name=self.bucket_name).load_dump(
                key="food_material_price_predict_model/research/tuned_params.pkl"
            )
        )
        model.fit()

        # adjust intercept for conservative prediction
        model.model.intercept_ = model.model.intercept_ + 150

        # predict & metric
        pred_y = model.predict(X=test_x)
        # r_test, r_pred = inverse_price(test_y), inverse_price(pred_y)
        metric = model.estimate_metric(scorer=customized_rmse, y=test_y, predictions=pred_y)

        # save
        # TODO self.now -> date set term, e.g. 010420 - 120420
        model.save(prefix="food_material_price_predict_model/{term}".format(term=self.term))
        return metric
Example #4
0
def main():
    """
        standalone process to load and save CSV file data with AWS S3
    """
    manager = S3Manager(bucket_name="production-bobsim", )
    objs = manager.fetch_objects(key="crawled_recipe", conversion_type="json")
    print(objs)
Example #5
0
    def __init__(
            self, x_train, y_train, bucket_name,
            grid_params=None, score=mean_squared_error
    ):
        if grid_params is None:
            grid_params = {
                "max_iter": [1, 5, 10],
                "alpha": [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                "l1_ratio": np.arange(0.0, 1.0, 0.1)
            }

        self.x_train = x_train
        self.y_train = y_train
        self.scorer = score

        self.error = None  # pd.Series
        self.metric = None

        # s3
        self.s3_manager = S3Manager(bucket_name=bucket_name)

        # logger
        self.logger = init_logger()

        super().__init__(
            estimator=ElasticNet(),
            param_grid=grid_params,
            scoring=make_scorer(self.scorer, greater_is_better=False),
            # we have to know the relationship before and after obviously, so n_splits: 2
            cv=TimeSeriesSplit(n_splits=2).split(self.x_train)
        )
Example #6
0
def load_from_s3(bucket_name, key):
    with tempfile.TemporaryFile() as fp:
        S3Manager(bucket_name=bucket_name).s3_bucket.download_fileobj(
            Fileobj=fp, Key=key)
        fp.seek(0)
        transformer = load(fp)
        fp.close()
    return transformer
Example #7
0
    def load(self):
        """
            fetch DataFrame and astype and filter by columns
        :return: pd DataFrame
        """
        manager = S3Manager(bucket_name=self.bucket_name)
        df = manager.fetch_df_from_csv(key=self.load_key)

        # TODO: no use index to get first element.
        return df[0]
Example #8
0
    def load(self):
        """
            init S3Manager instances and fetch objects
        :return: list of pd DataFrame (origin)
        """
        manager = S3Manager(bucket_name=self.bucket_name)
        df_list = manager.fetch_df_from_csv(key=self.s3_key)

        self.logger.info("{num} files is loaded".format(num=len(df_list)))
        self.logger.info("load df from origin bucket")
        return df_list
Example #9
0
    def load(self):
        """
            fetch DataFrame and astype and filter by columns
        :return: pd DataFrame
        """
        manager = S3Manager(bucket_name=self.bucket_name)
        df = manager.fetch_df_from_csv(key=self.load_key)

        # TODO: no use index to get first element.
        # filter by column and check types
        return df[0][self.dtypes.keys()].astype(dtype=self.dtypes).rename(
            columns=self.translate, inplace=False)
Example #10
0
def load(filename="2014-2020"):
    """
        fetch DataFrame and astype and filter by columns
    :return: pd DataFrame
    """
    manager = S3Manager(bucket_name="production-bobsim")
    df = manager.fetch_df_from_csv(
        key="public_data/open_data_terrestrial_weather/origin/csv/{filename}.csv"
        .format(filename=filename))

    # TODO: no use index to get first element.
    return df[0]
    def call_price(self):
        # args_str = ""
        #
        # for k, v in self.args.items():
        #     args_str += '%s=%s' % (k, v)
        #
        # res = requests.get(
        #     'http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str))
        # print('http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str))
        # print(args_str)
        # data = res.json()
        # items = data["Grid_20151128000000000315_1"]["row"]
        # print(items)
        # df_items = pd.DataFrame(items)
        # df_items.drop(["ROW_NUM"], axis=1, inplace=True)
        # print(df_items)
        # TODO: remove #
        # load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(filename="202005040") #self.today)
        # manager = S3Manager(bucket_name="production-bobsim")
        # manager.save_df_to_csv(df=df_items, key=load_key)

        def func(x: int):
            a = {
                'EXAMIN_DE' : "20200504", #self.today,
                "&EXAMIN_PRDLST_CODE": x
            }
            args_str = ""
            for k, v in a.items():
                args_str += '%s=%s' % (k, v)
            res = requests.get('http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str))
            data = res.json()
            print(data)
            # if len(data['Grid_20151128000000000315_1']['row']) is not 0:
            items = data["Grid_20151128000000000315_1"]["row"]
            return pd.DataFrame(items)

        df_list = list(map(lambda x: func(x), self.code_list))
        print(df_list)

        def concat(x, y):
            if x.empty:
                return x
            elif y.empty:
                return x
            else:
                return pd.concat([x, y])

        full_df = reduce(lambda x, y: concat(x, y), df_list)
        full_df.drop(["ROW_NUM"], axis=1, inplace=True)
        print(full_df)
        load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(filename="20205040") #self.today)
        manager = S3Manager(bucket_name="production-bobsim")
        manager.save_df_to_csv(df=full_df, key=load_key)
Example #12
0
    def __init__(self, bucket_name: str, x_train, y_train, params=None):
        # logger
        self.logger = init_logger()

        # s3
        self.s3_manager = S3Manager(bucket_name=bucket_name)

        if params is None:
            self.model = ElasticNet()
        else:
            self.model = ElasticNet(**params)

        self.x_train, self.y_train = x_train, y_train

        self.error = None
        self.metric = None
Example #13
0
    def __init__(self, base_url, bucket_name, key, head=False):
        self.logger = init_logger()

        self.bucket_name = bucket_name
        self.s3_manager = S3Manager(bucket_name=self.bucket_name)
        self.prefix = key

        self.chrome_path = "C:/chromedriver"
        options = webdriver.ChromeOptions()
        if head is False:
            options.add_argument('headless')

        self.driver = webdriver.Chrome(executable_path=self.chrome_path,
                                       chrome_options=options)

        self.base_url = base_url
def main():
    """
        save list of non-sparse item names (std_list).
    :return: exit code
    """
    # get standard list
    bucket_name = "production-bobsim"
    df, key = build_origin_price(bucket_name=bucket_name, date="201908")
    std_list = get_std_list(column=df["standard_item_name"], number=48)
    # print(std_list)

    # save standard list
    s3_manager = S3Manager(bucket_name=bucket_name)
    s3_manager.save_dump(
        x=std_list,
        key="food_material_price_predict_model/constants/std_list.pkl")
    return 0
Example #15
0
    def __init__(self, bucket_name: str, date: str):
        self.logger = init_logger()

        self.date = date

        # s3
        # TODO: bucket_name -> parameterized
        self.s3_manager = S3Manager(bucket_name=bucket_name)
        self.load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(
            filename=self.date)
        self.save_key = "public_data/open_data_raw_material_price/process/csv/{filename}.csv".format(
            filename=self.date)

        self.dtypes = dtype["raw_material_price"]
        self.translate = translation["raw_material_price"]

        # load filtered df
        self.input_df = self.load()
Example #16
0
 def get_recipes(prefix, source):
     data = S3Manager("production-bobsim").fetch_dict_from_json(
         key="crawled_{p}/{s}".format(p=prefix, s=source))
     if data is None:
         return 'there is no data'
     return jsonify(data)
Example #17
0
def save_to_s3(transformer, bucket_name, key):
    with tempfile.TemporaryFile() as fp:
        dump(transformer, fp)
        fp.seek(0)
        S3Manager(bucket_name=bucket_name).save_object(body=fp.read(), key=key)
        fp.close()
Example #18
0
def inverse_price(self, price):
    manager = S3Manager(bucket_name=self.bucket_name)
    mean, std = manager.load_dump(
        key="food_material_price_predict_model/price_(mean,std)_{date}.pkl".format(date=self.date)
    )
    return price * std + mean
 def save_coef(self, bucket_name, key):
     S3Manager(bucket_name=bucket_name).save_df_to_csv(self.coef_df, key=key)
 def save_model(self, bucket_name, key):
     manager = S3Manager(bucket_name=bucket_name)
     manager.save_dump(self.model, key=key)
Example #21
0
 def save(self, df: pd.DataFrame):
     manager = S3Manager(bucket_name=self.bucket_name)
     manager.save_df_to_csv(df=df, key=self.save_key)