Ejemplo n.º 1
0
    def _select_enterprises(self):
        print("Scanning all enterprises transaction data to filter enterprises whose number of frequent customer reach the minimum threshold ...")
        # get enterprise_id list in the enterprise db
        print("Retrieving enterprise id list from enterprise table ...")
        enterprises_id_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password, dbname=self.dbname, tbname=self.enter_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields="enterprise_id"))
        enterprises_trans_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password, dbname=self.dbname,
                                              tbname=self.trans_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields=["customer_id", "enterprise_id", "create_time"],
                                              time_field="create_time", start_time=self.init_date.strftime("%Y-%m-%d")))

        # filter enterprises
        filter_enters = []
        for enterprise_id in enterprises_id_df.enterprise_id:
            print("Analyzing current enterprise: {}".format(enterprise_id))
            enter_df = enterprises_trans_df[enterprises_trans_df['enterprise_id'] == enterprise_id]
            # next loop if df is empty:
            if len(enter_df.index) == 0:
                continue
            # remove duplicates of a customer in same day
            enter_df.create_time = enter_df.create_time.apply(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0))
            try:
                assert 'customer_id' in enter_df.columns and 'create_time' in enter_df.columns
            except:
                raise ValueError("Input df must have customer_id header and crete_time header!")
            enter_df = enter_df.drop_duplicates(['customer_id', 'create_time'])
            cus_count = (enter_df.customer_id.value_counts() >= self.min_purchase_count).sum()
            if cus_count >= self.customer_threshold:
                filter_enters.append(enterprise_id)
                print("enterprise {} satisfied: {} customers.".format(enterprise_id, cus_count))
        print("Analyzing enterprise done!")
        return filter_enters
Ejemplo n.º 2
0
 def trainingset_generation(self, enterprise_id_list_file=None, fields=["customer_id", "enterprise_id", "price", "create_time"], outdir=".", override=False):
     """
     :param enterprise_id_list_file: enterprise those data meets the minimum requirement. If not provided, function select_enterprises will be performed
     :param outdir: output directory for generated training set file
     :param fields: column header for retrieve data
     :param override: re-generate existing files
     :return: training set files corresponding to each filtered enterprise
     Note: 1. Total transaction period should be larger than train_input_length + 1 (test_set_times)
           2. if init date is not current date, it should follow time format: yyyy-mm-dd
     """
     print("Get training dataset of each enterprise...")
     # create output dir if not exists
     if not os.path.exists(outdir):
         os.makedirs(outdir)
     # get enterprise id list
     if not enterprise_id_list_file:
         enterprise_id_list_file = outdir + "/filtered_enterprise_id.txt"
     if not os.path.exists(enterprise_id_list_file) or override:
         filter_enterprises = self._select_enterprises()
         # save filtered enterprise ids to file
         list2file(filter_enterprises, enterprise_id_list_file)
     filter_enterprises = file2list(enterprise_id_list_file)
     # get transaction df
     trans_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password,
                                      dbname=self.dbname, tbname=self.trans_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields=fields,
                                      start_time=self.init_date.strftime("%Y-%m-%d")))
     for enterprise in filter_enterprises:
         outfile = outdir + "/" + str(enterprise) + ".csv"
         # override the existing file or not
         interval_file = outdir + "/" + str(enterprise) + ".intervals.csv"
         if os.path.exists(interval_file) and not override:
             continue
         print("Retrieving transaction data of {} from transaction table".format(enterprise))
         enter_df = trans_df[trans_df['enterprise_id'] == int(enterprise)]
         # df with interval
         df_interval = self._calculate_time_interval(enter_df)
         # remove lines with time interval is 0 (when encounter new customers)
         df_interval = df_interval.ix[df_interval.time_interval > 0, :]
         # output intervals data to file for later distribution assessment and data merging
         interval_output = df_interval.time_interval
         interval_output.to_csv(interval_file)
         if os.path.exists(outfile) and not override:
            continue
         # get customers whose transaction intervals overpass the minimum requirement: training set count + 1
         # cus_trans_count = df_interval.customer_id.value_counts().index[df_interval.customer_id.value_counts() >= self.training_set_times + 1].tolist()
         # df_interval = df_interval.ix[df_interval.customer_id.isin(cus_trans_count), :]
         print("Filtering customers whose purchase times meet the minimum threshold: {}".format(self.min_purchase_count))
         df_interval = self.check_transaction_data(df_interval, init_date=self.init_date)
         # get all unique customer_ids
         all_cus_ids = df_interval.customer_id.unique()
         df_cur_enter = pd.DataFrame()
         print("Formating the dataset...")
         for current_customer in all_cus_ids:
             dataset = df_interval.time_interval[df_interval.customer_id == current_customer]
             dataset = np.asarray(dataset)
             dataX, dataY = self._create_interval_dataset(dataset, look_back=self.train_input_length)
             X_cols = []
             for x in range(1, 1+self.train_input_length):
                 X_cols.append('X' + str(x))
             dfX = pd.DataFrame(dataX, columns=X_cols)
             dfY = pd.DataFrame(dataY, columns=['Y'])
             dfY['customer_id'] = current_customer
             dfY['enterprise_id'] = enterprise
             df_cur_cus = pd.concat((dfX, dfY), axis=1)
             df_cur_enter = pd.concat((df_cur_enter, df_cur_cus), axis=0)
         # output training dataset of current enterprise to output directory
         print("Output formated training dataset to file: {}".format(outfile))
         # reindex the output file
         df_cur_enter.index = range(len(df_cur_enter.index))
         df_cur_enter.to_csv(outfile)
     print("End generation!")