def isNumber(self, string):
     """ Returns True if string is a number. """
     try:
         cleanfloat(string)
         return True
     except ValueError:
         return False
Beispiel #2
0
 def cast(self, string):
     """ Returns an integer, float or a string from a string"""
     try:
         if string is None:
             return None
         return int(string)
     except ValueError:
         try:
             return cleanfloat(string)
         except ValueError:
             if string == '':
                 return None
             else:
                 return string
Beispiel #3
0
    def _getRowExtraVector(self, ret, column_name, col_row_index, distances):

        predict_columns = self.train_meta_data.model_predict_columns

        desired_total = self.train_meta_data.window_size
        batch_height = len(ret[column_name])
        remaining_row_count = batch_height - (col_row_index + 1)

        harvest_count = desired_total if desired_total < remaining_row_count else remaining_row_count
        empty_count = desired_total - harvest_count
        empty_vector_len = (len(ret[column_name][col_row_index]) + sum([
            len(ret[predict_col_name][0])
            for predict_col_name in predict_columns
        ]) + 1) * empty_count  # this is the width of the padding

        row_extra_vector = []

        for i in range(harvest_count):
            try:
                row_extra_vector += ret[column_name][col_row_index + i + 1]
                row_extra_vector += [distances[col_row_index + i + 1]]

                # append the target values before:
                for predict_col_name in predict_columns:
                    row_extra_vector += [
                        cleanfloat(v)
                        for v in ret[predict_col_name][col_row_index + i + 1]
                    ]
            except:
                logging.error(traceback.format_exc())
                logging.error(
                    'something is not right, seems like we got here with np arrays and they should not be!'
                )

        if empty_count > 0:
            # complete with empty
            row_extra_vector += [0] * empty_vector_len

        return row_extra_vector
    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        header = self.transaction.input_data.columns
        origData = {}

        for column in header:
            origData[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size), sample_size)
        self.logging.info('population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]
            for i, val in enumerate(row):
                column = header[i]
                value = tryCastToNumber(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    origData[column].append(value)
                column_count[column] += 1
        stats = {}

        for i, col_name in enumerate(origData):
            col_data = origData[col_name] # all rows in just one column
            data_type = self.getColumnDataType(col_data)

            # NOTE: Enable this if you want to assume that some numeric values can be text
            # We noticed that by default this should not be the behavior
            # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification
            #
            # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC:
            #     unique_count = len(set(col_data))
            #     if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN:
            #         data_type = DATA_TYPES.TEXT

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(parseDate(element).timestamp())
                        except:
                            logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)


                col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.05
                    initial_step_size = abs(max_value-min_value)/100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i-min_value)*inc_rate
                        i = i + i_inc


                    # TODO: Solve inc_rate for N
                    #    min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate ....
                    #
                    #      x_0 = 0
                    #      x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate
                    #
                    #      sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) )
                    #
                    #      mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) )
                    #
                    #     mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) ))
                    #              = n*min(sum_(i=1)^(i=n)(inc_rate^i))
                    #    =>  sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n))
                    #
                    # # i + i*x

                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []


                is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False


                col_stats = {
                    "column": col_name,
                    KEYS.DATA_TYPE: data_type,
                    # "distribution": best_fit_name,
                    # "distributionParams": distribution_params,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
                stats[col_name] = col_stats
            # else if its text
            else:

                # see if its a sentence or a word
                is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False
                dictionary, histogram = self.getWordsDictionary(col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(
                        dictionary) / len(col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {

                    "column": col_name,
                    KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "histogram": histogram
                }
                stats[col_name] = col_stats



        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.persistent_model_metadata.column_stats = stats
        self.transaction.persistent_model_metadata.total_row_count = total_rows
        self.transaction.persistent_model_metadata.test_row_count = test_rows
        self.transaction.persistent_model_metadata.train_row_count = train_rows
        self.transaction.persistent_model_metadata.validation_row_count = validation_rows

        self.transaction.persistent_model_metadata.update()

        return stats