Beispiel #1
0
def computePeriod(X_df, orderedValues, suffix_feature="SumImports"):
    """
    There are 134 different periods in this dataset.
    We can see that taking any of the SumSomething statistics.
    Indeed the column 1 to 12 such a SumSomething statistics have
    a finite number of values and they overlap in continuous way,
    according to the considerated period.
    Here, we take 1_diffSumImports(kmt) and 2_diffSumImports as indicators
    to travel through data and re-establish the hidden time parameter as periods.
    :param X_df: The original dataframe
    :return: The dataframe with a new column named period
    """

    # We find the value for the following periods by considering
    # that for any entry: the value of 1_diffSumImports(kmt) for
    # the next period is in 2_diffSumImports(kmt) at time t.
    if (len(orderedValues) == 0):
        orderedValues += [getFirstValue(X_df, suffix_feature)]
    for columnNumber in [1, 11]:
        while True:
            prev_value = orderedValues[-1]
            array_next = list(set(X_df.ix[(X_df[get_suffix(suffix_feature, columnNumber)[0]] == prev_value), [
                get_suffix(suffix_feature, columnNumber + 1)[0]]].values.ravel()))
            if (len(array_next) == 0):
                # In this case, it's the end of the loop, since
                # we have found all the values for 1_diffSumImports(kmt)
                break
            elif (len(array_next) != 1):
                raise ("There is not one unique value")
            orderedValues += [array_next[0]]

    index_feature = get_suffix(suffix_feature, 1)[0]
    period_df = pd.DataFrame(
        {index_feature: orderedValues, "period": range(0, len(orderedValues))}).set_index(index_feature)
    return period_df
Beispiel #2
0
def getFirstValue(X_df, suffix_feature):
    valuesSumImport = X_df[get_suffix(suffix_feature, 1)[0]].unique()

    # The loop below finds the only unique value of valuesSumImport that doesn't
    # appear anywhere in the 2_diffSumImports(kmt) of our data. It corresponds to
    # the value of 1_diffSumImports(kmt) for the first period
    for value in valuesSumImport:
        if (sum(X_df[get_suffix(suffix_feature, 2)[0]] == value) == 0):
            return value
    def transform(self, X_df):
        self.registerEngineeredFeatures(computePeriod(X_df, self.ordered_values, suffix_feature="SumImports"), "period",
                                        left_on=get_suffix("SumImports", 1))
        for engineered_df in self.engineered_df.values():
            X_df = mergeDf(X_df, engineered_df)

        X_df = createFeature(X_df,self.engineered_features)
        # X_df = X_df.ix[:, get_prefix(12) + self.engineered_features]
        X_df = X_df.ix[:,  get_suffix('sumprod',[11,12])+get_suffix(["exports",'refinery'],[10,11,12])+ self.engineered_features]
        #X_df = X_df.ix[:, except_suffix(['wti','sumclosing','refinery'],12) + get_suffix(["exports","refinery"],11) + self.engineered_features]
        #X_df = X_df.ix[:, get_prefix(12) + get_suffix(["exports", "refinery"],11) + self.engineered_features]

        #X_df = X_df.ix[:, get_suffix(["exports","refinery","sumImports"],[11,12])+ self.engineered_features]
        #X_df = self.computePrePred(X_df)
        return X_df
def computeCountryQuotient(X_df):
    countrySum = dict()
    for columns_group in ['Imports', 'Exports']:
        countrySum[columns_group] = (abs(
            X_df[["country"] +
                 get_suffix(columns_group)])).groupby("country").mean().mean(
                     axis=1)

    return countrySum['Imports'] / countrySum['Exports']
Beispiel #5
0
def computeVariance(X_df):
    variance = np.log(X_df[get_suffix("Imports", range(7, 13))].var(axis=1) +
                      1)
    variance = variance / max(variance)
    X_df["variance_diff" + "Imports(kmt)"] = variance
    return X_df
Beispiel #6
0
def createFeature(X_df, engineered_features):
    engineered_features += ["Exports_10_11_12"]
    X_df[engineered_features[-1]] = X_df[get_suffix("exports",
                                                    range(10, 13))].sum(axis=1)
    return X_df