def howFill(self, df, colums):

        if colums in cf.get('null_processing_delete'):
            df = FillMethods.delete(df, colums)
        elif colums in cf.get('null_processing_assignMean'):
            df = FillMethods.assignMean(df, colums)

        elif colums == cf.get('null_processing_property_fee'):
            df = FillMethods.property_fee(df, colums)

        # this had finished fillna and data_uniform, so it should do alonely
        # elif (colums in cf.get('null_processing_assingNumber')) or (colums == cf.get('null_processing_assingNumber_complete_time')):
        #     df = FillMethods.assingNumber(df, colums)

        elif colums in cf.get('null_processing_assignZero'):
            df = FillMethods.assignZero(df, colums)

        elif colums in cf.get('null_processing_assignMode'):
            df = FillMethods.assignMode(df, colums)
        elif colums == 'crawl_time':
            df = self.dateDayNull(df)

        else:
            pass

        return df
    def assingNumber(df, null_processing_assingNumber):
        # 年份与权值的映射
        year_map_num_dict = {}
        for v, k in enumerate(
                range(cf.get('year_map_num_start'),
                      cf.get('year_map_num_end') + 1)):
            year_map_num_dict[k] = v

        # 空值的填充
        udf_assignZero = udf(UDFMethods.udf_NULL_assignZero, FloatType())

        df_nan = df.filter(df[null_processing_assingNumber].isNull())
        df_null = df.filter(df[null_processing_assingNumber] == 'NULL')
        if df_nan.count() > 0:
            df = df.na.fill(
                0, null_processing_assingNumber)  # 为什么填充不了,仍然是空,但不报错?????
        if df_null.count() > 0:
            df = df.select(
                '*',
                udf_assignZero(
                    df[null_processing_assingNumber]).alias('temp_name'))
            df = df.drop(null_processing_assingNumber)
            df = df.withColumnRenamed('temp_name',
                                      null_processing_assingNumber)

        # 值转换
        if null_processing_assingNumber in cf.get(
                'null_processing_assingNumber'):

            # floor 值转换
            if null_processing_assingNumber == 'floor':

                udf_floor_assingNumber = udf(UDFMethods.udf_floor, FloatType())

                df = df.select(
                    '*',
                    udf_floor_assingNumber(
                        df[null_processing_assingNumber]).alias('temp_name'))
                df = df.drop(null_processing_assingNumber)
                df = df.withColumnRenamed('temp_name',
                                          null_processing_assingNumber)

            # decoration 值转换
            elif null_processing_assingNumber == 'decoration':

                udf_decoration_assingNumber = udf(UDFMethods.udf_decoration,
                                                  FloatType())

                df = df.select(
                    '*',
                    udf_decoration_assingNumber(
                        df[null_processing_assingNumber]).alias('temp_name'))
                df = df.drop(null_processing_assingNumber)
                df = df.withColumnRenamed('temp_name',
                                          null_processing_assingNumber)

            else:
                pass

        return df
 def udf_room_type(s):
     room_type_range = cf.get('uniformity_room_type')
     try:
         if s in room_type_range:
             return s
         else:
             return '其他'
     except Exception:
         return '其他'
    def udf_direction(s):
        direction_range = cf.get('uniformity_direction')

        try:
            if s in list(direction_range.keys()):
                return direction_range.get(s)
            else:
                return '其他'
        except Exception:
            return '其他'
    def udf_payType(s):

        uniformity_rent_type_dict = cf.get('uniformity_pay_type')
        uniformity_rent_type_dict_keys = list(uniformity_rent_type_dict.keys())

        try:
            if s in uniformity_rent_type_dict_keys:
                return uniformity_rent_type_dict.get(s)
            else:
                return '其他'
        except Exception:
            return '其他'
Exemple #6
0
 def udf_floor_total(s):
     try:
         if (s != None) | (s != 'NULL'):
             s = float(s)
             if (s < 0) | (s > cf.get('uniformity_floor_total_max')):
                 return float(floor_total_mode)
             else:
                 return float(s)
         else:
             return 0.0
     except Exception as e:
         return 0.0
def trainDataSplit(df):
    columns_list = df.columns.tolist()
    columns_list.remove('price')
    df_X = df[columns_list]
    df_y = df['price']
    train_size = int((df_X.shape[0]) * cf.get('train_size_rate'))

    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_y,
                                                        train_size=train_size,
                                                        random_state=4)

    return X_train, X_test, y_train, y_test
Exemple #8
0
def dataUniform(df):
    uniformity_fields = cf.get('uniformity_fields')
    cols = df.columns
    print('dataUniform==========df.count()')
    df.count()

    for column in uniformity_fields:

        if column in cols:

            print(
                'df.filter(df[colums].isNull()).count()=======data_uniformity_before',
                column,
                df.filter(df[column].isNull()).count())

            print('column=========', column, df.count())

            if column == 'direction':
                df = DataUniformity.direction(df)
            elif column == 'floor_total':
                df = DataUniformity.floorTotal(df)

            elif column == 'is_broker':
                df = DataUniformity.isBroker(df)
            elif column == 'rent_type':
                df = DataUniformity.rentType(df)
            # elif column == 'room_type':
            #     df = DataUniformity.roomType(df)
            elif column == 'pay_type':
                df = DataUniformity.payType(df)
            elif column in ['price', 'score', 'house_count', 'area']:
                df = Math.XiGeMa(df, column, 3)
            elif column == 'agency_name':
                df = DataUniformity.agencyName(df)
            elif column == 'zone':
                df = DataUniformity.zone(df)
            else:
                print('the feature need not be processed')
            print(
                'df.filter(df[colums].isNull()).count()=======data_uniformity_after',
                column,
                df.filter(df[column].isNull()).count())
            df.select(column).show()
        else:
            pass

    return df
Exemple #9
0
def oneHotAll(df):
    onHotFields = cf.get('null_no_processing')
    columns = df.columns
    onHotFields = list(set(onHotFields) & set(columns))
    # if len(onHotFields) TODO
    sdf = oneHot(df, "id", onHotFields[0])

    for i in onHotFields[1:]:
        tmp_sdf = oneHot(df, "id", i)
        sdf = sdf.join(tmp_sdf, on='id', how='inner')

    for j in onHotFields:
        columns.remove(j)

    numerice_sdf = df.select(columns)
    total_df = numerice_sdf.join(sdf, 'id', 'inner')
    del numerice_sdf
    del sdf
    return total_df