def remove_columns_containing_all_nan_values(df, threshold=80): """ receives a dataframe and threshold, removes column which contains nan >=threshold :param df: original dataframe containing data :param threshold: nans threshold from 0-100 as percentage :return: return dataframe after removing columns """ try: null_counts = \ df.select( [funct.count(funct.when(funct.col(col).isNull() | funct.col(col).contains("NULL") | funct.col(col).contains("null") | funct.col(col).contains("Null") | funct.col(col).contains("None") | funct.col(col).contains("NONE") | funct.col(col).contains("none"), col)).alias(col) for col in df.columns]).collect()[ 0].asDict() size_df = df.count() to_drop = [k for k, v in null_counts.items() if ((v / size_df) * 100) >= threshold] print("check") logger.warn("columns to drop ") logger.warn(to_drop) df = df.drop(*to_drop) return df except Exception as e: logger.error(e)
def read(self, address="", local="yes", file_format="csv", s3={}): """ :param address: :param local: :param file_format: :param s3: :return: """ try: if local == "yes": """ Time to read the file saved locally """ rf = ReadFileFromLocal() self.dataframe= rf.read(address, file_format) elif s3 != {}: """ Time to read data from s3 """ self.dataframe = ReadFileFromS3(address, file_format, s3) else: """ Not sure where the file is saved. """ message = "Please make sure you have file saved on either your local system or s3." logger.debug(message) self.dataframe = {"success": False, "message": message} return self.dataframe except Exception as e: logger.error(e)
def removing_stop_words(self, x, base_url): """ url column and base_url is given and cleaned url is returned :param x: row on which cleaning is need to be performed :param base_url: Contains base_url :return: cleaned url """ try: # If base_url param is empty figure out base_url using urllib if base_url == '': base_url = urlparse(x) base_url = base_url.netloc if base_url.scheme != '' else base_url.path.split("/")[0] x = x.replace("https://", "").replace("http://", "").replace(base_url, "") # fetch only alphabets ignore all special characters tokens = re.findall(r"[\w:']+", x) # remove duplicate words from url tokens = list(dict.fromkeys(tokens)) # remove stop words from url elem = [word for word in tokens if word not in self.stop_words] # add base_url to the url elem.insert(0, base_url) return '/'.join(elem) except Exception as e: logger.error(e)
def date_formatting(self, x): """ dateutill library is used to convert the different format of dates into standard format :param x: row wise date values :return: standard format of date """ try: return str(parser.parse(x)) except Exception as e: logger.error(e) return str(x)
def on_message(self, message): for msg in message: logger.debug("TOPIC: %s - PARTION: %s -KEY:%s" % (msg.topic, msg.partion, msg.key)) data = json.loads(msg.value) logger.debug("POLL _ID: %s" % data.get("_id")) try: self.callback(data.get("data")) except Exception as e: logger.error(traceback.format_exc()) logger.error(e.args) raise e
def run(self, df): try: time_variables = self.find_time_variables(df) print(time_variables) for v in time_variables: df = self.string_to_timestamp(df, v) self.update_metadata(v) return df except Exception as e: logger.error(e)
def read_parquet(self, path): """ :param path: :return: """ try: self.dataframe = self.spark_session.read.load(path) return self.dataframe except Exception as e: logger.error(e) return {"success": False, "message": e}
def fetch_columns_containing_url(df): """ Automatically fetch column name contains urls :param df: orig dataframe :return: return list of columns containing urls """ try: col_dict = df.select([funct.col(col).rlike(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))').alias(col) for col in df.columns]).collect()[0].asDict() col_containing_url = [k for k, v in col_dict.items() if v is True] return col_containing_url except Exception as e: logger.error(e)
def run(self, df): try: time_variables = self.find_time_variables(df) for v in time_variables: df = self.make_new_variables(df, v) self.update_metadata(v) return df except Exception as e: logger.error(e) print(e) return (e)
def prep(self, dataframe=None, s3={}, local="no", file_name="", file_format="csv", output_address="", output_format=""): """ :param dataframe: :param s3: :param local: :param file_name: :param file_format: :param output_address: :param output_format: :return: """ self.local = local self.s3 = s3 self.file_format = file_format self.file_name = file_name self.output_address = output_address self.output_format = output_format """ convert dataframe into spark dataframe if datarame provided is not none """ try: if dataframe != None: p = pd.DataFrame() if type(dataframe) == type(p): # convert dataframe into spark dataframe self.dataframe = self.spark_session.createDataFrame( dataframe) else: self.dataframe = dataframe """ read the file from local or s3 when dataframe passed is none """ if dataframe == None: self.dataframe = self.read_as_dataframe() """ time to prepare the recipe """ self.dataframe_output = self.preprocess() return self.dataframe_output except Exception as e: logger.error(e)
def remove_cols_containing_nan(): try: logger.debug("this is debug") df = sql.read.csv("./run/rem_test.csv", inferSchema=True, header=True) return_df = Duplication().remove_columns_containing_all_nan_values(df) return_df.toPandas().to_csv('./run/rem_test_result.csv') print(df.show()) print("#####################") print("resulted_df") print(return_df.show()) except Exception as e: logger.error(e)
def cleaning_test(): try: df = sql.read.csv("./run/column_rem.csv", inferSchema=True, header=True) return_df = Duplication().remove_columns_contains_same_value(df) return_df.toPandas().to_csv('./run/rem_test.csv') print(df.show()) print("#####################") print("resulted_df") print(return_df.show()) except Exception as e: logger.error(e)
def read_excel(self, path): """ :param path: :return: """ try: self.dataframe = self.spark_session.read.csv(path, inferSchema=True, header=True) return self.dataframe except Exception as e: logger.error(e) return {"success": False, "message": e}
def date_cleaning(): try: df = sql.read.csv("./run/testing_dates.csv", inferSchema=True, header=True) print(df.columns) return_df = DatetimeFormatting().date_cleaning(df, ['dates']) return_df.toPandas().to_csv('./run/date_test_res.csv') print(df.show()) print("#####################") print("resulted_df") print(return_df.show()) except Exception as e: logger.error(e)
def read_as_dataframe(self): """ :return: """ try: read = ReadFile() self.dataframe = read.read(address=self.file_address, local=self.local, file_format=self.file_format, s3=self.s3) return True except Exception as e: logger.error(e)
def run(self, df): numeric_columns = self.find_numeric_variables_saved_as_string(df) df = self.update_variable_types(df, numeric_columns) # self.update_metadata(v) return df try: numeric_columns = self.find_numeric_variables_saved_as_string(df) df = self.update_variable_types(df, numeric_columns) self.update_metadata(v) return df except Exception as e: logger.error(e)
def fetch_columns_containing_datetime(df): """ Automatically detects the column which contains the date values :param df: orig dataframe :return: list of column name contains the date values """ try: col_dict = df.select([ funct.col(col).rlike(r'(\d+(/|-){1}\d+(/|-){1}\d{2,4})').alias( col) for col in df.columns ]).collect()[0].asDict() col_containig_url = [k for k, v in col_dict.items() if v is True] return col_containig_url except Exception as e: logger.error(e)
def remove_columns_contains_same_value(df): """ remove columns which contains only one kind of value :param df: original dataframe containing data :return: return dataframe after removing columns """ try: col_counts = df.select([(funct.countDistinct(funct.col(col))).alias(col) for col in df.columns]).collect()[ 0].asDict() to_drop = [k for k, v in col_counts.items() if v == 1] df = df.drop(*to_drop) return df except Exception as e: logger.error(e)
def send(self, msg, key=None, partition=None, timestamp_ms=None): _id = allocate_uuid() msg = {"_id": _id, "data": msg} msg = self.__format_msg(msg) self.connector() try: self.client.send(MQ_KAFKA_TOPIC, value=msg, key=key, partition=partition, timestamp_ms=timestamp_ms) logger.debug("SEND _ID: %s" % _id) return _id except Exception as e: logger.error(traceback.format_exc()) logger.error(e.args) raise e
def date_cleaning(self, df, column_name=[]): """ Converts all the columns containing dates into standard date format In a for loop every column values are traverse and udf_date_formatting function is called :param df: orig dataframe :param column_name: list of column names containing date :return: return a new_df containing some new columns with updated date values """ try: for i in column_name: df = df.withColumn( i + '_new', self.udf_date_formatting()(funct.col(i).cast("String"))) return df except Exception as e: logger.error(e)
def remove_duplicate_urls(self, df, column_name, base_url=''): """ Orig dataframe is received with columns containing urls . Those columns are cleaned to remove duplication :param df: dataframe containing data which need to be cleaned :param column_name: list of columns containing urls :param base_url: base_url optional :return: return dataframe with _new column name append describing cleaned column """ try: for i in column_name: df = df.withColumn(i + '_new', self.udf_remove_stop_words(base_url)( funct.trim(funct.lower(funct.col(i).cast("string"))))) return df except Exception as e: logger.error(e)
def run(self, df): try: self.df = df variables = self.get_variables_types() df = self.train_model_for_categorical_variables() df = self.impute_categorical_variables() del_variables = self.delete_extra_categorical_variables() self.update_metadata(v) # df= self.train_model_for_numerical_variables() # df= self.impute_numerical_variables() # del_variables= self.delete_extra_numerical_variables() return df except Exception as e: logger.error(e)