def read_local_files (self): tfr = TextFileReader () tc = TextCleaner () access = self.source_config["access"] file_filter = "" if access["file_filter"] : file_filter = access["file_filter"] # Read file names from given path files_list = utils.get_files_in_path(access["path"], file_filter) file_data_list = [] label_value_list = [] for file in files_list: # read file content and convert json string to dictionary file_data = tfr.read_file (file) # Text data that is read from the file may contain one or more text documents, separated by some character or string # Split them into a list of docs if "document_separator" in access: separator_code = access["document_separator"] multi_docs = tc.split_multi_text_by_separator(file_data, separator_code=separator_code) else: multi_docs = [file_data] # If a label is provided globally, read it from config. # Label will be the class/prediction used for training purposes. global_label_value = None if "label_value_override" in access: global_label_value = access["label_value_override"] self.logger.info("Found {} markup documents ".format(len(multi_docs))) # Iterate through each markup document for textdoc in multi_docs: clean_data = self.cleanup_data(textdoc) if clean_data is not None and len(clean_data.strip()) > 0: file_data_list.append(clean_data) # if a label is provided globally, append it to labels list for each document if global_label_value is not None: label_value_list.append(global_label_value) return file_data_list, label_value_list
def cleanup_data(self, clean_data): tc = TextCleaner() clean_steps = self.source_config["clean"] # Iterate through each step and perform specified cleaning action for cstep in clean_steps: stepname = cstep["step"] if stepname == "remove_all_markup": clean_data = tc.remove_all_markup(clean_data, valid_markup=False) if stepname == "remove_html_encoded_chars": clean_data = tc.remove_html_encoded_chars(clean_data, replace_char=' ') if stepname == "remove_special_chars": if "special_chars" in cstep: special_chars = cstep["special_chars"] else: special_chars = self.def_special_chars clean_data = tc.remove_special_chars(special_chars, clean_data) if stepname == "remove_white_spaces": if "white_space_chars" in cstep: white_space_chars = cstep["white_space_chars"] else: white_space_chars = self.def_white_space_chars clean_data = tc.remove_white_spaces( white_space_chars=white_space_chars, doc=clean_data) return clean_data
def read_local_files(self): tfr = TextFileReader() tc = TextCleaner() access = self.source_config["access"] file_filter = "" if access["file_filter"]: file_filter = access["file_filter"] # Read file names from given path files_list = utils.get_files_in_path(access["path"], file_filter) # Which data element is to be read from the JSON object for text data data_element = access["data_element"] # If a label is provided globally, read it from config. # Label will be the class/prediction used for training purposes. global_label_value = None if "label_value_override" in access: global_label_value = access["label_value_override"] file_data_list = [] label_value_list = [] for file in files_list: # read file content and convert json string to dictionary data = tc.string_to_json_object(tfr.read_file(file)) # JSON read from the file can be a single object or an array of objects. # Check if this is a single JSON object if isinstance(data, dict): # Label value to be used for training purposes. label_value = None if "label_element" in data: label_value = utils.if_null(data["label_element"], None) if (data_element in data): text_data = data[data_element] clean_data = self.cleanup_data(text_data) # After cleanup, if there is valid text, then append to the list of documents if clean_data is not None and len(clean_data.strip()) > 0: file_data_list.append(clean_data) # Append the label value as well. Take care of global override, if provided if global_label_value is not None: label_value_list.append(global_label_value) else: label_value_list.append(label_value) # Check if this is an array of JSON objects. Then iterate through each object if isinstance(data, list): for jsonobj in data: # Label value to be used for training purposes. label_value = None if "label_element" in jsonobj: label_value = utils.if_null(jsonobj["label_element"], None) if data_element in jsonobj: text_data = jsonobj[data_element] clean_data = self.cleanup_data(text_data) # After cleanup, if there is valid text, then append to the list of documents if clean_data is not None and len( clean_data.strip()) > 0: file_data_list.append(clean_data) # Append the label value as well. Take care of global override, if provided if global_label_value is not None: label_value_list.append(global_label_value) else: label_value_list.append(label_value) return file_data_list, label_value_list
def read_local_files(self): tfr = TextFileReader() tc = TextCleaner() access = self.source_config["access"] file_filter = "" if access["file_filter"]: file_filter = access["file_filter"] # Read column delimiter delimiter = "," if "delimiter" in access: delimiter = utils.if_null(access["delimiter"], ",") header_row = None if "header_row" in access: header_row = utils.if_null(access["header_row"], None) data_column = 0 if "data_column" in access: data_column = utils.if_null(access["data_column"], 0) label_column = None if "label_column" in access: label_column = utils.if_null(access["label_column"], None) usecols = [] usecols.append(data_column) if label_column is not None: usecols.append(label_column) # If a label is provided globally, read it from config. # Label will be the class/prediction used for training purposes. global_label_value = None if "label_value_override" in access: global_label_value = utils.if_null(access["label_value_override"], None) # Read file names from given path files_list = utils.get_files_in_path(access["path"], file_filter) file_data_list = [] label_value_list = [] for file in files_list: # read csv file as a pandas dataframe data_df = tfr.read_csv_file(file_path=file, separator=delimiter, header_row=header_row, select_cols=usecols) self.logger.info("Found {} markup documents ".format(len(data_df))) if utils.df_size(data_df) < 1: continue # Convert data column (text rows) into a list text_list = list(data_df.iloc[:, 0].values) # If label column is specified, convert label column into list if label_column is not None: label_list = list(data_df.iloc[:, 1].values) # Iterate through each text row id = 0 for textdoc in text_list: clean_data = self.cleanup_data(textdoc) if clean_data is not None and len(clean_data.strip()) > 0: file_data_list.append(clean_data) # if a label is provided globally, append it to labels list for each document if global_label_value is not None: label_value_list.append(global_label_value) else: if label_column is not None: label_value_list.append(label_list[id]) id += 1 return file_data_list, label_value_list
def read_local_files(self): tfr = TextFileReader() tc = TextCleaner() access = self.source_config["access"] file_filter = "" if access["file_filter"]: file_filter = access["file_filter"] # Read file names from given path files_list = utils.get_files_in_path(access["path"], file_filter) # Which data element is to be read from the JSON object for text data data_element = access["data_element"] file_data_list = [] label_value_list = [] for file in files_list: # read file content and convert json string to dictionary file_data = tfr.read_file(file) # Marked up data that is read from the file may contain one or more markup blocks # Split them into a list of markup docs if "document_element" in access: separator_markup = access["document_element"] markup_docs = tc.split_multi_content_by_end_tag( file_data, separator_markup=separator_markup) else: markup_docs = [file_data] data_element = access["data_element"] # If a label is provided globally, read it from config. # Label will be the class/prediction used for training purposes. global_label_value = None if "label_value_override" in access: global_label_value = access["label_value_override"] self.logger.info("Found {} markup documents ".format( len(markup_docs))) # Iterate through each markup document for markupdoc in markup_docs: tagtext = tc.get_text_within_tags(markupdoc, container_tag=data_element) if isinstance(tagtext, list): text_data = " ".join(tagtext) else: text_data = tagtext clean_data = self.cleanup_data(text_data) if clean_data is not None and len(clean_data.strip()) > 0: file_data_list.append(clean_data) # if a label is provided globally, append it to labels list for each document if global_label_value is not None: label_value_list.append(global_label_value) return file_data_list, label_value_list