Ejemplo n.º 1
0
    def read_local_files (self):
        tfr = TextFileReader ()
        tc = TextCleaner ()
        access = self.source_config["access"]

        file_filter = ""
        if access["file_filter"] :
            file_filter = access["file_filter"]

        # Read file names from given path
        files_list = utils.get_files_in_path(access["path"], file_filter)

        file_data_list = []
        label_value_list = []
        for file in files_list:

            # read file content and convert json string to dictionary
            file_data = tfr.read_file (file)

            # Text data that is read from the file may contain one or more text documents, separated by some character or string
            # Split them into a list of docs

            if "document_separator" in access:
                separator_code = access["document_separator"]
                multi_docs = tc.split_multi_text_by_separator(file_data, separator_code=separator_code)
            else:
                multi_docs = [file_data]

            # If a label is provided globally, read it from config.
            # Label will be the class/prediction used for training purposes.
            global_label_value = None
            if "label_value_override" in access:
                global_label_value = access["label_value_override"]

            self.logger.info("Found {} markup documents ".format(len(multi_docs)))

            # Iterate through each markup document
            for textdoc in multi_docs:

                clean_data = self.cleanup_data(textdoc)

                if clean_data is not None and len(clean_data.strip()) > 0:
                    file_data_list.append(clean_data)

                    # if a label is provided globally, append it to labels list for each document
                    if global_label_value is not None:
                        label_value_list.append(global_label_value)

        return file_data_list, label_value_list
Ejemplo n.º 2
0
    def cleanup_data(self, clean_data):
        tc = TextCleaner()
        clean_steps = self.source_config["clean"]

        # Iterate through each step and perform specified cleaning action
        for cstep in clean_steps:

            stepname = cstep["step"]

            if stepname == "remove_all_markup":
                clean_data = tc.remove_all_markup(clean_data,
                                                  valid_markup=False)

            if stepname == "remove_html_encoded_chars":
                clean_data = tc.remove_html_encoded_chars(clean_data,
                                                          replace_char=' ')

            if stepname == "remove_special_chars":
                if "special_chars" in cstep:
                    special_chars = cstep["special_chars"]
                else:
                    special_chars = self.def_special_chars
                clean_data = tc.remove_special_chars(special_chars, clean_data)

            if stepname == "remove_white_spaces":
                if "white_space_chars" in cstep:
                    white_space_chars = cstep["white_space_chars"]
                else:
                    white_space_chars = self.def_white_space_chars

                clean_data = tc.remove_white_spaces(
                    white_space_chars=white_space_chars, doc=clean_data)

        return clean_data
Ejemplo n.º 3
0
    def read_local_files(self):
        tfr = TextFileReader()
        tc = TextCleaner()
        access = self.source_config["access"]

        file_filter = ""
        if access["file_filter"]:
            file_filter = access["file_filter"]

        # Read file names from given path
        files_list = utils.get_files_in_path(access["path"], file_filter)

        # Which data element is to be read from the JSON object for text data
        data_element = access["data_element"]

        # If a label is provided globally, read it from config.
        # Label will be the class/prediction used for training purposes.
        global_label_value = None
        if "label_value_override" in access:
            global_label_value = access["label_value_override"]

        file_data_list = []
        label_value_list = []
        for file in files_list:

            # read file content and convert json string to dictionary
            data = tc.string_to_json_object(tfr.read_file(file))

            # JSON read from the file can be a single object or an array of objects.
            # Check if this is a single JSON object
            if isinstance(data, dict):

                # Label value to be used for training purposes.
                label_value = None
                if "label_element" in data:
                    label_value = utils.if_null(data["label_element"], None)

                if (data_element in data):
                    text_data = data[data_element]
                    clean_data = self.cleanup_data(text_data)

                    # After cleanup, if there is valid text, then append to the list of documents
                    if clean_data is not None and len(clean_data.strip()) > 0:
                        file_data_list.append(clean_data)

                        # Append the label value as well. Take care of global override, if provided
                        if global_label_value is not None:
                            label_value_list.append(global_label_value)
                        else:
                            label_value_list.append(label_value)

            # Check if this is an array of JSON objects. Then iterate through each object
            if isinstance(data, list):
                for jsonobj in data:

                    # Label value to be used for training purposes.
                    label_value = None
                    if "label_element" in jsonobj:
                        label_value = utils.if_null(jsonobj["label_element"],
                                                    None)

                    if data_element in jsonobj:

                        text_data = jsonobj[data_element]
                        clean_data = self.cleanup_data(text_data)

                        # After cleanup, if there is valid text, then append to the list of documents
                        if clean_data is not None and len(
                                clean_data.strip()) > 0:
                            file_data_list.append(clean_data)

                            # Append the label value as well. Take care of global override, if provided
                            if global_label_value is not None:
                                label_value_list.append(global_label_value)
                            else:
                                label_value_list.append(label_value)

        return file_data_list, label_value_list
Ejemplo n.º 4
0
    def read_local_files(self):
        tfr = TextFileReader()
        tc = TextCleaner()
        access = self.source_config["access"]

        file_filter = ""
        if access["file_filter"]:
            file_filter = access["file_filter"]

        # Read column delimiter
        delimiter = ","
        if "delimiter" in access:
            delimiter = utils.if_null(access["delimiter"], ",")

        header_row = None
        if "header_row" in access:
            header_row = utils.if_null(access["header_row"], None)

        data_column = 0
        if "data_column" in access:
            data_column = utils.if_null(access["data_column"], 0)

        label_column = None
        if "label_column" in access:
            label_column = utils.if_null(access["label_column"], None)

        usecols = []
        usecols.append(data_column)

        if label_column is not None:
            usecols.append(label_column)

        # If a label is provided globally, read it from config.
        # Label will be the class/prediction used for training purposes.
        global_label_value = None
        if "label_value_override" in access:
            global_label_value = utils.if_null(access["label_value_override"],
                                               None)

        # Read file names from given path
        files_list = utils.get_files_in_path(access["path"], file_filter)

        file_data_list = []
        label_value_list = []
        for file in files_list:

            # read csv file as a pandas dataframe
            data_df = tfr.read_csv_file(file_path=file,
                                        separator=delimiter,
                                        header_row=header_row,
                                        select_cols=usecols)

            self.logger.info("Found {} markup documents ".format(len(data_df)))

            if utils.df_size(data_df) < 1:
                continue

            # Convert data column (text rows) into a list
            text_list = list(data_df.iloc[:, 0].values)

            # If label column is specified, convert label column into list
            if label_column is not None:
                label_list = list(data_df.iloc[:, 1].values)

            # Iterate through each text row
            id = 0
            for textdoc in text_list:

                clean_data = self.cleanup_data(textdoc)

                if clean_data is not None and len(clean_data.strip()) > 0:
                    file_data_list.append(clean_data)

                    # if a label is provided globally, append it to labels list for each document
                    if global_label_value is not None:
                        label_value_list.append(global_label_value)
                    else:
                        if label_column is not None:
                            label_value_list.append(label_list[id])
                id += 1

        return file_data_list, label_value_list
Ejemplo n.º 5
0
    def read_local_files(self):
        tfr = TextFileReader()
        tc = TextCleaner()
        access = self.source_config["access"]

        file_filter = ""
        if access["file_filter"]:
            file_filter = access["file_filter"]

        # Read file names from given path
        files_list = utils.get_files_in_path(access["path"], file_filter)

        # Which data element is to be read from the JSON object for text data
        data_element = access["data_element"]

        file_data_list = []
        label_value_list = []
        for file in files_list:

            # read file content and convert json string to dictionary
            file_data = tfr.read_file(file)

            # Marked up data that is read from the file may contain one or more markup blocks
            # Split them into a list of markup docs

            if "document_element" in access:
                separator_markup = access["document_element"]
                markup_docs = tc.split_multi_content_by_end_tag(
                    file_data, separator_markup=separator_markup)
            else:
                markup_docs = [file_data]

            data_element = access["data_element"]

            # If a label is provided globally, read it from config.
            # Label will be the class/prediction used for training purposes.
            global_label_value = None
            if "label_value_override" in access:
                global_label_value = access["label_value_override"]

            self.logger.info("Found {} markup documents ".format(
                len(markup_docs)))

            # Iterate through each markup document
            for markupdoc in markup_docs:

                tagtext = tc.get_text_within_tags(markupdoc,
                                                  container_tag=data_element)

                if isinstance(tagtext, list):
                    text_data = " ".join(tagtext)
                else:
                    text_data = tagtext

                clean_data = self.cleanup_data(text_data)

                if clean_data is not None and len(clean_data.strip()) > 0:
                    file_data_list.append(clean_data)

                    # if a label is provided globally, append it to labels list for each document
                    if global_label_value is not None:
                        label_value_list.append(global_label_value)

        return file_data_list, label_value_list