Esempio n. 1
0
    def create_documents(self, rows: List[List[str]],
                         heading: List[str] = None,
                         file_name: str = None,
                         dataset: str = None,
                         nested_key: str = None,
                         doc_id_field: str = None) -> List[Document]:
        documents = list()
        # etk = ETK()
        if self.heading_row is None and self.required_columns is not None:
            raise InvalidArgumentsError("cannot match the required columns since heading is not specified")

        # get the heading line index of required columns
        list_idx = list()
        if self.required_columns is not None:
            for i in range(len(heading)):
                if heading[i] in self.required_columns:
                    list_idx.append(i)
        # filter each row
        for row in rows:
            # if the row is empty, skip it
            if not any(row):
                continue
            # if some required field is missing, skip it
            is_required_not_empty = all(row[i] for i in list_idx)
            if not is_required_not_empty:
                continue
            # convert datetime obj to ISO format str
            row = self.datetime_to_string(row)
            # create doc for each row
            doc = dict()
            for i in range(0, self.heading_columns[1] - self.heading_columns[0]):
                if heading is not None:
                    key = heading[i]
                else:
                    key = self.column_name_prefix + str(i)

                if i >= len(row):
                    doc[key] = ''
                else:
                    doc[key] = row[i]

            cdr_doc = dict()
            if nested_key is not None:
                cdr_doc[nested_key] = doc
            else:
                cdr_doc = doc

            if file_name is not None:
                cdr_doc['file_name'] = file_name
            if dataset is not None:
                cdr_doc['dataset'] = dataset

            doc_id = None
            if doc_id_field:
                doc_id = cdr_doc.get(doc_id_field, None)
            documents.append(self.etk.create_document(cdr_doc, doc_id=doc_id))

        return documents
Esempio n. 2
0
    def __init__(self, etk, **mapping_spec) -> None:
        self.etk = etk
        # < annotation > < spreadsheet > < output >
        if mapping_spec['annotation'] is None:
            #  TODO: adding auto infer annotation code
            pass
        else:
            self.annotation = mapping_spec['annotation']

        if mapping_spec['spreadsheet'] is None:
            raise InvalidArgumentsError("for argument 'spreadsheet', please specify spreadsheet path")
            # print("Please specify spreadsheet path")
        else:
            self.spreadsheet = mapping_spec['spreadsheet']
Esempio n. 3
0
    def tabular_extractor(self,
                          table_str: str = None,
                          filename: str = None,
                          sheet_name: str = None,
                          dataset: str = None,
                          nested_key: str = None,
                          doc_id_field: str = None) -> List[Document]:
        data = list()

        if table_str is not None and filename is not None:
            raise InvalidArgumentsError(
                message=
                "for arguments 'table_str' and 'filename', please specify only one "
                "argument!")

        elif table_str is not None:
            f = StringIO(table_str)
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                data.append(row)
        elif filename is not None:
            # always read the entire file first
            fn, extension = os.path.splitext(filename)
            extension = extension.lower()

            if extension in self._get_data_function:
                get_data = self._get_data_function[extension]
            else:
                raise InvalidFilePathError("file extension can not read")

            try:
                data = get_data(filename,
                                auto_detect_datetime=False,
                                auto_detect_float=False,
                                encoding="utf-8")
            except:
                try:
                    data = get_data(filename,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding="latin_1")
                except:
                    data = get_data(filename,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding="utf-8-sig")

            if extension == '.xls' or extension == '.xlsx':
                if sheet_name is None:
                    sheet_name = list(data.keys())[0]

                data = data[sheet_name]
            else:
                file_name = fn.split('/')[-1] + extension
                data = data[file_name]

        table_content, heading = self.content_recognizer(data)

        return self.create_documents(rows=table_content,
                                     heading=heading,
                                     file_name=filename,
                                     dataset=dataset,
                                     nested_key=nested_key,
                                     doc_id_field=doc_id_field)
Esempio n. 4
0
    def tabular_extractor(self,
                          table_str: str = None,
                          filename: str = None,
                          file_content=None,
                          file_type=None,
                          sheet_name: str = None,
                          dataset: str = None,
                          nested_key: str = None,
                          doc_id_field: str = None,
                          dataframe: pd.DataFrame = None,
                          encoding=None,
                          fillnan=None,
                          df_string=False) -> List[Document]:
        """
        Read the input file/content and return a list of Document(s)
        Args:
            table_str: use this parameter, if you are 100% sure that the content is a csv
            filename: use this parameter if the file extension is one of tab, csv, tsv, xls, xlsx
            file_content: if the input has some arbitrary extension, read it yourself and pass the contents along
            file_type: use this parameter with file_content, can be tsv, csv, etc
            sheet_name: sheet name as in xls or xlsx files
            dataset: user provided string to be added to output Document(s)
            nested_key: user provided string to be added to output Document(s)
            doc_id_field: specify this field(should be present in the input file), its value will be used as doc_id
            dataframe: use this parameter if the contents being passed along are a pandas DataFrame
            fillnan: specify the value to be filled for NaNs in the dataframe
            df_string: converts all dataframe columns to type str

        Returns: List[Document]

        """
        data = list()

        if (table_str is not None and filename is not None) or (
                dataframe is not None
                and filename is not None) or (table_str is not None
                                              and dataframe is not None):
            raise InvalidArgumentsError(
                message=
                "for arguments 'table_str', 'filename' and 'dataframe', please specify only one "
                "argument!")

        elif table_str is not None:
            f = StringIO(table_str)
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                data.append(row)

        elif dataframe is not None:
            if self.heading_row is not None and self.heading_row > 1:
                raise InvalidArgumentsError(
                    message="Use pandas skiprows to decide the heading row!")
            if fillnan is not None:
                dataframe = dataframe.fillna(fillnan)
            if df_string:
                dataframe = dataframe.astype(str)
            data = [dataframe.columns.values.tolist()] + \
                dataframe.values.tolist()
        elif filename is not None:
            # always read the entire file first
            fn, extension = os.path.splitext(filename)
            extension = extension.lower()

            if extension in self._get_data_function:
                get_data = self._get_data_function[extension]
            else:
                # in pyexcel we trust
                # if there is an extension we have not mapped, just let pyexcel
                # figure it out
                get_data = pyexcel_io.get_data

            try:
                if file_content and file_type:
                    data = get_data(file_content,
                                    file_type=file_type,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding=encoding if encoding else "utf-8")
                else:
                    data = get_data(filename,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding=encoding if encoding else "utf-8")
            except:
                try:
                    data = get_data(filename,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding="latin_1")
                except:
                    data = get_data(filename,
                                    auto_detect_datetime=False,
                                    auto_detect_float=False,
                                    encoding="utf-8-sig")

            if extension == '.xls' or extension == '.xlsx':
                if sheet_name is None:
                    sheet_name = list(data.keys())[0]

                data = data[sheet_name]
            else:
                data = data[file_type] if file_type else data[fn.split('/')[-1]
                                                              + extension]

        table_content, heading = self.content_recognizer(data)
        return self.create_documents(rows=table_content,
                                     heading=heading,
                                     file_name=filename,
                                     dataset=dataset,
                                     nested_key=nested_key,
                                     doc_id_field=doc_id_field)