Example #1
0
    def run(self, control_data: Dict[str, Any], logger: Logger,
            accumulator: Dict[str, Any]) -> None:
        file_paths = [x['path'] for x in control_data[self.source_key]]
        step_name = convert_name_to_underscore(self.name)
        processed_file_paths = {}
        # processed_file_paths = []
        if step_name in control_data:
            for x in control_data[step_name]:
                if x['status'] == 'processed':
                    processed_file_paths[x['input']] = x

            # processed_file_paths = [x['input'] for x in control_data[step_name]
            #                         if x['status'] == 'processed']

        accumulator['file_count'] = 0
        for file, path in self.__source_iter(file_paths):
            filename = os.path.basename(path)
            if not filename.startswith(HIDDEN_FILE_PREFIXES):
                if path in processed_file_paths.keys() and not self._overwrite:
                    accumulator['files_output'].append(
                        processed_file_paths[path])
                    continue

                if accumulator['file_count'] > self.__max_file_count:
                    break

                self.process_file(file, path, control_data, logger,
                                  accumulator)
                if self._delete:
                    os.remove(path)

                accumulator['file_count'] += 1
Example #2
0
    def process_file(self,
                     file: IO[AnyStr],
                     path: str,
                     control_data: Dict[str, Any],
                     logger: Logger,
                     accumulator: Dict[str, Any]
                     ) -> str:
        logger.debug('process file: {}'.format(file.name))
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        data = input_doc['data']
        if 'structured_content' in data:
            for item in data['structured_content']:
                if 'text' in item:
                    is_question = self.predict_question(item['text'])
                    if is_question:
                        accumulator['found_questions'].append(item['text'])

                    item['is_question'] = is_question

        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        update_control_info_(file.name, path, output_filename, output_path, accumulator)
        self.__output_handler(output_path, input_doc)
        return output_path
Example #3
0
    def process_file(self,
                     file: IO[AnyStr],
                     path: str,
                     control_data: Dict[str, Any],
                     logger: Logger,
                     accumulator: Dict[str, Any]
                     ) -> None:
        logger.debug('process file: {}'.format(file.name))
        questions = []
        non_questions = []
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        text = input_doc['data']['text']
        for t in text:
            words = [x.lower() for x in t.split()]
            if t.endswith('?') or words[0] in self.__q_words:
                questions.append(t)
            else:
                non_questions.append(t)

        now = datetime.utcnow().isoformat()
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        content = {'questions': list(set(questions)), 'non_questions': non_questions}
        accumulator['files_output'].append({
            'filename': output_filename,
            'input': path,
            'path': output_path,
            'status': 'processed',
            'time': now
        })
        self.__output_handler(output_path, content)
Example #4
0
    def process_file(self, file: IO[AnyStr], control_data: Dict[str, Any],
                     logger: Logger, accumulator: Dict[str, Any]) -> None:
        logger.debug('process file: {}'.format(file.name))
        matcher = self.__matcher
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        text = input_doc['data']['text']
        sentences = []
        for t in text:
            doc = self.__nlp(t)
            entities = []
            annotated = []
            pos_tags = []
            for ent in doc.ents:
                entity = dict([
                    ('text', ent.text),
                    ('start_char', ent.start_char),
                    ('end_char', ent.end_char),
                    ('label', ent.label_),
                ])
                entities.append(entity)

            for token in doc:
                annotated.append(
                    dict([
                        ('text', token.text),
                        ('lemma', token.lemma_),
                        ('pos', token.pos_),
                        ('tag', token.tag_),
                        ('dep', token.dep_),
                        ('shape', token.shape_),
                        ('is_alpha', token.is_alpha),
                        ('is_stop', token.is_stop),
                    ]))
                pos_tags.append(token.tag_)

            # matches = matcher(doc)
            is_question = len(matcher(doc)) > 0

            sentence = dict([('text', t), ('annotated', annotated),
                             ('entities', entities),
                             ('is_question', is_question),
                             ('pos_tags', ' '.join(pos_tags))])
            sentences.append(sentence)

        now = datetime.utcnow().isoformat()
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        content = {'metadata': metadata, 'data': {'sentences': sentences}}
        accumulator['files_output'].append({
            'filename': output_filename,
            'path': output_path,
            'status': 'processed',
            'time': now
        })
        self.__output_handler(output_path, content)
Example #5
0
    def add_step(self, step: AbstractStep):
        if not step:
            return self

        # set name of previous step as source key for this step
        if self.__steps and not step.source_key:
            step.source_key = convert_name_to_underscore(self.__steps[-1].name)

        self.__steps.append(step)
        return self
Example #6
0
    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> str:
        logger.debug('process file: {}'.format(file.name))
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        texts = []
        data = input_doc['data']
        accumulator['files_processed'].append({
            'path':
            file.name,
            'time':
            datetime.utcnow().isoformat()
        })
        if 'structured_content' in data:
            for x in data['structured_content']:
                # if x['type'] in ['text', 'heading']:
                if x['type'] == 'text':
                    # texts.append('<p>{}</p>'.format(x['text']))
                    texts.append(x['text'])
                elif x['type'] == 'list':
                    text = ''
                    items = x['items']
                    if 'heading' in x:
                        # text = '<p>{}</p>'.format(x['heading'])
                        text = x['heading']

                    text += '<ul>'
                    for it in items:
                        text += '<li>{}</li>'.format(it)

                    text += '</ul>'
                    texts.append(text)

                elif x['type'] == 'table':
                    df = table_to_dataframe(x)
                    schema = infer_schema(df, n_header_rows=len(x['head']))
                    texts.extend(table_to_natural_text(df, schema))

        formatted = []
        for t in texts:
            formatted.append({'id': str(uuid.uuid4()), 'text': t})

        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.jsonl'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        update_control_info_(file.name, path, output_filename, output_path,
                             accumulator)
        self.__output_handler(output_path, formatted, self._overwrite)
        return output_path
Example #7
0
    def __initialize_steps(self):
        for step in self.__uninitialized_steps:
            if not step.source_key:
                if self.__initialized_steps:
                    # set name of previous step as source key for this step
                    step.source_key = convert_name_to_underscore(
                        self.__initialized_steps[-1].name)
                else:
                    # otherwise use the source key of the parent, which is
                    # from the step before the parent
                    step.source_key = self.source_key

            self.__initialized_steps.append(step)
Example #8
0
    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> None:
        logger.debug('process file: {}'.format(file.name))
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        text = input_doc['data']['text']
        graphs = []
        for t in text:
            tagged = self.get_tagged_from_server(t)
            entity_fragments = entity_extraction.extract_entities(tagged)
            edges = entity_extraction.generate_edges(entity_fragments)
            tokens = [t for t, _, _ in tagged]
            non_parsed_graph = {'tokens': tokens, 'edgeSet': edges}
            keras_models.model_params['wordembeddings'] = str(GLOVE_PATH)
            rel_parser = RelParser('model_ContextWeighted',
                                   models_foldes=str(MODELS_PATH))
            parsed_graph = rel_parser.classify_graph_relations(
                non_parsed_graph)
            # e.g.:
            # {'tokens': ['Germany', 'is', 'a', 'country', 'in', 'Europe'], 'edgeSet': [{'left': [0],
            # 'right': [5], 'kbID': 'P30', 'lexicalInput': 'continent'}, {'left': [0], 'right': [3],
            # 'kbID': 'P0', 'lexicalInput': 'ALL_ZERO'}, {'left': [5], 'right': [3], 'kbID': 'P31',
            # 'lexicalInput': 'instance of'}]}

            relations = []
            if parsed_graph:
                graphs.append(parsed_graph)
                for edge in parsed_graph['edgeSet']:
                    if edge['kbID'] != 'P0':
                        left = ' '.join([tokens[t] for t in edge['left']])
                        right = ' '.join([tokens[t] for t in edge['right']])
                        relations.append([left, edge['lexicalInput'], right])

                parsed_graph['relations'] = relations

        now = datetime.utcnow().isoformat()
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        content = {'metadata': metadata, 'data': {'graphs': graphs}}
        accumulator['files_output'].append({
            'filename': output_filename,
            'input': path,
            'path': output_path,
            'status': 'processed',
            'time': now
        })
        self.__output_handler(output_path, content)
Example #9
0
    def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None:
        file_paths = [x['path'] for x in control_data[self.source_key]]
        step_name = convert_name_to_underscore(self.name)
        processed_file_paths = {}
        if step_name in control_data:
            for x in control_data[step_name]:
                if x['status'] == 'processed':
                    processed_file_paths[x['input']] = x

        for file, path in self.__source_iter(file_paths):
            if not self._overwrite and path in processed_file_paths.keys():
                accumulator['files_output'].append(processed_file_paths[path])
                continue

            self.process_file(file, path, control_data, logger, accumulator)
Example #10
0
    def run(self, control_data: Dict[str, Any], logger: Logger,
            accumulator: Dict[str, Any]) -> None:
        file_paths = [x['path'] for x in control_data[self.source_key]]
        step_name = convert_name_to_underscore(self.name)
        processed_file_paths = []
        if step_name in control_data:
            processed_file_paths = [
                x['path'] for x in control_data[step_name]
                if x['status'] == 'processed'
            ]

        for file, path in self.__source_iter(file_paths):
            if not self._overwrite and path in processed_file_paths:
                continue

            self.process_file(file, control_data, logger, accumulator)
Example #11
0
    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> str:
        logger.debug('process file: {}'.format(file.name))
        write_root_dir = control_data['job']['write_root_dir']
        parsed = parser.from_file(path, xmlContent=True)
        ext = os.path.splitext(path)[1].lower()
        if ext == '.docx':
            doc_type = 'Word'
        elif ext == '.pdf':
            doc_type = 'PDF'
        else:
            doc_type = None

        metadata = parsed['metadata']
        record_id = metadata['dc:title'].replace(' ', '_')
        created_date = metadata['dcterms:created'][0]
        last_mod_date = metadata['dcterms:modified'][0]
        author = metadata.get('meta:last-author', '')
        word_count = int(metadata.get('meta:word-count', '-1'))
        accumulator.update({
            'data': {},
            'is_data': False,
            'metadata': {
                'doc_type': doc_type,
                'record_id': record_id,
                'created_date': created_date,
                'last_mod_date': last_mod_date,
                'author': author,
                'word_count': word_count
            }
        })
        self.process_doc(parsed['content'], accumulator)

        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        update_control_info_(file.name, path, output_filename, output_path,
                             accumulator)
        self.write_output(accumulator, output_path)
        return output_path
Example #12
0
def run_step(step: AbstractStep, control_data: Dict[str, Any], logger: Logger,
             accumulator: Dict[str, Any], temp_path: str) -> Dict[str, Any]:
    step_name = convert_name_to_underscore(step.name)
    control_data = write_control_file_start(step_name, control_data, temp_path)

    logged = logged_decorator(logger, step.name)
    tracked = tracked_decorator(step_name, control_data, accumulator,
                                temp_path)

    with logged, tracked:
        # noinspection PyBroadException
        try:
            step.run(control_data, logger, accumulator)
        except Exception as e:
            # logger.error(e)
            # traceback.print_exc()

            # allow decorators to handle, same effect as `break`
            raise e

    return control_data
Example #13
0
    def run(self, control_data: Dict[str, Any], logger: Logger,
            accumulator: Dict[str, Any]) -> None:
        file_paths = [x['path'] for x in control_data[self.source_key]]
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}.txt'.format(step_name)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        output = {
            'filename': output_filename,
            'path': output_path,
            'status': 'processed',
            'time': datetime.utcnow().isoformat()
        }
        accumulator['files_output'].append(output)
        processed_file_paths = []
        if step_name in control_data and control_data[step_name][
                'status'] == 'processed':
            processed_file_paths = control_data[step_name]['input']

        paths = []
        text = []
        j = 0
        for file, path in self.__source_iter(file_paths):
            paths.append(path)
            if not self._overwrite and path in processed_file_paths:
                continue

            text.extend(
                self.process_file(file, path, control_data, logger,
                                  accumulator))
            j += 1
            # manage memory use - flush every 100th file
            if j % FLUSH_FILE_COUNT == 0:
                self.__output_handler(output_path, text, self._overwrite)
                text = []

        self.__output_handler(output_path, text, self._overwrite)
        output['input'] = paths
Example #14
0
    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> str:
        logger.debug('process file: {}'.format(file.name))
        write_root_dir = control_data['job']['write_root_dir']
        accumulator.update({
            'data': {},
            'is_data': False,
            'metadata': {
                'doc_type': None,
                'record_id': None
            }
        })
        for event, el in self.element_iterator(file):
            self.process_xml_element(el, event, accumulator)

        record_id = accumulator['metadata']['record_id']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        update_control_info_(file.name, path, output_filename, output_path,
                             accumulator)
        self.write_output(accumulator, output_path)
        return output_path
Example #15
0
def extract_text(c: Dict[str, Any],
                 # logger: Logger,
                 a: Dict[str, Any],
                 excluded_tags: List[str],
                 output_handler: Callable,
                 f: TextIO) -> str:
    # logger.debug('process file: {}'.format(f.name))
    a.update({
        'data': {},
        'is_data': False,
        'metadata': {'doc_type': None, 'record_id': None}
    })
    it = etree.iterparse(f, events=('start', 'end'))
    stream = ((event, el) for event, el in it if el.tag not in excluded_tags)
    for event, el in stream:
        if el.tag == 'CONTENT' and event == 'end':
            a['metadata']['record_id'] = el.get('RECORDID')

        elif el.tag == 'MASTERIDENTIFER' and event == 'end':
            a['metadata']['title'] = el.text

        elif el.tag == 'TYPE' and event == 'end':
            a['metadata']['doc_type'] = el.text

        elif el.tag == 'DOCUMENTID' and event == 'end':
            a['metadata']['doc_id'] = el.text

        elif el.tag == 'VERSION' and event == 'end':
            a['metadata']['version'] = el.text

        elif el.tag == 'AUTHOR' and event == 'end':
            a['metadata']['author'] = el.text

        elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['end_timestamp_millis'] = millis
            a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['start_timestamp_millis'] = millis
            a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['create_timestamp_millis'] = millis
            a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['last_modified_timestamp_millis'] = millis
            a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'RESOURCEPATH' and event == 'end':
            a['metadata']['doc_location_path'] = el.text

        elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['published_timestamp_millis'] = millis
            a['metadata']['published_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == a['metadata']['doc_type']:
            a['is_data'] = (event == 'start')

        elif a['is_data'] and event == 'end' and el.text:
            # treat all text as html
            # lxml will automatically wrap plain text in a para, body and html tags
            structured_content = []
            text_list = []
            list_extractor = ListExtractor(excluded_tags=['table'])
            table_extractor = TableExtractor()
            text_extractor = TextExtractor(excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'])
            heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
            stream = BytesIO(fix_content(el.text).encode('utf-8'))

            for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True):
                heading_extractor.extract(elem, ev, structured_content, text_list)
                text_extractor.extract(elem, ev, structured_content, text_list)
                list_extractor.extract(elem, ev, structured_content, text_list)
                table_extractor.extract(elem, ev, structured_content, text_list)

            data = {}
            if len(text_list) == 1:
                data['text'] = text_list[0]
            else:
                data['text'] = text_list

            if structured_content:
                data['structured_content'] = structured_content

            a['data'][el.tag.lower()] = data

    now = datetime.utcnow().isoformat()
    a['files_processed'].append({
        'path': f.name,
        'time': now
    })
    write_root_dir = c['job']['write_root_dir']
    output_filename = '{}_{}.json'.format(convert_name_to_underscore(self.name), a['metadata']['record_id'])
    output_path = os.path.join(write_root_dir, output_filename)
    a['files_output'].append({
        'filename': output_filename,
        'path': output_path,
        'status': 'processed',
        'time': now
    })
    content = {'metadata': a['metadata'], 'data': a['data']}
    output_handler(output_path, content)
    return output_path
Example #16
0
    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> None:
        logger.debug('process file: {}'.format(file.name))
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        data = input_doc['data']
        text = data['text']
        nlp_text = []
        for t in text:
            entities = []
            keywords_found = self.keyword_processor.extract_keywords(
                t, span_info=True)
            for keyword in keywords_found:
                entities.append({
                    'entity':
                    self.entity_reverse_lookup[keyword[0]],
                    'location':
                    keyword[1:],
                    'value':
                    keyword[0],
                    'confidence':
                    1.0
                })

            matches = match_regexprs(t, self.regexprs)
            for match in matches:
                match['entity'] = self.entity_reverse_lookup[match['value']]

            entities.extend(matches)
            entities.extend(self.match_system_entities(t))

            # is the span of an entity contained within the span
            # of another entity
            def is_contained(entity):
                start, end = entity['location']
                for ent in entities:
                    s, e = ent['location']
                    # exclude exact span matches
                    if (start == s and end < e) or (
                            start > s and end == e) or (start > s and end < e):
                        return True

                return False

            def is_valid(entity):
                # remove spurious dates
                if entity['entity'] == 'sys-date':
                    start, end = entity['location']
                    if (end - start) < 8:
                        return False

                    value = entity['value']
                    if isinstance(value, str):
                        try:
                            date = parse(value)
                        except ValueError:
                            return False

                        year = date.year
                        if year < 1990 or year > 2025:
                            return False

                return True

            # keep the entity with the longest span where an entity
            # is contained within the span of another
            pruned_entities = [
                ent for ent in entities
                if not is_contained(ent) and is_valid(ent)
            ]
            nlp_text.append({'text': t, 'entities': pruned_entities})

        now = datetime.utcnow().isoformat()
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        data = {}
        data['nlp_text'] = nlp_text
        content = {'metadata': metadata, 'data': data}
        accumulator['files_output'].append({
            'filename': output_filename,
            'input': path,
            'path': output_path,
            'status': 'processed',
            'time': now
        })
        self.__output_handler(output_path, content)