def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] step_name = convert_name_to_underscore(self.name) processed_file_paths = {} # processed_file_paths = [] if step_name in control_data: for x in control_data[step_name]: if x['status'] == 'processed': processed_file_paths[x['input']] = x # processed_file_paths = [x['input'] for x in control_data[step_name] # if x['status'] == 'processed'] accumulator['file_count'] = 0 for file, path in self.__source_iter(file_paths): filename = os.path.basename(path) if not filename.startswith(HIDDEN_FILE_PREFIXES): if path in processed_file_paths.keys() and not self._overwrite: accumulator['files_output'].append( processed_file_paths[path]) continue if accumulator['file_count'] > self.__max_file_count: break self.process_file(file, path, control_data, logger, accumulator) if self._delete: os.remove(path) accumulator['file_count'] += 1
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any] ) -> str: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] data = input_doc['data'] if 'structured_content' in data: for item in data['structured_content']: if 'text' in item: is_question = self.predict_question(item['text']) if is_question: accumulator['found_questions'].append(item['text']) item['is_question'] = is_question write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) update_control_info_(file.name, path, output_filename, output_path, accumulator) self.__output_handler(output_path, input_doc) return output_path
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any] ) -> None: logger.debug('process file: {}'.format(file.name)) questions = [] non_questions = [] input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] text = input_doc['data']['text'] for t in text: words = [x.lower() for x in t.split()] if t.endswith('?') or words[0] in self.__q_words: questions.append(t) else: non_questions.append(t) now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) content = {'questions': list(set(questions)), 'non_questions': non_questions} accumulator['files_output'].append({ 'filename': output_filename, 'input': path, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content)
def process_file(self, file: IO[AnyStr], control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: logger.debug('process file: {}'.format(file.name)) matcher = self.__matcher input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] text = input_doc['data']['text'] sentences = [] for t in text: doc = self.__nlp(t) entities = [] annotated = [] pos_tags = [] for ent in doc.ents: entity = dict([ ('text', ent.text), ('start_char', ent.start_char), ('end_char', ent.end_char), ('label', ent.label_), ]) entities.append(entity) for token in doc: annotated.append( dict([ ('text', token.text), ('lemma', token.lemma_), ('pos', token.pos_), ('tag', token.tag_), ('dep', token.dep_), ('shape', token.shape_), ('is_alpha', token.is_alpha), ('is_stop', token.is_stop), ])) pos_tags.append(token.tag_) # matches = matcher(doc) is_question = len(matcher(doc)) > 0 sentence = dict([('text', t), ('annotated', annotated), ('entities', entities), ('is_question', is_question), ('pos_tags', ' '.join(pos_tags))]) sentences.append(sentence) now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) content = {'metadata': metadata, 'data': {'sentences': sentences}} accumulator['files_output'].append({ 'filename': output_filename, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content)
def add_step(self, step: AbstractStep): if not step: return self # set name of previous step as source key for this step if self.__steps and not step.source_key: step.source_key = convert_name_to_underscore(self.__steps[-1].name) self.__steps.append(step) return self
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> str: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] texts = [] data = input_doc['data'] accumulator['files_processed'].append({ 'path': file.name, 'time': datetime.utcnow().isoformat() }) if 'structured_content' in data: for x in data['structured_content']: # if x['type'] in ['text', 'heading']: if x['type'] == 'text': # texts.append('<p>{}</p>'.format(x['text'])) texts.append(x['text']) elif x['type'] == 'list': text = '' items = x['items'] if 'heading' in x: # text = '<p>{}</p>'.format(x['heading']) text = x['heading'] text += '<ul>' for it in items: text += '<li>{}</li>'.format(it) text += '</ul>' texts.append(text) elif x['type'] == 'table': df = table_to_dataframe(x) schema = infer_schema(df, n_header_rows=len(x['head'])) texts.extend(table_to_natural_text(df, schema)) formatted = [] for t in texts: formatted.append({'id': str(uuid.uuid4()), 'text': t}) write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.jsonl'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) update_control_info_(file.name, path, output_filename, output_path, accumulator) self.__output_handler(output_path, formatted, self._overwrite) return output_path
def __initialize_steps(self): for step in self.__uninitialized_steps: if not step.source_key: if self.__initialized_steps: # set name of previous step as source key for this step step.source_key = convert_name_to_underscore( self.__initialized_steps[-1].name) else: # otherwise use the source key of the parent, which is # from the step before the parent step.source_key = self.source_key self.__initialized_steps.append(step)
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] text = input_doc['data']['text'] graphs = [] for t in text: tagged = self.get_tagged_from_server(t) entity_fragments = entity_extraction.extract_entities(tagged) edges = entity_extraction.generate_edges(entity_fragments) tokens = [t for t, _, _ in tagged] non_parsed_graph = {'tokens': tokens, 'edgeSet': edges} keras_models.model_params['wordembeddings'] = str(GLOVE_PATH) rel_parser = RelParser('model_ContextWeighted', models_foldes=str(MODELS_PATH)) parsed_graph = rel_parser.classify_graph_relations( non_parsed_graph) # e.g.: # {'tokens': ['Germany', 'is', 'a', 'country', 'in', 'Europe'], 'edgeSet': [{'left': [0], # 'right': [5], 'kbID': 'P30', 'lexicalInput': 'continent'}, {'left': [0], 'right': [3], # 'kbID': 'P0', 'lexicalInput': 'ALL_ZERO'}, {'left': [5], 'right': [3], 'kbID': 'P31', # 'lexicalInput': 'instance of'}]} relations = [] if parsed_graph: graphs.append(parsed_graph) for edge in parsed_graph['edgeSet']: if edge['kbID'] != 'P0': left = ' '.join([tokens[t] for t in edge['left']]) right = ' '.join([tokens[t] for t in edge['right']]) relations.append([left, edge['lexicalInput'], right]) parsed_graph['relations'] = relations now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) content = {'metadata': metadata, 'data': {'graphs': graphs}} accumulator['files_output'].append({ 'filename': output_filename, 'input': path, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content)
def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] step_name = convert_name_to_underscore(self.name) processed_file_paths = {} if step_name in control_data: for x in control_data[step_name]: if x['status'] == 'processed': processed_file_paths[x['input']] = x for file, path in self.__source_iter(file_paths): if not self._overwrite and path in processed_file_paths.keys(): accumulator['files_output'].append(processed_file_paths[path]) continue self.process_file(file, path, control_data, logger, accumulator)
def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] step_name = convert_name_to_underscore(self.name) processed_file_paths = [] if step_name in control_data: processed_file_paths = [ x['path'] for x in control_data[step_name] if x['status'] == 'processed' ] for file, path in self.__source_iter(file_paths): if not self._overwrite and path in processed_file_paths: continue self.process_file(file, control_data, logger, accumulator)
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> str: logger.debug('process file: {}'.format(file.name)) write_root_dir = control_data['job']['write_root_dir'] parsed = parser.from_file(path, xmlContent=True) ext = os.path.splitext(path)[1].lower() if ext == '.docx': doc_type = 'Word' elif ext == '.pdf': doc_type = 'PDF' else: doc_type = None metadata = parsed['metadata'] record_id = metadata['dc:title'].replace(' ', '_') created_date = metadata['dcterms:created'][0] last_mod_date = metadata['dcterms:modified'][0] author = metadata.get('meta:last-author', '') word_count = int(metadata.get('meta:word-count', '-1')) accumulator.update({ 'data': {}, 'is_data': False, 'metadata': { 'doc_type': doc_type, 'record_id': record_id, 'created_date': created_date, 'last_mod_date': last_mod_date, 'author': author, 'word_count': word_count } }) self.process_doc(parsed['content'], accumulator) step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) update_control_info_(file.name, path, output_filename, output_path, accumulator) self.write_output(accumulator, output_path) return output_path
def run_step(step: AbstractStep, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any], temp_path: str) -> Dict[str, Any]: step_name = convert_name_to_underscore(step.name) control_data = write_control_file_start(step_name, control_data, temp_path) logged = logged_decorator(logger, step.name) tracked = tracked_decorator(step_name, control_data, accumulator, temp_path) with logged, tracked: # noinspection PyBroadException try: step.run(control_data, logger, accumulator) except Exception as e: # logger.error(e) # traceback.print_exc() # allow decorators to handle, same effect as `break` raise e return control_data
def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}.txt'.format(step_name) output_path = os.path.join(write_root_dir, step_name, output_filename) output = { 'filename': output_filename, 'path': output_path, 'status': 'processed', 'time': datetime.utcnow().isoformat() } accumulator['files_output'].append(output) processed_file_paths = [] if step_name in control_data and control_data[step_name][ 'status'] == 'processed': processed_file_paths = control_data[step_name]['input'] paths = [] text = [] j = 0 for file, path in self.__source_iter(file_paths): paths.append(path) if not self._overwrite and path in processed_file_paths: continue text.extend( self.process_file(file, path, control_data, logger, accumulator)) j += 1 # manage memory use - flush every 100th file if j % FLUSH_FILE_COUNT == 0: self.__output_handler(output_path, text, self._overwrite) text = [] self.__output_handler(output_path, text, self._overwrite) output['input'] = paths
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> str: logger.debug('process file: {}'.format(file.name)) write_root_dir = control_data['job']['write_root_dir'] accumulator.update({ 'data': {}, 'is_data': False, 'metadata': { 'doc_type': None, 'record_id': None } }) for event, el in self.element_iterator(file): self.process_xml_element(el, event, accumulator) record_id = accumulator['metadata']['record_id'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) update_control_info_(file.name, path, output_filename, output_path, accumulator) self.write_output(accumulator, output_path) return output_path
def extract_text(c: Dict[str, Any], # logger: Logger, a: Dict[str, Any], excluded_tags: List[str], output_handler: Callable, f: TextIO) -> str: # logger.debug('process file: {}'.format(f.name)) a.update({ 'data': {}, 'is_data': False, 'metadata': {'doc_type': None, 'record_id': None} }) it = etree.iterparse(f, events=('start', 'end')) stream = ((event, el) for event, el in it if el.tag not in excluded_tags) for event, el in stream: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = el.text elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = el.text elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = el.text elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = el.text elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = el.text elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = el.text elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] list_extractor = ListExtractor(excluded_tags=['table']) table_extractor = TableExtractor() text_extractor = TextExtractor(excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4']) heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) text_extractor.extract(elem, ev, structured_content, text_list) list_extractor.extract(elem, ev, structured_content, text_list) table_extractor.extract(elem, ev, structured_content, text_list) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data now = datetime.utcnow().isoformat() a['files_processed'].append({ 'path': f.name, 'time': now }) write_root_dir = c['job']['write_root_dir'] output_filename = '{}_{}.json'.format(convert_name_to_underscore(self.name), a['metadata']['record_id']) output_path = os.path.join(write_root_dir, output_filename) a['files_output'].append({ 'filename': output_filename, 'path': output_path, 'status': 'processed', 'time': now }) content = {'metadata': a['metadata'], 'data': a['data']} output_handler(output_path, content) return output_path
def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] data = input_doc['data'] text = data['text'] nlp_text = [] for t in text: entities = [] keywords_found = self.keyword_processor.extract_keywords( t, span_info=True) for keyword in keywords_found: entities.append({ 'entity': self.entity_reverse_lookup[keyword[0]], 'location': keyword[1:], 'value': keyword[0], 'confidence': 1.0 }) matches = match_regexprs(t, self.regexprs) for match in matches: match['entity'] = self.entity_reverse_lookup[match['value']] entities.extend(matches) entities.extend(self.match_system_entities(t)) # is the span of an entity contained within the span # of another entity def is_contained(entity): start, end = entity['location'] for ent in entities: s, e = ent['location'] # exclude exact span matches if (start == s and end < e) or ( start > s and end == e) or (start > s and end < e): return True return False def is_valid(entity): # remove spurious dates if entity['entity'] == 'sys-date': start, end = entity['location'] if (end - start) < 8: return False value = entity['value'] if isinstance(value, str): try: date = parse(value) except ValueError: return False year = date.year if year < 1990 or year > 2025: return False return True # keep the entity with the longest span where an entity # is contained within the span of another pruned_entities = [ ent for ent in entities if not is_contained(ent) and is_valid(ent) ] nlp_text.append({'text': t, 'entities': pruned_entities}) now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) data = {} data['nlp_text'] = nlp_text content = {'metadata': metadata, 'data': data} accumulator['files_output'].append({ 'filename': output_filename, 'input': path, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content)