def delete_file(filename): """delete file""" global api if '.json' in filename and filename in api.shared_folder_manager.get_file_names_in_folder(): try: file_path = os.path.join(BASE_DIR, SHARE_FOLDER, filename) dataset_folder = os.path.join(BASE_DIR, DATASET_FOLDER) dataset_path = os.path.join(dataset_folder, filename).replace('.jsonl', '.data').replace('.json', '.data') remove_file(file_path) remove_file(dataset_path) api.dataset_status.pop(filename) dataset_status_file_path = os.path.join(dataset_folder, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file_path) api.selected_dataset = None return make_response(jsonify({'message': '{0} deleted'.format(filename)}), 200) except Exception as e: return make_response(jsonify({'message': '{0}'.format(e)}), 400) else: dataset_folder = os.path.join(BASE_DIR, DATASET_FOLDER) dataset_path = os.path.join(dataset_folder, filename).replace('.jsonl', '.data').replace('.json', '.data') remove_file(dataset_path) api.dataset_status.pop(filename) dataset_status_file_path = os.path.join(dataset_folder, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file_path) api.selected_dataset = None return make_response(jsonify({'message': '{0} deleted'.format(filename)}), 200)
def build_med_terminology(terminology_file_path, entity_name=None, save=False): global added_terminology, split_characters, global_specialist_lexicon_parser if entity_name is None: entity_name = get_basename(terminology_file_path).replace('.txt', '') with open(terminology_file_path, 'r', encoding='utf-8', errors='replace') as fp: for line in fp: code, attr, desc, generic_code, generic_terminology, terminology_entry_type = \ normalize_line_of_terminology(line) if attr in ['SY', 'PT']: tags = { 'cat': 'noun', 't2': { 'code': code, 'entity': entity_name } } terminologies = normalize_and_expand_to_build_terminology(desc, terminology_entry_type, code, entity_name) for terminology in terminologies: if (code, terminology, terminology_entry_type, entity_name) not in added_terminology: global_specialist_lexicon_parser.build_trie(terminology, tags) added_terminology.add((code, terminology, terminology_entry_type, entity_name)) if generic_code and generic_terminology: terminology = generic_terminology.strip() if (generic_code, terminology, terminology_entry_type, entity_name) not in added_terminology: tags['t2_code'] = generic_code global_specialist_lexicon_parser.build_trie(terminology, tags) added_terminology.add((generic_code, terminology, terminology_entry_type, entity_name)) if save: save_specialist_lexicon_parser() write_json(list(added_terminology), '{0}_added.json'.format(entity_name))
def clean_orphan_dataset(): global api orphan_dataset = [] for ds in api.dataset_status.keys(): if ds == 'updated': continue dataset_path = os.path.join(DATASET_DIR, ds.replace('.jsonl', '.data').replace('.json', '.data')) if not os.path.exists(dataset_path): orphan_dataset.append(ds) if len(orphan_dataset) > 0: for ds in orphan_dataset: api.dataset_status.pop(ds) dataset_status_file = os.path.join(DATASET_DIR, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file)
def api_skip(): """Accept infer results of current dataset""" global api if request.method == 'POST': context, entity_type, extracted_code, original_highlighted, inprogress = get_next_dataset_context() api.dataset[context]['skipped'] = True dataset_file_path = os.path.join(DATASET_DIR, api.selected_dataset.replace('.jsonl', '.data').replace('.json', '.data')) write_json(api.dataset, dataset_file_path) api.dataset_status[api.selected_dataset]['processing_dataset'] -= 1 api.dataset_status[api.selected_dataset]['skipped_dataset'] += 1 api.dataset_status['updated'] = datetime.now().strftime(DATETIME_FORMAT) dataset_status_file_path = os.path.join(DATASET_DIR, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file_path) return infer_next_code()
def api_reject_and_learn_code(): """Accept infer results of current dataset""" global api if request.method == 'POST': new_code = request.json['new_code'] new_code_terminology = request.json['new_code_terminology'] highlighted = request.json['highlighted'] context, entity_type, extracted_code, original_highlighted, inprogress = get_next_dataset_context() api.dataset[context]['rejected'] = { 'selected': 'new_learn', 'new_code_terminology': new_code_terminology, 'highlighted': highlighted, 'code': new_code, } dataset_file_path = os.path.join(DATASET_DIR, api.selected_dataset.replace('.jsonl', '.data').replace('.json', '.data')) write_json(api.dataset, dataset_file_path) api.dataset_status[api.selected_dataset]['processing_dataset'] -= 1 api.dataset_status[api.selected_dataset]['rejected_dataset'] += 1 api.dataset_status['updated'] = datetime.now().strftime(DATETIME_FORMAT) dataset_status_file_path = os.path.join(DATASET_DIR, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file_path) return infer_next_code()
def infer_next_code(): """find code from unprocessed dataset""" global api find_code_results = [] context, entity_type, extracted_code, original_highlighted, inprogress = get_next_dataset_context() if context is None: response = { "message": "All contexts were processed.", "method": "POST", "results": [], "status-code": 200 } return make_response(jsonify(response), response.get("status-code", 400)) # if original_highlighted in ['', None]: processed_context_lines = ( context.lower().replace('\\n', '\n').replace('\n\n', '\n') .replace(';', '\n').replace(':', '\n').replace('.', ' . ').split('\n') ) payloads = generate_payload_by_line(processed_context_lines, entity_type=entity_type) # else: # processed_context_lines = context.lower().replace('\\n', '\n').replace('\n\n', '\n').split('\n') # payloads = generate_payload_by_highlighted(original_highlighted, processed_context_lines, # entity_type=entity_type) response = {} for payload in payloads: r = get_t2_find_code(payload) if r.status_code == 200: response = json.loads(r.content) for result in response['results']: result['synonym'] = payload['concept_text'] find_code_results.extend(response['results']) else: pass sorted_results = sorted(find_code_results, key=lambda x: (x['confidence']), reverse=True) sorted_top_concept = sort_by_code_weight_with_same_parent(sorted_results) response['results'] = sorted_top_concept """ if original_highlighted in ['', None]: selected_concept = sorted_top_concept[0].get('synonym', sorted_top_concept[0].get('preferred_terminology')) # selected_concept = sorted_top_concept[0].get('preferred_terminology', [sorted_top_concept[0].get('synonym')])[0] else: selected_concept = original_highlighted response_context_lines = ( context.replace('\\n', '\n').replace('\n\n', '\n') .replace(';', '\n').replace(':', '\n').replace('.', ' . ').split('\n') ) index = 0 selected_concept_tokens = set(selected_concept.lower().replace(';', '').replace(':', '').split()) for processed_context in processed_context_lines: processed_context_tokens = set(processed_context.split()) if selected_concept_tokens.intersection(processed_context_tokens) == selected_concept_tokens: response_context_lines[index] = get_highlight_from_concept(response_context_lines[index], selected_concept) index += 1 response['context'] = '<br>'.join(response_context_lines) """ response['context'] = context entity_codes = [] if entity_type in med_terminology_code_verbose: for code, detail in med_terminology_code_verbose[entity_type].items(): if code in med_terminology_code_tree: terminology = med_terminology_code_tree[code].get('PT', detail.get('SY', detail.get('STY'))[0]) else: terminology = detail.get('SY', detail.get('STY'))[0] entity_codes.append([code, terminology]) response['entity_codes'] = entity_codes response['entity_type'] = entity_type response['extracted_code'] = extracted_code response['original_highlighted'] = original_highlighted response['current_process'] = api.selected_dataset if len(sorted_top_concept) == 0: response['message'] = "OK" response['match_with_extracted'] = False else: if extracted_code and extracted_code == sorted_top_concept[0]['code']: response['match_with_extracted'] = True elif extracted_code: response['match_with_extracted'] = False response['message'] = "OK" jsonify_response = jsonify(response) api.dataset[context]['inferred'] = sorted_top_concept dataset_file_path = os.path.join(DATASET_DIR, api.selected_dataset.replace('.jsonl', '.data').replace('.json', '.data')) write_json(api.dataset, dataset_file_path) if not inprogress: api.dataset_status[api.selected_dataset]['processing_dataset'] += 1 api.dataset_status[api.selected_dataset]['not_started'] -= 1 api.dataset_status['updated'] = datetime.now().strftime(DATETIME_FORMAT) dataset_status_file_path = os.path.join(DATASET_DIR, DATASET_STATUS_FILE) write_json(api.dataset_status, dataset_status_file_path) return make_response(jsonify_response, response.get("status-code", 400))