def phenotype_subjects(job_id: str, phenotype_final: bool): client = util.mongo_client() db = client[util.mongo_db] res = [] # db.phenotype_results.aggregate([ {"$match":{"job_id":{"$eq":10201}, "phenotype_final":{"$eq":true}}}, # {"$group" : {_id:"$subject", count:{$sum:1}}} ]) try: q = [{ "$match": { "phenotype_final": { "$eq": phenotype_final }, "job_id": { "$eq": int(job_id) } } }, { "$group": { "_id": "$subject", "count": { "$sum": 1 } } }] res = list(db.phenotype_results.aggregate(q)) res = sorted(res, key=lambda r: r['count'], reverse=True) except Exception as e: log(e, ERROR) finally: client.close() return res
def get_predicate_boolean(expression: nlpql_parserParser.PredicateBooleanContext, define_name, final): log(expression) operator = "" entities = list() op = PhenotypeOperations(define_name, operator, entities, final=final, raw_text=get_pretty_text(expression)) return op
def lookup_phenotype_results_by_id(id_list: list): client = util.mongo_client() db = client[util.mongo_db] obj = dict() obj['results'] = list() obj['indexes'] = dict() try: # db.phenotype_results.find({"_id": { $in: [ObjectId("5b117352bcf26f020e392a9c"), # ObjectId("5b117352bcf26f020e3926e2")]}}) # TODO TODO TODO ids = list(map(lambda x: ObjectId(x), id_list)) res = db.phenotype_results.find({"_id": {"$in": ids}}) obj['results'] = list(res) n = 0 for o in obj['results']: o = display_mapping(o) id = str(o['_id']) obj['indexes'][id] = n n = n + 1 except Exception as e: log(e, ERROR) traceback.print_exc(file=sys.stdout) obj['success'] = False finally: client.close() return obj
def handle_tuple(context, phenotype: PhenotypeModel, define_name, final): log('tuple') obj = get_obj_context(context.getChild(0).getChild(1), to_string=True) num_children = len(context.children) op_raw_text = '' tuple_name = '{}_Step1'.format(define_name) if num_children == 2: operation = parse_operation(context.getChild(1), tuple_name, final) if operation: if not phenotype.operations: phenotype.operations = list() phenotype.operations.append(operation) op_raw_text = operation.get('raw_text') else: operation = None raw_text = ''' Tuple {} {} ''' if not operation: where = '' else: where = 'where {}'.format(op_raw_text) tuple_str = json.dumps(obj, indent=4) pe = PhenotypeEntity(tuple_name, 'define', final=final, tuple_=True, tuple_object=obj, tuple_predicate=operation, raw_text=raw_text.format(tuple_str, where), tuple_raw_text='Tuple {}'.format(tuple_str)) if not phenotype.tuples: phenotype.tuples = list() phenotype.tuples.append(pe)
def get_not_expression(expression: nlpql_parserParser.ExpressionContext, define_name, final): log(expression) entities = list() operator = "" op = PhenotypeOperations(define_name, operator, entities, final=final, raw_text=get_pretty_text(expression)) return op
def show_help(): log(get_version()) log(""" USAGE: python3 ./{0} OPTIONS: -i, --infile <quoted string> (required) input filename matrix market format [-r, --min_docs_per_term] <positive int> mininum number of docs per term, default is 3 (minimum allowable row sum in result matrix) [-c, --min_terms_per_doc] <positive int> minimum number of terms per document, default is 5 (minimum allowable column sum in result matrix) [-p, --precision] <positive int> precision of result matrix (default is 4 digits) [-hvzbw] FLAGS: -h, --help log this information and exit. -v, --version log version information and exit. -z, --selftest Run self-tests and exit. -b, --boolean Replace nonzero entries in input matrix with 1 (i.e. use 1 if value present, 0 if not) -w, --weights Compute tf-idf weights in result matrix (i.e. output a term-document matrix instead of a term-frequency matrix) """.format(_MODULE_NAME))
def handle_data_entity(context, phenotype: PhenotypeModel, define_name, final): log('data entity') pe = PhenotypeEntity(define_name, 'define', final=final) call = get_method_call(context.getChild(0)) # hasSepsis = PhenotypeEntity('hasSepsis', 'define', # library='ClarityNLP', # funct='ProviderAssertion', # named_arguments={ # "termsets": ['Sepsis'], # "documentsets": [ # 'ProviderNotes', # "Radiology" # ] # }) pe["funct"] = call["funct"] pe["library"] = call["library"] named_args = call["named_arguments"] args = call["arguments"] if named_args: pe["named_arguments"] = named_args if args and len(args) > 0: pe["arguments"] = args if not phenotype.data_entities: phenotype.data_entities = list() phenotype.data_entities.append(pe)
def insert_pipeline_config(pipeline: PipelineConfig, connection_string: str): conn = psycopg2.connect(connection_string) cursor = conn.cursor() pipeline_id = -1 try: if pipeline: pipeline_json = pipeline.to_json() cursor.execute(""" INSERT INTO nlp.pipeline_config(owner, config, pipeline_type, name, description, date_created) VALUES(%s, %s, %s, %s, %s, current_timestamp) RETURNING pipeline_id """, (pipeline.owner, pipeline_json, pipeline.config_type, pipeline.name, pipeline.description)) pipeline_id = cursor.fetchone()[0] conn.commit() except Exception as ex: log('failed to insert pipeline') log(ex) finally: conn.close() return pipeline_id
def get_pipeline_config(pipeline_id, connection_string): conn = psycopg2.connect(connection_string) cursor = conn.cursor() try: cursor.execute(""" SELECT * FROM nlp.pipeline_config WHERE pipeline_id = %s """, [str(pipeline_id)]) row = cursor.fetchone() if row: obj = PipelineConfig.from_json(row[2]) if obj: return obj else: return get_default_config() else: log("no rows returned") except Exception as ex: log(ex) finally: conn.close() return get_default_config()
def custom_cleanup(self, pipeline_id, job, owner, pipeline_type, pipeline_config, client, db): log('removing intermediate n-gram records') db.phenotype_results.remove({ "nlpql_feature": pipeline_config.name, "job_id": job, "phenotype_final": False })
def try_match_anchored_left(sentence_index, start, end, start_offsets, end_offsets, words): """If no match, try removing final words one at a time. This is less preferable than the previous loop, which anchors at the right to the terminating symbol. """ if TRACE: log("\tTrying match_anchored_left...", DEBUG) candidates = [] new_start = start + start_offsets[0] for i in reversed(range(1, len(words))): test_text = ' '.join(words[0:i]) if TRACE: log("\t\ttrial text:\t\t->{0}<-".format(test_text), DEBUG) if test_text in synonym_map: # found exact match, so save all associated concepts new_end = start + end_offsets[i-1] #if TRACE: print("\t\tspan: {0}-{1}".format(new_start, new_end)) for c in synonym_map[test_text]: cid = concept_to_cid_map[c] treecode_list = graph.treecode_list(cid) sh = SectionHeader(sentence_index, new_start, new_end, test_text, c, treecode_list) candidates.append(sh) break if TRACE: print_num_found(len(candidates)) return candidates
def try_match_anchored_right(sentence_index, start, end, start_offsets, end_offsets, words): """ Remove initial words one at a time and try for exact match. This finds word sequences anchored at the right, near the terminating symbol. """ if TRACE: log("\tTrying match_anchored_right...", DEBUG) candidates = [] new_end = start + end_offsets[-1] for i in range(1, len(words)): test_text = ' '.join(words[i:]) if TRACE: log("\t\ttrial text:\t\t->{0}<-".format(test_text), DEBUG) if test_text in synonym_map: # found exact match, so save all associated concepts new_start = start + start_offsets[i] for c in synonym_map[test_text]: cid = concept_to_cid_map[c] treecode_list = graph.treecode_list(cid) sh = SectionHeader(sentence_index, new_start, new_end, test_text, c, treecode_list) candidates.append(sh) break if TRACE: print_num_found(len(candidates)) return candidates
def phenotype_subject_results(job_id: str, phenotype_final: bool, subject: str): client = util.mongo_client() db = client[util.mongo_db] res = [] try: query = { "job_id": int(job_id), "phenotype_final": phenotype_final, "subject": subject } temp = list(db["phenotype_results"].find(query)) for r in temp: obj = r.copy() for k in r.keys(): val = r[k] if (isinstance(val, int) or isinstance(val, float)) and math.isnan(val): del obj[k] res.append(obj) except Exception as e: log(e, ERROR) finally: client.close() return res
def requires(self): try: self.solr_query, total_docs, doc_limit, ranges = initialize_task_and_get_documents( self.pipeline, self.job, self.owner) task = registered_pipelines[str(self.pipelinetype)] if task.parallel_task: matches = [ task(pipeline=self.pipeline, job=self.job, start=n, solr_query=self.solr_query, batch=n) for n in ranges ] else: matches = [ task(pipeline=self.pipeline, job=self.job, start=0, solr_query=self.solr_query, batch=0) ] return matches except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) log(ex) return list()
def handle_operation(context, phenotype: PhenotypeModel, define_name, final): log('operation') # SepsisState = PhenotypeOperations('SepsisState', 'OR', ['onVentilator', 'hasSepsis'], final=True) # : notOperator=(NOT | BANG) expression # | expression logicalOperator expression # | predicate IS NOT? BOOL expression_context = context.getChild(1) first = expression_context.getChild(0) res = None if type(first) == nlpql_parserParser.NotOperatorContext: res = get_not_expression(expression_context, define_name, final) elif type(first) == nlpql_parserParser.ExpressionContext: res = get_logical_expression(expression_context, define_name, final) elif type(first) == nlpql_parserParser.PredicateBooleanContext: res = get_predicate_boolean(expression_context, define_name, final) elif type(first) == nlpql_parserParser.PredicateContext: res = get_predicate_expression(expression_context.getChild(0), define_name, final) if not phenotype.operations: phenotype.operations = list() if res: phenotype.operations.append(res)
def _get_datetime_window(custom_args, data_earliest, data_latest): """ Extract and parse the datetime_start and datetime_end custom arguments. These specify the start/end datetime filters for the CQL results. """ datetime_start = None datetime_end = None if _ARG_TIME_START in custom_args: time_start = custom_args[_ARG_TIME_START] datetime_start = tc.parse_time_command(time_start, data_earliest, data_latest) if _ARG_TIME_END in custom_args: time_end = custom_args[_ARG_TIME_END] datetime_end = tc.parse_time_command(time_end, data_earliest, data_latest) if _TRACE: log('\n*** datetime_start: {0}'.format(datetime_start)) log('*** datetime_end: {0}'.format(datetime_end)) return (datetime_start, datetime_end)
def _log_error(msg): """ Print an error message to the log file. """ log('TermProximityTask error: {0}'.format(msg)) return
def _get_custom_arg(str_key, str_variable_name, job_id, custom_arg_dict): """ Extract a value at the given key from the given dict, or return None if not found. Attempt to read from environmental variables, if known. """ value = None if str_key in custom_arg_dict: value = custom_arg_dict[str_key] if value is None: if str_key in util.properties: value = util.properties[str_key] # treat empty strings as None if str == type(value) and 0 == len(value): value = None # echo in job status and in log file msg = 'CQLExecutionTask: {0} == {1}'.format(str_variable_name, value) data_access.update_job_status(job_id, util.conn_string, data_access.IN_PROGRESS, msg) # write msg to log file log(msg) return value
def run_measurement_finder(filename): measurements = list() try: p = Popen( ['java', '-jar', FULL_JAR, '-f', filename, '-m', MODEL_FULL_DIR], stdout=PIPE, stderr=STDOUT) json_str = '' for line in p.stdout: json_str += str(line, 'utf-8') json_obj = json.loads(json_str) results = json_obj['results'] for res in results: meas_count = int(res["measurementCount"]) if meas_count > 0: # print("found %d measurements" % meas_count) meas_results = res['measurements'] for meas in meas_results: meas_obj = Measurement.from_dict(meas) meas_obj.__setattr__('sentence', res['sentence']) measurements.append(meas_obj) except Exception as e: log(e, ERROR) return measurements
def _process_bundle(name, bundle_obj, result_type_str): """ Process a DSTU2 or DSTU3 resource bundle returned from the CQL Engine. """ if _TRACE: log('Decoding BUNDLE resource...') # this bundle should be a string representation of a list of dicts obj_type = type(bundle_obj) assert str == obj_type try: obj = json.loads(bundle_obj) except json.decoder.JSONDecodeError as e: log('\t{0}: String conversion (bundle) failed with error: "{1}"'. format(_MODULE_NAME, e)) return [] # now find out what type of obj was created from the string obj_type = type(obj) assert list == obj_type rts = result_type_str.lower() bundled_objs = [] for elt in obj: if rts.endswith('stu2') or rts.endswith('stu3') or rts.endswith( 'list'): result = _process_resource(elt) if result is not None: # insert the name as a new 'cql_feature' field result[KEY_CQL_FEATURE] = name bundled_objs.append(result) return bundled_objs
def scrape_common_verbs(): """ Scrape a list of the 100 most common English verbs. """ page = requests.get(url_common) if 200 != page.status_code: log('Error getting url: ' + url) sys.exit(-1) soup = BeautifulSoup(page.text, 'html.parser') verbs = [] # get all 'a' elements, look for the first one with href='/dictionary/be' a_elts = soup.find_all('a') start=0 for a in a_elts: if 'be' == a.get_text().strip().lower(): break else: start += 1 # skip the verb 'be', which has a complex entry for i in range(start+1, start+1000): verb = a_elts[i].get_text().strip().lower() verbs.append(verb) return verbs
def _convert_units(value, units, is_area_measurement, is_vol_measurement): """ Given a token value in the indicated units, convert to mm and return the new value. """ convert_cm = units.startswith('cm') or \ units.startswith('centi') or \ units.startswith('cc') convert_in = units.startswith('in') is_area = is_area_measurement or _is_area_unit(units) is_volume = is_vol_measurement or _is_vol_unit(units) if _TRACE: log('_convert_units::is_area: {0}'.format(is_area)) log('_convert_units::is_volume: {0}'.format(is_volume)) if convert_cm: # convert from cm to mm value = value * _CM_TO_MM if is_area: value = value * _CM_TO_MM elif is_volume: value = value * _CM_TO_MM_SQ elif convert_in: # convert from inches to mm value = value * _IN_TO_MM if is_area: value = value * _IN_TO_MM elif is_volume: value = value * _IN_TO_MM_SQ return value
def get_descendants(conn_string, concept, vocabulary): conn = psycopg2.connect(conn_string) cursor = conn.cursor() if vocabulary is None: vocabulary = "SNOMED" try: cursor.execute( """ SELECT concept_name FROM nlp.concept_ancestor INNER JOIN nlp.concept on concept_id = descendant_concept_id WHERE ancestor_concept_id in (SELECT concept_id from nlp.concept where lower(concept_name) = %s AND vocabulary_id=%s AND invalid_reason IS null) AND vocabulary_id=%s AND invalid_reason is null order by max_levels_of_separation asc """, (concept.lower(), vocabulary, vocabulary)) result = cursor.fetchall() return result except Exception as ex: log('Failed to get descendants') log(str(ex)) finally: conn.close() return list()
def _run(json_obj): results = [] # assumes we either have a list of objects or a single obj obj_type = type(json_obj) if list == obj_type: #log('**** SAW A LIST ****') for e in json_obj: result_obj = crp.decode_top_level_obj(e) if result_obj is None: continue if list is not type(result_obj): results.append(result_obj) else: results.extend(result_obj) elif dict == obj_type: #log('**** SAW A DICT ****') result_obj = crp.decode_top_level_obj(json_obj) if result_obj is not None: if list is not type(result_obj): results.append(result_obj) else: results.extend(result_obj) #log('found {0} results'.format(len(results))) counter = 0 for obj in results: log('Result {0}: {1}'.format(counter, obj)) counter += 1
def get_synonyms(conn_string, concept, vocabulary): conn = psycopg2.connect(conn_string) cursor = conn.cursor() if vocabulary is None: vocabulary = "SNOMED" try: cursor.execute( """ SELECT concept_synonym_name FROM nlp.concept_synonym s INNER JOIN nlp.concept c on c.concept_id = s.concept_id WHERE lower(concept_name) = %s and c.vocabulary_id=%s and invalid_reason is null order by concept_synonym_name """, (concept.lower(), vocabulary)) result = cursor.fetchall() return result except Exception as ex: log('Failed to get synonyms') log(str(ex)) finally: conn.close() return list()
def get_related_terms(conn_string, concept, vocabulary, get_synonyms_bool=True, get_descendants_bool=False, get_ancestors_bool=False, escape=True): related_terms = [] escaped = [] if get_synonyms_bool: res = get_synonyms(conn_string, concept, vocabulary) if res: for r in res: related_terms.append(r[0]) escaped.append(re.escape(r[0])) if get_descendants_bool: res = get_descendants(conn_string, concept, vocabulary) if res: for r in res: related_terms.append(r[0]) escaped.append(re.escape(r[0])) if get_ancestors_bool: res = get_ancestors(conn_string, concept, vocabulary) if res: for r in res: related_terms.append(r[0]) escaped.append(re.escape(r[0])) if len(related_terms) > 0: log(related_terms) if escape: return list(set(escaped)) else: return list(set(related_terms))
def delete_job_by_id(job_id: int): log('deleting job now ' + str(job_id)) flag = delete_job(str(job_id), util.conn_string) if flag == 1: return "Successfully deleted Job!" else: return "Unable to delete Job!"
def get_section_source(): """GET source file for sections and synonyms""" try: file_path = get_sec_tag_source_tags() return send_file(file_path) except Exception as ex: log(ex) return "Failed to retrieve sections source file"
def get_document_set_attributes(model): tags = dict() types = dict() custom_query = dict() filter_query = dict() source = dict() # Clarity.createDocumentSet({ # report_tags: [optional] # report_types: [optional], # "filter_query": " # query: "query" if model.document_sets: for d in model.document_sets: log(d) if d['library'] == "Clarity" or d["library"] == "ClarityNLP": args = d['arguments'] named_args = d['named_arguments'] funct = d['funct'] doc_set_name = d['name'] if funct == "createReportTagList": if len(args) == 1 and type(args[0]) == list: tags[doc_set_name] = args[0] else: tags[doc_set_name] = args elif funct == "createDocumentSet": if named_args: if "report_tags" in named_args: arg_report_tags = named_args["report_tags"] if len(arg_report_tags) == 1 and type(arg_report_tags[0]) == list: tags[doc_set_name] = arg_report_tags[0] else: tags[doc_set_name] = arg_report_tags if "report_types" in named_args: types[doc_set_name] = named_args["report_types"] if "filter_query" in named_args: fq = normalize_query_quotes(named_args["filter_query"]) filter_query[doc_set_name] = fq if "query" in named_args: query = normalize_query_quotes(named_args["query"]) custom_query[doc_set_name] = query if "source" in named_args: if type(named_args["source"]) == str: source[doc_set_name] = named_args["source"].split(",") else: source[doc_set_name] = named_args["source"] elif "sources" in named_args: if type(named_args["sources"]) == str: source[doc_set_name] = named_args["sources"].split(",") else: source[doc_set_name] = named_args["sources"] elif funct == "createReportTypeList": if len(args) == 1 and type(args[0]) == list: types[doc_set_name] = args[0] else: types[doc_set_name] = args return tags, types, custom_query, filter_query, source
def run_custom_task(self, pipeline_id, job, owner, pipeline_type, pipeline_config, client, db): group_key = get_config_string(pipeline_config, "group_by", default='report_type') log('run custom task collector') q = [{ "$match": { "nlpql_feature": { "$eq": pipeline_config.name }, "job_id": { "$eq": job } } }, { "$group": { "_id": ("$" + group_key), "avg_word_cnt": { "$avg": "$words" }, "avg_grade_level": { "$avg": "$grade_level" }, "avg_sentences": { "$avg": "$sentences" }, "avg_long_words": { "$avg": "$long_words" }, "avg_polysyllable_words": { "$avg": "$polysyllable_words" } } }] results = list(db.phenotype_results.aggregate(q)) group_by = "text_stats_group_by_field_" + group_key for r in results: pipeline_mongo_writer( client, pipeline_id, pipeline_type, job, 0, pipeline_config, None, { group_by: r['_id'], 'average_word_count': r['avg_word_cnt'], 'average_grade_level': r['avg_grade_level'], 'average_sentences': r['avg_sentences'], 'average_long_words': r['avg_long_words'], 'average_polysyllable_words': r['avg_polysyllable_words'] }, phenotype_final=True)