def db_validate(arglist): saved_pwd = os.getcwd() opt = optparse.OptionParser() opt.add_option('--rf2',action='store') opt.add_option('--release_type', action='store', choices=['delta','snapshot','full']) opt.add_option('--neopw64', action='store') opt.add_option('--neopw', action='store') opt.add_option('--exceptions', action='store') opt.add_option('--logfile', action='store', default='-') opts, args = opt.parse_args(arglist) if not (len(args)==0 and opts.rf2 and opts.release_type and (opts.neopw or opts.neopw64)): print('Usage: db_validate --rf2 <dir> --release_type full --neopw <pw>') sys.exit(1) if opts.neopw and opts.neopw64: print('Usage: only one of --neopw and --neopw64 may be specified') sys.exit(1) if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2 # open logfile logfile = open(opts.logfile, 'w') if opts.logfile != '-' else sys.stdout #--------------------------------------------------------------------------- # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal #--------------------------------------------------------------------------- pathsep = '/' # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls # ... ask directly if these variables don't exist snomed_g_bin = os.environ.get('SNOMED_G_BIN',None) # unlikely to exist, but great if it does if not snomed_g_bin: snomed_g_home = os.environ.get('SNOMED_G_HOME',None) if snomed_g_home: snomed_g_bin = snomed_g_home.rstrip(pathsep) + pathsep + 'bin' else: snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep) validated = False while not validated: if len(snomed_g_bin)==0: snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep) else: # try to validate, look for snomed_g_rf2_tools.py target_file = snomed_g_bin+pathsep+'snomed_g_rf2_tools.py' validated = os.path.isfile(target_file) if not validated: print('Cant find [%s]' % target_file); snomed_g_bin = '' snomed_g_bin = os.path.abspath(snomed_g_bin) print('SNOMED_G bin directory [%s]' % snomed_g_bin) # connect to NEO4J, make sure information given is good neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64)) # Connect to RF2 files, make sure rf2 directory given is good rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type) # Build # open SQLITE database DB = StatusDb(os.path.abspath(opts.output_dir.rstrip(pathsep)+pathsep+'validate_status.db')) # create YYYYMMDD string d = datetime.datetime.now() # determine current date yyyymmdd = '%04d%02d%02d' % (d.year,d.month,d.day) job_start_datetime = datetime.datetime.now() # Commands needed to Create/Update a SNOMED_G Graph Database commands_d = { 'JOB_START': {'stepname': 'JOB_START', 'log': 'JOB-START(release_type:[%s], rf2:[%s], date:[%s])' \ % (opts.release_type, opts.rf2, yyyymmdd)}, 'VALIDATE_CONCEPTS': {'stepname': 'VALIDATE_CONCEPTS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element concept --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_DESCRIPTIONS': {'stepname': 'VALIDATE_DESCRIPTIONS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element description --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_ISA_RELS': {'stepname': 'VALIDATE_ISA_RELS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element isa_rel --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_DEFINING_RELS': {'stepname': 'VALIDATE_DEFINING_RELS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element defining_rel --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'JOB_END': {'stepname': 'JOB_END', 'log': 'JOB-END'} } command_list_validate_build = \ [ commands_d[x] for x in ['JOB_START', 'VALIDATE_CONCEPTS', 'VALIDATE_DESCRIPTIONS', 'VALIDATE_ISA_RELS', 'VALIDATE_DEFINING_RELS', 'JOB_END'] ] command_list = command_list_validate_build stepnames = [x['stepname'] for x in command_list] # list of dictionaries seqnum = DB.get_next_sequence_number() # Execute commands (BUILD) results_d = {} for command_d in command_list: # extract from tuple stepname, cmd, logmsg, expected_status, mode_requirement = \ command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None) if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode results_d[stepname] = {} cmd_start = datetime.datetime.now() if stepname!='JOB_END' else job_start_datetime # start timer status = -1 should_break = False results_d[stepname]['result'] = 'SUCCESS' # assumption of success until failure determined results_d[stepname]['expected_status'] = expected_status results_d[stepname]['command'] = cmd results_d[stepname]['error_count'] = 0 # default print(stepname) print(stepname, file=logfile) # indicate to user what step we are on if logmsg: # no command to execute in a separate process results_d[stepname]['status'] = 0 results_d[stepname]['STDOUT'] = logmsg # LOG everything after 'LOG:' output, err = '', '' else: # execute command (cmd) in subprocess print(cmd, file=logfile) try: # SUBPROCESS creation cmd_as_list = cmd.split(' ') if opts.output_dir != '.': os.chdir(opts.output_dir) # move to output_dir, to start subprocess subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile) if opts.output_dir !='.': os.chdir(saved_pwd) # get back (popd) status = 0 # if no exception -- status guaranteed to be zero except subprocess.CalledProcessError as e: status = e.returncode # by validate_graphdb convention, this code is the number of discrprancies found results_d[stepname]['status'] = status if status != expected_status: results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status should_break = False # keep validating pass # might be fine, should_break controls termination except: # NOTE: result defaulted to -1 above results_d[stepname]['result'] = 'EXCEPTION occured -- on step [%s], cmd [%s]' % (stepname,cmd) should_break = True pass else: # no exception results_d[stepname]['status'] = status if status != expected_status: results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status results_d[stepname]['error_count'] = status # graphdb_validate convention is to return discrprency count should_break = True # no steps are optional # Book-keeping cmd_end = datetime.datetime.now() # stop timer cmd_seconds = (cmd_end-cmd_start).seconds results_d[stepname]['elapsed_seconds'] = cmd_seconds if len(output) > 0: results_d[stepname]['STDOUT'] = output.replace('\n','<EOL>') if len(err) > 0: results_d[stepname]['STDERR'] = err.replace('\n','<EOL>') results_d[stepname]['cmd_start'] = cmd_start results_d[stepname]['cmd_end'] = cmd_end if should_break: break # Write results to the database save_and_report_results(DB, seqnum, stepnames, results_d) # Done sys.exit(0) return
def validate_graphdb(arglist): def rf2_filename(element, view=None): # rf2_folders is set in validate_graphdb initialization return rf2_folders.rf2_file_path(element, view) # eg: 'concept' def old_compute_hist_changes(new_field_values, prev_field_values, field_names): # find map with only modified fields return { field_names[idx] : new_field_values[idx] for idx in range(len(field_names)) if db_data_prep(new_field_values[idx]) != db_data_prep(prev_field_values[idx]) } ''' HISTORY COMPUTATION -- Example information for a concept: Information state example (need to understand for history computation) csv_fields = ['id','effectiveTime','active','moduleId','definitionStatusId','FSN','history'] field_names = ['id','effectiveTime','active','moduleId','definitionStatusId'] renamed_fields = {} id -- '293672009' concepts_d[id]['20160301'] -- concepts_d[id] is a map keyed by effectiveTime, its value ==> list of attribute values for that time, in same order as in RF2 file graph_matches_d[id] (graph) -- {u'nodetype': u'concept', u'effectiveTime': u'20060131', u'FSN': u'Antiemetic allergy (disorder)', u'definitionStatusId': u'900000000000073002', u'sctid': u'293672009', u'active': u'1', u'moduleId': u'900000000000207008', u'id': u'293672009', u'history': u'[{"active": "1", "effectiveTime": "20020131", ...}, ...]'} ''' def compute_history_string(id, rf2_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields): if opts.release_type == 'full': # compute history, have all information historical_effectiveTimes = sorted(rf2_d[id].keys())[:-1] # excluce 'current' (latest) hist = [ { nm: rf2_d[id][effTime][rf2_fields_d[renamed_fields.get(nm,nm)]] for nm in field_names } for effTime in historical_effectiveTimes ] \ if len(rf2_d[id].keys()) > 1 else [] else: # not FULL, can be missing historical info if id not in graph_matches_d: hist = [] else: old_history = graph_matches_d[id]['history'] # JSON string or empty string old_field_values = [ graph_matches_d[id][nm] for nm in field_names ] if len(old_history) == 0: # no prev history, old values ==> previous history) hist = [ { a:b for a,b in zip(field_names, old_field_values) } ] else: # existing history, not FULL release, append previous values from graph (previous history) hist = json.loads(old_history) + [ ( { a:b for a,b in zip(field_names, old_field_values) } ) ] return json.dumps(hist) if len(hist) > 0 else '' def build_csv_output_line(id, non_rf2_fields, current_effTime, rf2_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_fields): csv_data = [None]*len(csv_fields_d.keys()) for nm in field_names: csv_data[csv_fields_d[nm]] = db_data_prep(rf2_d[id][current_effTime][rf2_fields_d[renamed_fields.get(nm,nm)]]) for k,v in non_rf2_fields: csv_data[csv_fields_d[k]] = db_data_prep(v) # eg: [('history','<hist-json-str>'),...] if None in csv_data: raise ValueError('csv_data %s' % str(csv_data)) for nm in quoted_fields: csv_data[csv_fields_d[nm]] = csv_clean_str(csv_data[csv_fields_d[nm]]) # quote only necessary fields return db_data_prep( ','.join(csv_data) ) # output_line #------------------------------------------------------------------------------| # CONCEPT CSV files creation -- concept_new.csv, concept_chg.csv | #------------------------------------------------------------------------------| def validate_concepts(): def concept_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in concepts_d: concepts_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in concepts_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) concepts_d[id][effTime] = fields[:] # attributes in RF2-defined order def Fsn_cb(fields, fields_d, hist): all_Fsn_in_Rf2_d[ db_data_prep(fields[ fields_d['conceptId'] ]) ] = db_data_prep(fields[ fields_d['term'] ]) # FSN def Fsn_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_FSN # validate_concepts: # ==> generate concept_new.csv, concept_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = { } timing_idx = 0 timing_overall_nm = '{:04d}_validate_concepts'.format(timing_idx); timing_start(timing_d, timing_overall_nm) timing_idx += 1; timing_nm = '{:04d}_read_RF2_description'.format(timing_idx); timing_start(timing_d, timing_nm) all_Fsn_in_Rf2_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(Fsn_cb, Fsn_filter, False) timing_end(timing_d, timing_nm) f_new, f_chg = io.open('concept_new.csv','w',encoding='utf8'),io.open('concept_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['concept'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['concept'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['FSN','history'] ] # exclude non-RF2 history and FSN (external) renamed_fields = attributes_by_file.renamed_fields['concept'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['concept'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header # create concepts_d with information from DELTA/SNAPSHOT/FULL concept file timing_idx += 1; timing_nm = '{:04d}_read_RF2_concept'.format(timing_idx); timing_start(timing_d, timing_nm) concepts_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('concept') ).process_file(concept_cb, None, False) timing_end(timing_d, timing_nm) rf2_idlist = concepts_d.keys() Fsn_d = { k: all_Fsn_in_Rf2_d[k] for k in list(set(all_Fsn_in_Rf2_d.keys()).intersection(set(rf2_idlist))) } # sets compare ascii+unicode print('count of RF2 ids: %d' % len(rf2_idlist)) # Look for existing FSN values in graph print('count of FSNs in RF2: %d' % len(Fsn_d.keys())) if opts.action=='create': graph_matches_d = {} else: # NEO4J -- look for these concepts (N at a time) timing_idx += 1; timing_nm = '{:04d}_neo4j_lookup_concepts'.format(timing_idx); timing_start(timing_d, timing_nm) if opts.release_type=='delta': graph_matches_d = neo4j.lookup_concepts_for_ids(rf2_idlist) # This includes FSN values else: graph_matches_d = neo4j.lookup_all_concepts() timing_end(timing_d, timing_nm) print('Found %d of the IDs+FSNs in the graph DB:' % len(graph_matches_d.keys())) # Set any missing FSN values from the Graph target_id_set = set(graph_matches_d.keys()) - set(Fsn_d.keys()) print('Filling in %d FSN values from the graph' % len(target_id_set)) for id in list(target_id_set): Fsn_d[id] = graph_matches_d[id]['FSN'] print('count of FSNs after merge with RF2 FSNs: %d' % len(Fsn_d.keys())) # Make sure all ids have an FSN if sorted(Fsn_d.keys()) != sorted(rf2_idlist): raise ValueError('*** (sanity check failure) Cant find FSN for all IDs in release ***') # GENERATE CSV FILES timing_idx += 1; timing_nm = '{:04d}_generate_csvs'.format(timing_idx); timing_start(timing_d, timing_nm) for id in rf2_idlist: current_effTime = sorted(concepts_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif concepts_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> SKIP else: stats['change'] += 1 hist_str = compute_history_string(id, concepts_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('FSN',Fsn_d[id]),('history',hist_str)],current_effTime, concepts_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_concepts #------------------------------------------------------------------------------| # DESCRIPTION CSV files -- descrip_new.csv, descrip_chg.csv | #------------------------------------------------------------------------------| def validate_descriptions(): def description_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in description_d: description_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in description_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) description_d[id][effTime] = fields[:] # attributes in RF2-defined order def language_cb(fields, fields_d, hist): id = fields[ fields_d['referencedComponentId'] ] # DONT USE "id", use the id associated with the Description if id in language_d and language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # PREFER US definition language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() } def snapshot_language_cb(fields, fields_d, hist): id = fields[ fields_d['referencedComponentId'] ] if id in snapshot_language_d and snapshot_language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # prefer US def snapshot_language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() } def compute_descriptionType(typeId,acceptabilityId): return 'FSN' if typeId=='900000000000003001' \ else 'Preferred' if typeId=='900000000000013009' and acceptabilityId=='900000000000548007' \ else 'Synonym' # validate_descriptions: # ==> generate descrip_new.csv, descrip_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_validate_descriptions' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ RF2 DESCRIPTION FILE timing_idx += 1; timing_nm = '%04d_read_RF2_description' % timing_idx; timing_start(timing_d, timing_nm) description_d, language_d, snapshot_language_d = {}, {}, {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(description_cb, None, False) timing_end(timing_d, timing_nm) rf2_idlist = description_d.keys() print('count of RF2 ids: %d' % len(rf2_idlist)) # READ RF2 LANGUAGE FILE timing_idx += 1; timing_nm = '%04d_read_RF2_language' % timing_idx; timing_start(timing_d, timing_nm) snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language') ).process_file(language_cb, None, False) timing_end(timing_d, timing_nm) if opts.release_type=='delta': # need snapshot file for fallback of potential missing historical information print('read snapshot language values'); timing_idx += 1; timing_nm = '%04d_read_rf2_language_snapshot' % timing_idx; timing_start(timing_d, timing_nm) snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language','Snapshot') ).process_file(snapshot_language_cb, None, False); print('read') timing_end(timing_d, timing_nm) # CSV INIT, ATTRIBUTE NAMES MANAGEMENT f_new, f_chg = io.open('descrip_new.csv','w',encoding='utf8'),io.open('descrip_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['description'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['description'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['id128bit','acceptabilityId','refsetId','descriptionType','history'] ] renamed_fields = attributes_by_file.renamed_fields['description'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['description'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action=='create': graph_matches_d = {} else: # 'update' (compare vs Graph) # READ NEO4J DESCRIPTIONS timing_idx += 1; timing_nm = '%04d_neo4j_lookup_DESCRIPTIONS' % timing_idx; timing_start(timing_d, timing_nm) if opts.release_type=='delta': graph_matches_d = neo4j.lookup_descriptions_for_ids(rf2_idlist) # This includes FSN values else: graph_matches_d = neo4j.lookup_all_descriptions() timing_end(timing_d, timing_nm) print('count of Descriptions in NEO4J: %d' % len(graph_matches_d.keys())) print('count of Language Descriptions in RF2: %d' % len(list(set(language_d.keys()).intersection(set(rf2_idlist))))) # GENERATE CSV FILES timing_idx += 1; timing_nm = '%04d_generate_csvs' % timing_idx; timing_start(timing_d, timing_nm) for id in rf2_idlist: current_effTime = sorted(description_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif description_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 hist_str = compute_history_string(id, description_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) # Need to add the following to the description_d definition ==> # 'id128bit','acceptabilityId','descriptionType' (compute from acceptabilityId),'refsetId' computed = {} current_typeId = description_d[id][current_effTime][rf2_fields_d['typeId']] if id in language_d: computed['id128bit'] = language_d[id]['id'] computed['acceptabilityId'] = language_d[id]['acceptabilityId'] computed['refsetId'] = language_d[id]['refsetId'] computed['descriptionType'] = compute_descriptionType(current_typeId,language_d[id]['acceptabilityId']) elif id in snapshot_language_d: # empty unless view=='delta', things not necessarily in Graph (any missing releases in graph) computed['id128bit'] = snapshot_language_d[id]['id'] computed['acceptabilityId'] = snapshot_language_d[id]['acceptabilityId'] computed['refsetId'] = snapshot_language_d[id]['refsetId'] computed['descriptionType'] = compute_descriptionType(current_typeId,snapshot_language_d[id]['acceptabilityId']) elif id in graph_matches_d: computed['id128bit'] = graph_matches_d[id]['id128bit'] computed['acceptabilityId'] = graph_matches_d[id]['acceptabilityId'] computed['refsetId'] = graph_matches_d[id]['refsetId'] computed['descriptionType'] = graph_matches_d[id]['descriptionType'] else: stats['no_language'] += 1 computed['id128bit'] = '<NA>' computed['acceptabilityId'] = '<NA>' computed['refsetId'] = '<NA>' computed['descriptionType'] = '<NA>' if stats['no_language']<=1000: print('*** Missing LANGUAGE records for Description %s ***' % id) elif stats['no_language']==1001: print('*** Missing more than 1000 LANGUAGE records ***') non_rf2_fields = [(x,computed[x]) for x in ['id128bit','acceptabilityId','refsetId','descriptionType']]+[('history',hist_str)] output_line = build_csv_output_line(id, non_rf2_fields, current_effTime, description_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup if stats['no_language'] > 0: print('Missing %d LANGUAGE records' % stats['no_language']) print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) # DONE for f in outfile_list: f.close() # cleanup sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_descriptions #------------------------------------------------------------------------------| # ISA_REL CSV files -- isa_rel_new.csv, isa_rel_chg.csv | #------------------------------------------------------------------------------| def validate_isa_rels(): def isa_rel_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in isa_rel_d: isa_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** ISA id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in isa_rel_d[id]: raise ValueError('*** ISA id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) isa_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order def isa_rel_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_ISA # validate_isa_rels: # ==> generate isa_rel_new.csv, isa_rel_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_make_isa_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ RF2 RELATIONSHIP FILE - EXTRACT ISA timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm) isa_rel_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(isa_rel_cb, isa_rel_filter, False) timing_end(timing_d, timing_nm) rf2_idlist = isa_rel_d.keys() print('count of ids in RF2: %d' % len(rf2_idlist)) # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT f_new, f_chg = io.open('isa_rel_new.csv','w',encoding='utf8'),io.open('isa_rel_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['isa_rel'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['isa_rel'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['history'] ] renamed_fields = attributes_by_file.renamed_fields['isa_rel'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['isa_rel'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action=='create': graph_matches_d = {} else: # EXTRACT ISA RELATIONSHIPS FROM NEO4J timing_idx += 1; timing_nm = '%04d_get_neo4j_ISA' % timing_idx; timing_start(timing_d, timing_nm) all_in_graph = neo4j.lookup_all_isa_rels() # looking for ISA by its 'id' is SLOOOOOOW, get them ALL instead timing_end(timing_d, timing_nm) print('count of ALL ISA in NEO4J: %d' % len(all_in_graph.keys())) graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test print('count of ISA in NEO4J: %d' % len(graph_matches_d.keys())) # GENERATE CSV FILES FOR NEW AND CHG timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm) for id in rf2_idlist: # must compute updated history for each current_effTime = sorted(isa_rel_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif isa_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 hist_str = compute_history_string(id, isa_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, isa_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_isa_rels #------------------------------------------------------------------------------| # DEFINING_REL CSV files -- defining_rel_new.csv, defining_rel_chg.csv | #------------------------------------------------------------------------------| def validate_defining_rels(): def defining_rel_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in defining_rel_d: defining_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** DEFINING-REL id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in defining_rel_d[id]: raise ValueError('*** DEFINING-REL id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) defining_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order def defining_rel_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] != snomedct_constants.SNOMEDCT_TYPEID_ISA # validate_defining_rels: # ==> generate defining_rel_new.csv, defining_rel_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_make_defining_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ all_roles.csv (tiny file) timing_idx += 1; timing_nm = '%04d_read_all_roles' % timing_idx; timing_start(timing_d, timing_nm) roleHash = {} with open('all_roles.csv') as f: for idx,line in enumerate(x.rstrip('\n').rstrip('\r') for x in f): if idx==0: continue # typeId,rolename typeId, rolename = line.split(',') roleHash[typeId] = rolename timing_end(timing_d, timing_nm) # READ RF2 RELATIONSHIP FILE - EXTRACT DEFINING-RELS timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm) defining_rel_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(defining_rel_cb, defining_rel_filter, False) timing_end(timing_d, timing_nm) rf2_idlist = defining_rel_d.keys() print('count of ids in RF2: %d' % len(rf2_idlist)) # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT f_new, f_chg = io.open('defining_rel_new.csv','w',encoding='utf8'),io.open('defining_rel_chg.csv','w',encoding='utf8') f_edge_rem = io.open('defining_rel_edge_rem.csv','w',encoding='utf8') print(db_data_prep('id,rolegroup,sourceId,destinationId'),file=f_edge_rem) outfile_list = [f_new,f_chg] f_DRs = {} # per-defining-relationship type rf2_fields = attributes_by_file.rf2_fields['defining_rel'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['defining_rel'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['history'] ] renamed_fields = attributes_by_file.renamed_fields['defining_rel'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['defining_rel'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action == 'create': graph_matches_d = {} else: # EXTRACT DEFINING RELATIONSHIPS FROM NEO4J timing_idx += 1; timing_nm = '%04d_get_neo4j_DEFINING_RELS' % timing_idx; timing_start(timing_d, timing_nm) all_in_graph = neo4j.lookup_all_defining_rels() # looking for rel by its 'id' is SLOOOOOOW, get them ALL instead timing_end(timing_d, timing_nm) print('count of ALL DEFINING-REL in NEO4J: %d' % len(all_in_graph.keys())) graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test print('count of DEFINING-REL in NEO4J: %d' % len(graph_matches_d.keys())) # GENERATE CSV FILES FOR NEW AND CHG timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm) f_used_roles = open('used_roles.csv','w'); print('typeId,rolename',file=f_used_roles) for id in rf2_idlist: # must compute updated history for each current_effTime = sorted(defining_rel_d[id].keys())[-1] # highest effectiveTime is current current_typeId = defining_rel_d[id][current_effTime][rf2_fields_d['typeId']] rolegroup_changed = False # if this occurred, treat as create instead of change (as it requires edge remove+edge create) if id not in graph_matches_d: stats['new'] += 1 if current_typeId not in f_DRs: f_DRs[current_typeId] = open('DR_%s_new.csv' % roleHash[current_typeId],'w'); print(csv_header, file=f_DRs[current_typeId]) print('%s,%s' % (current_typeId, roleHash[current_typeId]), file=f_used_roles) elif defining_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 # see if rolegroup changed if graph_matches_d[id]['rolegroup'] != defining_rel_d[id][current_effTime][ rf2_fields_d['relationshipGroup'] ]: # rolegroup change? print('%s,%s,%s,%s' % (id,graph_matches_d[id]['rolegroup'],graph_matches_d[id]['sctid'],graph_matches_d[id]['destinationId']),file=f_edge_rem) rolegroup_changed = True # treat this as an edge create case hist_str = compute_history_string(id, defining_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, defining_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) for f in ([f_chg] if rolegroup_changed==False and id in graph_matches_d else [f_new, f_DRs[current_typeId]]): print(output_line,file=f) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list+[f_edge_rem]+[f_DRs[typeId] for typeId in f_DRs.keys()]+[f_used_roles]: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_defining_rels # validate_graphdb: # Output: result displayed to STDOUT, exceptions) opt = optparse.OptionParser() opt.add_option('--verbose',action='store_true',dest='verbose') opt.add_option('--rf2',action='store',dest='rf2') opt.add_option('--element',action='store', choices=['concept','description','isa_rel','defining_rel']) opt.add_option('--release_type', action='store', dest='release_type', choices=['delta','snapshot','full']) opt.add_option('--exceptions_file', action='store', dest='exceptions_file') opt.add_option('--neopw64', action='store') opt.add_option('--neopw', action='store') opts, args = opt.parse_args(arglist) if not (len(args)==0 and opts.rf2 and opts.element and opts.release_type and (opts.neopw or opts.neopw64)): print('Usage: validate_graphdb --element concept/description/isa_rel/defining_rel --rf2 <dir> --release_type delta/snapshot [--verbose] --neopw <base64pw>') sys.exit(1) if opts.neopw and (opts.neopw or opts.neopw64): print('Usage: only one of --neopw and --neopw64 may be specified') sys.exit(1) if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2 # Connect to NEO4J #neopw = base64.decodestring( json.loads(open('necares_config.json').read())['salt'] ) neo4j = snomed_g_lib_neo4j.Neo4j_Access(opts.neopw) # Connect to RF2 files rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type) # Information for comparing RF2 to Graph attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File() # POSSIBILITY - open exception file (append if it exists, write header if it did not exist) fn = opts.exceptions_file exceptions_file = open(fn, 'a') if exceptions_file.tell()==0: print('element,id,description',file=exceptions_file) # header # determine the fields names, NOTE: history is assumed as added last field if opts.element=='concept': validate_concepts() elif opts.element=='description': validate_descriptions() elif opts.element=='isa_rel': validate_isa_rels() elif opts.element=='defining_rel': validate_defining_rels() else: print('unknown element [%s]' % opts.element); sys.exit(1) return
def TC_from_graph(arglist): #------------------------------------------------------------------------------- # build_ISA_graph(children,filename) # Concept: Reads ISA edges from relationships file, stores in the children hash #------------------------------------------------------------------------------- def build_ISA_graph(children,isa_rels): for idvalue in isa_rels.keys(): isa_map = isa_rels[idvalue] active, sourceId, destinationId = isa_map['active'], isa_map['sourceId'], isa_map['destinationId'] if active=='1': # active ISA relationship if destinationId not in children: # parent discovered children[destinationId] = set([sourceId]) # 1st child, create list else: children[destinationId].add(sourceId) # nth child, add to set return # done #------------------------------------------------------------------------------- # compute_TC_table(startnode,children,descendants,visited) #------------------------------------------------------------------------------- # Based on a method described in "Transitive Closure Algorithms # Based on Graph Traversal" by Yannis Ioannidis, Raghu Ramakrishnan, and Linda Winger, # ACM Transactions on Database Systems, Vol. 18, No. 3, September 1993, # Pages: 512 - 576. # Simplified version of their "DAG_DFTC" algorithm. #------------------------------------------------------------------------------- # def compute_TC_table(startnode,children,descendants,visited): # recursively depth-first traverse the graph. visited.add(startnode) descendants[startnode] = set([]) # no descendants yet if startnode not in children: return # no children case, leaf nodes for childnode in children[startnode]: # for all the children of the startnode if childnode not in visited: # if not yet visited (Note: DFS traversal) compute_TC_table(childnode,children,descendants,visited) # recursively visit the childnode, set descendants for descendant in list(descendants[childnode]): # each descendant of childnode descendants[startnode].add(descendant) # mark descendants of startnode descendants[startnode].add(childnode) # mark immediate child of startnode return def print_TC_table(descendants, outfile_name): fout = open(outfile_name, 'w') for startnode in descendants.keys(): for endnode in list(descendants[startnode]): print('%s,%s' % (startnode,endnode), file = fout) fout.close() return def show_timings(t): print('NEO4J Graph DB open: %g' % (t['graph_open_end']-t['graph_open_start'])) print('ISA extraction from NEO4J: %g' % (t['isa_get_end']-t['isa_get_start'])) print('TC computation: %g' % (t['TC_end']-t['TC_start'])) print('Output (csv): %g' % (t['output_write_end']-t['output_write_start'])) print('Total time: %g' % (t['end']-t['start'])) # TC_from_graph: # command line parsing opt = optparse.OptionParser() opt.add_option('--neopw64', action='store', dest='neopw64') opts, args = opt.parse_args(arglist) if not (len(args)==1 and opts.neopw64): print('Usage: cmd TC_from_graph <TCfile-out> --neopw64 <pw>'); sys.exit(1) output_TC_filename = args[0] # Extract ISA relationships from graph (active and inactive) timings = {} timings['start'] = timer() timings['graph_open_start'] = timer() neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64)) timings['graph_open_end'] = timer() timings['isa_get_start'] = timer() isa_rels = neo4j.lookup_all_isa_rels() timings['isa_get_end'] = timer() print('Result class: %s' % str(type(isa_rels))) print('Returned %d objects' % len(isa_rels)) # Compute TC table from ISA relationships, output to specified file. timings['TC_start'] = timer() children, visited, descendants, concept_node = ({}, set(), {}, "138875005") # init build_ISA_graph(children, isa_rels) # build 'children' hash compute_TC_table(concept_node, children, descendants, visited) timings['TC_end'] = timer() timings['output_write_start'] = timer() print_TC_table(descendants, output_TC_filename) timings['output_write_end'] = timer() timings['end'] = timer() show_timings(timings) # All done return
def db_build(arglist): saved_pwd = os.getcwd() opt = optparse.OptionParser() opt.add_option('--rf2', action='store', dest='rf2') opt.add_option('--release_type', action='store', dest='release_type', choices=['delta', 'snapshot', 'full']) opt.add_option('--action', action='store', dest='action', default='create', choices=['create', 'update']) opt.add_option('--neopw64', action='store', dest='neopw64') opt.add_option( '--mode', action='store', dest='mode', default='build', choices=['build', 'prep', 'make_csvs', 'run_cypher', 'validate']) # build is end-to-end, others are subsets opt.add_option('--logfile', action='store', dest='logfile') opt.add_option('--output_dir', action='store', dest='output_dir', default='.') opt.add_option('--relationship_file', action='store', dest='relationship_file', default='Relationship') opt.add_option('--language_code', action='store', dest='language_code', default='en') opt.add_option('--language_name', action='store', dest='language_name', default='Language') opt.add_option('--prep_only', action='store_true', dest='prep_only') opts, args = opt.parse_args(arglist) if not (len(args) == 0 and opts.rf2 and opts.release_type and opts.neopw64): print( 'Usage: db_build --rf2 <dir> --release_type delta/snapshot --neopw64 <base64pw>' ) sys.exit(1) # file path separator pathsep = '/' # make sure output directory exists and is empty opts.output_dir = get_path(opts.output_dir, pathsep) if not (os.path.isdir(opts.output_dir) and len(os.listdir(opts.output_dir)) == 0): print('*** Output directory is not an empty directory [%s] ***' % opts.output_dir) sys.exit(1) # open logfile logfile = open(opts.output_dir+'build.log', 'w') if not opts.logfile else \ (sys.output if opts.logfile == '-' else open(opts.logfile, 'w')) #--------------------------------------------------------------------------- # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal #--------------------------------------------------------------------------- # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls # ... ask directly if these variables don't exist snomed_g_bin = os.environ.get( 'SNOMED_G_BIN', None) # unlikely to exist, but great if it does if not snomed_g_bin: snomed_g_home = os.environ.get('SNOMED_G_HOME', None) if snomed_g_home: snomed_g_bin = get_path(snomed_g_home, pathsep) + 'bin' else: snomed_g_bin = get_path(os.path.dirname(os.path.abspath(__file__)), pathsep) # default to python script dir validated = False while not validated: if len(snomed_g_bin) == 0: snomed_g_bin = raw_input( 'Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ' ).rstrip(pathsep) else: # try to validate, look for snomed_g_rf2_tools.py target_file = snomed_g_bin + pathsep + 'snomed_g_rf2_tools.py' validated = os.path.isfile(target_file) if not validated: print('Cant find [%s]' % target_file) snomed_g_bin = '' snomed_g_bin = get_path(snomed_g_bin, pathsep) print('SNOMED_G bin directory [%s]' % snomed_g_bin) # db_build ==> connect to NEO4J, make sure information given is good if opts.mode == 'build': neo4j = snomed_g_lib_neo4j.Neo4j_Access( base64.decodestring(opts.neopw64)) # Connect to RF2 files, make sure rf2 directory given is good rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type, opts.relationship_file, opts.language_code) # Build # open SQLITE database DB = StatusDb( os.path.abspath( opts.output_dir.rstrip(pathsep) + pathsep + 'build_status.db')) # create YYYYMMDD string d = datetime.datetime.now() # determine current date yyyymmdd = '%04d%02d%02d' % (d.year, d.month, d.day) job_start_datetime = datetime.datetime.now() # Commands needed to Create/Update a SNOMED_G Graph Database command_list_db_build = [{ 'stepname': 'JOB_START', 'log': 'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])' % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd) }, { 'stepname': 'FIND_ROLENAMES', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'FIND_ROLEGROUPS', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs'] }, { 'stepname': 'MAKE_CONCEPT_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DESCRIPTION_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_ISA_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DEFINING_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'TEMPLATE_PROCESSING', 'cmd': 'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s' % (snomed_g_bin, snomed_g_bin, ('create' if opts.action == 'create' else 'update'), opts.rf2, opts.release_type), 'mode': ['build', 'prep'] }, { 'stepname': 'CYPHER_EXECUTION', 'cmd': 'python %s/snomed_g_neo4j_tools.py run_cypher %s/build.cypher --verbose --neopw64 %s' % (snomed_g_bin, opts.output_dir, opts.neopw64), 'mode': ['build', 'run_cypher'] }, { 'stepname': 'CHECK_RESULT', 'cmd': 'python %s/snomed_g_neo4j_tools.py run_cypher %s/snomed_g_graphdb_update_failure_check.cypher --verbose --neopw64 %s' % (snomed_g_bin, snomed_g_bin, opts.neopw64), 'mode': ['build', 'run_cypher'] }, { 'stepname': 'JOB_END', 'log': 'JOB-END' }] command_list_db_build_prep = [{ 'stepname': 'JOB_START', 'log': 'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])' % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd) }, { 'stepname': 'FIND_ROLENAMES', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'FIND_ROLEGROUPS', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs'] }, { 'stepname': 'MAKE_CONCEPT_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DESCRIPTION_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_ISA_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DEFINING_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'TEMPLATE_PROCESSING', 'cmd': 'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s' % (snomed_g_bin, snomed_g_bin, ('create' if opts.action == 'create' else 'update'), opts.rf2, opts.release_type), 'mode': ['build', 'prep'] }, { 'stepname': 'JOB_END', 'log': 'JOB-END' }] # OLD -- #{'stepname':'CYPHER_EXECUTION', 'cmd':'%s/neo4j-shell -localhost -file build.cypher' % neo4j_bin, 'mode':['build','run_cypher']}, command_list = command_list_db_build if not opts.prep_only else command_list_db_build_prep stepnames = [x['stepname'] for x in command_list] # list of dictionaries seqnum = DB.get_next_sequence_number() # Execute commands (BUILD) results_d = {} for command_d in command_list: # extract from tuple stepname, cmd, logmsg, expected_status, mode_requirement = \ command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None) if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode results_d[stepname] = {} cmd_start = datetime.datetime.now( ) if stepname != 'JOB_END' else job_start_datetime # start timer status = -1 should_break = False results_d[stepname][ 'result'] = 'SUCCESS' # assumption of success until failure determined results_d[stepname]['expected_status'] = expected_status results_d[stepname]['command'] = cmd print(stepname) print(stepname, file=logfile) # indicate to user what step we are on if logmsg: # no command to execute in a separate process results_d[stepname]['status'] = 0 results_d[stepname][ 'STDOUT'] = logmsg # LOG everything after 'LOG:' output, err = '', '' else: # execute command (cmd) in subprocess print(cmd, file=logfile) try: #p = subprocess.Popen(cmd, shell=True,stdin=PIPE, stdout=PIPE, stderr=PIPE) #output, err = p.communicate(b"") #status = p.returncode cmd_as_list = cmd.split(' ') if opts.output_dir != '.': os.chdir(opts.output_dir ) # move to output_dir, to start subprocess subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile) if opts.output_dir != '.': os.chdir(saved_pwd) # get back (popd) status = 0 # if no exception -- status is zero except subprocess.CalledProcessError, e: status = e.returncode results_d[stepname]['status'] = status if status != expected_status: results_d[stepname][ 'result'] = 'FAILED (STATUS %d)' % status should_break = True pass # might be fine, should_break controls termination except: # NOTE: result defaulted to -1 above
def TC_fordate_from_graph(arglist): def active_at_date(datestring, isa_edge): active = '0' # if no information applies (possible), default to inactive # check the current definition, may be in effect at given date if isa_edge['effectiveTime'] <= datestring: # the current def in effect active = isa_edge['active'] elif len(isa_edge['history']) > 2: # check history, current definition doesnt apply # eg: datestring = 20050101 and current effectiveTime is 20160101 ==> not in effect # hist item 20030101 and 20040101 exists ==> 200401010 in effect in 20050101. # note: no need to check current element again, already determined not in effect # JSON example [{"typeId": "116680003", "sourceId": "900000000000441003", ...},{...}] ordered_history_list = json.loads(isa_edge['history']) for hist_elem in ordered_history_list: # list of maps if hist_elem['effectiveTime'] > datestring: break # in future vs given date if 'active' in hist_elem: active = hist_elem['active'] return active=='1' #------------------------------------------------------------------------------- # build_ISA_graph(children,filename) # Concept: Reads ISA edges from relationships file, stores in the children hash #------------------------------------------------------------------------------- def build_ISA_graph(children,isa_rels,yyyymmdd): for idvalue in isa_rels.keys(): isa_map = isa_rels[idvalue] sourceId, destinationId = isa_map['sourceId'], isa_map['destinationId'] if active_at_date(yyyymmdd, isa_map): if destinationId not in children: # parent discovered children[destinationId] = set([sourceId]) # 1st child, create list else: children[destinationId].add(sourceId) # nth child, add to set return # done #------------------------------------------------------------------------------- # compute_TC_table(startnode,children,descendants,visited) #------------------------------------------------------------------------------- # Based on a method described in "Transitive Closure Algorithms # Based on Graph Traversal" by Yannis Ioannidis, Raghu Ramakrishnan, and Linda Winger, # ACM Transactions on Database Systems, Vol. 18, No. 3, September 1993, # Pages: 512 - 576. # Simplified version of their "DAG_DFTC" algorithm. #------------------------------------------------------------------------------- # def compute_TC_table(startnode,children,descendants,visited): # recursively depth-first traverse the graph. visited.add(startnode) descendants[startnode] = set([]) # no descendants yet if startnode not in children: return # no children case, leaf nodes for childnode in children[startnode]: # for all the children of the startnode if childnode not in visited: # if not yet visited (Note: DFS traversal) compute_TC_table(childnode,children,descendants,visited) # recursively visit the childnode, set descendants for descendant in list(descendants[childnode]): # each descendant of childnode descendants[startnode].add(descendant) # mark descendants of startnode descendants[startnode].add(childnode) # mark immediate child of startnode return def print_TC_table(descendants, outfile_name): fout = open(outfile_name, 'w') for startnode in descendants.keys(): for endnode in list(descendants[startnode]): print('%s,%s' % (startnode,endnode), file = fout) fout.close() return def show_timings(t): print('NEO4J Graph DB open: %g' % (t['graph_open_end']-t['graph_open_start'])) print('ISA extraction from NEO4J: %g' % (t['isa_get_end']-t['isa_get_start'])) print('TC computation: %g' % (t['TC_end']-t['TC_start'])) print('Output (csv): %g' % (t['output_write_end']-t['output_write_start'])) print('Total time: %g' % (t['end']-t['start'])) # TC_fordate_from_graph: # command line parsing opt = optparse.OptionParser() opt.add_option('--neopw64', action='store', dest='neopw64') opts, args = opt.parse_args(arglist) if not (len(args)==2 and opts.neopw64): print('Usage: cmd TC_fordate_from_graph YYYYMMDD <TCfile-out> --neopw64 <pw>'); sys.exit(1) yyyymmdd, output_TC_filename = args[0], args[1] # Extract ISA relationships from graph (active and inactive) timings = {} timings['start'] = timer() timings['graph_open_start'] = timer() neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64)) timings['graph_open_end'] = timer() timings['isa_get_start'] = timer() isa_rels = neo4j.lookup_all_isa_rels() timings['isa_get_end'] = timer() print('Result class: %s' % str(type(isa_rels))) print('Returned %d objects' % len(isa_rels)) # Compute TC table from ISA relationships, output to specified file. timings['TC_start'] = timer() children, visited, descendants, concept_node = ({}, set(), {}, "138875005") # init build_ISA_graph(children, isa_rels, yyyymmdd) # build 'children' hash compute_TC_table(concept_node, children, descendants, visited) timings['TC_end'] = timer() timings['output_write_start'] = timer() print_TC_table(descendants, output_TC_filename) timings['output_write_end'] = timer() timings['end'] = timer() show_timings(timings) return