def instantiate(arglist): # instantiate: # PARSE COMMAND # syntax: instantiate <template-file> <outfile> opt = optparse.OptionParser() opt.add_option('--rf2', action='store', dest='rf2') opt.add_option('--release_type', action='store', dest='release_type', choices=['delta', 'snapshot', 'full']) opt.add_option('--verbose', action='store_true', dest='verbose') opt.add_option('--action', action='store', dest='action', default='create', choices=['create', 'update']) opts, args = opt.parse_args(arglist) if not (len(args) == 2 and opts.rf2 and opts.release_type): print( 'Usage: intantiate <template-file> <output-file> --rf2 <dir> --release_type {Full,Snapshot,Delta}' ) sys.exit(1) template_file, output_file = args # Connect to RF2 files rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type) # Information for creating the CSV files attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File() # CONFIGURATION config = {} config['terminology_dir'] = rf2_folders.get_terminology_dir() config['release_date'] = rf2_folders.get_release_date() config['release_center'] = rf2_folders.get_release_center() config['output_dir'] = './' # Process template file fout = open(output_file, 'w') release_date = config['release_date'].strip() release_center = config.get('release_center', 'INT') os_pathsep = config.get('os_pathsep', '/') # JGP 2015/10/07, no default previously output_dir = get_path(config['output_dir'], os_pathsep) if sys.platform == 'win32': output_dir = output_dir.replace( '\\', '/' ) # JGP 2016/07/30 -- issue "c:\sno\build\us20160301/defining_rel_edge_rem.csv" terminology_dir = get_path(config['terminology_dir'], os_pathsep) config_file_suffix = '%s_%s' % (release_center, release_date) file_protocol = 'file:///' if sys.platform in [ 'cygwin', 'win32', 'darwin' ] else 'file:' # ubuntu is else case # NOTE: can result in 'file:////Users/<rest>' on Mac, replace by 'file:///Users/<rest>' # INSTANTIATION PT1 -- PROCESS FILES IN TEMPLATE, REPLACING TEMPLATES WITH INSTANTIATED VALUES for line in [x.rstrip('\n').rstrip('\r') for x in open(template_file)]: line = line.replace('<<<release_date>>>', release_date) \ .replace('<<<output_dir>>>', output_dir) \ .replace('<<<terminology_dir>>>', terminology_dir) \ .replace('<<<config_file_suffix>>>', config_file_suffix) \ .replace('<<<file_protocol>>>', file_protocol) \ .replace('file:////','file:///') print(line, file=fout) # INSTANTIATION PT2 -- DEFINING RELATIONSHIPS PROCESSING # Handle NEW defining relationships # Data source (for new defining relationships): # <output_dir>/defining_rels_new_sorted.csv file # id,active,sctid,rolegroup,typeId,rolename,destinationId,effectiveTime, # moduleId,characteristicTypeId,modifierId,history # 4661958023,1,471280008,1,FINDING_SITE,589001,20140731, # 900000000000207008,900000000000011006,900000000000451002, # Algorithm: # NOTE: already sorted by rolename, so all FINDING_SITE elements together, etc # ./snomed_sort_csv.py --fields 'rolename' --string # defining_rels_new.csv defining_rels_new_sorted.csv # ==> create separate files for each defining-relationship type that # is found, eg: DR_<snomedct-code>_new.csv # ==> add CYPHER code to process the created files and add the # defining relationships. with open(output_dir + 'used_roles.csv') as f: for idx, line in enumerate(x.rstrip('\n').rstrip('\r') for x in f): if idx == 0: continue # typeId,rolename typeId, rolename = line.split(',') # create CYPHER to load the file and add the relationships to ROLE_GROUP nodes # JGP 2017-10-31. Use a 2-step procedure for creating the defining relationships, # to support systems with smaller amounts of memory (use smaller transactions). # The first step creates any necessary role groups, and the second step creates # the defining relationship edges from role groups to the specified target concepts. print('// %s defining relationships' % rolename, file=fout) print('''RETURN 'NEW Defining relationships of type %s';''' % rolename, file=fout) print(file=fout) print('USING PERIODIC COMMIT 200', file=fout) load_csv_line = ( 'LOAD CSV with headers from "%s%sDR_%s_new.csv" as line' % (('file:///' if sys.platform in ['cygwin', 'win32', 'darwin'] else 'file:'), output_dir, typeId)).replace( 'file:////', 'file:///') print(load_csv_line, file=fout) print( 'with line, line.sctid as source_id, line.destinationId as dest_id, line.rolegroup as rolegroup_id', file=fout) # EMonson changed : typo in original print( 'MERGE (rg:RoleGroup { sctid: source_id, rolegroup: rolegroup_id });', file=fout) print(file=fout) print( '// Add defining relationship edge in 2nd step, Java memory issue', file=fout) print('USING PERIODIC COMMIT 200', file=fout) load_csv_line = ( 'LOAD CSV with headers from "%s%sDR_%s_new.csv" as line' % (('file:///' if sys.platform in ['cygwin', 'win32', 'darwin'] else 'file:'), output_dir, typeId)).replace( 'file:////', 'file:///') print(load_csv_line, file=fout) print( 'with line, line.sctid as source_id, line.destinationId as dest_id, line.rolegroup as rolegroup_id', file=fout) print( 'MATCH (rg:RoleGroup { sctid: source_id, rolegroup: rolegroup_id })', file=fout) print('WITH line,rg,source_id,dest_id,rolegroup_id', file=fout) print('MATCH (c:ObjectConcept { sctid: dest_id })', file=fout) print( 'CREATE UNIQUE (rg)-[:%s { id: line.id, active: line.active, sctid: source_id,' % rolename, file=fout) print(' typeId: line.typeId,', file=fout) print( ' rolegroup: rolegroup_id, effectiveTime: line.effectiveTime,', file=fout) print( ' moduleId: line.moduleId, characteristicTypeId: line.characteristicTypeId,', file=fout) print( ' modifierId: line.modifierId,', file=fout) print( ' history: line.history }]->(c);', file=fout) # close CSV, wrap up print('// Finito', file=fout) fout.close() return
def validate_graphdb(arglist): def rf2_filename(element, view=None): # rf2_folders is set in validate_graphdb initialization return rf2_folders.rf2_file_path(element, view) # eg: 'concept' def old_compute_hist_changes(new_field_values, prev_field_values, field_names): # find map with only modified fields return { field_names[idx] : new_field_values[idx] for idx in range(len(field_names)) if db_data_prep(new_field_values[idx]) != db_data_prep(prev_field_values[idx]) } ''' HISTORY COMPUTATION -- Example information for a concept: Information state example (need to understand for history computation) csv_fields = ['id','effectiveTime','active','moduleId','definitionStatusId','FSN','history'] field_names = ['id','effectiveTime','active','moduleId','definitionStatusId'] renamed_fields = {} id -- '293672009' concepts_d[id]['20160301'] -- concepts_d[id] is a map keyed by effectiveTime, its value ==> list of attribute values for that time, in same order as in RF2 file graph_matches_d[id] (graph) -- {u'nodetype': u'concept', u'effectiveTime': u'20060131', u'FSN': u'Antiemetic allergy (disorder)', u'definitionStatusId': u'900000000000073002', u'sctid': u'293672009', u'active': u'1', u'moduleId': u'900000000000207008', u'id': u'293672009', u'history': u'[{"active": "1", "effectiveTime": "20020131", ...}, ...]'} ''' def compute_history_string(id, rf2_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields): if opts.release_type == 'full': # compute history, have all information historical_effectiveTimes = sorted(rf2_d[id].keys())[:-1] # excluce 'current' (latest) hist = [ { nm: rf2_d[id][effTime][rf2_fields_d[renamed_fields.get(nm,nm)]] for nm in field_names } for effTime in historical_effectiveTimes ] \ if len(rf2_d[id].keys()) > 1 else [] else: # not FULL, can be missing historical info if id not in graph_matches_d: hist = [] else: old_history = graph_matches_d[id]['history'] # JSON string or empty string old_field_values = [ graph_matches_d[id][nm] for nm in field_names ] if len(old_history) == 0: # no prev history, old values ==> previous history) hist = [ { a:b for a,b in zip(field_names, old_field_values) } ] else: # existing history, not FULL release, append previous values from graph (previous history) hist = json.loads(old_history) + [ ( { a:b for a,b in zip(field_names, old_field_values) } ) ] return json.dumps(hist) if len(hist) > 0 else '' def build_csv_output_line(id, non_rf2_fields, current_effTime, rf2_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_fields): csv_data = [None]*len(csv_fields_d.keys()) for nm in field_names: csv_data[csv_fields_d[nm]] = db_data_prep(rf2_d[id][current_effTime][rf2_fields_d[renamed_fields.get(nm,nm)]]) for k,v in non_rf2_fields: csv_data[csv_fields_d[k]] = db_data_prep(v) # eg: [('history','<hist-json-str>'),...] if None in csv_data: raise ValueError('csv_data %s' % str(csv_data)) for nm in quoted_fields: csv_data[csv_fields_d[nm]] = csv_clean_str(csv_data[csv_fields_d[nm]]) # quote only necessary fields return db_data_prep( ','.join(csv_data) ) # output_line #------------------------------------------------------------------------------| # CONCEPT CSV files creation -- concept_new.csv, concept_chg.csv | #------------------------------------------------------------------------------| def validate_concepts(): def concept_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in concepts_d: concepts_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in concepts_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) concepts_d[id][effTime] = fields[:] # attributes in RF2-defined order def Fsn_cb(fields, fields_d, hist): all_Fsn_in_Rf2_d[ db_data_prep(fields[ fields_d['conceptId'] ]) ] = db_data_prep(fields[ fields_d['term'] ]) # FSN def Fsn_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_FSN # validate_concepts: # ==> generate concept_new.csv, concept_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = { } timing_idx = 0 timing_overall_nm = '{:04d}_validate_concepts'.format(timing_idx); timing_start(timing_d, timing_overall_nm) timing_idx += 1; timing_nm = '{:04d}_read_RF2_description'.format(timing_idx); timing_start(timing_d, timing_nm) all_Fsn_in_Rf2_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(Fsn_cb, Fsn_filter, False) timing_end(timing_d, timing_nm) f_new, f_chg = io.open('concept_new.csv','w',encoding='utf8'),io.open('concept_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['concept'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['concept'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['FSN','history'] ] # exclude non-RF2 history and FSN (external) renamed_fields = attributes_by_file.renamed_fields['concept'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['concept'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header # create concepts_d with information from DELTA/SNAPSHOT/FULL concept file timing_idx += 1; timing_nm = '{:04d}_read_RF2_concept'.format(timing_idx); timing_start(timing_d, timing_nm) concepts_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('concept') ).process_file(concept_cb, None, False) timing_end(timing_d, timing_nm) rf2_idlist = concepts_d.keys() Fsn_d = { k: all_Fsn_in_Rf2_d[k] for k in list(set(all_Fsn_in_Rf2_d.keys()).intersection(set(rf2_idlist))) } # sets compare ascii+unicode print('count of RF2 ids: %d' % len(rf2_idlist)) # Look for existing FSN values in graph print('count of FSNs in RF2: %d' % len(Fsn_d.keys())) if opts.action=='create': graph_matches_d = {} else: # NEO4J -- look for these concepts (N at a time) timing_idx += 1; timing_nm = '{:04d}_neo4j_lookup_concepts'.format(timing_idx); timing_start(timing_d, timing_nm) if opts.release_type=='delta': graph_matches_d = neo4j.lookup_concepts_for_ids(rf2_idlist) # This includes FSN values else: graph_matches_d = neo4j.lookup_all_concepts() timing_end(timing_d, timing_nm) print('Found %d of the IDs+FSNs in the graph DB:' % len(graph_matches_d.keys())) # Set any missing FSN values from the Graph target_id_set = set(graph_matches_d.keys()) - set(Fsn_d.keys()) print('Filling in %d FSN values from the graph' % len(target_id_set)) for id in list(target_id_set): Fsn_d[id] = graph_matches_d[id]['FSN'] print('count of FSNs after merge with RF2 FSNs: %d' % len(Fsn_d.keys())) # Make sure all ids have an FSN if sorted(Fsn_d.keys()) != sorted(rf2_idlist): raise ValueError('*** (sanity check failure) Cant find FSN for all IDs in release ***') # GENERATE CSV FILES timing_idx += 1; timing_nm = '{:04d}_generate_csvs'.format(timing_idx); timing_start(timing_d, timing_nm) for id in rf2_idlist: current_effTime = sorted(concepts_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif concepts_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> SKIP else: stats['change'] += 1 hist_str = compute_history_string(id, concepts_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('FSN',Fsn_d[id]),('history',hist_str)],current_effTime, concepts_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_concepts #------------------------------------------------------------------------------| # DESCRIPTION CSV files -- descrip_new.csv, descrip_chg.csv | #------------------------------------------------------------------------------| def validate_descriptions(): def description_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in description_d: description_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in description_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) description_d[id][effTime] = fields[:] # attributes in RF2-defined order def language_cb(fields, fields_d, hist): id = fields[ fields_d['referencedComponentId'] ] # DONT USE "id", use the id associated with the Description if id in language_d and language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # PREFER US definition language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() } def snapshot_language_cb(fields, fields_d, hist): id = fields[ fields_d['referencedComponentId'] ] if id in snapshot_language_d and snapshot_language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # prefer US def snapshot_language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() } def compute_descriptionType(typeId,acceptabilityId): return 'FSN' if typeId=='900000000000003001' \ else 'Preferred' if typeId=='900000000000013009' and acceptabilityId=='900000000000548007' \ else 'Synonym' # validate_descriptions: # ==> generate descrip_new.csv, descrip_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_validate_descriptions' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ RF2 DESCRIPTION FILE timing_idx += 1; timing_nm = '%04d_read_RF2_description' % timing_idx; timing_start(timing_d, timing_nm) description_d, language_d, snapshot_language_d = {}, {}, {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(description_cb, None, False) timing_end(timing_d, timing_nm) rf2_idlist = description_d.keys() print('count of RF2 ids: %d' % len(rf2_idlist)) # READ RF2 LANGUAGE FILE timing_idx += 1; timing_nm = '%04d_read_RF2_language' % timing_idx; timing_start(timing_d, timing_nm) snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language') ).process_file(language_cb, None, False) timing_end(timing_d, timing_nm) if opts.release_type=='delta': # need snapshot file for fallback of potential missing historical information print('read snapshot language values'); timing_idx += 1; timing_nm = '%04d_read_rf2_language_snapshot' % timing_idx; timing_start(timing_d, timing_nm) snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language','Snapshot') ).process_file(snapshot_language_cb, None, False); print('read') timing_end(timing_d, timing_nm) # CSV INIT, ATTRIBUTE NAMES MANAGEMENT f_new, f_chg = io.open('descrip_new.csv','w',encoding='utf8'),io.open('descrip_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['description'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['description'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['id128bit','acceptabilityId','refsetId','descriptionType','history'] ] renamed_fields = attributes_by_file.renamed_fields['description'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['description'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action=='create': graph_matches_d = {} else: # 'update' (compare vs Graph) # READ NEO4J DESCRIPTIONS timing_idx += 1; timing_nm = '%04d_neo4j_lookup_DESCRIPTIONS' % timing_idx; timing_start(timing_d, timing_nm) if opts.release_type=='delta': graph_matches_d = neo4j.lookup_descriptions_for_ids(rf2_idlist) # This includes FSN values else: graph_matches_d = neo4j.lookup_all_descriptions() timing_end(timing_d, timing_nm) print('count of Descriptions in NEO4J: %d' % len(graph_matches_d.keys())) print('count of Language Descriptions in RF2: %d' % len(list(set(language_d.keys()).intersection(set(rf2_idlist))))) # GENERATE CSV FILES timing_idx += 1; timing_nm = '%04d_generate_csvs' % timing_idx; timing_start(timing_d, timing_nm) for id in rf2_idlist: current_effTime = sorted(description_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif description_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 hist_str = compute_history_string(id, description_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) # Need to add the following to the description_d definition ==> # 'id128bit','acceptabilityId','descriptionType' (compute from acceptabilityId),'refsetId' computed = {} current_typeId = description_d[id][current_effTime][rf2_fields_d['typeId']] if id in language_d: computed['id128bit'] = language_d[id]['id'] computed['acceptabilityId'] = language_d[id]['acceptabilityId'] computed['refsetId'] = language_d[id]['refsetId'] computed['descriptionType'] = compute_descriptionType(current_typeId,language_d[id]['acceptabilityId']) elif id in snapshot_language_d: # empty unless view=='delta', things not necessarily in Graph (any missing releases in graph) computed['id128bit'] = snapshot_language_d[id]['id'] computed['acceptabilityId'] = snapshot_language_d[id]['acceptabilityId'] computed['refsetId'] = snapshot_language_d[id]['refsetId'] computed['descriptionType'] = compute_descriptionType(current_typeId,snapshot_language_d[id]['acceptabilityId']) elif id in graph_matches_d: computed['id128bit'] = graph_matches_d[id]['id128bit'] computed['acceptabilityId'] = graph_matches_d[id]['acceptabilityId'] computed['refsetId'] = graph_matches_d[id]['refsetId'] computed['descriptionType'] = graph_matches_d[id]['descriptionType'] else: stats['no_language'] += 1 computed['id128bit'] = '<NA>' computed['acceptabilityId'] = '<NA>' computed['refsetId'] = '<NA>' computed['descriptionType'] = '<NA>' if stats['no_language']<=1000: print('*** Missing LANGUAGE records for Description %s ***' % id) elif stats['no_language']==1001: print('*** Missing more than 1000 LANGUAGE records ***') non_rf2_fields = [(x,computed[x]) for x in ['id128bit','acceptabilityId','refsetId','descriptionType']]+[('history',hist_str)] output_line = build_csv_output_line(id, non_rf2_fields, current_effTime, description_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup if stats['no_language'] > 0: print('Missing %d LANGUAGE records' % stats['no_language']) print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) # DONE for f in outfile_list: f.close() # cleanup sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_descriptions #------------------------------------------------------------------------------| # ISA_REL CSV files -- isa_rel_new.csv, isa_rel_chg.csv | #------------------------------------------------------------------------------| def validate_isa_rels(): def isa_rel_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in isa_rel_d: isa_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** ISA id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in isa_rel_d[id]: raise ValueError('*** ISA id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) isa_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order def isa_rel_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_ISA # validate_isa_rels: # ==> generate isa_rel_new.csv, isa_rel_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_make_isa_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ RF2 RELATIONSHIP FILE - EXTRACT ISA timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm) isa_rel_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(isa_rel_cb, isa_rel_filter, False) timing_end(timing_d, timing_nm) rf2_idlist = isa_rel_d.keys() print('count of ids in RF2: %d' % len(rf2_idlist)) # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT f_new, f_chg = io.open('isa_rel_new.csv','w',encoding='utf8'),io.open('isa_rel_chg.csv','w',encoding='utf8') outfile_list = [f_new,f_chg] rf2_fields = attributes_by_file.rf2_fields['isa_rel'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['isa_rel'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['history'] ] renamed_fields = attributes_by_file.renamed_fields['isa_rel'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['isa_rel'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action=='create': graph_matches_d = {} else: # EXTRACT ISA RELATIONSHIPS FROM NEO4J timing_idx += 1; timing_nm = '%04d_get_neo4j_ISA' % timing_idx; timing_start(timing_d, timing_nm) all_in_graph = neo4j.lookup_all_isa_rels() # looking for ISA by its 'id' is SLOOOOOOW, get them ALL instead timing_end(timing_d, timing_nm) print('count of ALL ISA in NEO4J: %d' % len(all_in_graph.keys())) graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test print('count of ISA in NEO4J: %d' % len(graph_matches_d.keys())) # GENERATE CSV FILES FOR NEW AND CHG timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm) for id in rf2_idlist: # must compute updated history for each current_effTime = sorted(isa_rel_d[id].keys())[-1] # highest effectiveTime is current if id not in graph_matches_d: stats['new'] += 1 elif isa_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 hist_str = compute_history_string(id, isa_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, isa_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) print(output_line,file=(f_new if not id in graph_matches_d else f_chg)) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_isa_rels #------------------------------------------------------------------------------| # DEFINING_REL CSV files -- defining_rel_new.csv, defining_rel_chg.csv | #------------------------------------------------------------------------------| def validate_defining_rels(): def defining_rel_cb(fields, fields_d, hist): id = fields[ fields_d['id'] ] effTime = fields[ fields_d['effectiveTime'] ] if id not in defining_rel_d: defining_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime) else: if opts.release_type != 'full': raise ValueError('*** DEFINING-REL id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type)) if effTime in defining_rel_d[id]: raise ValueError('*** DEFINING-REL id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime)) defining_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order def defining_rel_filter(fields, fields_d, hist): return fields[ fields_d['typeId'] ] != snomedct_constants.SNOMEDCT_TYPEID_ISA # validate_defining_rels: # ==> generate defining_rel_new.csv, defining_rel_chg.csv -- from info in RF2 and NEO4J stats = { 'error_count': 0 } timing_d = {} timing_idx = 0 timing_overall_nm = '%04d_make_defining_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm) # READ all_roles.csv (tiny file) timing_idx += 1; timing_nm = '%04d_read_all_roles' % timing_idx; timing_start(timing_d, timing_nm) roleHash = {} with open('all_roles.csv') as f: for idx,line in enumerate(x.rstrip('\n').rstrip('\r') for x in f): if idx==0: continue # typeId,rolename typeId, rolename = line.split(',') roleHash[typeId] = rolename timing_end(timing_d, timing_nm) # READ RF2 RELATIONSHIP FILE - EXTRACT DEFINING-RELS timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm) defining_rel_d = {} snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(defining_rel_cb, defining_rel_filter, False) timing_end(timing_d, timing_nm) rf2_idlist = defining_rel_d.keys() print('count of ids in RF2: %d' % len(rf2_idlist)) # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT f_new, f_chg = io.open('defining_rel_new.csv','w',encoding='utf8'),io.open('defining_rel_chg.csv','w',encoding='utf8') f_edge_rem = io.open('defining_rel_edge_rem.csv','w',encoding='utf8') print(db_data_prep('id,rolegroup,sourceId,destinationId'),file=f_edge_rem) outfile_list = [f_new,f_chg] f_DRs = {} # per-defining-relationship type rf2_fields = attributes_by_file.rf2_fields['defining_rel'] rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) } csv_fields = attributes_by_file.csv_fields['defining_rel'] # ['id','effectiveTime','active',...,'history'] csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) } field_names = [ x for x in csv_fields if x not in ['history'] ] renamed_fields = attributes_by_file.renamed_fields['defining_rel'] # dictionary quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['defining_rel'] csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..." for f in outfile_list: print(csv_header, file=f) # header if opts.action == 'create': graph_matches_d = {} else: # EXTRACT DEFINING RELATIONSHIPS FROM NEO4J timing_idx += 1; timing_nm = '%04d_get_neo4j_DEFINING_RELS' % timing_idx; timing_start(timing_d, timing_nm) all_in_graph = neo4j.lookup_all_defining_rels() # looking for rel by its 'id' is SLOOOOOOW, get them ALL instead timing_end(timing_d, timing_nm) print('count of ALL DEFINING-REL in NEO4J: %d' % len(all_in_graph.keys())) graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test print('count of DEFINING-REL in NEO4J: %d' % len(graph_matches_d.keys())) # GENERATE CSV FILES FOR NEW AND CHG timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm) f_used_roles = open('used_roles.csv','w'); print('typeId,rolename',file=f_used_roles) for id in rf2_idlist: # must compute updated history for each current_effTime = sorted(defining_rel_d[id].keys())[-1] # highest effectiveTime is current current_typeId = defining_rel_d[id][current_effTime][rf2_fields_d['typeId']] rolegroup_changed = False # if this occurred, treat as create instead of change (as it requires edge remove+edge create) if id not in graph_matches_d: stats['new'] += 1 if current_typeId not in f_DRs: f_DRs[current_typeId] = open('DR_%s_new.csv' % roleHash[current_typeId],'w'); print(csv_header, file=f_DRs[current_typeId]) print('%s,%s' % (current_typeId, roleHash[current_typeId]), file=f_used_roles) elif defining_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']: stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY else: stats['change'] += 1 # see if rolegroup changed if graph_matches_d[id]['rolegroup'] != defining_rel_d[id][current_effTime][ rf2_fields_d['relationshipGroup'] ]: # rolegroup change? print('%s,%s,%s,%s' % (id,graph_matches_d[id]['rolegroup'],graph_matches_d[id]['sctid'],graph_matches_d[id]['destinationId']),file=f_edge_rem) rolegroup_changed = True # treat this as an edge create case hist_str = compute_history_string(id, defining_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields) output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, defining_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields) for f in ([f_chg] if rolegroup_changed==False and id in graph_matches_d else [f_new, f_DRs[current_typeId]]): print(output_line,file=f) # Done generating CSVs timing_end(timing_d, timing_nm) timing_end(timing_d, timing_overall_nm) # CLEANUP, DISPLAY RESULTS for f in outfile_list+[f_edge_rem]+[f_DRs[typeId] for typeId in f_DRs.keys()]+[f_used_roles]: f.close() # cleanup print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count'])) show_timings(timing_d) sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS) # END validate_defining_rels # validate_graphdb: # Output: result displayed to STDOUT, exceptions) opt = optparse.OptionParser() opt.add_option('--verbose',action='store_true',dest='verbose') opt.add_option('--rf2',action='store',dest='rf2') opt.add_option('--element',action='store', choices=['concept','description','isa_rel','defining_rel']) opt.add_option('--release_type', action='store', dest='release_type', choices=['delta','snapshot','full']) opt.add_option('--exceptions_file', action='store', dest='exceptions_file') opt.add_option('--neopw64', action='store') opt.add_option('--neopw', action='store') opts, args = opt.parse_args(arglist) if not (len(args)==0 and opts.rf2 and opts.element and opts.release_type and (opts.neopw or opts.neopw64)): print('Usage: validate_graphdb --element concept/description/isa_rel/defining_rel --rf2 <dir> --release_type delta/snapshot [--verbose] --neopw <base64pw>') sys.exit(1) if opts.neopw and (opts.neopw or opts.neopw64): print('Usage: only one of --neopw and --neopw64 may be specified') sys.exit(1) if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2 # Connect to NEO4J #neopw = base64.decodestring( json.loads(open('necares_config.json').read())['salt'] ) neo4j = snomed_g_lib_neo4j.Neo4j_Access(opts.neopw) # Connect to RF2 files rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type) # Information for comparing RF2 to Graph attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File() # POSSIBILITY - open exception file (append if it exists, write header if it did not exist) fn = opts.exceptions_file exceptions_file = open(fn, 'a') if exceptions_file.tell()==0: print('element,id,description',file=exceptions_file) # header # determine the fields names, NOTE: history is assumed as added last field if opts.element=='concept': validate_concepts() elif opts.element=='description': validate_descriptions() elif opts.element=='isa_rel': validate_isa_rels() elif opts.element=='defining_rel': validate_defining_rels() else: print('unknown element [%s]' % opts.element); sys.exit(1) return
def db_validate(arglist): saved_pwd = os.getcwd() opt = optparse.OptionParser() opt.add_option('--rf2',action='store') opt.add_option('--release_type', action='store', choices=['delta','snapshot','full']) opt.add_option('--neopw64', action='store') opt.add_option('--neopw', action='store') opt.add_option('--exceptions', action='store') opt.add_option('--logfile', action='store', default='-') opts, args = opt.parse_args(arglist) if not (len(args)==0 and opts.rf2 and opts.release_type and (opts.neopw or opts.neopw64)): print('Usage: db_validate --rf2 <dir> --release_type full --neopw <pw>') sys.exit(1) if opts.neopw and opts.neopw64: print('Usage: only one of --neopw and --neopw64 may be specified') sys.exit(1) if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2 # open logfile logfile = open(opts.logfile, 'w') if opts.logfile != '-' else sys.stdout #--------------------------------------------------------------------------- # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal #--------------------------------------------------------------------------- pathsep = '/' # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls # ... ask directly if these variables don't exist snomed_g_bin = os.environ.get('SNOMED_G_BIN',None) # unlikely to exist, but great if it does if not snomed_g_bin: snomed_g_home = os.environ.get('SNOMED_G_HOME',None) if snomed_g_home: snomed_g_bin = snomed_g_home.rstrip(pathsep) + pathsep + 'bin' else: snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep) validated = False while not validated: if len(snomed_g_bin)==0: snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep) else: # try to validate, look for snomed_g_rf2_tools.py target_file = snomed_g_bin+pathsep+'snomed_g_rf2_tools.py' validated = os.path.isfile(target_file) if not validated: print('Cant find [%s]' % target_file); snomed_g_bin = '' snomed_g_bin = os.path.abspath(snomed_g_bin) print('SNOMED_G bin directory [%s]' % snomed_g_bin) # connect to NEO4J, make sure information given is good neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64)) # Connect to RF2 files, make sure rf2 directory given is good rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type) # Build # open SQLITE database DB = StatusDb(os.path.abspath(opts.output_dir.rstrip(pathsep)+pathsep+'validate_status.db')) # create YYYYMMDD string d = datetime.datetime.now() # determine current date yyyymmdd = '%04d%02d%02d' % (d.year,d.month,d.day) job_start_datetime = datetime.datetime.now() # Commands needed to Create/Update a SNOMED_G Graph Database commands_d = { 'JOB_START': {'stepname': 'JOB_START', 'log': 'JOB-START(release_type:[%s], rf2:[%s], date:[%s])' \ % (opts.release_type, opts.rf2, yyyymmdd)}, 'VALIDATE_CONCEPTS': {'stepname': 'VALIDATE_CONCEPTS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element concept --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_DESCRIPTIONS': {'stepname': 'VALIDATE_DESCRIPTIONS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element description --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_ISA_RELS': {'stepname': 'VALIDATE_ISA_RELS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element isa_rel --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'VALIDATE_DEFINING_RELS': {'stepname': 'VALIDATE_DEFINING_RELS', 'cmd': 'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element defining_rel --release_type %s --rf2 %s --neopw %s' \ % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw), 'mode': ['validate']}, 'JOB_END': {'stepname': 'JOB_END', 'log': 'JOB-END'} } command_list_validate_build = \ [ commands_d[x] for x in ['JOB_START', 'VALIDATE_CONCEPTS', 'VALIDATE_DESCRIPTIONS', 'VALIDATE_ISA_RELS', 'VALIDATE_DEFINING_RELS', 'JOB_END'] ] command_list = command_list_validate_build stepnames = [x['stepname'] for x in command_list] # list of dictionaries seqnum = DB.get_next_sequence_number() # Execute commands (BUILD) results_d = {} for command_d in command_list: # extract from tuple stepname, cmd, logmsg, expected_status, mode_requirement = \ command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None) if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode results_d[stepname] = {} cmd_start = datetime.datetime.now() if stepname!='JOB_END' else job_start_datetime # start timer status = -1 should_break = False results_d[stepname]['result'] = 'SUCCESS' # assumption of success until failure determined results_d[stepname]['expected_status'] = expected_status results_d[stepname]['command'] = cmd results_d[stepname]['error_count'] = 0 # default print(stepname) print(stepname, file=logfile) # indicate to user what step we are on if logmsg: # no command to execute in a separate process results_d[stepname]['status'] = 0 results_d[stepname]['STDOUT'] = logmsg # LOG everything after 'LOG:' output, err = '', '' else: # execute command (cmd) in subprocess print(cmd, file=logfile) try: # SUBPROCESS creation cmd_as_list = cmd.split(' ') if opts.output_dir != '.': os.chdir(opts.output_dir) # move to output_dir, to start subprocess subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile) if opts.output_dir !='.': os.chdir(saved_pwd) # get back (popd) status = 0 # if no exception -- status guaranteed to be zero except subprocess.CalledProcessError as e: status = e.returncode # by validate_graphdb convention, this code is the number of discrprancies found results_d[stepname]['status'] = status if status != expected_status: results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status should_break = False # keep validating pass # might be fine, should_break controls termination except: # NOTE: result defaulted to -1 above results_d[stepname]['result'] = 'EXCEPTION occured -- on step [%s], cmd [%s]' % (stepname,cmd) should_break = True pass else: # no exception results_d[stepname]['status'] = status if status != expected_status: results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status results_d[stepname]['error_count'] = status # graphdb_validate convention is to return discrprency count should_break = True # no steps are optional # Book-keeping cmd_end = datetime.datetime.now() # stop timer cmd_seconds = (cmd_end-cmd_start).seconds results_d[stepname]['elapsed_seconds'] = cmd_seconds if len(output) > 0: results_d[stepname]['STDOUT'] = output.replace('\n','<EOL>') if len(err) > 0: results_d[stepname]['STDERR'] = err.replace('\n','<EOL>') results_d[stepname]['cmd_start'] = cmd_start results_d[stepname]['cmd_end'] = cmd_end if should_break: break # Write results to the database save_and_report_results(DB, seqnum, stepnames, results_d) # Done sys.exit(0) return
def db_build(arglist): saved_pwd = os.getcwd() opt = optparse.OptionParser() opt.add_option('--rf2', action='store', dest='rf2') opt.add_option('--release_type', action='store', dest='release_type', choices=['delta', 'snapshot', 'full']) opt.add_option('--action', action='store', dest='action', default='create', choices=['create', 'update']) opt.add_option('--neopw64', action='store', dest='neopw64') opt.add_option( '--mode', action='store', dest='mode', default='build', choices=['build', 'prep', 'make_csvs', 'run_cypher', 'validate']) # build is end-to-end, others are subsets opt.add_option('--logfile', action='store', dest='logfile') opt.add_option('--output_dir', action='store', dest='output_dir', default='.') opt.add_option('--relationship_file', action='store', dest='relationship_file', default='Relationship') opt.add_option('--language_code', action='store', dest='language_code', default='en') opt.add_option('--language_name', action='store', dest='language_name', default='Language') opt.add_option('--prep_only', action='store_true', dest='prep_only') opts, args = opt.parse_args(arglist) if not (len(args) == 0 and opts.rf2 and opts.release_type and opts.neopw64): print( 'Usage: db_build --rf2 <dir> --release_type delta/snapshot --neopw64 <base64pw>' ) sys.exit(1) # file path separator pathsep = '/' # make sure output directory exists and is empty opts.output_dir = get_path(opts.output_dir, pathsep) if not (os.path.isdir(opts.output_dir) and len(os.listdir(opts.output_dir)) == 0): print('*** Output directory is not an empty directory [%s] ***' % opts.output_dir) sys.exit(1) # open logfile logfile = open(opts.output_dir+'build.log', 'w') if not opts.logfile else \ (sys.output if opts.logfile == '-' else open(opts.logfile, 'w')) #--------------------------------------------------------------------------- # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal #--------------------------------------------------------------------------- # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls # ... ask directly if these variables don't exist snomed_g_bin = os.environ.get( 'SNOMED_G_BIN', None) # unlikely to exist, but great if it does if not snomed_g_bin: snomed_g_home = os.environ.get('SNOMED_G_HOME', None) if snomed_g_home: snomed_g_bin = get_path(snomed_g_home, pathsep) + 'bin' else: snomed_g_bin = get_path(os.path.dirname(os.path.abspath(__file__)), pathsep) # default to python script dir validated = False while not validated: if len(snomed_g_bin) == 0: snomed_g_bin = raw_input( 'Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ' ).rstrip(pathsep) else: # try to validate, look for snomed_g_rf2_tools.py target_file = snomed_g_bin + pathsep + 'snomed_g_rf2_tools.py' validated = os.path.isfile(target_file) if not validated: print('Cant find [%s]' % target_file) snomed_g_bin = '' snomed_g_bin = get_path(snomed_g_bin, pathsep) print('SNOMED_G bin directory [%s]' % snomed_g_bin) # db_build ==> connect to NEO4J, make sure information given is good if opts.mode == 'build': neo4j = snomed_g_lib_neo4j.Neo4j_Access( base64.decodestring(opts.neopw64)) # Connect to RF2 files, make sure rf2 directory given is good rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type, opts.relationship_file, opts.language_code) # Build # open SQLITE database DB = StatusDb( os.path.abspath( opts.output_dir.rstrip(pathsep) + pathsep + 'build_status.db')) # create YYYYMMDD string d = datetime.datetime.now() # determine current date yyyymmdd = '%04d%02d%02d' % (d.year, d.month, d.day) job_start_datetime = datetime.datetime.now() # Commands needed to Create/Update a SNOMED_G Graph Database command_list_db_build = [{ 'stepname': 'JOB_START', 'log': 'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])' % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd) }, { 'stepname': 'FIND_ROLENAMES', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'FIND_ROLEGROUPS', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs'] }, { 'stepname': 'MAKE_CONCEPT_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DESCRIPTION_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_ISA_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DEFINING_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'TEMPLATE_PROCESSING', 'cmd': 'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s' % (snomed_g_bin, snomed_g_bin, ('create' if opts.action == 'create' else 'update'), opts.rf2, opts.release_type), 'mode': ['build', 'prep'] }, { 'stepname': 'CYPHER_EXECUTION', 'cmd': 'python %s/snomed_g_neo4j_tools.py run_cypher %s/build.cypher --verbose --neopw64 %s' % (snomed_g_bin, opts.output_dir, opts.neopw64), 'mode': ['build', 'run_cypher'] }, { 'stepname': 'CHECK_RESULT', 'cmd': 'python %s/snomed_g_neo4j_tools.py run_cypher %s/snomed_g_graphdb_update_failure_check.cypher --verbose --neopw64 %s' % (snomed_g_bin, snomed_g_bin, opts.neopw64), 'mode': ['build', 'run_cypher'] }, { 'stepname': 'JOB_END', 'log': 'JOB-END' }] command_list_db_build_prep = [{ 'stepname': 'JOB_START', 'log': 'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])' % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd) }, { 'stepname': 'FIND_ROLENAMES', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'FIND_ROLEGROUPS', 'cmd': 'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs'] }, { 'stepname': 'MAKE_CONCEPT_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DESCRIPTION_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_ISA_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'MAKE_DEFINING_REL_CSVS', 'cmd': 'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s' % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action, opts.relationship_file, opts.language_code, opts.language_name), 'mode': ['build', 'prep', 'make_csvs', 'validate'] }, { 'stepname': 'TEMPLATE_PROCESSING', 'cmd': 'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s' % (snomed_g_bin, snomed_g_bin, ('create' if opts.action == 'create' else 'update'), opts.rf2, opts.release_type), 'mode': ['build', 'prep'] }, { 'stepname': 'JOB_END', 'log': 'JOB-END' }] # OLD -- #{'stepname':'CYPHER_EXECUTION', 'cmd':'%s/neo4j-shell -localhost -file build.cypher' % neo4j_bin, 'mode':['build','run_cypher']}, command_list = command_list_db_build if not opts.prep_only else command_list_db_build_prep stepnames = [x['stepname'] for x in command_list] # list of dictionaries seqnum = DB.get_next_sequence_number() # Execute commands (BUILD) results_d = {} for command_d in command_list: # extract from tuple stepname, cmd, logmsg, expected_status, mode_requirement = \ command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None) if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode results_d[stepname] = {} cmd_start = datetime.datetime.now( ) if stepname != 'JOB_END' else job_start_datetime # start timer status = -1 should_break = False results_d[stepname][ 'result'] = 'SUCCESS' # assumption of success until failure determined results_d[stepname]['expected_status'] = expected_status results_d[stepname]['command'] = cmd print(stepname) print(stepname, file=logfile) # indicate to user what step we are on if logmsg: # no command to execute in a separate process results_d[stepname]['status'] = 0 results_d[stepname][ 'STDOUT'] = logmsg # LOG everything after 'LOG:' output, err = '', '' else: # execute command (cmd) in subprocess print(cmd, file=logfile) try: #p = subprocess.Popen(cmd, shell=True,stdin=PIPE, stdout=PIPE, stderr=PIPE) #output, err = p.communicate(b"") #status = p.returncode cmd_as_list = cmd.split(' ') if opts.output_dir != '.': os.chdir(opts.output_dir ) # move to output_dir, to start subprocess subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile) if opts.output_dir != '.': os.chdir(saved_pwd) # get back (popd) status = 0 # if no exception -- status is zero except subprocess.CalledProcessError, e: status = e.returncode results_d[stepname]['status'] = status if status != expected_status: results_d[stepname][ 'result'] = 'FAILED (STATUS %d)' % status should_break = True pass # might be fine, should_break controls termination except: # NOTE: result defaulted to -1 above