def instantiate(arglist):
    # instantiate:
    # PARSE COMMAND
    # syntax: instantiate <template-file> <outfile>
    opt = optparse.OptionParser()
    opt.add_option('--rf2', action='store', dest='rf2')
    opt.add_option('--release_type',
                   action='store',
                   dest='release_type',
                   choices=['delta', 'snapshot', 'full'])
    opt.add_option('--verbose', action='store_true', dest='verbose')
    opt.add_option('--action',
                   action='store',
                   dest='action',
                   default='create',
                   choices=['create', 'update'])
    opts, args = opt.parse_args(arglist)
    if not (len(args) == 2 and opts.rf2 and opts.release_type):
        print(
            'Usage: intantiate <template-file> <output-file> --rf2 <dir> --release_type {Full,Snapshot,Delta}'
        )
        sys.exit(1)
    template_file, output_file = args
    # Connect to RF2 files
    rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type)
    # Information for creating the CSV files
    attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File()
    # CONFIGURATION
    config = {}
    config['terminology_dir'] = rf2_folders.get_terminology_dir()
    config['release_date'] = rf2_folders.get_release_date()
    config['release_center'] = rf2_folders.get_release_center()
    config['output_dir'] = './'

    # Process template file
    fout = open(output_file, 'w')
    release_date = config['release_date'].strip()
    release_center = config.get('release_center', 'INT')
    os_pathsep = config.get('os_pathsep',
                            '/')  # JGP 2015/10/07, no default previously
    output_dir = get_path(config['output_dir'], os_pathsep)
    if sys.platform == 'win32':
        output_dir = output_dir.replace(
            '\\', '/'
        )  # JGP 2016/07/30 -- issue "c:\sno\build\us20160301/defining_rel_edge_rem.csv"
    terminology_dir = get_path(config['terminology_dir'], os_pathsep)
    config_file_suffix = '%s_%s' % (release_center, release_date)
    file_protocol = 'file:///' if sys.platform in [
        'cygwin', 'win32', 'darwin'
    ] else 'file:'  # ubuntu is else case
    # NOTE: can result in 'file:////Users/<rest>' on Mac, replace by 'file:///Users/<rest>'
    # INSTANTIATION PT1 -- PROCESS FILES IN TEMPLATE, REPLACING TEMPLATES WITH INSTANTIATED VALUES
    for line in [x.rstrip('\n').rstrip('\r') for x in open(template_file)]:
        line = line.replace('<<<release_date>>>', release_date) \
                   .replace('<<<output_dir>>>', output_dir) \
                   .replace('<<<terminology_dir>>>', terminology_dir) \
                   .replace('<<<config_file_suffix>>>', config_file_suffix) \
                   .replace('<<<file_protocol>>>', file_protocol) \
                   .replace('file:////','file:///')
        print(line, file=fout)

    # INSTANTIATION PT2 -- DEFINING RELATIONSHIPS PROCESSING

    #                    Handle NEW defining relationships
    # Data source (for new defining relationships):
    #     <output_dir>/defining_rels_new_sorted.csv file
    #     id,active,sctid,rolegroup,typeId,rolename,destinationId,effectiveTime,
    #         moduleId,characteristicTypeId,modifierId,history
    #     4661958023,1,471280008,1,FINDING_SITE,589001,20140731,
    #         900000000000207008,900000000000011006,900000000000451002,
    # Algorithm:
    # NOTE: already sorted by rolename, so all FINDING_SITE elements together, etc
    #       ./snomed_sort_csv.py --fields 'rolename' --string
    #          defining_rels_new.csv defining_rels_new_sorted.csv
    # ==> create separate files for each defining-relationship type that
    #     is found, eg: DR_<snomedct-code>_new.csv
    # ==> add CYPHER code to process the created files and add the
    #     defining relationships.

    with open(output_dir + 'used_roles.csv') as f:
        for idx, line in enumerate(x.rstrip('\n').rstrip('\r') for x in f):
            if idx == 0: continue  # typeId,rolename
            typeId, rolename = line.split(',')
            # create CYPHER to load the file and add the relationships to ROLE_GROUP nodes
            # JGP 2017-10-31.  Use a 2-step procedure for creating the defining relationships,
            #   to support systems with smaller amounts of memory (use smaller transactions).
            #   The first step creates any necessary role groups, and the second step creates
            #   the defining relationship edges from role groups to the specified target concepts.
            print('// %s defining relationships' % rolename, file=fout)
            print('''RETURN 'NEW Defining relationships of type %s';''' %
                  rolename,
                  file=fout)
            print(file=fout)
            print('USING PERIODIC COMMIT 200', file=fout)
            load_csv_line = (
                'LOAD CSV with headers from "%s%sDR_%s_new.csv" as line' %
                (('file:///' if sys.platform in ['cygwin', 'win32', 'darwin']
                  else 'file:'), output_dir, typeId)).replace(
                      'file:////', 'file:///')
            print(load_csv_line, file=fout)
            print(
                'with line, line.sctid as source_id, line.destinationId as dest_id, line.rolegroup as rolegroup_id',
                file=fout)
            # EMonson changed : typo in original
            print(
                'MERGE (rg:RoleGroup { sctid: source_id, rolegroup: rolegroup_id });',
                file=fout)
            print(file=fout)
            print(
                '// Add defining relationship edge in 2nd step, Java memory issue',
                file=fout)
            print('USING PERIODIC COMMIT 200', file=fout)
            load_csv_line = (
                'LOAD CSV with headers from "%s%sDR_%s_new.csv" as line' %
                (('file:///' if sys.platform in ['cygwin', 'win32', 'darwin']
                  else 'file:'), output_dir, typeId)).replace(
                      'file:////', 'file:///')
            print(load_csv_line, file=fout)
            print(
                'with line, line.sctid as source_id, line.destinationId as dest_id, line.rolegroup as rolegroup_id',
                file=fout)
            print(
                'MATCH (rg:RoleGroup { sctid: source_id, rolegroup: rolegroup_id })',
                file=fout)
            print('WITH line,rg,source_id,dest_id,rolegroup_id', file=fout)
            print('MATCH (c:ObjectConcept { sctid: dest_id })', file=fout)
            print(
                'CREATE UNIQUE (rg)-[:%s { id: line.id, active: line.active, sctid: source_id,'
                % rolename,
                file=fout)
            print('                                    typeId: line.typeId,',
                  file=fout)
            print(
                '                                    rolegroup: rolegroup_id, effectiveTime: line.effectiveTime,',
                file=fout)
            print(
                '                                    moduleId: line.moduleId, characteristicTypeId: line.characteristicTypeId,',
                file=fout)
            print(
                '                                    modifierId: line.modifierId,',
                file=fout)
            print(
                '                                    history: line.history }]->(c);',
                file=fout)
    # close CSV, wrap up
    print('// Finito', file=fout)
    fout.close()
    return
Exemple #2
0
def validate_graphdb(arglist):

  def rf2_filename(element, view=None): # rf2_folders is set in validate_graphdb initialization
    return rf2_folders.rf2_file_path(element, view) # eg: 'concept'

  def old_compute_hist_changes(new_field_values, prev_field_values, field_names): # find map with only modified fields
    return { field_names[idx] : new_field_values[idx] for idx in range(len(field_names)) if db_data_prep(new_field_values[idx]) != db_data_prep(prev_field_values[idx]) }

  '''
  HISTORY COMPUTATION -- Example information for a concept:
  
  Information state example (need to understand for history computation)
    csv_fields  = ['id','effectiveTime','active','moduleId','definitionStatusId','FSN','history']
    field_names = ['id','effectiveTime','active','moduleId','definitionStatusId']
    renamed_fields = {}
    id -- '293672009'
    concepts_d[id]['20160301'] -- concepts_d[id] is a map keyed by effectiveTime,
                                  its value ==> list of attribute values for that time,
                                  in same order as in RF2 file
    graph_matches_d[id] (graph) --
              {u'nodetype': u'concept', u'effectiveTime': u'20060131', u'FSN': u'Antiemetic allergy (disorder)',
               u'definitionStatusId': u'900000000000073002', u'sctid': u'293672009', u'active': u'1',
               u'moduleId': u'900000000000207008', u'id': u'293672009',
               u'history': u'[{"active": "1", "effectiveTime": "20020131", ...}, ...]'}
  '''

  def compute_history_string(id, rf2_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields):
    if opts.release_type == 'full': # compute history, have all information
      historical_effectiveTimes = sorted(rf2_d[id].keys())[:-1] # excluce 'current' (latest)
      hist = [ { nm: rf2_d[id][effTime][rf2_fields_d[renamed_fields.get(nm,nm)]] for nm in field_names } for effTime in historical_effectiveTimes ] \
             if len(rf2_d[id].keys()) > 1 else []
    else: # not FULL, can be missing historical info
      if id not in graph_matches_d:
        hist = []
      else:
        old_history =  graph_matches_d[id]['history'] # JSON string or empty string
        old_field_values = [ graph_matches_d[id][nm] for nm in field_names ]
        if len(old_history) == 0: # no prev history, old values ==> previous history)
          hist = [ { a:b for a,b in zip(field_names, old_field_values) } ]
        else: # existing history, not FULL release, append previous values from graph (previous history)
          hist = json.loads(old_history) + [ ( { a:b for a,b in zip(field_names, old_field_values) } ) ]
    return json.dumps(hist) if len(hist) > 0 else ''

  def build_csv_output_line(id, non_rf2_fields, current_effTime, rf2_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_fields):
    csv_data = [None]*len(csv_fields_d.keys())
    for nm in field_names: csv_data[csv_fields_d[nm]] = db_data_prep(rf2_d[id][current_effTime][rf2_fields_d[renamed_fields.get(nm,nm)]])
    for k,v in non_rf2_fields: csv_data[csv_fields_d[k]] = db_data_prep(v) # eg: [('history','<hist-json-str>'),...]
    if None in csv_data: raise ValueError('csv_data %s' % str(csv_data))
    for nm in quoted_fields: csv_data[csv_fields_d[nm]] = csv_clean_str(csv_data[csv_fields_d[nm]]) # quote only necessary fields
    return db_data_prep( ','.join(csv_data) ) # output_line

  #------------------------------------------------------------------------------|
  #        CONCEPT CSV files creation -- concept_new.csv, concept_chg.csv        |
  #------------------------------------------------------------------------------|

  def validate_concepts():

    def concept_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in concepts_d: concepts_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in concepts_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      concepts_d[id][effTime] = fields[:] # attributes in RF2-defined order

    def Fsn_cb(fields, fields_d, hist):
      all_Fsn_in_Rf2_d[ db_data_prep(fields[ fields_d['conceptId'] ]) ] = db_data_prep(fields[ fields_d['term'] ]) # FSN

    def Fsn_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_FSN

    # validate_concepts:
    # ==> generate concept_new.csv, concept_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = { }
    timing_idx = 0
    timing_overall_nm = '{:04d}_validate_concepts'.format(timing_idx); timing_start(timing_d, timing_overall_nm)
    timing_idx += 1; timing_nm = '{:04d}_read_RF2_description'.format(timing_idx); timing_start(timing_d, timing_nm)
    all_Fsn_in_Rf2_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(Fsn_cb, Fsn_filter, False)
    timing_end(timing_d, timing_nm)
    f_new, f_chg = io.open('concept_new.csv','w',encoding='utf8'),io.open('concept_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['concept']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['concept'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['FSN','history'] ] # exclude non-RF2 history and FSN (external)
    renamed_fields = attributes_by_file.renamed_fields['concept'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['concept']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    # create concepts_d with information from DELTA/SNAPSHOT/FULL concept file
    timing_idx += 1; timing_nm = '{:04d}_read_RF2_concept'.format(timing_idx); timing_start(timing_d, timing_nm)
    concepts_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('concept') ).process_file(concept_cb, None, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = concepts_d.keys()
    Fsn_d = { k: all_Fsn_in_Rf2_d[k] for k in list(set(all_Fsn_in_Rf2_d.keys()).intersection(set(rf2_idlist))) } # sets compare ascii+unicode
    print('count of RF2 ids: %d' % len(rf2_idlist))
    # Look for existing FSN values in graph
    print('count of FSNs in RF2: %d' % len(Fsn_d.keys()))
    if opts.action=='create':
      graph_matches_d = {}
    else:
      # NEO4J -- look for these concepts (N at a time)
      timing_idx += 1; timing_nm = '{:04d}_neo4j_lookup_concepts'.format(timing_idx); timing_start(timing_d, timing_nm)
      if opts.release_type=='delta':
        graph_matches_d = neo4j.lookup_concepts_for_ids(rf2_idlist) # This includes FSN values
      else:
        graph_matches_d = neo4j.lookup_all_concepts()
      timing_end(timing_d, timing_nm)
      print('Found %d of the IDs+FSNs in the graph DB:' % len(graph_matches_d.keys()))
      # Set any missing FSN values from the Graph
      target_id_set = set(graph_matches_d.keys()) - set(Fsn_d.keys())
      print('Filling in %d FSN values from the graph' % len(target_id_set))
      for id in list(target_id_set): Fsn_d[id] = graph_matches_d[id]['FSN']
      print('count of FSNs after merge with RF2 FSNs: %d' % len(Fsn_d.keys()))
    # Make sure all ids have an FSN
    if sorted(Fsn_d.keys()) != sorted(rf2_idlist): raise ValueError('*** (sanity check failure) Cant find FSN for all IDs in release ***')
    # GENERATE CSV FILES
    timing_idx += 1; timing_nm = '{:04d}_generate_csvs'.format(timing_idx); timing_start(timing_d, timing_nm)
    for id in rf2_idlist:
      current_effTime = sorted(concepts_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif concepts_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> SKIP
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, concepts_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('FSN',Fsn_d[id]),('history',hist_str)],current_effTime, concepts_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_concepts

  #------------------------------------------------------------------------------|
  #        DESCRIPTION CSV files  -- descrip_new.csv, descrip_chg.csv            |
  #------------------------------------------------------------------------------|
  def validate_descriptions():

    def description_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in description_d: description_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in description_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      description_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def language_cb(fields, fields_d, hist):
      id = fields[ fields_d['referencedComponentId'] ] # DONT USE "id", use the id associated with the Description
      if id in language_d and language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # PREFER US definition
      language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() }
    def snapshot_language_cb(fields, fields_d, hist):
      id = fields[ fields_d['referencedComponentId'] ]
      if id in snapshot_language_d and snapshot_language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # prefer US def
      snapshot_language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() }
    def compute_descriptionType(typeId,acceptabilityId):
      return 'FSN' if typeId=='900000000000003001' \
             else 'Preferred' if typeId=='900000000000013009' and acceptabilityId=='900000000000548007' \
             else 'Synonym'

    # validate_descriptions:
    # ==> generate descrip_new.csv, descrip_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_validate_descriptions' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ RF2 DESCRIPTION FILE
    timing_idx += 1; timing_nm = '%04d_read_RF2_description' % timing_idx; timing_start(timing_d, timing_nm)
    description_d, language_d, snapshot_language_d = {}, {}, {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(description_cb, None, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = description_d.keys()
    print('count of RF2 ids: %d' % len(rf2_idlist))
    # READ RF2 LANGUAGE FILE
    timing_idx += 1; timing_nm = '%04d_read_RF2_language' % timing_idx; timing_start(timing_d, timing_nm)
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language') ).process_file(language_cb, None, False)
    timing_end(timing_d, timing_nm)
    if opts.release_type=='delta': # need snapshot file for fallback of potential missing historical information
      print('read snapshot language values');
      timing_idx += 1; timing_nm = '%04d_read_rf2_language_snapshot' % timing_idx; timing_start(timing_d, timing_nm)
      snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language','Snapshot') ).process_file(snapshot_language_cb, None, False); print('read')
      timing_end(timing_d, timing_nm)
    # CSV INIT, ATTRIBUTE NAMES MANAGEMENT
    f_new, f_chg = io.open('descrip_new.csv','w',encoding='utf8'),io.open('descrip_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['description']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['description'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['id128bit','acceptabilityId','refsetId','descriptionType','history'] ]
    renamed_fields = attributes_by_file.renamed_fields['description'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['description']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action=='create':
      graph_matches_d = {}
    else: # 'update' (compare vs Graph)
      # READ NEO4J DESCRIPTIONS
      timing_idx += 1; timing_nm = '%04d_neo4j_lookup_DESCRIPTIONS' % timing_idx; timing_start(timing_d, timing_nm)
      if opts.release_type=='delta':
        graph_matches_d = neo4j.lookup_descriptions_for_ids(rf2_idlist) # This includes FSN values
      else:
        graph_matches_d = neo4j.lookup_all_descriptions()
      timing_end(timing_d, timing_nm)
      print('count of Descriptions in NEO4J: %d' % len(graph_matches_d.keys()))
      print('count of Language Descriptions in RF2: %d' % len(list(set(language_d.keys()).intersection(set(rf2_idlist)))))
    # GENERATE CSV FILES
    timing_idx += 1; timing_nm = '%04d_generate_csvs' % timing_idx; timing_start(timing_d, timing_nm)
    for id in rf2_idlist:
      current_effTime = sorted(description_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif description_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, description_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      # Need to add the following to the description_d definition ==>
      #  'id128bit','acceptabilityId','descriptionType' (compute from acceptabilityId),'refsetId'
      computed = {}
      current_typeId = description_d[id][current_effTime][rf2_fields_d['typeId']]
      if id in language_d:
        computed['id128bit']        = language_d[id]['id']
        computed['acceptabilityId'] = language_d[id]['acceptabilityId']
        computed['refsetId']        = language_d[id]['refsetId']
        computed['descriptionType'] = compute_descriptionType(current_typeId,language_d[id]['acceptabilityId'])
      elif id in snapshot_language_d: # empty unless view=='delta', things not necessarily in Graph (any missing releases in graph)
        computed['id128bit']        = snapshot_language_d[id]['id']
        computed['acceptabilityId'] = snapshot_language_d[id]['acceptabilityId']
        computed['refsetId']        = snapshot_language_d[id]['refsetId']
        computed['descriptionType'] = compute_descriptionType(current_typeId,snapshot_language_d[id]['acceptabilityId'])
      elif id in graph_matches_d:
        computed['id128bit']        = graph_matches_d[id]['id128bit']
        computed['acceptabilityId'] = graph_matches_d[id]['acceptabilityId']
        computed['refsetId']        = graph_matches_d[id]['refsetId']
        computed['descriptionType'] = graph_matches_d[id]['descriptionType']
      else:
        stats['no_language'] += 1
        computed['id128bit']        = '<NA>'
        computed['acceptabilityId'] = '<NA>'
        computed['refsetId']        = '<NA>'
        computed['descriptionType'] = '<NA>'
        if stats['no_language']<=1000: print('*** Missing LANGUAGE records for Description %s ***' % id)
        elif stats['no_language']==1001: print('*** Missing more than 1000 LANGUAGE records ***')
      non_rf2_fields = [(x,computed[x]) for x in ['id128bit','acceptabilityId','refsetId','descriptionType']]+[('history',hist_str)]
      output_line = build_csv_output_line(id, non_rf2_fields, current_effTime, description_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    if stats['no_language'] > 0: print('Missing %d LANGUAGE records' % stats['no_language'])
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    # DONE
    for f in outfile_list: f.close() # cleanup
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_descriptions

  #------------------------------------------------------------------------------|
  #            ISA_REL CSV files  -- isa_rel_new.csv, isa_rel_chg.csv            |
  #------------------------------------------------------------------------------|
  def validate_isa_rels():

    def isa_rel_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in isa_rel_d: isa_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** ISA id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in isa_rel_d[id]: raise ValueError('*** ISA id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      isa_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def isa_rel_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_ISA

    # validate_isa_rels:
    # ==> generate isa_rel_new.csv, isa_rel_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_make_isa_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ RF2 RELATIONSHIP FILE - EXTRACT ISA
    timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm)
    isa_rel_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(isa_rel_cb, isa_rel_filter, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = isa_rel_d.keys()
    print('count of ids in RF2: %d' % len(rf2_idlist))
    # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT
    f_new, f_chg = io.open('isa_rel_new.csv','w',encoding='utf8'),io.open('isa_rel_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['isa_rel']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['isa_rel'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['history'] ]
    renamed_fields = attributes_by_file.renamed_fields['isa_rel'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['isa_rel']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action=='create':
      graph_matches_d = {}
    else:
      # EXTRACT ISA RELATIONSHIPS FROM NEO4J
      timing_idx += 1; timing_nm = '%04d_get_neo4j_ISA' % timing_idx; timing_start(timing_d, timing_nm)
      all_in_graph = neo4j.lookup_all_isa_rels() # looking for ISA by its 'id' is SLOOOOOOW, get them ALL instead
      timing_end(timing_d, timing_nm)
      print('count of ALL ISA in NEO4J: %d' % len(all_in_graph.keys()))
      graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test
      print('count of ISA in NEO4J: %d' % len(graph_matches_d.keys()))
    # GENERATE CSV FILES FOR NEW AND CHG
    timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm)
    for id in rf2_idlist: # must compute updated history for each
      current_effTime = sorted(isa_rel_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif isa_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, isa_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, isa_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_isa_rels

  #------------------------------------------------------------------------------|
  #    DEFINING_REL CSV files  -- defining_rel_new.csv, defining_rel_chg.csv     |
  #------------------------------------------------------------------------------|
  def validate_defining_rels():

    def defining_rel_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in defining_rel_d: defining_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** DEFINING-REL id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in defining_rel_d[id]: raise ValueError('*** DEFINING-REL id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      defining_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def defining_rel_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] != snomedct_constants.SNOMEDCT_TYPEID_ISA

    # validate_defining_rels:
    # ==> generate defining_rel_new.csv, defining_rel_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_make_defining_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ all_roles.csv (tiny file)
    timing_idx += 1; timing_nm = '%04d_read_all_roles' % timing_idx; timing_start(timing_d, timing_nm)
    roleHash = {}
    with open('all_roles.csv') as f:
      for idx,line in enumerate(x.rstrip('\n').rstrip('\r') for x in f):
        if idx==0: continue # typeId,rolename
        typeId, rolename = line.split(',')
        roleHash[typeId] = rolename
    timing_end(timing_d, timing_nm)
    # READ RF2 RELATIONSHIP FILE - EXTRACT DEFINING-RELS
    timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm)
    defining_rel_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(defining_rel_cb, defining_rel_filter, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = defining_rel_d.keys()
    print('count of ids in RF2: %d' % len(rf2_idlist))
    # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT
    f_new, f_chg = io.open('defining_rel_new.csv','w',encoding='utf8'),io.open('defining_rel_chg.csv','w',encoding='utf8')
    f_edge_rem = io.open('defining_rel_edge_rem.csv','w',encoding='utf8')
    print(db_data_prep('id,rolegroup,sourceId,destinationId'),file=f_edge_rem)
    outfile_list = [f_new,f_chg]
    f_DRs = {} # per-defining-relationship type
    rf2_fields = attributes_by_file.rf2_fields['defining_rel']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['defining_rel'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['history'] ]
    renamed_fields = attributes_by_file.renamed_fields['defining_rel'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['defining_rel']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action == 'create':
      graph_matches_d = {}
    else:
      # EXTRACT DEFINING RELATIONSHIPS FROM NEO4J
      timing_idx += 1; timing_nm = '%04d_get_neo4j_DEFINING_RELS' % timing_idx; timing_start(timing_d, timing_nm)
      all_in_graph = neo4j.lookup_all_defining_rels() # looking for rel by its 'id' is SLOOOOOOW, get them ALL instead
      timing_end(timing_d, timing_nm)
      print('count of ALL DEFINING-REL in NEO4J: %d' % len(all_in_graph.keys()))
      graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test
      print('count of DEFINING-REL in NEO4J: %d' % len(graph_matches_d.keys()))
    # GENERATE CSV FILES FOR NEW AND CHG
    timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm)
    f_used_roles = open('used_roles.csv','w'); print('typeId,rolename',file=f_used_roles)
    for id in rf2_idlist: # must compute updated history for each
      current_effTime = sorted(defining_rel_d[id].keys())[-1] # highest effectiveTime is current
      current_typeId = defining_rel_d[id][current_effTime][rf2_fields_d['typeId']]
      rolegroup_changed = False # if this occurred, treat as create instead of change (as it requires edge remove+edge create)
      if id not in graph_matches_d:
        stats['new'] += 1
        if current_typeId not in f_DRs:
          f_DRs[current_typeId] = open('DR_%s_new.csv' % roleHash[current_typeId],'w'); print(csv_header, file=f_DRs[current_typeId])
          print('%s,%s' % (current_typeId, roleHash[current_typeId]), file=f_used_roles)
      elif defining_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
        # see if rolegroup changed
        if graph_matches_d[id]['rolegroup'] != defining_rel_d[id][current_effTime][ rf2_fields_d['relationshipGroup'] ]: # rolegroup change?
          print('%s,%s,%s,%s' % (id,graph_matches_d[id]['rolegroup'],graph_matches_d[id]['sctid'],graph_matches_d[id]['destinationId']),file=f_edge_rem)
          rolegroup_changed = True # treat this as an edge create case
      hist_str = compute_history_string(id, defining_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, defining_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      for f in ([f_chg] if rolegroup_changed==False and id in graph_matches_d else [f_new, f_DRs[current_typeId]]): print(output_line,file=f)
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list+[f_edge_rem]+[f_DRs[typeId] for typeId in f_DRs.keys()]+[f_used_roles]: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_defining_rels

  # validate_graphdb:
  # Output: result displayed to STDOUT, exceptions)
  opt = optparse.OptionParser()
  opt.add_option('--verbose',action='store_true',dest='verbose')
  opt.add_option('--rf2',action='store',dest='rf2')
  opt.add_option('--element',action='store', choices=['concept','description','isa_rel','defining_rel'])
  opt.add_option('--release_type', action='store', dest='release_type', choices=['delta','snapshot','full'])
  opt.add_option('--exceptions_file', action='store', dest='exceptions_file')
  opt.add_option('--neopw64', action='store')
  opt.add_option('--neopw', action='store')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==0 and opts.rf2 and opts.element and opts.release_type and (opts.neopw or opts.neopw64)):
    print('Usage: validate_graphdb --element concept/description/isa_rel/defining_rel --rf2 <dir> --release_type delta/snapshot [--verbose] --neopw <base64pw>')
    sys.exit(1)
  if opts.neopw and (opts.neopw or opts.neopw64):
    print('Usage: only one of --neopw and --neopw64 may be specified')
    sys.exit(1)
  if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw
    opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2
  # Connect to NEO4J
  #neopw = base64.decodestring( json.loads(open('necares_config.json').read())['salt'] )
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(opts.neopw)
  # Connect to RF2 files
  rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type)
  # Information for comparing RF2 to Graph
  attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File()
  # POSSIBILITY - open exception file (append if it exists, write header if it did not exist)
  fn = opts.exceptions_file
  exceptions_file = open(fn, 'a')
  if exceptions_file.tell()==0: print('element,id,description',file=exceptions_file) # header
  # determine the fields names, NOTE: history is assumed as added last field
  if   opts.element=='concept':      validate_concepts()
  elif opts.element=='description':  validate_descriptions()
  elif opts.element=='isa_rel':      validate_isa_rels()
  elif opts.element=='defining_rel': validate_defining_rels()
  else:
    print('unknown element [%s]' % opts.element); sys.exit(1)
  return
Exemple #3
0
def db_validate(arglist):
  saved_pwd = os.getcwd()
  opt = optparse.OptionParser()
  opt.add_option('--rf2',action='store')
  opt.add_option('--release_type', action='store', choices=['delta','snapshot','full'])
  opt.add_option('--neopw64', action='store')
  opt.add_option('--neopw', action='store')
  opt.add_option('--exceptions', action='store')
  opt.add_option('--logfile', action='store', default='-')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==0 and opts.rf2 and opts.release_type and (opts.neopw or opts.neopw64)):
    print('Usage: db_validate --rf2 <dir> --release_type full --neopw <pw>')
    sys.exit(1)
  if opts.neopw and opts.neopw64:
    print('Usage: only one of --neopw and --neopw64 may be specified')
    sys.exit(1)
  if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw
      opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2
  # open logfile
  logfile = open(opts.logfile, 'w') if opts.logfile != '-' else sys.stdout
  #---------------------------------------------------------------------------
  # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal
  #---------------------------------------------------------------------------
  pathsep = '/'
  # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls
  # ... ask directly if these variables don't exist
  snomed_g_bin = os.environ.get('SNOMED_G_BIN',None) # unlikely to exist, but great if it does
  if not snomed_g_bin:
    snomed_g_home = os.environ.get('SNOMED_G_HOME',None)
    if snomed_g_home:
      snomed_g_bin = snomed_g_home.rstrip(pathsep) + pathsep + 'bin'
    else:
      snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep)
  validated = False
  while not validated:
    if len(snomed_g_bin)==0:
      snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep)
    else: # try to validate, look for snomed_g_rf2_tools.py
      target_file = snomed_g_bin+pathsep+'snomed_g_rf2_tools.py'
      validated = os.path.isfile(target_file)
      if not validated: print('Cant find [%s]' % target_file); snomed_g_bin = ''
  snomed_g_bin = os.path.abspath(snomed_g_bin)
  print('SNOMED_G bin directory [%s]' % snomed_g_bin)
  # connect to NEO4J, make sure information given is good
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64))
  # Connect to RF2 files, make sure rf2 directory given is good
  rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type)
  # Build
  # open SQLITE database
  DB = StatusDb(os.path.abspath(opts.output_dir.rstrip(pathsep)+pathsep+'validate_status.db'))

  # create YYYYMMDD string
  d = datetime.datetime.now() # determine current date
  yyyymmdd = '%04d%02d%02d' % (d.year,d.month,d.day)
  job_start_datetime = datetime.datetime.now()

  # Commands needed to Create/Update a SNOMED_G Graph Database
  commands_d = {
      'JOB_START':
          {'stepname': 'JOB_START',
           'log':      'JOB-START(release_type:[%s], rf2:[%s], date:[%s])' \
                           % (opts.release_type, opts.rf2, yyyymmdd)},
      'VALIDATE_CONCEPTS':
          {'stepname': 'VALIDATE_CONCEPTS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element concept      --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
           'mode':     ['validate']},
      'VALIDATE_DESCRIPTIONS':
          {'stepname': 'VALIDATE_DESCRIPTIONS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element description  --release_type %s --rf2 %s --neopw %s' \
                       % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'VALIDATE_ISA_RELS':
          {'stepname': 'VALIDATE_ISA_RELS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element isa_rel      --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'VALIDATE_DEFINING_RELS':
          {'stepname': 'VALIDATE_DEFINING_RELS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element defining_rel --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'JOB_END':
          {'stepname': 'JOB_END',
           'log':      'JOB-END'}
  }

  command_list_validate_build = \
      [ commands_d[x] for x in
        ['JOB_START',
         'VALIDATE_CONCEPTS',
         'VALIDATE_DESCRIPTIONS',
         'VALIDATE_ISA_RELS',
         'VALIDATE_DEFINING_RELS',
         'JOB_END'] ]
  command_list = command_list_validate_build
  stepnames = [x['stepname'] for x in command_list] # list of dictionaries
  seqnum = DB.get_next_sequence_number()
  # Execute commands (BUILD)
  results_d = {}
  for command_d in command_list:
    # extract from tuple
    stepname, cmd, logmsg, expected_status, mode_requirement = \
      command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None)
    if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode
    results_d[stepname] = {}
    cmd_start = datetime.datetime.now() if stepname!='JOB_END' else job_start_datetime  # start timer
    status = -1
    should_break = False
    results_d[stepname]['result'] = 'SUCCESS' # assumption of success until failure determined
    results_d[stepname]['expected_status'] = expected_status
    results_d[stepname]['command'] = cmd
    results_d[stepname]['error_count'] = 0 # default
    print(stepname)
    print(stepname, file=logfile) # indicate to user what step we are on
    if logmsg: # no command to execute in a separate process
      results_d[stepname]['status'] = 0
      results_d[stepname]['STDOUT'] = logmsg # LOG everything after 'LOG:'
      output, err = '', ''
    else: # execute command (cmd) in subprocess
      print(cmd, file=logfile)
      try:
        # SUBPROCESS creation
        cmd_as_list = cmd.split(' ')
        if opts.output_dir != '.': os.chdir(opts.output_dir) # move to output_dir, to start subprocess
        subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile)
        if opts.output_dir !='.': os.chdir(saved_pwd) # get back (popd)
        status = 0 # if no exception -- status guaranteed to be zero
      except subprocess.CalledProcessError as e:
        status = e.returncode # by validate_graphdb convention, this code is the number of discrprancies found
        results_d[stepname]['status'] = status
        if status != expected_status:
          results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status
          should_break = False # keep validating
        pass # might be fine, should_break controls termination
      except: # NOTE: result defaulted to -1 above
        results_d[stepname]['result'] = 'EXCEPTION occured -- on step [%s], cmd [%s]' % (stepname,cmd)
        should_break = True
        pass
      else: # no exception
        results_d[stepname]['status'] = status
        if status != expected_status:
          results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status
          results_d[stepname]['error_count'] = status # graphdb_validate convention is to return discrprency count
          should_break = True # no steps are optional
    # Book-keeping
    cmd_end = datetime.datetime.now() # stop timer
    cmd_seconds = (cmd_end-cmd_start).seconds
    results_d[stepname]['elapsed_seconds'] = cmd_seconds
    if len(output) > 0: results_d[stepname]['STDOUT'] = output.replace('\n','<EOL>')
    if len(err) > 0: results_d[stepname]['STDERR'] = err.replace('\n','<EOL>')
    results_d[stepname]['cmd_start'] = cmd_start
    results_d[stepname]['cmd_end'] = cmd_end

    if should_break: break
  # Write results to the database
  save_and_report_results(DB, seqnum, stepnames, results_d)

  # Done
  sys.exit(0)
  return
Exemple #4
0
def db_build(arglist):
    saved_pwd = os.getcwd()
    opt = optparse.OptionParser()
    opt.add_option('--rf2', action='store', dest='rf2')
    opt.add_option('--release_type',
                   action='store',
                   dest='release_type',
                   choices=['delta', 'snapshot', 'full'])
    opt.add_option('--action',
                   action='store',
                   dest='action',
                   default='create',
                   choices=['create', 'update'])
    opt.add_option('--neopw64', action='store', dest='neopw64')
    opt.add_option(
        '--mode',
        action='store',
        dest='mode',
        default='build',
        choices=['build', 'prep', 'make_csvs', 'run_cypher',
                 'validate'])  # build is end-to-end, others are subsets
    opt.add_option('--logfile', action='store', dest='logfile')
    opt.add_option('--output_dir',
                   action='store',
                   dest='output_dir',
                   default='.')
    opt.add_option('--relationship_file',
                   action='store',
                   dest='relationship_file',
                   default='Relationship')
    opt.add_option('--language_code',
                   action='store',
                   dest='language_code',
                   default='en')
    opt.add_option('--language_name',
                   action='store',
                   dest='language_name',
                   default='Language')
    opt.add_option('--prep_only', action='store_true', dest='prep_only')
    opts, args = opt.parse_args(arglist)
    if not (len(args) == 0 and opts.rf2 and opts.release_type
            and opts.neopw64):
        print(
            'Usage: db_build --rf2 <dir> --release_type delta/snapshot --neopw64 <base64pw>'
        )
        sys.exit(1)
    # file path separator
    pathsep = '/'
    # make sure output directory exists and is empty
    opts.output_dir = get_path(opts.output_dir, pathsep)
    if not (os.path.isdir(opts.output_dir)
            and len(os.listdir(opts.output_dir)) == 0):
        print('*** Output directory is not an empty directory [%s] ***' %
              opts.output_dir)
        sys.exit(1)
    # open logfile
    logfile = open(opts.output_dir+'build.log', 'w') if not opts.logfile else \
              (sys.output if opts.logfile == '-' else open(opts.logfile, 'w'))
    #---------------------------------------------------------------------------
    # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal
    #---------------------------------------------------------------------------
    # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls
    # ... ask directly if these variables don't exist
    snomed_g_bin = os.environ.get(
        'SNOMED_G_BIN', None)  # unlikely to exist, but great if it does
    if not snomed_g_bin:
        snomed_g_home = os.environ.get('SNOMED_G_HOME', None)
        if snomed_g_home:
            snomed_g_bin = get_path(snomed_g_home, pathsep) + 'bin'
        else:
            snomed_g_bin = get_path(os.path.dirname(os.path.abspath(__file__)),
                                    pathsep)  # default to python script dir
    validated = False
    while not validated:
        if len(snomed_g_bin) == 0:
            snomed_g_bin = raw_input(
                'Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: '
            ).rstrip(pathsep)
        else:  # try to validate, look for snomed_g_rf2_tools.py
            target_file = snomed_g_bin + pathsep + 'snomed_g_rf2_tools.py'
            validated = os.path.isfile(target_file)
            if not validated:
                print('Cant find [%s]' % target_file)
                snomed_g_bin = ''
    snomed_g_bin = get_path(snomed_g_bin, pathsep)
    print('SNOMED_G bin directory [%s]' % snomed_g_bin)
    # db_build ==> connect to NEO4J, make sure information given is good
    if opts.mode == 'build':
        neo4j = snomed_g_lib_neo4j.Neo4j_Access(
            base64.decodestring(opts.neopw64))
    # Connect to RF2 files, make sure rf2 directory given is good
    rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type,
                                               opts.relationship_file,
                                               opts.language_code)
    # Build
    # open SQLITE database
    DB = StatusDb(
        os.path.abspath(
            opts.output_dir.rstrip(pathsep) + pathsep + 'build_status.db'))

    # create YYYYMMDD string
    d = datetime.datetime.now()  # determine current date
    yyyymmdd = '%04d%02d%02d' % (d.year, d.month, d.day)
    job_start_datetime = datetime.datetime.now()

    # Commands needed to Create/Update a SNOMED_G Graph Database
    command_list_db_build = [{
        'stepname':
        'JOB_START',
        'log':
        'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])'
        % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd)
    }, {
        'stepname':
        'FIND_ROLENAMES',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'FIND_ROLEGROUPS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs']
    }, {
        'stepname':
        'MAKE_CONCEPT_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DESCRIPTION_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_ISA_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DEFINING_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'TEMPLATE_PROCESSING',
        'cmd':
        'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s'
        % (snomed_g_bin, snomed_g_bin,
           ('create' if opts.action == 'create' else 'update'), opts.rf2,
           opts.release_type),
        'mode': ['build', 'prep']
    }, {
        'stepname':
        'CYPHER_EXECUTION',
        'cmd':
        'python %s/snomed_g_neo4j_tools.py run_cypher %s/build.cypher --verbose --neopw64 %s'
        % (snomed_g_bin, opts.output_dir, opts.neopw64),
        'mode': ['build', 'run_cypher']
    }, {
        'stepname':
        'CHECK_RESULT',
        'cmd':
        'python %s/snomed_g_neo4j_tools.py run_cypher %s/snomed_g_graphdb_update_failure_check.cypher --verbose --neopw64 %s'
        % (snomed_g_bin, snomed_g_bin, opts.neopw64),
        'mode': ['build', 'run_cypher']
    }, {
        'stepname': 'JOB_END',
        'log': 'JOB-END'
    }]
    command_list_db_build_prep = [{
        'stepname':
        'JOB_START',
        'log':
        'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])'
        % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd)
    }, {
        'stepname':
        'FIND_ROLENAMES',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'FIND_ROLEGROUPS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs']
    }, {
        'stepname':
        'MAKE_CONCEPT_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DESCRIPTION_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_ISA_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DEFINING_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'TEMPLATE_PROCESSING',
        'cmd':
        'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s'
        % (snomed_g_bin, snomed_g_bin,
           ('create' if opts.action == 'create' else 'update'), opts.rf2,
           opts.release_type),
        'mode': ['build', 'prep']
    }, {
        'stepname': 'JOB_END',
        'log': 'JOB-END'
    }]
    # OLD --     #{'stepname':'CYPHER_EXECUTION',       'cmd':'%s/neo4j-shell -localhost -file build.cypher' % neo4j_bin, 'mode':['build','run_cypher']},
    command_list = command_list_db_build if not opts.prep_only else command_list_db_build_prep
    stepnames = [x['stepname'] for x in command_list]  # list of dictionaries
    seqnum = DB.get_next_sequence_number()
    # Execute commands (BUILD)
    results_d = {}
    for command_d in command_list:
        # extract from tuple
        stepname, cmd, logmsg, expected_status, mode_requirement = \
          command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None)
        if mode_requirement and opts.mode not in mode_requirement:
            continue  # eg: NEO4J execution only in build mode
        results_d[stepname] = {}
        cmd_start = datetime.datetime.now(
        ) if stepname != 'JOB_END' else job_start_datetime  # start timer
        status = -1
        should_break = False
        results_d[stepname][
            'result'] = 'SUCCESS'  # assumption of success until failure determined
        results_d[stepname]['expected_status'] = expected_status
        results_d[stepname]['command'] = cmd
        print(stepname)
        print(stepname, file=logfile)  # indicate to user what step we are on
        if logmsg:  # no command to execute in a separate process
            results_d[stepname]['status'] = 0
            results_d[stepname][
                'STDOUT'] = logmsg  # LOG everything after 'LOG:'
            output, err = '', ''
        else:  # execute command (cmd) in subprocess
            print(cmd, file=logfile)
            try:
                #p = subprocess.Popen(cmd, shell=True,stdin=PIPE, stdout=PIPE, stderr=PIPE)
                #output, err = p.communicate(b"")
                #status = p.returncode
                cmd_as_list = cmd.split(' ')
                if opts.output_dir != '.':
                    os.chdir(opts.output_dir
                             )  # move to output_dir, to start subprocess
                subprocess.check_call(cmd_as_list,
                                      stdout=logfile,
                                      stderr=logfile)
                if opts.output_dir != '.':
                    os.chdir(saved_pwd)  # get back (popd)
                status = 0  # if no exception -- status is zero
            except subprocess.CalledProcessError, e:
                status = e.returncode
                results_d[stepname]['status'] = status
                if status != expected_status:
                    results_d[stepname][
                        'result'] = 'FAILED (STATUS %d)' % status
                    should_break = True
                pass  # might be fine, should_break controls termination
            except:  # NOTE: result defaulted to -1 above