def __init__(self, tables_not_to_include = [], debug = True, debugMore=True, recalc_struct_to_upload = True): ''' recalc_struct_to_upload : true if you would like to reconstruct the struct to upload. if false. it will upload the json doc in logs\bulk_import_data ''' INDEX = 'metadata' DOCTYPE = 'tables' dbm = dbManager() #initialize database Manager # NOTE: the keys will be tablenames. if recalc_struct_to_upload: mm = MapManager(debug=False) #initialize the map manager if(debug): mm.log_mappings('logs/mappings_used') #check that mappigs are correct # open the bulk_uplopad file to print import statements to. bulk_upload_fp = u.log_file('logs/bulk_import_data.json', keep_alive=True) # open the database and enumerate through tablenames for i,tablename in enumerate(dbm.open_db()): if(tablename not in tables_not_to_include): #break the tablename into it's individual parts. table_info_raw = dbm.parse_row(tablename, debug = debug) # apply the mapping based on those parts. save them to the struct. table_info_raw = mm.apply_mapping(table_info_raw, debug = debug) # if there is an instrument... get the scales. & extra info if('instrument abbreviation' in table_info_raw): if(debugMore): print('\n\nAbout to look at scales for %s' % u.prettify_str(table_info_raw)) # get the scales & instrument name from the mappings scales, instrument_name = mm.get_info(table_info_raw['instrument abbreviation'], debug = debugMore) if(scales): # get_info may return null, if it doesn't have scales/instrument name table_info_raw['scales'] = [] for scale in scales: scale_info = { 'scale' : scale, 'description' : scales[scale] } table_info_raw['scales'].append(scale_info) if(instrument_name): table_info_raw['instrument name'] = instrument_name table_indexing = {'index':{'_index':INDEX, '_type':DOCTYPE, '_id':i}} json.dump(table_indexing, bulk_upload_fp) json.dump(table_info_raw, bulk_upload_fp) else: # grab the logs\struct_uploaded,json and upload that bulk_upload_fp = json.load(open('logs/bulk_import_data.json')) dbm.close_db() dbm.open_es() print('exporting struct to es.... may take some time....') dbm.insert_struct_to_es(bulk_upload_fp) print(u.prettify_str(json.loads(bulk_upload_fp)))
def log_mappings(self,path): # save the mappings to a file import time f = open(path + time.strftime("-%a_%b_%Y-%H") + '.md','w') f.write('Raw Parameters File :\n') f.write(u.prettify_str(self.json_raw)) f.write('\n\n---------------------------------\n') f.write('Mappings Struct:\n') f.write(u.prettify_str(self.maps)) f.write('\n\n---------------------------------\n') f.write('Instrument Parameters:\n') f.write(u.prettify_str(self.instr_params)) f.close()
def parse_row(self, row, debug = True): # get a dict with the attribute for this tablename """ Takes in a row from database cursor. returns dict with all attributes from tablename args: row : ['wtp_data', '', tablename, 'table', ''] This is the definition from the pypyodbc package. each row of cursosr.tables() is like that. returns: { 'type': 'data'/'calc'/'misc' 'tablename': tablename, 'instrument abbreviation': ABES 'respondent': t/m/f 'phase' : 4 etc.... } This function uses the mappings defined in the class variable table_name map. which basically """ tablename_maps =\ { 'parsing maps':{ 'data':{ 'tablename':'table_split[0:]', 'phase':'table_split[1: 2]', 'instrument abbreviation':'table_split[2 : table_split.__len__()-1]', 'respondent':'table_split[-1]', }, 'calc':{ 'tablename':'table_split[0:]', 'phase':'table_split[1: 2]', 'instrument abbreviation':'table_split[2: table_split.__len__()-1]', 'respondent':'table_split[-1]', }, 'misc':{ 'tablename':'table_split[0:]', } }, 'filters':{ 'data':{ 'length':4, 'disallowed words':[ 'dates' ] }, 'calc':{ 'length':4, 'disallowed words':[ 'dates' ] } }, '__doc__': \ """ This is the readme for this dict. This dict defines the parameters for the WTP naming conventions. This has 3 sections. - This readme - Parsing maps - Filters ___This readme___ is self explanatory. ___Parsing maps___ has a bunch of commands to parse a tablename into various attributes. the structure is like so: type{ 'attribute this type should have 1' 'attribute this type should have 2' etc. } It is assumed that the tablename has already been split into an array at '_'s and called table_split note that the selections made by each command will return an array of strings. that should be joined by _s. You should check that it's an array. and not just a single string. If it's a string. don't join it. ___filters___ is a list of things each type needs in order to be considered. part of that type. i.e. data needs to have at least 4 parts. type, phase, respondent, instrument. If it doesn't then it's some kind of special meta-data. like data_dates..... And we don't know what to do with them yet. ----- you should be aware. that type should always be the first secgment of any tablename in the wtp database. we use that to select which types to try. but first we filter it. if it doesn't match the filter's criteria it's categorized as misc. ---- If you start updating this filter you'll need to update get_table_type as well. """ } attributes_dict = {} tablename = row[2] # the pypyodbc tables() row has tablename at index 2 if(debug):print('Parsing table %s' % tablename) table_split = tablename.split('_') # the sections should be split by _ table_type = self.get_table_type(table_split, tablename_maps['filters']) if(debug):print('type determined to be %s' % table_type) for attrib in tablename_maps['parsing maps'][table_type]: parts = eval(tablename_maps['parsing maps'][table_type][attrib]) if(type(parts) is list): parts = '_'.join(parts) attributes_dict[attrib] = parts attributes_dict['type'] = table_type if(debug): print('\tbecame->\n\t' + u.prettify_str(attributes_dict).replace('\n','\n\t')) return attributes_dict
def get_info(self, instr_abbrev, instr_full=None, instr_list = None, line ='', debug = True): # gets scales and name """ Search for instrument with abbreviation that matches. if there are any scales, return them. they will be in the form of a dict If instrument name can be determined return that as well if No scales found return None ARGS: instr_abbrev -> the abbreviation of the instrument we're lookng for instr_list -> the list of instrument maps _defaults_ to self.instr_params RETURNS: the dict of scales if found. None if not found. the name of the instrument as can best be determined recall that constructs should support nested instruments. by way of extension. recall constucts is in the forms { 'instr name':{ 'abbreviation':'' 'extensions':{ 'instrument name':{ 'abbreviation':'' 'scales':{} 'memo':'' 'details':'' } } 'scales' } } """ if(not instr_list): # if no list provided use the whole parameters list. instr_list = self.instr_params scales = None # scales will hold the scales from maps. for instr in instr_list: # iterate through all instruments provided. instr_dict = instr_list[instr] # instr_dict is the parameters dict for inst #_check_ if this is the correct instrument. # NOTE : as it uses starts with - potential bug when abbrevations have 'au' and 'aub' if(instr_abbrev[0:instr_dict['abbreviation'].__len__()] == (instr_dict['abbreviation'])): # if(debug): print('%sFound beginning of %s in:' %(line, instr_abbrev)) if(debug): print('%s\"%s\" %s' %(line, instr, u.prettify_str(instr_dict).replace('\n','\n\t'+line))) if(not instr_full): instr_full = '' instr_full = instr_full + instr +' ' # track full instrument name. Lower levels should append their additions if(debug): print('%sinstrument name = %s...' %(line,instr_full)) # if there are extensions. recurse if('extensions' in instr_dict): # then recurse. if(debug): print('%s---Extensions found: Recursing' %(line+'\t')) instr_abbrev_suffix = instr_abbrev[instr_dict['abbreviation'].__len__():] # save after map abrv # recurse down lower_scales, instr_full = self.get_info( instr_abbrev_suffix, instr_full=instr_full, instr_list = instr_dict['extensions'], line = (line + '\t'), debug=debug) if(lower_scales): # if it found something if(not scales): scales = {} # if scales doesn't exist initialize it scales.update(lower_scales) # add it to scales. # if there are no extensions and instr_abrv doesn't match. Then return None. elif(instr_dict['abbreviation'] != instr_abbrev): if(debug): print('%sNothing Found. The abbreviations don\'t match. .%s:%s.' \ % (line+'\t', instr_abbrev, instr_dict['abbreviation'])) return None, instr_full if('scales' in instr_dict): # then add them to the dict for return if(debug): print('%sScales found!' % line) if(not scales): scales ={} # if scales doesn't exist initialize it scales.update(instr_dict['scales']) # add scales from instr_list to scales if(debug): print('%sReturning scales as %s' %(line, str(scales)[:30])) return scales, instr_full return None, instr_full # if you iterate through all instruments. and none returns. then failed.