def refine_schemas(self): """Try to discern relationships between tables by looking for primary key and column name and column help text similarities. Encode those relationships as the target foreign key table in the "is_key" element within the schemas to replace the True/False 1/0 value there currently""" if self.verbosity: print('Refining {0} schemas... '.format(len(self.schemas.items()))) # find all foreign key references even if the column is not labeled 'is_key' because the FDA doesn't label the footnotes column for the NDB No as a primary key, though maybe it should # Ncols=[] # for schema in self.schemas: # Ncols.append(schema['numcols']) Nkeys={} Ncols={} # count the keys before they begin being modified for k,schema in self.schemas.items(): Nkeys[k] = schema['is_key'].count(True) Ncols[k] = len(schema['is_key']) # already available within the schema for k,schema in self.schemas.items(): if self.verbosity: print('Refining schema '+str(k)) for i,schema_is_key in enumerate(schema['is_key']): max_target_likelihood = 0.0 source_likelihood = 0.3*(Nkeys[k]-1) source_likelihood += 0.5*bool(schema_is_key) source_likelihood -= 0.5*bool(schema['is_blank'][i]) target_likelihoods = [] for fsk,foreign_schema in self.schemas.items(): # actHL this is a kluge to catch the footnote table in the FDA database # footnote_table_columns = 4 # if (Nkeys==1 and schema['is_key'][0]==1 and schema['numcols']>=max(Ncols)): # continue # if foreign_schema['numcols']....: # if fsk == k or schema['column_name'][i]!=foreign_schema['column_name'][0]: # target_likelihood = 0.0 # else: target_likelihood = source_likelihood if fsk==k and i==0 and Nkeys[k]==1: target_likelihood += .5 target_likelihood += 0.5*( schema['column_name'][i] == foreign_schema['column_name'][0] ) target_likelihood += nlp.similarity(nlp.essence(schema['help_text'][i][0:32]),nlp.essence(foreign_schema['help_text'][0][0:32])) target_likelihood += 0.2*( schema['field_type'][i] == foreign_schema['field_type'][0] ) target_likelihood += 0.5*foreign_schema['numcols']/(2*Nkeys[fsk]+1) # +1 to help avoid divide by zero # assumes that the key for each table is the first column, which is not the general case, but works for the FDA schema if self.verbosity: print(str(k)+'.'+str(schema['column_name'][i]) +'->'+str(fsk)+'.'+str(foreign_schema['column_name'][0]) + ' likelihood='+str(target_likelihood)) if target_likelihood > max_target_likelihood: max_target_likelihood = target_likelihood max_target_key = fsk target_likelihoods.append(target_likelihood); if self.verbosity: print('max likelihood='+str(max_target_likelihood)+'='+str(max(target_likelihoods))) if max_target_key == k: schema['is_key'][i] = 1 # actHL: should already be 1 elif max_target_likelihood>1 : schema['is_key'][i]=str(max_target_key) #+'.'+schema['column_name']
def read_tables(self): """Read *.txt.table files and form a schema files in memory for each table file.\ This is extremely brittle translation of .table files (created by cutting and pasting text from FDA database documentation in sr22.pdf) into *.txt.schema files. Cut-and-paste must be done with Adobe Acrobat reader to gedit or similar text editor. The gnome document viewer application garbles tables--shuffling the text order.""" print('Reading {0} table files... '.format(len(self.file_names))) valid_fields = False # flag to indicate when the first line containing a valid set of fields has been read #lre = re.compile(r'['+nlp.SPACE+']+') # use this to perform split if custom whitespace character definition is required for i, fn in enumerate(self.file_names): fmn = self.file_model_names[ i] # actHL: should be stored in the schemas dictionary of dictionaries table_path = os.path.join(self.full_path, fn + '.table') schema_path = os.path.join(self.full_path, fn + '.schema') if (os.path.isfile(schema_path) and not self.force): print( 'Warning: {0} already exists and "--force" (overwrite) option not enabled. Skipping schema.' .format(schema_path)) continue if not os.path.isfile(table_path): continue with open(table_path, 'Ur') as infile: print('Opened {0}'.format(table_path)) for l in infile: l = l.strip( ) # leading and trailing whitespace (including \r \n, regardless of eol for locale) if nlp.similarity( nlp.essence(l), nlp.essence('Field name Type Description')) > 0.7: print 'Found a header row: "{0}"\n so skipping to next line...'.format( l) continue # header row detected, skip it # find or develop a regular expression language for tokenizing text lines into variables like this does manually:\ fields = list( self.default_fields ) # without the "list()" method, assignment only creates a pointer with fields pointing to the same memory as self.default_fields #s=lre.split(l,self.num_table_fields_before_asterisk_split-1) # use this to spit on custom whitespace characters s = l.split( None, self.num_table_fields_before_asterisk_split - 1 ) # careful, if you specify a whitespace character then a different algorithm runs # The following is unnecessary, a check of the field contents and formating occurs below # All this would do is prevent some blank table rows being generated with helptext containing the short, invalid text lines from the file if not self.schemas.has_key(fmn): self.schemas.update({fmn: {}}) for k in self.schema_keys: self.schemas[fmn].update({k: []}) if len(s) < self.num_table_fields_before_asterisk_split: if valid_fields == False: print 'Warning (RecipeParser.read_tables()): Row {0}, number of fields insufficient (<{1}), skipping to next line.'.format( len(s), self.num_table_fields_before_asterisk_split) print 'The faulty table line was: "{0}"'.format(l) else: self.schemas[fmn]['help_text'][-1] = self.schemas[ fmn]['help_text'][-1] + ' ' + l.strip( nlp.SPACE + eol) continue # actHL: need some way of specifying generically this syntax where more than one schema field is contained in a single space-delimitted token if s[2][0].isdigit() and s[2].endswith('*'): # insert synthetic field between second number and the asterisk (primary key indicator) # create a new Y/N token to represent the presence or absence of this asterisk s[2:3] = [s[2].rstrip('*'), 'Y'] else: s[2:3] = [s[2], 'N'] for j in range(len(self.field_order)): # if self.field_order[j]<len(s) doesn't work because sometimes self.field_order is a slice cmd = 's[' + str(self.field_order[j]) + ']' s2 = eval(cmd) if type( s2 ) == list: # actHL: there's certainly some slice or sequence method more elegant than this fields[j] = ' '.join(s2) else: fields[j] = s2 if fields[0][0].isalpha() and fields[0][0].isupper() and ( fields[1] == 'A' or fields[1] == 'N') and fields[2][0].isdigit() and ( fields[3] == 'Y' or fields[3] == 'N' ) and (fields[4] == 'Y' or fields[4] == 'N') and ( fields[5] == 'Y' or fields[5] == 'N'): # and len(fields[7])>10: # actHL: automate this with a loop over some translation definitions for strings into fields, like Y/N into 1/0 etc, colname coes with schema_key[0] etc valid_fields = True for j in range(len(self.schema_keys)): for k, v in self.schema_translations[j].items(): fields[j] = fields[j].replace( k, str(v) ) # actHL: should take into account the type of the variable being translated into, somehow self.schemas[fmn][self.schema_keys[j]].append( type(self.default_fields[j])(fields[j])) else: # line is a continuation of the help text rather than a new column description self.schemas[fmn]['help_text'][-1] = self.schemas[fmn][ 'help_text'][-1] + ' ' + l.strip(nlp.SPACE + eol) self.schemas[fmn].update({ 'numcols': len(self.schemas[fmn][self.schema_keys[0]]) }) # not necesary, but convenient self.schemas[fmn].update( {'file_name': fn} ) # necessary because the "variablize" conversion (title case, no punc) is not reversible
def read_tables(self): """Read *.txt.table files and form a schema files in memory for each table file.\ This is extremely brittle translation of .table files (created by cutting and pasting text from FDA database documentation in sr22.pdf) into *.txt.schema files. Cut-and-paste must be done with Adobe Acrobat reader to gedit or similar text editor. The gnome document viewer application garbles tables--shuffling the text order.""" print('Reading {0} table files... '.format(len(self.file_names))) valid_fields=False # flag to indicate when the first line containing a valid set of fields has been read #lre = re.compile(r'['+nlp.SPACE+']+') # use this to perform split if custom whitespace character definition is required for i,fn in enumerate(self.file_names): fmn=self.file_model_names[i] # actHL: should be stored in the schemas dictionary of dictionaries table_path=os.path.join(self.full_path,fn+'.table') schema_path=os.path.join(self.full_path,fn+'.schema') if (os.path.isfile(schema_path) and not self.force): print('Warning: {0} already exists and "--force" (overwrite) option not enabled. Skipping schema.'.format(schema_path)) continue if not os.path.isfile(table_path): continue with open(table_path,'Ur') as infile: print('Opened {0}'.format(table_path)) for l in infile: l=l.strip() # leading and trailing whitespace (including \r \n, regardless of eol for locale) if nlp.similarity(nlp.essence(l),nlp.essence('Field name Type Description')) > 0.7: print 'Found a header row: "{0}"\n so skipping to next line...'.format(l) continue # header row detected, skip it # find or develop a regular expression language for tokenizing text lines into variables like this does manually:\ fields=list(self.default_fields) # without the "list()" method, assignment only creates a pointer with fields pointing to the same memory as self.default_fields #s=lre.split(l,self.num_table_fields_before_asterisk_split-1) # use this to spit on custom whitespace characters s=l.split(None,self.num_table_fields_before_asterisk_split-1) # careful, if you specify a whitespace character then a different algorithm runs # The following is unnecessary, a check of the field contents and formating occurs below # All this would do is prevent some blank table rows being generated with helptext containing the short, invalid text lines from the file if not self.schemas.has_key(fmn): self.schemas.update({fmn:{}}) for k in self.schema_keys: self.schemas[fmn].update({k:[]}) if len(s)<self.num_table_fields_before_asterisk_split: if valid_fields==False: print 'Warning (RecipeParser.read_tables()): Row {0}, number of fields insufficient (<{1}), skipping to next line.'.format( len(s),self.num_table_fields_before_asterisk_split) print 'The faulty table line was: "{0}"'.format(l) else: self.schemas[fmn]['help_text'][-1]=self.schemas[fmn]['help_text'][-1]+' '+l.strip(nlp.SPACE+eol) continue # actHL: need some way of specifying generically this syntax where more than one schema field is contained in a single space-delimitted token if s[2][0].isdigit() and s[2].endswith('*'): # insert synthetic field between second number and the asterisk (primary key indicator) # create a new Y/N token to represent the presence or absence of this asterisk s[2:3]=[s[2].rstrip('*'),'Y'] else: s[2:3]=[s[2],'N'] for j in range(len(self.field_order)): # if self.field_order[j]<len(s) doesn't work because sometimes self.field_order is a slice cmd='s['+str(self.field_order[j])+']' s2=eval(cmd) if type(s2)==list: # actHL: there's certainly some slice or sequence method more elegant than this fields[j]=' '.join(s2) else: fields[j]=s2 if fields[0][0].isalpha() and fields[0][0].isupper() and (fields[1]=='A' or fields[1]=='N') and fields[2][0].isdigit() and (fields[3]=='Y' or fields[3]=='N') and (fields[4]=='Y' or fields[4]=='N') and (fields[5]=='Y' or fields[5]=='N'): # and len(fields[7])>10: # actHL: automate this with a loop over some translation definitions for strings into fields, like Y/N into 1/0 etc, colname coes with schema_key[0] etc valid_fields=True for j in range(len(self.schema_keys)): for k,v in self.schema_translations[j].items(): fields[j]=fields[j].replace(k,str(v)) # actHL: should take into account the type of the variable being translated into, somehow self.schemas[fmn][self.schema_keys[j]].append(type(self.default_fields[j])(fields[j])) else: # line is a continuation of the help text rather than a new column description self.schemas[fmn]['help_text'][-1]=self.schemas[fmn]['help_text'][-1]+' '+l.strip(nlp.SPACE+eol) self.schemas[fmn].update({'numcols': len(self.schemas[fmn][self.schema_keys[0]])}) # not necesary, but convenient self.schemas[fmn].update({'file_name': fn}) # necessary because the "variablize" conversion (title case, no punc) is not reversible