def learnfields(df, fields_path='', table='', **kwds): if not fields_path: fields_path = mkpath(StageTable.CONFIGDIR, 'fields_map', "%s_fields_map.json" % table) __ = StageTable.get_config(fields_path) fields_map = learn_fields(df, __, **kwds) to_json(fields_path, fields_map) return fields_map
class USAddr(object): cnfg = from_json(mkpath(*[os.path.dirname(__file__), 'config','labels','address_labels.json'])) labels = {k:([v] if not isinstance(v,list) else v) for k,v in cnfg['labels'].items()} states = map(unicode.lower,cnfg['states'].keys() +\ cnfg['states'].values()) def __init__(self,address): self.orig = address self.prepped = self.preclean(address) self.components = self.parse() @classmethod def disect(cls, x): return cls(x).components @staticmethod def preclean(x): return strip(remove_non_ascii( to_single_space( ' '.join(i for i in uniq(x.split())) ))) def __repr__(self): return "\n".join(" : ".join(map(str, [k, v])) for k,v in self.components.items()) def get_parts(self): try: tagged_address, self.type = usaddress.tag(self.prepped) except usaddress.RepeatedLabelError as e: tagged_address, self.type = (OrderedDict(e.parsed_string), "Questionable") return tagged_address def is_valid(self, addr_dict): try: assert(addr_dict['zip']); return True except AssertionError: _ = addr_dict['state'] if _ and (len(_) >= 2 or _ in self.states): return True return False def parse(self): parts = self.get_parts(); d = {}; for mylabel, label in self.labels.items(): part = ' '.join(parts[i] for i in label if i in parts) d.update({mylabel : (part if part else None)}) d.update({'valid' : self.is_valid(d), 'type' : self.type}) return d
class StageTable(Table): CONFIGDIR = os.path.join(os.path.dirname(__file__), 'config') DEFAULT_FIELDSPATH = mkpath(CONFIGDIR, 'fields_config.json') def __init__(self, path, fields_path='', learn=False, table='', omit=('\\', '='), **kwds): self.learn = learn self.table = table self.fields_path = self.DEFAULT_FIELDSPATH if fields_path: self.fields_path = fields_path self.omit = omit self.load() super(StageTable, self).__init__(path) @staticmethod def get_config(path): __ = from_json_if_exists(path) if not __: return {} return __ @staticmethod def get_table_config(table): _ = mkpath(StageTable.CONFIGDIR, 'schema', table + '.json') return StageTable.get_config(_) @staticmethod def get_table_fields(table, login={}): __ = StageTable.get_table_config(table) if 'fields' not in __: return Database(login=login).list_fields(table) return __['fields'] @staticmethod def learnfields(df, fields_path='', table='', **kwds): if not fields_path: fields_path = mkpath(StageTable.CONFIGDIR, 'fields_map', "%s_fields_map.json" % table) __ = StageTable.get_config(fields_path) fields_map = learn_fields(df, __, **kwds) to_json(fields_path, fields_map) return fields_map @staticmethod def conform(df, table, learn=False, fields_path='', fields=[], fields_map={}, **kwds): if not fields: fields = StageTable.get_table_fields(table, **kwds) if learn: fields_map.update( StageTable.learnfields(df, fields_path=fields_path, table=table, fields=fields)) df.rename(columns=fields_map, inplace=True) return df.ix[:, fields] @property def fieldgroups(self): return [ i for i in attrlist(self) if isinstance(i[1], list) and i[0].endswith('_fields') ] def _conform(self, df, **kwds): return self.conform(df, self.table, fields_map=self.fields_map, **kwds) def __iter__(self): for df in super(StageTable, self).__iter__(): yield self.normalize(df) gc.collect() gc.disable() gc.collect() def load(self): self.fields_map = self.get_config(self.fields_path) config = self.get_table_config(self.table) if not config: try: raise Exception, "No config file for table '%s'" % self.table except: return for k, v in config.items(): setattr(self, k, v) def field_mapper(self): for i, df in self.samples.items(): __ = self.learnfields(df, table=self.table, fields_path=self.fields_path) self.fields_map.update(__) def preprocess(self): super(StageTable, self).preprocess() if self.learn: self.field_mapper() def getfunc(self, name): return getattr(pd.Series, "to_{}".format(name.split('_')[0])) def normalize(self, df, *args, **kwds): df = df.rename(columns=self.fields_map).clean(*self.omit) for k, v in self.fields_map.items(): if v in df.columns: self.info((renformat(field=k, choice=v))) for name, fields in self.fieldgroups: if fields: cols = df.filter_fields(items=fieldlist) df[cols] = df[cols].apply(self.getfunc(name)) return self.conform(df)
def get_table_config(table): _ = mkpath(StageTable.CONFIGDIR, 'schema', table + '.json') return StageTable.get_config(_)
def df_2_update_file(df, table, outdir='', **kwds): outfile = mkpath(mkdir(outdir, '%s_updates' % table), "update_%s.sql" % utcnow()\ .strftime("%Y-%m-%d_%I.%M.%S")) writedata(outfile, df_2_update_query(df, table, **kwds))