コード例 #1
0
ファイル: table.py プロジェクト: yoshi788/stagelib
    def learnfields(df, fields_path='', table='', **kwds):
        if not fields_path:
            fields_path = mkpath(StageTable.CONFIGDIR, 'fields_map',
                                 "%s_fields_map.json" % table)

        __ = StageTable.get_config(fields_path)
        fields_map = learn_fields(df, __, **kwds)
        to_json(fields_path, fields_map)
        return fields_map
コード例 #2
0
class USAddr(object):
    cnfg = from_json(mkpath(*[os.path.dirname(__file__), 'config','labels','address_labels.json']))
    labels = {k:([v] if not isinstance(v,list) else v)
              for k,v in cnfg['labels'].items()}

    states = map(unicode.lower,cnfg['states'].keys() +\
                 cnfg['states'].values())

    def __init__(self,address):
        self.orig = address
        self.prepped = self.preclean(address)
        self.components = self.parse()

    @classmethod
    def disect(cls, x):
        return cls(x).components

    @staticmethod
    def preclean(x):
        return strip(remove_non_ascii(
            to_single_space(
                ' '.join(i for i in uniq(x.split()))
                    )))

    def __repr__(self):
        return "\n".join(" : ".join(map(str, [k, v])) for k,v in self.components.items())

    def get_parts(self):
        try:
            tagged_address, self.type = usaddress.tag(self.prepped)
        except usaddress.RepeatedLabelError as e:
            tagged_address, self.type = (OrderedDict(e.parsed_string), "Questionable")
        return tagged_address

    def is_valid(self, addr_dict):
        try:
            assert(addr_dict['zip']); return True
        except AssertionError:
            _ = addr_dict['state']
            if _ and (len(_) >= 2 or _ in self.states):
                return True
            return False

    def parse(self):
        parts = self.get_parts(); d = {};
        for mylabel, label in self.labels.items():
            part = ' '.join(parts[i] for i in label if i in parts)
            d.update({mylabel : (part if part else None)})
        d.update({'valid' : self.is_valid(d), 'type' : self.type})
        return d
コード例 #3
0
ファイル: table.py プロジェクト: yoshi788/stagelib
class StageTable(Table):
    CONFIGDIR = os.path.join(os.path.dirname(__file__), 'config')
    DEFAULT_FIELDSPATH = mkpath(CONFIGDIR, 'fields_config.json')

    def __init__(self,
                 path,
                 fields_path='',
                 learn=False,
                 table='',
                 omit=('\\', '='),
                 **kwds):
        self.learn = learn
        self.table = table
        self.fields_path = self.DEFAULT_FIELDSPATH
        if fields_path:
            self.fields_path = fields_path
        self.omit = omit
        self.load()
        super(StageTable, self).__init__(path)

    @staticmethod
    def get_config(path):
        __ = from_json_if_exists(path)
        if not __:
            return {}
        return __

    @staticmethod
    def get_table_config(table):
        _ = mkpath(StageTable.CONFIGDIR, 'schema', table + '.json')
        return StageTable.get_config(_)

    @staticmethod
    def get_table_fields(table, login={}):
        __ = StageTable.get_table_config(table)
        if 'fields' not in __:
            return Database(login=login).list_fields(table)
        return __['fields']

    @staticmethod
    def learnfields(df, fields_path='', table='', **kwds):
        if not fields_path:
            fields_path = mkpath(StageTable.CONFIGDIR, 'fields_map',
                                 "%s_fields_map.json" % table)

        __ = StageTable.get_config(fields_path)
        fields_map = learn_fields(df, __, **kwds)
        to_json(fields_path, fields_map)
        return fields_map

    @staticmethod
    def conform(df,
                table,
                learn=False,
                fields_path='',
                fields=[],
                fields_map={},
                **kwds):
        if not fields:
            fields = StageTable.get_table_fields(table, **kwds)

        if learn:
            fields_map.update(
                StageTable.learnfields(df,
                                       fields_path=fields_path,
                                       table=table,
                                       fields=fields))
        df.rename(columns=fields_map, inplace=True)
        return df.ix[:, fields]

    @property
    def fieldgroups(self):
        return [
            i for i in attrlist(self)
            if isinstance(i[1], list) and i[0].endswith('_fields')
        ]

    def _conform(self, df, **kwds):
        return self.conform(df, self.table, fields_map=self.fields_map, **kwds)

    def __iter__(self):
        for df in super(StageTable, self).__iter__():
            yield self.normalize(df)
            gc.collect()
        gc.disable()
        gc.collect()

    def load(self):
        self.fields_map = self.get_config(self.fields_path)
        config = self.get_table_config(self.table)
        if not config:
            try:
                raise Exception, "No config file for table '%s'" % self.table
            except:
                return

        for k, v in config.items():
            setattr(self, k, v)

    def field_mapper(self):
        for i, df in self.samples.items():
            __ = self.learnfields(df,
                                  table=self.table,
                                  fields_path=self.fields_path)
            self.fields_map.update(__)

    def preprocess(self):
        super(StageTable, self).preprocess()
        if self.learn:
            self.field_mapper()

    def getfunc(self, name):
        return getattr(pd.Series, "to_{}".format(name.split('_')[0]))

    def normalize(self, df, *args, **kwds):
        df = df.rename(columns=self.fields_map).clean(*self.omit)
        for k, v in self.fields_map.items():
            if v in df.columns:
                self.info((renformat(field=k, choice=v)))

        for name, fields in self.fieldgroups:
            if fields:
                cols = df.filter_fields(items=fieldlist)
                df[cols] = df[cols].apply(self.getfunc(name))
        return self.conform(df)
コード例 #4
0
ファイル: table.py プロジェクト: yoshi788/stagelib
 def get_table_config(table):
     _ = mkpath(StageTable.CONFIGDIR, 'schema', table + '.json')
     return StageTable.get_config(_)
コード例 #5
0
ファイル: sql.py プロジェクト: yoshi788/stagelib
def df_2_update_file(df, table, outdir='', **kwds):
    outfile = mkpath(mkdir(outdir, '%s_updates' % table),
        "update_%s.sql" % utcnow()\
        .strftime("%Y-%m-%d_%I.%M.%S"))
    writedata(outfile, df_2_update_query(df, table, **kwds))