Example #1
0
    def clean_header(self):
        header = self.header
        if not header:
            return

        newheader = []
        timesseen = Counter()
        attridx = 0
        for value in header:
            try:
                ret = re_nonasciistart.sub('', re_space.sub('_', re_nonascii.sub('', value.strip()).strip()))
                ret = to_utf(ret)
                if not ret:
                    ret = 'attr%d' % attridx
                if re.match('\d+', ret):
                    ret = 'n_%s' % ret
            except:
                _log.info('clean_header\t%s', value)
                ret = 'attr%d' % attridx
            attridx += 1
            ret = ret.lower()
            if timesseen[ret] > 0:
                newheader.append('%s_%d' % (ret, timesseen[ret]))
            elif timesseen[ret] > 3:
                break
            else:
                newheader.append(ret)
            timesseen[ret] += 1

        # XXX: ensure that header doesn't have overlapping values
        if len(set(newheader)) < len(newheader):
            _log.info("duplicate elements in header\t%s", str(newheader))
            self.header = None
        else:
            self.header = newheader
Example #2
0
    def get_data_iter(self):
        # if first row with data has style and non of the other rows have style
        # then it's a header row
        sheet = self.s
        nrows = sheet.nrows
        rows = [sheet.row(i) for i in xrange(nrows)]
        if len(rows) <= 1:
            return None


        # skip empty rows
        # empty rows typically have much fewer non-empty columns
        # than the rest of the rows
        idx = 0
        while idx < len(rows):
            ncontents = len([c for c in rows[idx] if c.ctype != 0])
            if ncontents > 0.3 * nrows or ncontents > max(1, nrows - 2):
                break
            idx += 1
        rows = rows[idx:]
        
        # header = None        
        # if sum(1 for c in rows[0] if c.) > 0.8 * len(rows[0]):
        #     header = [c.value for c in rows[0]]
        #     rows = rows[1:]
        
        rows = [[to_utf(c.value) for c in r] for r in rows]            

        return DataIterator(lambda: iter(rows), fname=self.fname)
Example #3
0
    def _geocode(self, address, restriction=''):
        """
        Try to pick the best geocoder for the address.  Google has a low
        daily limit so try to use other geocoders if possible

        If the address looks like a standard address (NUMBER WORDS+),
        then use yahoo, bing, geocoder.us

        Otherwise if it is a place name, try geonames

        If all else fails, use Google
        """
        geocoder = None
        format_string = self.get_format_string(restriction)
        query = to_utf((format_string % address).lower())

        if not query:
            return []

        if query in self.cache:
            try:
                return pickle.loads(self.cache[query])
            except:
                pass
        
        if re_addr2.search(address) and restriction:
            rand = random.random()
            if rand < 0.4:
                geocoder = geocoders.Yahoo(settings.YAHOO_APPID)
            elif rand < 0.8:
                geocoder = geocoders.Bing(settings.BING_APIKEY)
            else:
                geocoder = geocoders.GeocoderDotUS()
        else:
            geocoder = geocoders.GeoNames()

        try:
            result = geocoder.geocode(query, exactly_one=False)
            self.ncalls += 1
        except:
            try:
                geocoder = geocoders.Google()
                result = geocoder.geocode(query, exactly_one=False)
                self.ncalls += 1
            except:
                result = []

        self.cache[query] = pickle.dumps(result)
        return result
Example #4
0
File: pg.py Project: imclab/dbtruck
    def prepare_row_for_copy(self, row):
        newrow = []
        for col in row:
            if col is None:
                newrow.append('NULL')
            elif isinstance(col, basestring):
                newrow.append(cr_re.sub('\r', to_utf(col).replace('\t', ' ')))
            else:
                newrow.append(str(col).replace('\t', ' '))


        if len(newrow) < len(self.types):
            newrow += ['NULL'] * (len(self.types) - len(newrow))
        if len(newrow) > len(self.types):
            newrow = newrow[:len(self.types)]
        return newrow
Example #5
0
def populate_shadow_cols(db, tablemd, schema):
    tablename = tablemd.tablename
    colnames = schema.columns.keys()

    arg = ','.join(colnames)
    resproxy = db.execute("""select %s from %s order by id asc""" % (arg, tablename))

    annotations = defaultdict(list)
    for anno in tablemd.annotations:
        annotations[anno.name].append((anno.loctype, anno.extractor()))

    def annotate_shadow(shadow_data, loctype, vals):
        for sd, v in zip(shadow_data, vals):
            sd[loctype] = v

    while True:
        rows = resproxy.fetchmany(2000)
        if not rows:
            break
        
        coldatas = zip(*rows)
        ids = None
        shadow_data = [dict() for row in rows]
        for cn, cd in zip(colnames, coldatas):
            cd = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in cd if v]
            annos = annotations[cn]
            for loctype, extractor in annos:
                extracted = map(extractor, cd)
                if loctype == 'latlon':
                    lats, lons = zip(*extracted)

                    annotate_shadow(shadow_data, 'latitude', lats)
                    annotate_shadow(shadow_data, 'longitude', lons)
                else:
                    annotate_shadow(shadow_data, loctype, extracted)

            ids = cd if cn == 'id' else ids

        print 'saving', len(rows)
        save_shadow(db, tablename, ids, shadow_data)


    loctypes = set([anno.loctype for anno in tablemd.annotations])
    if ('latlon' in loctypes or
        ('latitude' in loctypes and 'longitude' in loctypes)):
        return False
    return True
Example #6
0
    def clean_header(self):
        """Remove non-ascii values from the header, fill in any gaps with generic
        attribute names, and rename duplicate attributes. Fails if the same attribute
        is present more than three times."""

        header = self.header
        if not header:
            return

        newheader = []
        timesseen = Counter()
        attridx = 0
        for value in header:
            try:
                ret = re_nonasciistart.sub('', re_space.sub('_', re_nonascii.sub('', value.strip()).strip()))
                ret = to_utf(ret)
                if not ret:
                    ret = 'attr%d' % attridx
                if re.match('\d+', ret):
                    ret = 'n_%s' % ret
            except:
                _log.info('clean_header\t%s', value)
                ret = 'attr%d' % attridx
            attridx += 1
            ret = ret.lower()
            if timesseen[ret] > 0:
                newheader.append('%s_%d' % (ret, timesseen[ret]))
            elif timesseen[ret] > 3:
                break
            else:
                newheader.append(ret)
            timesseen[ret] += 1

        # ensure that header doesn't have overlapping values
        if len(set(newheader)) < len(newheader):
            _log.info("duplicate elements in header\t%s", str(newheader))
            self.header = None
        else:
            self.header = newheader
Example #7
0
    def get_data_iter(self):
        # No header in this type
        sheet = self.s
        nrows = sheet.nrows
        rows = [sheet.row(i) for i in xrange(nrows)]
        if len(rows) <= 1:
            return None

        # skip empty rows
        # empty rows typically have much fewer non-empty columns
        # than the rest of the rows
        idx = 0
        while idx < len(rows):
            ncontents = len([c for c in rows[idx] if c.ctype != 0])
            if ncontents > 0.3 * nrows or ncontents > max(1, nrows - 2):
                break
            idx += 1
        rows = rows[idx:]

        rows = [[to_utf(c.value) for c in r] for r in rows]

        return DataIterator(lambda: iter(rows), fname=self.fname)
Example #8
0
def possible_loc(colname, vals):
    def is_ok(new_vals, maxposs=vals, thresh=0.65):
        n = 0
        for v in new_vals:
            if isinstance(v, list) or isinstance(v, tuple):
                if filter(lambda s:s, v):
                    n += 1
            else:
                if v != None:
                    n += 1
        if float(n) > thresh * len(maxposs):
            return n
        return False

    vals = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in vals if v]
    nonempty = [v for v in vals if v]        
    colname = colname.lower().strip()
    ret = {}#defaultdict()
    
    if 'lat' in colname:
        lats = map(parse_lat, vals)
        if is_ok(lats, nonempty, thresh=0.8):
            ret['latitude'] = 'parse_lat'

    if 'lon' in colname:
        lons = map(parse_lon, vals)
        if is_ok(lons, nonempty, thresh=0.8):
            ret['longitude'] = 'parse_lon'

    if 'latitude' in ret and 'longitude' in ret:
        return ret

    if is_ok(map(parse_coords, vals), nonempty, thresh=0.5):
        ret['latlon'] = 'parse_coords'
        return ret



    if 'zip' in colname:
        zips = map(parse_zip, vals)
        if is_ok(zips, nonempty):
            return {"zipcode" : 'parse_zip'}
            
    if colname.startswith('st'):
        states = map(parse_state, vals)
        if is_ok(states, nonempty):
            return {'state' : 'parse_state'}

    zips = map(parse_per_word_zip, vals)
    if is_ok(zips, nonempty, thresh=0.8):
        ret['zipcode'] = 'parse_per_word_zip'

    states = map(parse_per_word_state, vals)
    if is_ok(states, nonempty, thresh=0.8):
        ret['state'] = 'parse_per_word_state'


    # county codes
    # countries

    # street addresses (number string string suffix)
    # column is not a single attribute, lets look for composite data
    # ok maybe its embedded in the text??
    addrs = map(parse_addrs, vals)
    if is_ok(addrs, nonempty, thresh=0.55):
        ret['address'] = 'parse_addrs'
    return ret