def clean_header(self): header = self.header if not header: return newheader = [] timesseen = Counter() attridx = 0 for value in header: try: ret = re_nonasciistart.sub('', re_space.sub('_', re_nonascii.sub('', value.strip()).strip())) ret = to_utf(ret) if not ret: ret = 'attr%d' % attridx if re.match('\d+', ret): ret = 'n_%s' % ret except: _log.info('clean_header\t%s', value) ret = 'attr%d' % attridx attridx += 1 ret = ret.lower() if timesseen[ret] > 0: newheader.append('%s_%d' % (ret, timesseen[ret])) elif timesseen[ret] > 3: break else: newheader.append(ret) timesseen[ret] += 1 # XXX: ensure that header doesn't have overlapping values if len(set(newheader)) < len(newheader): _log.info("duplicate elements in header\t%s", str(newheader)) self.header = None else: self.header = newheader
def get_data_iter(self): # if first row with data has style and non of the other rows have style # then it's a header row sheet = self.s nrows = sheet.nrows rows = [sheet.row(i) for i in xrange(nrows)] if len(rows) <= 1: return None # skip empty rows # empty rows typically have much fewer non-empty columns # than the rest of the rows idx = 0 while idx < len(rows): ncontents = len([c for c in rows[idx] if c.ctype != 0]) if ncontents > 0.3 * nrows or ncontents > max(1, nrows - 2): break idx += 1 rows = rows[idx:] # header = None # if sum(1 for c in rows[0] if c.) > 0.8 * len(rows[0]): # header = [c.value for c in rows[0]] # rows = rows[1:] rows = [[to_utf(c.value) for c in r] for r in rows] return DataIterator(lambda: iter(rows), fname=self.fname)
def _geocode(self, address, restriction=''): """ Try to pick the best geocoder for the address. Google has a low daily limit so try to use other geocoders if possible If the address looks like a standard address (NUMBER WORDS+), then use yahoo, bing, geocoder.us Otherwise if it is a place name, try geonames If all else fails, use Google """ geocoder = None format_string = self.get_format_string(restriction) query = to_utf((format_string % address).lower()) if not query: return [] if query in self.cache: try: return pickle.loads(self.cache[query]) except: pass if re_addr2.search(address) and restriction: rand = random.random() if rand < 0.4: geocoder = geocoders.Yahoo(settings.YAHOO_APPID) elif rand < 0.8: geocoder = geocoders.Bing(settings.BING_APIKEY) else: geocoder = geocoders.GeocoderDotUS() else: geocoder = geocoders.GeoNames() try: result = geocoder.geocode(query, exactly_one=False) self.ncalls += 1 except: try: geocoder = geocoders.Google() result = geocoder.geocode(query, exactly_one=False) self.ncalls += 1 except: result = [] self.cache[query] = pickle.dumps(result) return result
def prepare_row_for_copy(self, row): newrow = [] for col in row: if col is None: newrow.append('NULL') elif isinstance(col, basestring): newrow.append(cr_re.sub('\r', to_utf(col).replace('\t', ' '))) else: newrow.append(str(col).replace('\t', ' ')) if len(newrow) < len(self.types): newrow += ['NULL'] * (len(self.types) - len(newrow)) if len(newrow) > len(self.types): newrow = newrow[:len(self.types)] return newrow
def populate_shadow_cols(db, tablemd, schema): tablename = tablemd.tablename colnames = schema.columns.keys() arg = ','.join(colnames) resproxy = db.execute("""select %s from %s order by id asc""" % (arg, tablename)) annotations = defaultdict(list) for anno in tablemd.annotations: annotations[anno.name].append((anno.loctype, anno.extractor())) def annotate_shadow(shadow_data, loctype, vals): for sd, v in zip(shadow_data, vals): sd[loctype] = v while True: rows = resproxy.fetchmany(2000) if not rows: break coldatas = zip(*rows) ids = None shadow_data = [dict() for row in rows] for cn, cd in zip(colnames, coldatas): cd = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in cd if v] annos = annotations[cn] for loctype, extractor in annos: extracted = map(extractor, cd) if loctype == 'latlon': lats, lons = zip(*extracted) annotate_shadow(shadow_data, 'latitude', lats) annotate_shadow(shadow_data, 'longitude', lons) else: annotate_shadow(shadow_data, loctype, extracted) ids = cd if cn == 'id' else ids print 'saving', len(rows) save_shadow(db, tablename, ids, shadow_data) loctypes = set([anno.loctype for anno in tablemd.annotations]) if ('latlon' in loctypes or ('latitude' in loctypes and 'longitude' in loctypes)): return False return True
def clean_header(self): """Remove non-ascii values from the header, fill in any gaps with generic attribute names, and rename duplicate attributes. Fails if the same attribute is present more than three times.""" header = self.header if not header: return newheader = [] timesseen = Counter() attridx = 0 for value in header: try: ret = re_nonasciistart.sub('', re_space.sub('_', re_nonascii.sub('', value.strip()).strip())) ret = to_utf(ret) if not ret: ret = 'attr%d' % attridx if re.match('\d+', ret): ret = 'n_%s' % ret except: _log.info('clean_header\t%s', value) ret = 'attr%d' % attridx attridx += 1 ret = ret.lower() if timesseen[ret] > 0: newheader.append('%s_%d' % (ret, timesseen[ret])) elif timesseen[ret] > 3: break else: newheader.append(ret) timesseen[ret] += 1 # ensure that header doesn't have overlapping values if len(set(newheader)) < len(newheader): _log.info("duplicate elements in header\t%s", str(newheader)) self.header = None else: self.header = newheader
def get_data_iter(self): # No header in this type sheet = self.s nrows = sheet.nrows rows = [sheet.row(i) for i in xrange(nrows)] if len(rows) <= 1: return None # skip empty rows # empty rows typically have much fewer non-empty columns # than the rest of the rows idx = 0 while idx < len(rows): ncontents = len([c for c in rows[idx] if c.ctype != 0]) if ncontents > 0.3 * nrows or ncontents > max(1, nrows - 2): break idx += 1 rows = rows[idx:] rows = [[to_utf(c.value) for c in r] for r in rows] return DataIterator(lambda: iter(rows), fname=self.fname)
def possible_loc(colname, vals): def is_ok(new_vals, maxposs=vals, thresh=0.65): n = 0 for v in new_vals: if isinstance(v, list) or isinstance(v, tuple): if filter(lambda s:s, v): n += 1 else: if v != None: n += 1 if float(n) > thresh * len(maxposs): return n return False vals = [ re_badchar.sub(' ', to_utf(v)).lower().strip() for v in vals if v] nonempty = [v for v in vals if v] colname = colname.lower().strip() ret = {}#defaultdict() if 'lat' in colname: lats = map(parse_lat, vals) if is_ok(lats, nonempty, thresh=0.8): ret['latitude'] = 'parse_lat' if 'lon' in colname: lons = map(parse_lon, vals) if is_ok(lons, nonempty, thresh=0.8): ret['longitude'] = 'parse_lon' if 'latitude' in ret and 'longitude' in ret: return ret if is_ok(map(parse_coords, vals), nonempty, thresh=0.5): ret['latlon'] = 'parse_coords' return ret if 'zip' in colname: zips = map(parse_zip, vals) if is_ok(zips, nonempty): return {"zipcode" : 'parse_zip'} if colname.startswith('st'): states = map(parse_state, vals) if is_ok(states, nonempty): return {'state' : 'parse_state'} zips = map(parse_per_word_zip, vals) if is_ok(zips, nonempty, thresh=0.8): ret['zipcode'] = 'parse_per_word_zip' states = map(parse_per_word_state, vals) if is_ok(states, nonempty, thresh=0.8): ret['state'] = 'parse_per_word_state' # county codes # countries # street addresses (number string string suffix) # column is not a single attribute, lets look for composite data # ok maybe its embedded in the text?? addrs = map(parse_addrs, vals) if is_ok(addrs, nonempty, thresh=0.55): ret['address'] = 'parse_addrs' return ret