def __init__(self, terms, excep, canonize, weight=0): if weight is None: weight = 0 self.weight = int(weight) self.canonize = canonize if self.canonize: terms = canonical_form(terms) if excep: self.pattern = r"(?<!{excep})\s*" + self.pattern if self.canonize: excep = canonical_form(excep) self.pattern = re.compile(self.pattern.format(term=terms, excep=excep))
def add_geos(table): geocoder = Geocoder() terms_db = TermsDB() table_coords = [] total = len(table) for index, row in table.iterrows(): row_terms = [] for cell in row: if type(cell) is str: canonical = canonical_form(cell) terms = terms_db.search(cell, canonical) if terms: row_terms = row_terms + terms # for term in terms: # # print(term) # row_coords.append(geocoder.geocode(term)) # if geo: # geo.sort(key=lambda x: x[1]) # print(row_coords) # if not geo and cell not in EXC: # a[cell] = 1 gt = GeoEntity(row_terms) gt.geocode(geocoder) table_coords.append(gt) # print progress sys.stdout.write("\r %s / %s" % (str(index), total)) sys.stdout.flush() print("") geocoder.close() return pd.concat( [table, pd.Series(table_coords, name="geoentity")], axis=1)
def geocode_list(self, strings): """Creates a GeoEntity with the strings geocoded.""" all_terms = [] for string in strings: canonical = canonical_form(string) terms = self.terms_db.search(string, canonical) if terms: all_terms += terms gt = GeoEntity(all_terms) gt.geocode(self) return gt
def load_plans(ensemble_paths): plans_by_ensemble = dict() for (ensemble, path) in ensemble_paths.items(): # load sampled ensemble, collect unique plans with open(path, 'rb') as f: raw_plans = pickle.load(f) plans = [canonical_form(plan) for plan in raw_plans] plans_by_ensemble[ensemble] = plans return plans_by_ensemble