def reinsert_deleted_comments(deleted_comments): self.previous_comments = NoAho() for action in deleted_comments: if action['type'] == 'COMMENT_REMOVAL' and len( action['content']) > self.THERESHOLD: self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation']))
def load(self, deleted_comments): """ Load the previous page state, deleted comments and other information """ self.deleted_records = {} self.previous_comments = NoAho() for pair in deleted_comments: self.previous_comments.add(pair[0], (pair[1], int(pair[2]))) self.deleted_records[pair[1]] = True return
def test_empty_construction(self): """Make sure that we can safely construct and dealloc a tree with no initial keywords. Important because the C implementation assumes keywords exist on its dealloc, so we have to do some work on the back end to avoid silly segmentation errors.""" tree = NoAho() del tree
def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-2]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3].strip() muni_name = row[10].strip() coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = { 'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e } street = street.lower().decode('utf8') num_list = addr_hash.setdefault(street, []) for s in num_list: if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e[ 'letter'] == s['letter']: break else: num_list.append(e) self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street)
def process(self, rev, DEBUGGING_MODE=False): rev['text'] = clean(rev['text']) a = text_split.tokenize(self.latest_content) b = text_split.tokenize(rev['text']) rev['diff'] = sorted([ self.convert_diff_format(x, a, b) for x in list(sequence_matcher.diff(a, b)) ], key=lambda k: k['a1']) rev['diff'] = diff_tuning(rev['diff'], a, b) rev['diff'] = sorted(rev['diff'], key=lambda k: k['a1']) if self.NOT_EXISTED: self.previous_comments = NoAho() old_page = self.page_creation(rev) else: old_page = self.page self.latest_content = rev['text'] try: actions, updated_page = insert(rev, old_page, self.previous_comments, DEBUGGING_MODE) except: e_type, e_val, tb = sys.exc_info() traceback.print_tb(tb) traceback.print_exception(e_type, e_val, tb) tb_info = traceback.extract_tb(tb) filename, line, func, text = tb_info[-1] print( 'An error occurred on line {} in statement {} when parsing revision {}' .format(line, text, rev['rev_id'])) return self.page = updated_page for action in actions: action['page_id'] = rev['page_id'] action['page_title'] = rev['page_title'] if action['type'] == 'COMMENT_REMOVAL' and len( action['content']) > self.THERESHOLD: self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation'])) return actions
def load_phrase_list(data_path, phrase_list): phrase_size = len(phrase_list) phrase_max = max(phrase_size, 1) pdict = [] phrase_to_id = dict() for i in range(len(phrase_list)): phrase_to_id[phrase_list[i]] = i pdict.append(NoAho()) with codecs.open(os.path.join(data_path, phrase_list[i] + ".plist"), "r", "utf8") as f: for line in f: pdict[i].add(line.strip().lower(), i) return pdict, phrase_to_id, phrase_size, phrase_max
def process(self, rev, DEBUGGING_MODE = False): rev['text'] = clean(rev['text']) #print(rev['text']) pid = rev['page_id'] rev['diff'] = list(diff(self.latest_content[pid], rev['text'])) if pid not in self.pages: self.previous_comments[pid] = NoAho() self.latest_content[pid] = "" updated_page = self.page_creation(rev) old_page = updated_page else: old_page = self.pages[rev['page_id']] self.latest_content[pid] = rev['text'] try: actions, updated_page = insert(rev, old_page, self.previous_comments[pid], DEBUGGING_MODE) except: e_type, e_val, tb = sys.exc_info() traceback.print_tb(tb) traceback.print_exception(e_type, e_val, tb) tb_info = traceback.extract_tb(tb) filename, line, func, text = tb_info[-1] self.save('%s_error_stopped.json'%(rev['rev_id'])) print('An error occurred on line {} in statement {} when parsing revision {}'.format(line, text, rev['rev_id'])) print('Intermediate file has been saved in %s_error_stopped.json, load from it to continue when ready.'%(rev['rev_id'])) if not(self.tracking_file == None): self.tracking_file.close() return self.pages[pid] = updated_page for action in actions: action['page_id'] = pid action['page_title'] = rev['page_title'] # if (action['type'] == 'COMMENT_ADDING' or action['type'] == 'COMMENT_MODIFICATION' or action['type'] == 'SECTION_CREATION') and len(action['content']) > self.THERESHOLD: if action['type'] == 'COMMENT_REMOVAL' and len(action['content']) > self.THERESHOLD: self.previous_comments[pid].add(''.join(action['content']), (action['parent_id'], action['indentation'])) if not(self.tracking_file == None): self.tracking_file.write(json.dumps([pid, ''.join(action['content']), (action['parent_id'], action['indentation'])]) + '\n') return actions
def load(self, FILENAME, COMMENT_TRACKING_FILE = None): BASE_DIR = 'json_dumps' with open(os.path.join(BASE_DIR, FILENAME)) as f: self.pages, self.THERESHOLD, self.latest_content = json.load(f) self.previous_comments = {} for pid in self.pages.keys(): print(type(pid)) self.previous_comments[pid] = NoAho() updated_actions = {} for act, val in self.pages[pid]['actions'].items(): updated_actions[int(act)] = tuple(val) self.pages[pid]['actions'] = updated_actions print(updated_actions) if not(COMMENT_TRACKING_FILE == None): with open(COMMENT_TRACKING_FILE, "r") as f: for line in f: pid, key, val = json.loads(line) self.previous_comments[pid].add(key, val) self.trakcing_file = open(COMMENT_TRACKING_FILE, "a") else: self.tracking_file = None
def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-1]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3] muni_name = row[10] coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e} street = street.lower().decode('utf8') if street in addr_hash: if num2 == None: num2s = '' else: num2s = str(num2) addr_hash[street].append(e) else: addr_hash[street] = [e] self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street)
def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-2]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3].strip() muni_name = row[10].strip() coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e} street = street.lower().decode('utf8') num_list = addr_hash.setdefault(street, []) for s in num_list: if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']: break else: num_list.append(e) self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street)
class Conversation_Constructor: def __init__(self): self.COMMENT_LOWERBOUND = 10 self.COMMENT_UPPERBOUND = 1000 # Deleted comments with less than this number of tokens will not be recorded # thus not considered in comment restoration actions to reduce confusion. self.deleted_records = {} def page_creation(self, rev): page = {} page['page_id'] = rev['page_id'] page['actions'] = {} page['page_title'] = rev['page_title'] page['actions'][0] = (-1, -1) return page def load(self, deleted_comments): """ Load the previous page state, deleted comments and other information """ self.deleted_records = {} self.previous_comments = NoAho() for pair in deleted_comments: self.previous_comments.add(pair[0], (pair[1], int(pair[2]))) self.deleted_records[pair[1]] = True return def convert_diff_format(self, x, a, b): ret = x if x['name'] == 'insert': ret['tokens'] = b[x['b1']:x['b2']] if x['name'] == 'delete': ret['tokens'] = a[x['a1']:x['a2']] return ret def mydiff_toDelta(self, diffs): """Crush the diff into a list of dictionary indicating changes from one document to another. Operations are dictionary record with name (insert, delete, equal) and offsets (in original text and resulted text). Args: diffs: Array of diff tuples. Returns: Deltas. """ text = [] a = 0 b = 0 DIFF_DELETE = -1 DIFF_INSERT = 1 DIFF_EQUAL = 0 for (op, data) in diffs: if op == DIFF_INSERT: yield ({ "name": "insert", "a1": a, "a2": a, "b1": b, "b2": b + len(data) }) b += len(data) elif op == DIFF_DELETE: yield ({ "name": "delete", "a1": a, "a2": a + len(data), "b1": b, "b2": b }) a += len(data) elif op == DIFF_EQUAL: yield ({ "name": "equal", "a1": a, "a2": a + len(data), "b1": b, "b2": b + len(data) }) a += len(data) b += len(data) def clean_dict(self, page, the_dict): """ We only store the information of currently 'alive' actions. Definition of alive: - The action was a deletion happened recently, hence might be restored later. - The action is still present on the page, hence might be modified/removed/replied to. """ keylist = the_dict.keys() ret = the_dict alive_actions = set([action[0] for action in page['actions'].values()]) for action in keylist: if not (action in alive_actions or action in self.deleted_records): del ret[action] return ret def process(self, page_state, latest_content, rev): logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id']) memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage) # Clean the HTML format of the revision. rev['text'] = clean_html(rev['text']) # Compute the diff between the latest processed revision and the current # one. dmp = dmp_module.diff_match_patch() logging.debug("LENGTH : %d -> %d" % (len(latest_content), len(rev['text']))) diff = dmp.diff_main(latest_content, rev['text'], False) dmp.diff_cleanupSemantic(diff) delta = self.mydiff_toDelta(diff) rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \ for x in delta], key=lambda k: k['a1']) # Create a new page if this page was never processed before. if not (page_state): self.previous_comments = NoAho() old_page = self.page_creation(rev) page_state = {'rev_id': int(rev['rev_id']), \ 'timestamp': rev['timestamp'], \ 'page_id': rev['page_id'], \ 'deleted_comments': [], \ 'conversation_id': {}, \ 'authors': {}, 'ancestor_id': {}} else: page_state['rev_id'] = int(rev['rev_id']) page_state['timestamp'] = rev['timestamp'] old_page = page_state['page_state'] memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage) # Process the revision to get the actions and update page state actions, updated_page = insert(rev, old_page, self.previous_comments, self.COMMENT_LOWERBOUND) page_state['page_state'] = updated_page # Post process of the actions: for action in actions: # If the action is adding new content # - locate which conversation does it belong to # - record the name of the author into the author list of the comment if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \ or action['type'] == 'CREATION': if action['replyTo_id'] == None: page_state['conversation_id'][action['id']] = action['id'] else: page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['replyTo_id']] if action['type'] == 'MODIFICATION': page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['authors'][action['id']].add( (action['user_id'], action['user_text'])) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] else: page_state['authors'][action['id']] = \ set([(action['user_id'], action['user_text'])]) page_state['ancestor_id'][action['id']] = action['id'] else: page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] # Removed and restored comments are considered # belonging to the same conversation as its original version. if action['type'] == 'DELETION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] if action['type'] == 'RESTORATION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] action['conversation_id'] = page_state['conversation_id'][ action['id']] action['authors'] = list(page_state['authors'][action['id']]) action['page_id'] = rev['page_id'] action['page_title'] = rev['page_title'] action['cleaned_content'] = clean(action['content']) action['ancestor_id'] = page_state['ancestor_id'][action['id']] # If a comment is deleted, it will be added to a list used for # identifying restoration actions later. Note that comments that # deleted two weeks ago will be removed from the list to ensure # memory efficiency. Also comments that are too long or too short # are ignored in this case. if action['type'] == 'DELETION' and\ len(action['content']) > self.COMMENT_LOWERBOUND and\ len(action['content']) < self.COMMENT_UPPERBOUND: page_state['deleted_comments'].append( (''.join(action['content']), action['parent_id'], action['indentation'])) self.deleted_records[action['parent_id']] = True self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation'])) page_state['conversation_id'] = self.clean_dict( updated_page, page_state['conversation_id']) page_state['authors'] = self.clean_dict(updated_page, page_state['authors']) # Set is not JSON serializable. page_state['authors'] = { action_id: list(authors) for action_id, authors in page_state['authors'].items() } memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." % memory_usage) return page_state, actions, rev['text']
def process(self, page_state, latest_content, rev): logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id']) memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage) # Clean the HTML format of the revision. rev['text'] = clean_html(rev['text']) # Compute the diff between the latest processed revision and the current # one. dmp = dmp_module.diff_match_patch() logging.debug("LENGTH : %d -> %d" % (len(latest_content), len(rev['text']))) diff = dmp.diff_main(latest_content, rev['text'], False) dmp.diff_cleanupSemantic(diff) delta = self.mydiff_toDelta(diff) rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \ for x in delta], key=lambda k: k['a1']) # Create a new page if this page was never processed before. if not (page_state): self.previous_comments = NoAho() old_page = self.page_creation(rev) page_state = {'rev_id': int(rev['rev_id']), \ 'timestamp': rev['timestamp'], \ 'page_id': rev['page_id'], \ 'deleted_comments': [], \ 'conversation_id': {}, \ 'authors': {}, 'ancestor_id': {}} else: page_state['rev_id'] = int(rev['rev_id']) page_state['timestamp'] = rev['timestamp'] old_page = page_state['page_state'] memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage) # Process the revision to get the actions and update page state actions, updated_page = insert(rev, old_page, self.previous_comments, self.COMMENT_LOWERBOUND) page_state['page_state'] = updated_page # Post process of the actions: for action in actions: # If the action is adding new content # - locate which conversation does it belong to # - record the name of the author into the author list of the comment if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \ or action['type'] == 'CREATION': if action['replyTo_id'] == None: page_state['conversation_id'][action['id']] = action['id'] else: page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['replyTo_id']] if action['type'] == 'MODIFICATION': page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['authors'][action['id']].add( (action['user_id'], action['user_text'])) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] else: page_state['authors'][action['id']] = \ set([(action['user_id'], action['user_text'])]) page_state['ancestor_id'][action['id']] = action['id'] else: page_state['authors'][action['id']] = \ set(page_state['authors'][action['parent_id']]) page_state['ancestor_id'][action['id']] = \ page_state['ancestor_id'][action['parent_id']] # Removed and restored comments are considered # belonging to the same conversation as its original version. if action['type'] == 'DELETION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] if action['type'] == 'RESTORATION': page_state['conversation_id'][action['id']] = \ page_state['conversation_id'][action['parent_id']] action['conversation_id'] = page_state['conversation_id'][ action['id']] action['authors'] = list(page_state['authors'][action['id']]) action['page_id'] = rev['page_id'] action['page_title'] = rev['page_title'] action['cleaned_content'] = clean(action['content']) action['ancestor_id'] = page_state['ancestor_id'][action['id']] # If a comment is deleted, it will be added to a list used for # identifying restoration actions later. Note that comments that # deleted two weeks ago will be removed from the list to ensure # memory efficiency. Also comments that are too long or too short # are ignored in this case. if action['type'] == 'DELETION' and\ len(action['content']) > self.COMMENT_LOWERBOUND and\ len(action['content']) < self.COMMENT_UPPERBOUND: page_state['deleted_comments'].append( (''.join(action['content']), action['parent_id'], action['indentation'])) self.deleted_records[action['parent_id']] = True self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation'])) page_state['conversation_id'] = self.clean_dict( updated_page, page_state['conversation_id']) page_state['authors'] = self.clean_dict(updated_page, page_state['authors']) # Set is not JSON serializable. page_state['authors'] = { action_id: list(authors) for action_id, authors in page_state['authors'].items() } memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." % memory_usage) return page_state, actions, rev['text']
class Conversation_Constructor: def __init__(self): self.page = {} self.THERESHOLD = 3 # A comment with at least THERESHOLD number of tokens will be recorded self.latest_content = "" self.NOT_EXISTED = True def page_creation(self, rev): page = {} page['page_id'] = rev['page_id'] page['actions'] = {} page['page_title'] = rev['page_title'] page['actions'][0] = (-1, -1) self.NOT_EXISTED = False return page def convert_diff_format(self, x, a, b): ret = {} ret['name'] = x.name ret['a1'] = x.a1 ret['a2'] = x.a2 ret['b1'] = x.b1 ret['b2'] = x.b2 if x.name == 'insert': ret['tokens'] = b[x.b1:x.b2] if x.name == 'delete': ret['tokens'] = a[x.a1:x.a2] return ret def process(self, rev, DEBUGGING_MODE=False): rev['text'] = clean(rev['text']) a = text_split.tokenize(self.latest_content) b = text_split.tokenize(rev['text']) rev['diff'] = sorted([ self.convert_diff_format(x, a, b) for x in list(sequence_matcher.diff(a, b)) ], key=lambda k: k['a1']) rev['diff'] = diff_tuning(rev['diff'], a, b) rev['diff'] = sorted(rev['diff'], key=lambda k: k['a1']) if self.NOT_EXISTED: self.previous_comments = NoAho() old_page = self.page_creation(rev) else: old_page = self.page self.latest_content = rev['text'] try: actions, updated_page = insert(rev, old_page, self.previous_comments, DEBUGGING_MODE) except: e_type, e_val, tb = sys.exc_info() traceback.print_tb(tb) traceback.print_exception(e_type, e_val, tb) tb_info = traceback.extract_tb(tb) filename, line, func, text = tb_info[-1] print( 'An error occurred on line {} in statement {} when parsing revision {}' .format(line, text, rev['rev_id'])) return self.page = updated_page for action in actions: action['page_id'] = rev['page_id'] action['page_title'] = rev['page_title'] if action['type'] == 'COMMENT_REMOVAL' and len( action['content']) > self.THERESHOLD: self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation'])) return actions def reinsert_deleted_comments(deleted_comments): self.previous_comments = NoAho() for action in deleted_comments: if action['type'] == 'COMMENT_REMOVAL' and len( action['content']) > self.THERESHOLD: self.previous_comments.add( ''.join(action['content']), (action['parent_id'], action['indentation']))
class AhjoGeocoder(object): PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$' PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$' def __init__(self): self.logger = logging.getLogger(__name__) self.no_match_addresses = [] self.no_match_plans = [] self.no_match_plan_units = [] self.plan_map = {} self.plan_unit_map = {} self.property_map = {} self.street_tree = None self.matches = 0 def convert_from_gk25(self, north, east): pnt = Point(east, north, srid=GK25_SRID) pnt.transform(settings.PROJECTION_SRID) return pnt def geocode_address(self, text): if not self.street_tree: return {} STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja', 'raitti', 'rinne', 'penger', 'ranta', u'väylä') for sfx in STREET_SUFFIXES: m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text) if not m: continue street_name = m.groups()[0].lower() if street_name not in self.street_hash: print "Street name not found: %s" % street_name.encode('utf8') self.no_match_addresses.append('%s %s' % (m.groups()[0], m.groups()[1])) textl = text.lower() ret = [x for x in self.street_tree.findall_long(textl)] geometries = {} for street_match in ret: (start, end) = street_match[0:2] street_name = textl[start:end] # check for the address number m = re.match(r'\s*(\d+)', text[end:]) if not m: #print "\tno address: %s" % text[start:] continue num = int(m.groups()[0]) e_list = self.street_hash[street_name] for e in e_list: if num == e['num']: break if e['num_end'] and e['num'] < num <= e['num_end']: break else: self.logger.warning("No match found for '%s %d'" % (street_name, num)) s = '%s %d' % (e['street'], num) if not s in self.no_match_addresses: self.no_match_addresses.append(s) continue pnt = self.convert_from_gk25(e['coord_n'], e['coord_e']) geom = { 'name': '%s %d' % (e['street'], num), 'geometry': pnt, 'type': 'address', 'text': text } geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries def geocode_plan(self, plan_id): plan = self.plan_map.get(plan_id) if not plan: if plan_id not in self.no_match_plans: self.logger.warning("No plan found for plan id %s" % plan_id) self.no_match_plans.append(plan_id) return return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'} def geocode_plan_unit(self, text, context): # If there are more than one '/' characters, it's not a plan unit m = re.match(self.PLAN_UNIT_SHORT_MATCH, text) if m: if text.count('/') > 1: return None block_id, unit_id, rest = m.groups() block_id = int(block_id) unit_id = int(unit_id) district_id = block_id // 1000 block_id %= 1000 # TODO: Code the logic to extract and use unit # ids from the rest of the match. # if rest: # if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'): # rest = rest[1:] # rest = rest.strip() # if rest and rest[0] == '-': # range_end = int(re.match('-\s?(\d)+', rest).groups()[0]) # elif rest.startswith('ja'): # range_end = int(rest[2:]) # elif rest.lower().startswith('.a'): # Ksv notation # pass # elif rest.startswith(':'): # ??? # pass # check for '161/3.A' style if not district_id: for l in context['all_text']: m = re.match(r'(\d+)\.ko', l, re.I) if not m: continue district_id = int(m.groups()[0]) break if not district_id: self.logger.warning("No district id found for '%s'" % text) return None else: m = re.match(self.PLAN_UNIT_LONG_MATCH, text) district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]] rest = m.groups()[3] jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id) name = '91-%d-%d-%d' % (district_id, block_id, unit_id) plan_unit = self.plan_unit_map.get(jhs_id, None) prop = self.property_map.get(jhs_id, None) geometry = None if plan_unit: geometry = plan_unit['geometry'] elif prop: geometry = prop['geometry'] else: print("No geometry found for '%s'" % jhs_id) self.logger.warning("No geometry found for '%s'" % jhs_id) self.no_match_plan_units.append([text, jhs_id]) return None self.matches += 1 return {'name': name, 'type': 'plan_unit', 'geometry': geometry} def geocode_district(self, text): return def geocode_from_text(self, text, context): text = text.strip() if not isinstance(text, unicode): text = unicode(text) geometries = {} # Check for plan unit IDs m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text) m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text) if m1 or m2: geom = self.geocode_plan_unit(text, context) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries m = re.match(r'^(\d{3,5})\.[pP]$', text) if m: geom = self.geocode_plan(m.groups()[0]) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom geometries.update(self.geocode_address(text)) return geometries def geocode_from_text_list(self, text_list): geometries = {} context = {'all_text': text_list} for text in text_list: g = self.geocode_from_text(text, context) geometries.update(g) return [geom for geom_id, geom in geometries.iteritems()] def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-2]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3].strip() muni_name = row[10].strip() coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = { 'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e } street = street.lower().decode('utf8') num_list = addr_hash.setdefault(street, []) for s in num_list: if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e[ 'letter'] == s['letter']: break else: num_list.append(e) self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street) def _load_mapinfo(self, ds, id_field_name, id_fixer=None): geom_map = {} lyr = ds[0] for idx, feat in enumerate(lyr): origin_id = feat[id_field_name].as_string().strip() if id_fixer: origin_id = id_fixer(origin_id) geom = feat.geom geom.srid = GK25_SRID geom.transform(settings.PROJECTION_SRID) if origin_id not in geom_map: plan = {'geometry': None} geom_map[origin_id] = plan else: plan = geom_map[origin_id] poly = GEOSGeometry(geom.wkb, srid=geom.srid) if isinstance(poly, LineString): try: ring = LinearRing(poly.tuple) except Exception: self.logger.error( "Skipping plan %s, it's linestring doesn't close." % origin_id) # if the LineString doesn't form a polygon, skip it. continue poly = Polygon(ring) if plan['geometry']: if isinstance(plan['geometry'], Polygon): plan['geometry'] = MultiPolygon(plan['geometry']) if isinstance(poly, MultiPolygon): plan['geometry'].extend(poly) else: plan['geometry'].append(poly) else: plan['geometry'] = poly for key, e in geom_map.items(): geom = e['geometry'] if not geom.valid: self.logger.warning("geometry for %s not OK, fixing" % key) geom = geom.simplify() assert geom.valid e['geometry'] = geom return geom_map def load_plans(self, plan_file, in_effect): if getattr(self, 'all_plans_loaded', False): return if not in_effect: # Okay, this is hacky! try: picklef = open('plans.pickle', 'r') self.plan_map = cPickle.load(picklef) self.all_plans_loaded = True print "%d pickled plans loaded" % len(self.plan_map) return except IOError: pass ds = DataSource(plan_file, encoding='iso8859-1') plan_map = self._load_mapinfo(ds, 'kaavatunnus') print "%d plans imported" % len(plan_map) self.plan_map.update(plan_map) if in_effect: picklef = open('plans.pickle', 'w') cPickle.dump(self.plan_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_plan_units(self, plan_unit_file): try: picklef = open('plan_units.pickle', 'r') self.plan_unit_map = cPickle.load(picklef) print "%d plan units loaded" % len(self.plan_unit_map) return except IOError: pass ds = DataSource(plan_unit_file, encoding='iso8859-1') self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus') print "%d plan units imported" % len(self.plan_unit_map) picklef = open('plan_units.pickle', 'w') cPickle.dump(self.plan_unit_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_properties(self, property_file): try: picklef = open('geo_properties.pickle', 'r') self.property_map = cPickle.load(picklef) print "%d properties loaded" % len(self.property_map) return except IOError: pass def fix_property_id(s): if s[0] != '0': return '0' + s return s ds = DataSource(property_file, encoding='iso8859-1') self.property_map = self._load_mapinfo(ds, 'Kiinteistotunnus', id_fixer=fix_property_id) print "%d properties imported" % len(self.property_map) picklef = open('geo_properties.pickle', 'w') cPickle.dump(self.property_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)
#!/usr/bin/env python # -*- coding:utf-8 -*- from __future__ import unicode_literals from noaho import NoAho # 多模式匹配 from collections import Counter, defaultdict trie = NoAho() trie.add('hehe') trie.add('py') trie.add('python') txt = """ 我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python 小米科技有限公司 """ ''' c = defaultdict(int) words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)] wc = Counter(words) for k in trie.findall_long(txt): word = txt[k[0]:k[1]] c[word] += 1 #print(k) print(txt[k[0]:k[1]]) for k, v in wc.items(): print k, v
# 1. reads links and labels form the ./data/link_labels.csv file # 2. creates a trie for Aho-Corasick string matching # 3. finds non-overlapping matches by length first # 4. replace matches in a text file with links and writes the result to an output file in # quasi-html format. # Skip these generic words skipwords = [ "frantzösiske", "landet", "staden", "kongen", "konungen", "general", "sundet", "printzen", "öfwersten", "slottet", "keysaren" ] # Valid word boundaries word_boundaries = ".\n\r\t /:" trie = NoAho() text = "" def make_trie(): pattern_list = [] with open('./data/link_labels.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if not row[ "link"] in pattern_list and "https://sv.wikipedia.org/wiki/N.N." not in row[ "link"] and not row['label'].lower( ) in skipwords and len(row["label"]) > 3: pattern_list.append((row['label'], row['link'])) #f.write(f"""s|{row['label']}|<a href="{row['link']}">{row['label']}</a>|\n""")
def setUp(self): self.tree = NoAho()
class AhoCorasickTest(unittest.TestCase): def setUp(self): self.tree = NoAho() def tearDown(self): self.tree = None def test_compile_before_use(self): self.tree.add('bar') self.assertRaises(AssertionError, lambda: self.tree.find_short('xxxbaryyy')) self.tree.compile() self.tree.find_short('xxxbaryyy') self.assertRaises(AssertionError, lambda: self.tree.add('foo')) def test_keyword_as_prefix_of_another(self): """According to John, there's a problem with the matcher. this test case should expose the bug.""" self.tree.add('foobar') self.tree.add('foo') self.tree.add('bar') self.tree.compile() self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy')) self.assertEqual((0, 3, None), self.tree.find_short('foo')) self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy')) def test_another_find(self): """Just to triangulate the search code. We want to make sure that the implementation can do more than one search, at least.""" self.tree.add("Python") self.tree.add("PLT Scheme") self.tree.compile() self.assertEqual((19, 25, None), self.tree.find_short( "I am learning both Python and PLT Scheme")) self.assertEqual((0, 10, None), self.tree.find_short( "PLT Scheme is an interesting language.")) def test_simple_construction(self): self.tree.add("foo") self.tree.add("bar") self.tree.compile() self.assertEqual((10, 13, None), self.tree.find_short("this is a foo message")) self.assertEqual(self.tree.children_count(), 6) def test_find_longest(self): self.tree.add("a") self.tree.add("alphabet") self.tree.compile() self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup")) self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup")) self.assertEqual((13, 14, None), self.tree.find_long( "yummy, I see an alphabet soup bowl")) def test_find_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_short(longString)) def test_find_longest_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_long(longString)) def test_find_longest_with_no_match(self): self.tree.add("foobar") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_long("fooba")) def test_with_expected_non_match(self): """Check to see that we don't always get a successful match.""" self.tree.add("wise man") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_short( "where fools and wise men fear to tread")) def test_reject_empty_key(self): self.assertRaises(ValueError, self.tree.add, "") def test_empty_construction(self): """Make sure that we can safely construct and dealloc a tree with no initial keywords. Important because the C implementation assumes keywords exist on its dealloc, so we have to do some work on the back end to avoid silly segmentation errors.""" tree = NoAho() del tree def test_embedded_nulls(self): """Check to see if we can accept embedded nulls""" self.tree.add("hell\0 world") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_short("ello\0 world")) self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world")) def test_embedded_nulls_again(self): self.tree.add("\0\0\0") self.tree.compile() self.assertEqual((0, 3, None), self.tree.find_short("\0\0\0\0\0\0\0\0")) def test_findall_and_findall_longest(self): self.tree.add("python") self.tree.add("perl") self.tree.add("scheme") self.tree.add("java") self.tree.add("pythonperl") self.tree.compile() self.assertEqual( [(0, 6, None), (6, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_short("pythonperlschemejava"))) self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_long("pythonperlschemejava"))) self.assertEqual([], list(self.tree.findall_short("no pascal here"))) self.assertEqual([], list(self.tree.findall_long("no pascal here"))) def test_bug2_competing_longests(self): """Previously we'd return the /last/ key found, now we look forward while there are contiguous candidate keys, and actually return the longest. """ self.tree.add('cisco', 'cisco') self.tree.add('em', 'em') self.tree.add('cisco systems australia', 'cisco systems') self.tree.compile() self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')], list(self.tree.findall_long('cisco systems'))) def test_bug3_false_terminal_nodes(self): self.tree.add('an', None) self.tree.add('canal', None) self.tree.add('e can oilfield', None) self.tree.compile() self.assertEqual([(4, 4+5, None)], list(self.tree.findall_long('one canal'))) def test_payload(self): class RandomClass(object): def __init__(self): pass obj = RandomClass() self.tree.add("python", "yes-python") self.tree.add("perl", "") self.tree.add("scheme", None) self.tree.add("lisp", [1, 2, 3]) # no payload, comes out None self.tree.add("C++") self.tree.add("dylan", obj) self.tree.compile() self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python")) self.assertEqual((0, 4, ""), self.tree.find_short("perl")) self.assertEqual((0, 6, None), self.tree.find_short("scheme")) self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp")) self.assertEqual((0, 3, None), self.tree.find_short("C++")) self.assertEqual((0, 5, obj), self.tree.find_short("dylan")) def test_dict_style_get_and_set(self): self.tree['foo'] = 5 self.assertEqual(5, self.tree['foo']) def test_dict_style_set_empty_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, '', None) def test_dict_style_set_nonstring_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, 6, None) self.assertRaises(ValueError, self.tree.__setitem__, None, None) self.assertRaises(ValueError, self.tree.__setitem__, [], None) def test_dict_style_get_unseen_key(self): # __getitem__ implements this part of the [] protocol self.assertRaises(KeyError, self.tree.__getitem__, 'unseen') self.assertRaises(KeyError, self.tree.__getitem__, '') def test_dict_style_containment(self): self.tree['foo'] = 5 self.assertEqual(True, 'foo' in self.tree) self.assertEqual(False, '' in self.tree) self.assertEqual(False, 'fo' in self.tree) self.assertEqual(False, 'o' in self.tree) self.assertEqual(False, 'oo' in self.tree) self.assertEqual(False, 'f' in self.tree) def test_dict_style_len(self): self.tree['a'] = None self.tree['b'] = [1, 2] self.tree['c'] = 12 self.assertEqual(3, len(self.tree)) # reminder that we need to figure out which version we're in, and # test Python 2 unicode explicitly @unittest.expectedFailure def test_unicode_in_python2(self): self.assertEqual(True, False) # key iteration is unimplemented @unittest.expectedFailure def test_iteration(self): self.tree.add("Harry") self.tree.add("Hermione") self.tree.add("Ron") self.assertEqual(set("Harry", "Hermione", "Ron"), set(self.tree.keys())) # reminder that we need to implement findall_short @unittest.expectedFailure def test_subset(self): self.tree.add("he") self.tree.add("hers") self.assertEqual([(0, 2, None), (0, 4, None)], list(self.tree.findall_short("hers")))
#!/usr/bin/env python # -*- coding:utf-8 -*- from __future__ import unicode_literals from noaho import NoAho # 多模式匹配 from collections import Counter, defaultdict trie = NoAho() trie.add('hehe') trie.add('py') trie.add('python') txt = """ 我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python 小米科技有限公司 """ ''' c = defaultdict(int) words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)] wc = Counter(words) for k in trie.findall_long(txt): word = txt[k[0]:k[1]] c[word] += 1 #print(k) print(txt[k[0]:k[1]]) for k, v in wc.items(): print k, v ''' k = trie.find_short(txt)
class AhoCorasickTest(unittest.TestCase): def setUp(self): self.tree = NoAho() def tearDown(self): self.tree = None def test_compile_before_use(self): self.tree.add('bar') self.assertRaises(AssertionError, lambda: self.tree.find_short('xxxbaryyy')) self.tree.compile() self.tree.find_short('xxxbaryyy') self.assertRaises(AssertionError, lambda: self.tree.add('foo')) def test_keyword_as_prefix_of_another(self): """According to John, there's a problem with the matcher. this test case should expose the bug.""" self.tree.add('foobar') self.tree.add('foo') self.tree.add('bar') self.tree.compile() self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy')) self.assertEqual((0, 3, None), self.tree.find_short('foo')) self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy')) def test_another_find(self): """Just to triangulate the search code. We want to make sure that the implementation can do more than one search, at least.""" self.tree.add("Python") self.tree.add("PLT Scheme") self.tree.compile() self.assertEqual( (19, 25, None), self.tree.find_short("I am learning both Python and PLT Scheme")) self.assertEqual( (0, 10, None), self.tree.find_short("PLT Scheme is an interesting language.")) def test_simple_construction(self): self.tree.add("foo") self.tree.add("bar") self.tree.compile() self.assertEqual((10, 13, None), self.tree.find_short("this is a foo message")) self.assertEqual(self.tree.children_count(), 6) def test_find_longest(self): self.tree.add("a") self.tree.add("alphabet") self.tree.compile() self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup")) self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup")) self.assertEqual( (13, 14, None), self.tree.find_long("yummy, I see an alphabet soup bowl")) def test_find_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_short(longString)) def test_find_longest_with_whole_match(self): """Make sure that longest search will match the whole string.""" longString = "supercalifragilisticexpialidocious" self.tree.add(longString) self.tree.compile() self.assertEqual((0, len(longString), None), self.tree.find_long(longString)) def test_find_longest_with_no_match(self): self.tree.add("foobar") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_long("fooba")) def test_with_expected_non_match(self): """Check to see that we don't always get a successful match.""" self.tree.add("wise man") self.tree.compile() self.assertEqual( (None, None, None), self.tree.find_short("where fools and wise men fear to tread")) def test_reject_empty_key(self): self.assertRaises(ValueError, self.tree.add, "") def test_empty_construction(self): """Make sure that we can safely construct and dealloc a tree with no initial keywords. Important because the C implementation assumes keywords exist on its dealloc, so we have to do some work on the back end to avoid silly segmentation errors.""" tree = NoAho() del tree def test_embedded_nulls(self): """Check to see if we can accept embedded nulls""" self.tree.add("hell\0 world") self.tree.compile() self.assertEqual((None, None, None), self.tree.find_short("ello\0 world")) self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world")) def test_embedded_nulls_again(self): self.tree.add("\0\0\0") self.tree.compile() self.assertEqual((0, 3, None), self.tree.find_short("\0\0\0\0\0\0\0\0")) def test_findall_and_findall_longest(self): self.tree.add("python") self.tree.add("perl") self.tree.add("scheme") self.tree.add("java") self.tree.add("pythonperl") self.tree.compile() self.assertEqual([(0, 6, None), (6, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_short("pythonperlschemejava"))) self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)], list(self.tree.findall_long("pythonperlschemejava"))) self.assertEqual([], list(self.tree.findall_short("no pascal here"))) self.assertEqual([], list(self.tree.findall_long("no pascal here"))) def test_bug2_competing_longests(self): """Previously we'd return the /last/ key found, now we look forward while there are contiguous candidate keys, and actually return the longest. """ self.tree.add('cisco', 'cisco') self.tree.add('em', 'em') self.tree.add('cisco systems australia', 'cisco systems') self.tree.compile() self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')], list(self.tree.findall_long('cisco systems'))) def test_bug3_false_terminal_nodes(self): self.tree.add('an', None) self.tree.add('canal', None) self.tree.add('e can oilfield', None) self.tree.compile() self.assertEqual([(4, 4 + 5, None)], list(self.tree.findall_long('one canal'))) def test_payload(self): class RandomClass(object): def __init__(self): pass obj = RandomClass() self.tree.add("python", "yes-python") self.tree.add("perl", "") self.tree.add("scheme", None) self.tree.add("lisp", [1, 2, 3]) # no payload, comes out None self.tree.add("C++") self.tree.add("dylan", obj) self.tree.compile() self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python")) self.assertEqual((0, 4, ""), self.tree.find_short("perl")) self.assertEqual((0, 6, None), self.tree.find_short("scheme")) self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp")) self.assertEqual((0, 3, None), self.tree.find_short("C++")) self.assertEqual((0, 5, obj), self.tree.find_short("dylan")) def test_dict_style_get_and_set(self): self.tree['foo'] = 5 self.assertEqual(5, self.tree['foo']) def test_dict_style_set_empty_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, '', None) def test_dict_style_set_nonstring_key(self): # equivalent to self.tree[''] = None # __setitem__ implements this part of the [] protocol self.assertRaises(ValueError, self.tree.__setitem__, 6, None) self.assertRaises(ValueError, self.tree.__setitem__, None, None) self.assertRaises(ValueError, self.tree.__setitem__, [], None) def test_dict_style_get_unseen_key(self): # __getitem__ implements this part of the [] protocol self.assertRaises(KeyError, self.tree.__getitem__, 'unseen') self.assertRaises(KeyError, self.tree.__getitem__, '') def test_dict_style_containment(self): self.tree['foo'] = 5 self.assertEqual(True, 'foo' in self.tree) self.assertEqual(False, '' in self.tree) self.assertEqual(False, 'fo' in self.tree) self.assertEqual(False, 'o' in self.tree) self.assertEqual(False, 'oo' in self.tree) self.assertEqual(False, 'f' in self.tree) def test_dict_style_len(self): self.tree['a'] = None self.tree['b'] = [1, 2] self.tree['c'] = 12 self.assertEqual(3, len(self.tree)) # reminder that we need to figure out which version we're in, and # test Python 2 unicode explicitly @unittest.expectedFailure def test_unicode_in_python2(self): self.assertEqual(True, False) # key iteration is unimplemented @unittest.expectedFailure def test_iteration(self): self.tree.add("Harry") self.tree.add("Hermione") self.tree.add("Ron") self.assertEqual(set("Harry", "Hermione", "Ron"), set(self.tree.keys())) # reminder that we need to implement findall_short @unittest.expectedFailure def test_subset(self): self.tree.add("he") self.tree.add("hers") self.assertEqual([(0, 2, None), (0, 4, None)], list(self.tree.findall_short("hers")))
class AhjoGeocoder(object): PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$' PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$' def __init__(self): self.logger = logging.getLogger(__name__) self.no_match_addresses = [] self.no_match_plans = [] self.no_match_plan_units = [] self.plan_map = {} self.plan_unit_map = {} self.property_map = {} self.street_tree = None self.matches = 0 def convert_from_gk25(self, north, east): pnt = Point(east, north, srid=GK25_SRID) pnt.transform(settings.PROJECTION_SRID) return pnt def geocode_address(self, text): if not self.street_tree: return {} STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja', 'raitti', 'rinne', 'penger', 'ranta', u'väylä') for sfx in STREET_SUFFIXES: m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text) if not m: continue street_name = m.groups()[0].lower() if street_name not in self.street_hash: print "Street name not found: %s" % street_name.encode('utf8') self.no_match_addresses.append('%s %s' % (m.groups()[0], m.groups()[1])) textl = text.lower() ret = [x for x in self.street_tree.findall_long(textl)] geometries = {} for street_match in ret: (start, end) = street_match[0:2] street_name = textl[start:end] # check for the address number m = re.match(r'\s*(\d+)', text[end:]) if not m: #print "\tno address: %s" % text[start:] continue num = int(m.groups()[0]) e_list = self.street_hash[street_name] for e in e_list: if num == e['num']: break if e['num_end'] and e['num'] < num <= e['num_end']: break else: self.logger.warning("No match found for '%s %d'" % (street_name, num)) s = '%s %d' % (e['street'], num) if not s in self.no_match_addresses: self.no_match_addresses.append(s) continue pnt = self.convert_from_gk25(e['coord_n'], e['coord_e']) geom = {'name': '%s %d' % (e['street'], num), 'geometry': pnt, 'type': 'address', 'text': text} geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries def geocode_plan(self, plan_id): plan = self.plan_map.get(plan_id) if not plan: if plan_id not in self.no_match_plans: self.logger.warning("No plan found for plan id %s" % plan_id) self.no_match_plans.append(plan_id) return return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'} def geocode_plan_unit(self, text, context): # If there are more than one '/' characters, it's not a plan unit m = re.match(self.PLAN_UNIT_SHORT_MATCH, text) if m: if text.count('/') > 1: return None block_id, unit_id, rest = m.groups() block_id = int(block_id) unit_id = int(unit_id) district_id = block_id // 1000 block_id %= 1000 # TODO: Code the logic to extract and use unit # ids from the rest of the match. # if rest: # if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'): # rest = rest[1:] # rest = rest.strip() # if rest and rest[0] == '-': # range_end = int(re.match('-\s?(\d)+', rest).groups()[0]) # elif rest.startswith('ja'): # range_end = int(rest[2:]) # elif rest.lower().startswith('.a'): # Ksv notation # pass # elif rest.startswith(':'): # ??? # pass # check for '161/3.A' style if not district_id: for l in context['all_text']: m = re.match(r'(\d+)\.ko', l, re.I) if not m: continue district_id = int(m.groups()[0]) break if not district_id: self.logger.warning("No district id found for '%s'" % text) return None else: m = re.match(self.PLAN_UNIT_LONG_MATCH, text) district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]] rest = m.groups()[3] jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id) name = '91-%d-%d-%d' % (district_id, block_id, unit_id) plan_unit = self.plan_unit_map.get(jhs_id, None) prop = self.property_map.get(jhs_id, None) geometry = None if plan_unit: geometry = plan_unit['geometry'] elif prop: geometry = prop['geometry'] else: print("No geometry found for '%s'" % jhs_id) self.logger.warning("No geometry found for '%s'" % jhs_id) self.no_match_plan_units.append([text, jhs_id]) return None self.matches += 1 return {'name': name, 'type': 'plan_unit', 'geometry': geometry} def geocode_district(self, text): return def geocode_from_text(self, text, context): text = text.strip() if not isinstance(text, unicode): text = unicode(text) geometries = {} # Check for plan unit IDs m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text) m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text) if m1 or m2: geom = self.geocode_plan_unit(text, context) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom return geometries m = re.match(r'^(\d{3,5})\.[pP]$', text) if m: geom = self.geocode_plan(m.groups()[0]) if geom: geom['text'] = text geom_id = "%s/%s" % (geom['type'], geom['name']) geometries[geom_id] = geom geometries.update(self.geocode_address(text)) return geometries def geocode_from_text_list(self, text_list): geometries = {} context = {'all_text': text_list} for text in text_list: g = self.geocode_from_text(text, context) geometries.update(g) return [geom for geom_id, geom in geometries.iteritems()] def load_address_database(self, csv_file): reader = csv.reader(csv_file, delimiter=',') reader.next() addr_hash = {} for idx, row in enumerate(reader): row_type = int(row[-2]) if row_type != 1: continue street = row[0].strip() if not row[1]: continue num = int(row[1]) if not num: continue num2 = row[2] if not num2: num2 = None letter = row[3].strip() muni_name = row[10].strip() coord_n = int(row[8]) coord_e = int(row[9]) if muni_name != "Helsinki": continue e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2, 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e} street = street.lower().decode('utf8') num_list = addr_hash.setdefault(street, []) for s in num_list: if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']: break else: num_list.append(e) self.street_hash = addr_hash self.street_tree = NoAho() print "%d street names loaded" % len(self.street_hash) for street in self.street_hash.keys(): self.street_tree.add(street) def _load_mapinfo(self, ds, id_field_name, id_fixer=None): geom_map = {} lyr = ds[0] for idx, feat in enumerate(lyr): origin_id = feat[id_field_name].as_string().strip() if id_fixer: origin_id = id_fixer(origin_id) geom = feat.geom geom.srid = GK25_SRID geom.transform(settings.PROJECTION_SRID) if origin_id not in geom_map: plan = {'geometry': None} geom_map[origin_id] = plan else: plan = geom_map[origin_id] poly = GEOSGeometry(geom.wkb, srid=geom.srid) if isinstance(poly, LineString): try: ring = LinearRing(poly.tuple) except Exception: self.logger.error("Skipping plan %s, it's linestring doesn't close." % origin_id) # if the LineString doesn't form a polygon, skip it. continue poly = Polygon(ring) if plan['geometry']: if isinstance(plan['geometry'], Polygon): plan['geometry'] = MultiPolygon(plan['geometry']) if isinstance(poly, MultiPolygon): plan['geometry'].extend(poly) else: plan['geometry'].append(poly) else: plan['geometry'] = poly for key, e in geom_map.items(): geom = e['geometry'] if not geom.valid: self.logger.warning("geometry for %s not OK, fixing" % key) geom = geom.simplify() assert geom.valid e['geometry'] = geom return geom_map def load_plans(self, plan_file, in_effect): if getattr(self, 'all_plans_loaded', False): return if not in_effect: # Okay, this is hacky! try: picklef = open('plans.pickle', 'r') self.plan_map = cPickle.load(picklef) self.all_plans_loaded = True print "%d pickled plans loaded" % len(self.plan_map) return except IOError: pass ds = DataSource(plan_file, encoding='iso8859-1') plan_map = self._load_mapinfo(ds, 'kaavatunnus') print "%d plans imported" % len(plan_map) self.plan_map.update(plan_map) if in_effect: picklef = open('plans.pickle', 'w') cPickle.dump(self.plan_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_plan_units(self, plan_unit_file): try: picklef = open('plan_units.pickle', 'r') self.plan_unit_map = cPickle.load(picklef) print "%d plan units loaded" % len(self.plan_unit_map) return except IOError: pass ds = DataSource(plan_unit_file, encoding='iso8859-1') self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus') print "%d plan units imported" % len(self.plan_unit_map) picklef = open('plan_units.pickle', 'w') cPickle.dump(self.plan_unit_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL) def load_properties(self, property_file): try: picklef = open('geo_properties.pickle', 'r') self.property_map = cPickle.load(picklef) print "%d properties loaded" % len(self.property_map) return except IOError: pass def fix_property_id(s): if s[0] != '0': return '0' + s return s ds = DataSource(property_file, encoding='iso8859-1') self.property_map = self._load_mapinfo(ds, 'Kiinteistotunnus', id_fixer=fix_property_id) print "%d properties imported" % len(self.property_map) picklef = open('geo_properties.pickle', 'w') cPickle.dump(self.property_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)