def reinsert_deleted_comments(deleted_comments):
     self.previous_comments = NoAho()
     for action in deleted_comments:
         if action['type'] == 'COMMENT_REMOVAL' and len(
                 action['content']) > self.THERESHOLD:
             self.previous_comments.add(
                 ''.join(action['content']),
                 (action['parent_id'], action['indentation']))
 def load(self, deleted_comments):
     """
       Load the previous page state, deleted comments and other information
     """
     self.deleted_records = {}
     self.previous_comments = NoAho()
     for pair in deleted_comments:
         self.previous_comments.add(pair[0], (pair[1], int(pair[2])))
         self.deleted_records[pair[1]] = True
     return
Esempio n. 3
0
 def test_empty_construction(self):
     """Make sure that we can safely construct and dealloc a tree
     with no initial keywords.  Important because the C
     implementation assumes keywords exist on its dealloc, so we
     have to do some work on the back end to avoid silly segmentation
     errors."""
     tree = NoAho()
     del tree
Esempio n. 4
0
    def load_address_database(self, csv_file):
        reader = csv.reader(csv_file, delimiter=',')
        reader.next()
        addr_hash = {}
        for idx, row in enumerate(reader):
            row_type = int(row[-2])
            if row_type != 1:
                continue
            street = row[0].strip()
            if not row[1]:
                continue
            num = int(row[1])
            if not num:
                continue
            num2 = row[2]
            if not num2:
                num2 = None
            letter = row[3].strip()
            muni_name = row[10].strip()
            coord_n = int(row[8])
            coord_e = int(row[9])
            if muni_name != "Helsinki":
                continue
            e = {
                'muni': muni_name,
                'street': street,
                'num': num,
                'num_end': num2,
                'letter': letter,
                'coord_n': coord_n,
                'coord_e': coord_e
            }
            street = street.lower().decode('utf8')
            num_list = addr_hash.setdefault(street, [])
            for s in num_list:
                if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e[
                        'letter'] == s['letter']:
                    break
            else:
                num_list.append(e)

        self.street_hash = addr_hash
        self.street_tree = NoAho()
        print "%d street names loaded" % len(self.street_hash)
        for street in self.street_hash.keys():
            self.street_tree.add(street)
    def process(self, rev, DEBUGGING_MODE=False):
        rev['text'] = clean(rev['text'])
        a = text_split.tokenize(self.latest_content)
        b = text_split.tokenize(rev['text'])
        rev['diff'] = sorted([
            self.convert_diff_format(x, a, b)
            for x in list(sequence_matcher.diff(a, b))
        ],
                             key=lambda k: k['a1'])
        rev['diff'] = diff_tuning(rev['diff'], a, b)
        rev['diff'] = sorted(rev['diff'], key=lambda k: k['a1'])
        if self.NOT_EXISTED:
            self.previous_comments = NoAho()
            old_page = self.page_creation(rev)
        else:
            old_page = self.page
        self.latest_content = rev['text']

        try:
            actions, updated_page = insert(rev, old_page,
                                           self.previous_comments,
                                           DEBUGGING_MODE)
        except:
            e_type, e_val, tb = sys.exc_info()
            traceback.print_tb(tb)
            traceback.print_exception(e_type, e_val, tb)
            tb_info = traceback.extract_tb(tb)
            filename, line, func, text = tb_info[-1]
            print(
                'An error occurred on line {} in statement {} when parsing revision {}'
                .format(line, text, rev['rev_id']))
            return

        self.page = updated_page
        for action in actions:
            action['page_id'] = rev['page_id']
            action['page_title'] = rev['page_title']
            if action['type'] == 'COMMENT_REMOVAL' and len(
                    action['content']) > self.THERESHOLD:
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))
        return actions
Esempio n. 6
0
def load_phrase_list(data_path, phrase_list):
    phrase_size = len(phrase_list)
    phrase_max = max(phrase_size, 1)
    pdict = []
    phrase_to_id = dict()
    for i in range(len(phrase_list)):
        phrase_to_id[phrase_list[i]] = i
        pdict.append(NoAho())
        with codecs.open(os.path.join(data_path, phrase_list[i] + ".plist"), "r", "utf8") as f:
            for line in f:
                pdict[i].add(line.strip().lower(), i)

    return pdict, phrase_to_id, phrase_size, phrase_max
Esempio n. 7
0
    def process(self, rev, DEBUGGING_MODE = False):
        rev['text'] = clean(rev['text'])
        #print(rev['text'])
        pid = rev['page_id']
        rev['diff'] = list(diff(self.latest_content[pid], rev['text'])) 
        if pid not in self.pages:
            self.previous_comments[pid] = NoAho()
            self.latest_content[pid] = ""
            updated_page = self.page_creation(rev)
            old_page = updated_page
        else:    
            old_page = self.pages[rev['page_id']]
    
        
        self.latest_content[pid] = rev['text']
    
        try:
            actions, updated_page = insert(rev, old_page, self.previous_comments[pid], DEBUGGING_MODE)
        except:
            e_type, e_val, tb = sys.exc_info()
            traceback.print_tb(tb) 
            traceback.print_exception(e_type, e_val, tb)
            tb_info = traceback.extract_tb(tb)
            filename, line, func, text = tb_info[-1]
            self.save('%s_error_stopped.json'%(rev['rev_id']))
            print('An error occurred on line {} in statement {} when parsing revision {}'.format(line, text, rev['rev_id']))
            print('Intermediate file has been saved in %s_error_stopped.json, load from it to continue when ready.'%(rev['rev_id']))
            if not(self.tracking_file == None):
                self.tracking_file.close()
            return

        self.pages[pid] = updated_page
        for action in actions:
            action['page_id'] = pid
            action['page_title'] = rev['page_title'] 
#            if (action['type'] == 'COMMENT_ADDING' or action['type'] == 'COMMENT_MODIFICATION' or action['type'] == 'SECTION_CREATION') and len(action['content']) > self.THERESHOLD:
            if action['type'] == 'COMMENT_REMOVAL' and len(action['content']) > self.THERESHOLD:
                self.previous_comments[pid].add(''.join(action['content']), (action['parent_id'], action['indentation']))
                if not(self.tracking_file == None):
                    self.tracking_file.write(json.dumps([pid, ''.join(action['content']), (action['parent_id'], action['indentation'])]) + '\n')
        return actions
Esempio n. 8
0
 def load(self, FILENAME, COMMENT_TRACKING_FILE = None):
     BASE_DIR = 'json_dumps'
     with open(os.path.join(BASE_DIR, FILENAME)) as f:
         self.pages, self.THERESHOLD, self.latest_content = json.load(f)
     self.previous_comments = {}
     for pid in self.pages.keys():
         print(type(pid))
         self.previous_comments[pid] = NoAho()
         updated_actions = {}
         for act, val in self.pages[pid]['actions'].items():
             updated_actions[int(act)] = tuple(val)
         self.pages[pid]['actions'] = updated_actions
         print(updated_actions)
     if not(COMMENT_TRACKING_FILE == None):
         with open(COMMENT_TRACKING_FILE, "r") as f:
             for line in f:
                 pid, key, val = json.loads(line)
             self.previous_comments[pid].add(key, val)
         self.trakcing_file = open(COMMENT_TRACKING_FILE, "a")
     else:
         self.tracking_file = None
Esempio n. 9
0
 def load_address_database(self, csv_file):
     reader = csv.reader(csv_file, delimiter=',')
     reader.next()
     addr_hash = {}
     for idx, row in enumerate(reader):
         row_type = int(row[-1])
         if row_type != 1:
             continue
         street = row[0].strip()
         if not row[1]:
             continue
         num = int(row[1])
         if not num:
             continue
         num2 = row[2]
         if not num2:
             num2 = None
         letter = row[3]
         muni_name = row[10]
         coord_n = int(row[8])
         coord_e = int(row[9])
         if muni_name != "Helsinki":
             continue
         e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2,
              'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e}
         street = street.lower().decode('utf8')
         if street in addr_hash:
             if num2 == None:
                 num2s = ''
             else:
                 num2s = str(num2)
             addr_hash[street].append(e)
         else:
             addr_hash[street] = [e]
     self.street_hash = addr_hash
     self.street_tree = NoAho()
     print "%d street names loaded" % len(self.street_hash)
     for street in self.street_hash.keys():
         self.street_tree.add(street)
Esempio n. 10
0
    def load_address_database(self, csv_file):
        reader = csv.reader(csv_file, delimiter=',')
        reader.next()
        addr_hash = {}
        for idx, row in enumerate(reader):
            row_type = int(row[-2])
            if row_type != 1:
                continue
            street = row[0].strip()
            if not row[1]:
                continue
            num = int(row[1])
            if not num:
                continue
            num2 = row[2]
            if not num2:
                num2 = None
            letter = row[3].strip()
            muni_name = row[10].strip()
            coord_n = int(row[8])
            coord_e = int(row[9])
            if muni_name != "Helsinki":
                continue
            e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2,
                 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e}
            street = street.lower().decode('utf8')
            num_list = addr_hash.setdefault(street, [])
            for s in num_list:
                if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']:
                    break
            else:
                num_list.append(e)

        self.street_hash = addr_hash
        self.street_tree = NoAho()
        print "%d street names loaded" % len(self.street_hash)
        for street in self.street_hash.keys():
            self.street_tree.add(street)
class Conversation_Constructor:
    def __init__(self):
        self.COMMENT_LOWERBOUND = 10
        self.COMMENT_UPPERBOUND = 1000
        # Deleted comments with less than this number of tokens will not be recorded
        # thus not considered in comment restoration actions to reduce confusion.
        self.deleted_records = {}

    def page_creation(self, rev):
        page = {}
        page['page_id'] = rev['page_id']
        page['actions'] = {}
        page['page_title'] = rev['page_title']
        page['actions'][0] = (-1, -1)
        return page

    def load(self, deleted_comments):
        """
          Load the previous page state, deleted comments and other information
        """
        self.deleted_records = {}
        self.previous_comments = NoAho()
        for pair in deleted_comments:
            self.previous_comments.add(pair[0], (pair[1], int(pair[2])))
            self.deleted_records[pair[1]] = True
        return

    def convert_diff_format(self, x, a, b):
        ret = x
        if x['name'] == 'insert':
            ret['tokens'] = b[x['b1']:x['b2']]
        if x['name'] == 'delete':
            ret['tokens'] = a[x['a1']:x['a2']]
        return ret

    def mydiff_toDelta(self, diffs):
        """Crush the diff into a list of dictionary indicating changes
         from one document to another. Operations are dictionary record
         with name (insert, delete, equal) and offsets (in original text
         and resulted text).

         Args:
           diffs: Array of diff tuples.
         Returns:
           Deltas.
         """
        text = []
        a = 0
        b = 0
        DIFF_DELETE = -1
        DIFF_INSERT = 1
        DIFF_EQUAL = 0

        for (op, data) in diffs:
            if op == DIFF_INSERT:
                yield ({
                    "name": "insert",
                    "a1": a,
                    "a2": a,
                    "b1": b,
                    "b2": b + len(data)
                })
                b += len(data)
            elif op == DIFF_DELETE:
                yield ({
                    "name": "delete",
                    "a1": a,
                    "a2": a + len(data),
                    "b1": b,
                    "b2": b
                })
                a += len(data)
            elif op == DIFF_EQUAL:
                yield ({
                    "name": "equal",
                    "a1": a,
                    "a2": a + len(data),
                    "b1": b,
                    "b2": b + len(data)
                })
                a += len(data)
                b += len(data)

    def clean_dict(self, page, the_dict):
        """
          We only store the information of currently 'alive' actions.
          Definition of alive:
             - The action was a deletion happened recently, hence might be restored later.
             - The action is still present on the page, hence might be modified/removed/replied to.
        """
        keylist = the_dict.keys()
        ret = the_dict
        alive_actions = set([action[0] for action in page['actions'].values()])
        for action in keylist:
            if not (action in alive_actions or action in self.deleted_records):
                del ret[action]
        return ret

    def process(self, page_state, latest_content, rev):
        logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id'])
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage)
        # Clean the HTML format of the revision.
        rev['text'] = clean_html(rev['text'])
        # Compute the diff between the latest processed revision and the current
        # one.
        dmp = dmp_module.diff_match_patch()
        logging.debug("LENGTH : %d -> %d" %
                      (len(latest_content), len(rev['text'])))
        diff = dmp.diff_main(latest_content, rev['text'], False)
        dmp.diff_cleanupSemantic(diff)
        delta = self.mydiff_toDelta(diff)
        rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \
                              for x in delta], key=lambda k: k['a1'])
        # Create a new page if this page was never processed before.
        if not (page_state):
            self.previous_comments = NoAho()
            old_page = self.page_creation(rev)
            page_state = {'rev_id': int(rev['rev_id']), \
                          'timestamp': rev['timestamp'], \
                          'page_id': rev['page_id'], \
                          'deleted_comments': [], \
                          'conversation_id': {}, \
                          'authors': {},
                          'ancestor_id': {}}
        else:
            page_state['rev_id'] = int(rev['rev_id'])
            page_state['timestamp'] = rev['timestamp']
            old_page = page_state['page_state']
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage)
        # Process the revision to get the actions and update page state
        actions, updated_page = insert(rev, old_page, self.previous_comments,
                                       self.COMMENT_LOWERBOUND)
        page_state['page_state'] = updated_page
        # Post process of the actions:
        for action in actions:
            # If the action is adding new content
            # - locate which conversation does it belong to
            # - record the name of the author into the author list of the comment
            if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \
               or action['type'] == 'CREATION':
                if action['replyTo_id'] == None:
                    page_state['conversation_id'][action['id']] = action['id']
                else:
                    page_state['conversation_id'][action['id']] = \
                        page_state['conversation_id'][action['replyTo_id']]
                if action['type'] == 'MODIFICATION':
                    page_state['authors'][action['id']] = \
                        set(page_state['authors'][action['parent_id']])
                    page_state['authors'][action['id']].add(
                        (action['user_id'], action['user_text']))
                    page_state['ancestor_id'][action['id']] = \
                        page_state['ancestor_id'][action['parent_id']]
                else:
                    page_state['authors'][action['id']] = \
                        set([(action['user_id'], action['user_text'])])
                    page_state['ancestor_id'][action['id']] = action['id']
            else:
                page_state['authors'][action['id']] = \
                    set(page_state['authors'][action['parent_id']])
                page_state['ancestor_id'][action['id']] = \
                    page_state['ancestor_id'][action['parent_id']]

            # Removed and restored comments are considered
            # belonging to the same conversation as its original version.
            if action['type'] == 'DELETION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            if action['type'] == 'RESTORATION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            action['conversation_id'] = page_state['conversation_id'][
                action['id']]
            action['authors'] = list(page_state['authors'][action['id']])
            action['page_id'] = rev['page_id']
            action['page_title'] = rev['page_title']
            action['cleaned_content'] = clean(action['content'])
            action['ancestor_id'] = page_state['ancestor_id'][action['id']]
            # If a comment is deleted, it will be added to a list used for
            # identifying restoration actions later. Note that comments that
            # deleted two weeks ago will be removed from the list to ensure
            # memory efficiency. Also comments that are too long or too short
            # are ignored in this case.
            if action['type'] == 'DELETION' and\
                len(action['content']) > self.COMMENT_LOWERBOUND and\
                len(action['content']) < self.COMMENT_UPPERBOUND:
                page_state['deleted_comments'].append(
                    (''.join(action['content']), action['parent_id'],
                     action['indentation']))
                self.deleted_records[action['parent_id']] = True
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))

        page_state['conversation_id'] = self.clean_dict(
            updated_page, page_state['conversation_id'])
        page_state['authors'] = self.clean_dict(updated_page,
                                                page_state['authors'])
        # Set is not JSON serializable.
        page_state['authors'] = {
            action_id: list(authors)
            for action_id, authors in page_state['authors'].items()
        }
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." %
                      memory_usage)
        return page_state, actions, rev['text']
    def process(self, page_state, latest_content, rev):
        logging.debug("DEBUGGING MODE on REVISION %s" % rev['rev_id'])
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE ANYTHING: %d KB." % memory_usage)
        # Clean the HTML format of the revision.
        rev['text'] = clean_html(rev['text'])
        # Compute the diff between the latest processed revision and the current
        # one.
        dmp = dmp_module.diff_match_patch()
        logging.debug("LENGTH : %d -> %d" %
                      (len(latest_content), len(rev['text'])))
        diff = dmp.diff_main(latest_content, rev['text'], False)
        dmp.diff_cleanupSemantic(diff)
        delta = self.mydiff_toDelta(diff)
        rev['diff'] = sorted([self.convert_diff_format(x, latest_content, rev['text']) \
                              for x in delta], key=lambda k: k['a1'])
        # Create a new page if this page was never processed before.
        if not (page_state):
            self.previous_comments = NoAho()
            old_page = self.page_creation(rev)
            page_state = {'rev_id': int(rev['rev_id']), \
                          'timestamp': rev['timestamp'], \
                          'page_id': rev['page_id'], \
                          'deleted_comments': [], \
                          'conversation_id': {}, \
                          'authors': {},
                          'ancestor_id': {}}
        else:
            page_state['rev_id'] = int(rev['rev_id'])
            page_state['timestamp'] = rev['timestamp']
            old_page = page_state['page_state']
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE BEFORE PROCESSING: %d KB." % memory_usage)
        # Process the revision to get the actions and update page state
        actions, updated_page = insert(rev, old_page, self.previous_comments,
                                       self.COMMENT_LOWERBOUND)
        page_state['page_state'] = updated_page
        # Post process of the actions:
        for action in actions:
            # If the action is adding new content
            # - locate which conversation does it belong to
            # - record the name of the author into the author list of the comment
            if action['type'] == 'ADDITION' or action['type'] == 'MODIFICATION' \
               or action['type'] == 'CREATION':
                if action['replyTo_id'] == None:
                    page_state['conversation_id'][action['id']] = action['id']
                else:
                    page_state['conversation_id'][action['id']] = \
                        page_state['conversation_id'][action['replyTo_id']]
                if action['type'] == 'MODIFICATION':
                    page_state['authors'][action['id']] = \
                        set(page_state['authors'][action['parent_id']])
                    page_state['authors'][action['id']].add(
                        (action['user_id'], action['user_text']))
                    page_state['ancestor_id'][action['id']] = \
                        page_state['ancestor_id'][action['parent_id']]
                else:
                    page_state['authors'][action['id']] = \
                        set([(action['user_id'], action['user_text'])])
                    page_state['ancestor_id'][action['id']] = action['id']
            else:
                page_state['authors'][action['id']] = \
                    set(page_state['authors'][action['parent_id']])
                page_state['ancestor_id'][action['id']] = \
                    page_state['ancestor_id'][action['parent_id']]

            # Removed and restored comments are considered
            # belonging to the same conversation as its original version.
            if action['type'] == 'DELETION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            if action['type'] == 'RESTORATION':
                page_state['conversation_id'][action['id']] = \
                         page_state['conversation_id'][action['parent_id']]
            action['conversation_id'] = page_state['conversation_id'][
                action['id']]
            action['authors'] = list(page_state['authors'][action['id']])
            action['page_id'] = rev['page_id']
            action['page_title'] = rev['page_title']
            action['cleaned_content'] = clean(action['content'])
            action['ancestor_id'] = page_state['ancestor_id'][action['id']]
            # If a comment is deleted, it will be added to a list used for
            # identifying restoration actions later. Note that comments that
            # deleted two weeks ago will be removed from the list to ensure
            # memory efficiency. Also comments that are too long or too short
            # are ignored in this case.
            if action['type'] == 'DELETION' and\
                len(action['content']) > self.COMMENT_LOWERBOUND and\
                len(action['content']) < self.COMMENT_UPPERBOUND:
                page_state['deleted_comments'].append(
                    (''.join(action['content']), action['parent_id'],
                     action['indentation']))
                self.deleted_records[action['parent_id']] = True
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))

        page_state['conversation_id'] = self.clean_dict(
            updated_page, page_state['conversation_id'])
        page_state['authors'] = self.clean_dict(updated_page,
                                                page_state['authors'])
        # Set is not JSON serializable.
        page_state['authors'] = {
            action_id: list(authors)
            for action_id, authors in page_state['authors'].items()
        }
        memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.debug("MOMERY USAGE AFTER POSTPROCESSING: %d KB." %
                      memory_usage)
        return page_state, actions, rev['text']
Esempio n. 13
0
class Conversation_Constructor:
    def __init__(self):
        self.page = {}
        self.THERESHOLD = 3  # A comment with at least THERESHOLD number of tokens will be recorded
        self.latest_content = ""
        self.NOT_EXISTED = True

    def page_creation(self, rev):
        page = {}
        page['page_id'] = rev['page_id']
        page['actions'] = {}
        page['page_title'] = rev['page_title']
        page['actions'][0] = (-1, -1)
        self.NOT_EXISTED = False
        return page

    def convert_diff_format(self, x, a, b):
        ret = {}
        ret['name'] = x.name
        ret['a1'] = x.a1
        ret['a2'] = x.a2
        ret['b1'] = x.b1
        ret['b2'] = x.b2
        if x.name == 'insert':
            ret['tokens'] = b[x.b1:x.b2]
        if x.name == 'delete':
            ret['tokens'] = a[x.a1:x.a2]
        return ret

    def process(self, rev, DEBUGGING_MODE=False):
        rev['text'] = clean(rev['text'])
        a = text_split.tokenize(self.latest_content)
        b = text_split.tokenize(rev['text'])
        rev['diff'] = sorted([
            self.convert_diff_format(x, a, b)
            for x in list(sequence_matcher.diff(a, b))
        ],
                             key=lambda k: k['a1'])
        rev['diff'] = diff_tuning(rev['diff'], a, b)
        rev['diff'] = sorted(rev['diff'], key=lambda k: k['a1'])
        if self.NOT_EXISTED:
            self.previous_comments = NoAho()
            old_page = self.page_creation(rev)
        else:
            old_page = self.page
        self.latest_content = rev['text']

        try:
            actions, updated_page = insert(rev, old_page,
                                           self.previous_comments,
                                           DEBUGGING_MODE)
        except:
            e_type, e_val, tb = sys.exc_info()
            traceback.print_tb(tb)
            traceback.print_exception(e_type, e_val, tb)
            tb_info = traceback.extract_tb(tb)
            filename, line, func, text = tb_info[-1]
            print(
                'An error occurred on line {} in statement {} when parsing revision {}'
                .format(line, text, rev['rev_id']))
            return

        self.page = updated_page
        for action in actions:
            action['page_id'] = rev['page_id']
            action['page_title'] = rev['page_title']
            if action['type'] == 'COMMENT_REMOVAL' and len(
                    action['content']) > self.THERESHOLD:
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))
        return actions

    def reinsert_deleted_comments(deleted_comments):
        self.previous_comments = NoAho()
        for action in deleted_comments:
            if action['type'] == 'COMMENT_REMOVAL' and len(
                    action['content']) > self.THERESHOLD:
                self.previous_comments.add(
                    ''.join(action['content']),
                    (action['parent_id'], action['indentation']))
Esempio n. 14
0
class AhjoGeocoder(object):
    PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$'
    PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$'

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.no_match_addresses = []
        self.no_match_plans = []
        self.no_match_plan_units = []
        self.plan_map = {}
        self.plan_unit_map = {}
        self.property_map = {}
        self.street_tree = None
        self.matches = 0

    def convert_from_gk25(self, north, east):
        pnt = Point(east, north, srid=GK25_SRID)
        pnt.transform(settings.PROJECTION_SRID)
        return pnt

    def geocode_address(self, text):
        if not self.street_tree:
            return {}

        STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja',
                           'raitti', 'rinne', 'penger', 'ranta', u'väylä')
        for sfx in STREET_SUFFIXES:
            m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text)
            if not m:
                continue
            street_name = m.groups()[0].lower()
            if street_name not in self.street_hash:
                print "Street name not found: %s" % street_name.encode('utf8')
                self.no_match_addresses.append('%s %s' %
                                               (m.groups()[0], m.groups()[1]))
        textl = text.lower()
        ret = [x for x in self.street_tree.findall_long(textl)]
        geometries = {}
        for street_match in ret:
            (start, end) = street_match[0:2]
            street_name = textl[start:end]
            # check for the address number
            m = re.match(r'\s*(\d+)', text[end:])
            if not m:
                #print "\tno address: %s" % text[start:]
                continue
            num = int(m.groups()[0])

            e_list = self.street_hash[street_name]
            for e in e_list:
                if num == e['num']:
                    break
                if e['num_end'] and e['num'] < num <= e['num_end']:
                    break
            else:
                self.logger.warning("No match found for '%s %d'" %
                                    (street_name, num))
                s = '%s %d' % (e['street'], num)
                if not s in self.no_match_addresses:
                    self.no_match_addresses.append(s)
                continue

            pnt = self.convert_from_gk25(e['coord_n'], e['coord_e'])
            geom = {
                'name': '%s %d' % (e['street'], num),
                'geometry': pnt,
                'type': 'address',
                'text': text
            }
            geom_id = "%s/%s" % (geom['type'], geom['name'])
            geometries[geom_id] = geom
        return geometries

    def geocode_plan(self, plan_id):
        plan = self.plan_map.get(plan_id)
        if not plan:
            if plan_id not in self.no_match_plans:
                self.logger.warning("No plan found for plan id %s" % plan_id)
                self.no_match_plans.append(plan_id)
            return
        return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'}

    def geocode_plan_unit(self, text, context):
        # If there are more than one '/' characters, it's not a plan unit
        m = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        if m:
            if text.count('/') > 1:
                return None
            block_id, unit_id, rest = m.groups()
            block_id = int(block_id)
            unit_id = int(unit_id)
            district_id = block_id // 1000
            block_id %= 1000
            # TODO: Code the logic to extract and use unit
            #       ids from the rest of the match.
            # if rest:
            #     if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'):
            #         rest = rest[1:]
            #     rest = rest.strip()
            #     if rest and rest[0] == '-':
            #         range_end = int(re.match('-\s?(\d)+', rest).groups()[0])
            #     elif rest.startswith('ja'):
            #         range_end = int(rest[2:])
            #     elif rest.lower().startswith('.a'): # Ksv notation
            #         pass
            #     elif rest.startswith(':'): # ???
            #         pass
            # check for '161/3.A' style
            if not district_id:
                for l in context['all_text']:
                    m = re.match(r'(\d+)\.ko', l, re.I)
                    if not m:
                        continue
                    district_id = int(m.groups()[0])
                    break
                if not district_id:
                    self.logger.warning("No district id found for '%s'" % text)
                    return None
        else:
            m = re.match(self.PLAN_UNIT_LONG_MATCH, text)
            district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]]
            rest = m.groups()[3]

        jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id)
        name = '91-%d-%d-%d' % (district_id, block_id, unit_id)
        plan_unit = self.plan_unit_map.get(jhs_id, None)
        prop = self.property_map.get(jhs_id, None)
        geometry = None
        if plan_unit:
            geometry = plan_unit['geometry']
        elif prop:
            geometry = prop['geometry']
        else:
            print("No geometry found for '%s'" % jhs_id)
            self.logger.warning("No geometry found for '%s'" % jhs_id)
            self.no_match_plan_units.append([text, jhs_id])
            return None

        self.matches += 1
        return {'name': name, 'type': 'plan_unit', 'geometry': geometry}

    def geocode_district(self, text):
        return

    def geocode_from_text(self, text, context):
        text = text.strip()
        if not isinstance(text, unicode):
            text = unicode(text)

        geometries = {}

        # Check for plan unit IDs
        m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text)
        if m1 or m2:
            geom = self.geocode_plan_unit(text, context)
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom
            return geometries

        m = re.match(r'^(\d{3,5})\.[pP]$', text)
        if m:
            geom = self.geocode_plan(m.groups()[0])
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom

        geometries.update(self.geocode_address(text))

        return geometries

    def geocode_from_text_list(self, text_list):
        geometries = {}
        context = {'all_text': text_list}
        for text in text_list:
            g = self.geocode_from_text(text, context)
            geometries.update(g)
        return [geom for geom_id, geom in geometries.iteritems()]

    def load_address_database(self, csv_file):
        reader = csv.reader(csv_file, delimiter=',')
        reader.next()
        addr_hash = {}
        for idx, row in enumerate(reader):
            row_type = int(row[-2])
            if row_type != 1:
                continue
            street = row[0].strip()
            if not row[1]:
                continue
            num = int(row[1])
            if not num:
                continue
            num2 = row[2]
            if not num2:
                num2 = None
            letter = row[3].strip()
            muni_name = row[10].strip()
            coord_n = int(row[8])
            coord_e = int(row[9])
            if muni_name != "Helsinki":
                continue
            e = {
                'muni': muni_name,
                'street': street,
                'num': num,
                'num_end': num2,
                'letter': letter,
                'coord_n': coord_n,
                'coord_e': coord_e
            }
            street = street.lower().decode('utf8')
            num_list = addr_hash.setdefault(street, [])
            for s in num_list:
                if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e[
                        'letter'] == s['letter']:
                    break
            else:
                num_list.append(e)

        self.street_hash = addr_hash
        self.street_tree = NoAho()
        print "%d street names loaded" % len(self.street_hash)
        for street in self.street_hash.keys():
            self.street_tree.add(street)

    def _load_mapinfo(self, ds, id_field_name, id_fixer=None):
        geom_map = {}
        lyr = ds[0]
        for idx, feat in enumerate(lyr):
            origin_id = feat[id_field_name].as_string().strip()
            if id_fixer:
                origin_id = id_fixer(origin_id)
            geom = feat.geom
            geom.srid = GK25_SRID
            geom.transform(settings.PROJECTION_SRID)
            if origin_id not in geom_map:
                plan = {'geometry': None}
                geom_map[origin_id] = plan
            else:
                plan = geom_map[origin_id]
            poly = GEOSGeometry(geom.wkb, srid=geom.srid)
            if isinstance(poly, LineString):
                try:
                    ring = LinearRing(poly.tuple)
                except Exception:
                    self.logger.error(
                        "Skipping plan %s, it's linestring doesn't close." %
                        origin_id)
                    # if the LineString doesn't form a polygon, skip it.
                    continue
                poly = Polygon(ring)
            if plan['geometry']:
                if isinstance(plan['geometry'], Polygon):
                    plan['geometry'] = MultiPolygon(plan['geometry'])
                if isinstance(poly, MultiPolygon):
                    plan['geometry'].extend(poly)
                else:
                    plan['geometry'].append(poly)
            else:
                plan['geometry'] = poly

        for key, e in geom_map.items():
            geom = e['geometry']
            if not geom.valid:
                self.logger.warning("geometry for %s not OK, fixing" % key)
                geom = geom.simplify()
                assert geom.valid
                e['geometry'] = geom
        return geom_map

    def load_plans(self, plan_file, in_effect):
        if getattr(self, 'all_plans_loaded', False):
            return
        if not in_effect:  # Okay, this is hacky!
            try:
                picklef = open('plans.pickle', 'r')
                self.plan_map = cPickle.load(picklef)
                self.all_plans_loaded = True
                print "%d pickled plans loaded" % len(self.plan_map)
                return
            except IOError:
                pass

        ds = DataSource(plan_file, encoding='iso8859-1')

        plan_map = self._load_mapinfo(ds, 'kaavatunnus')
        print "%d plans imported" % len(plan_map)
        self.plan_map.update(plan_map)

        if in_effect:
            picklef = open('plans.pickle', 'w')
            cPickle.dump(self.plan_map,
                         picklef,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    def load_plan_units(self, plan_unit_file):
        try:
            picklef = open('plan_units.pickle', 'r')
            self.plan_unit_map = cPickle.load(picklef)
            print "%d plan units loaded" % len(self.plan_unit_map)
            return
        except IOError:
            pass

        ds = DataSource(plan_unit_file, encoding='iso8859-1')

        self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus')

        print "%d plan units imported" % len(self.plan_unit_map)

        picklef = open('plan_units.pickle', 'w')
        cPickle.dump(self.plan_unit_map,
                     picklef,
                     protocol=cPickle.HIGHEST_PROTOCOL)

    def load_properties(self, property_file):
        try:
            picklef = open('geo_properties.pickle', 'r')
            self.property_map = cPickle.load(picklef)
            print "%d properties loaded" % len(self.property_map)
            return
        except IOError:
            pass

        def fix_property_id(s):
            if s[0] != '0':
                return '0' + s
            return s

        ds = DataSource(property_file, encoding='iso8859-1')

        self.property_map = self._load_mapinfo(ds,
                                               'Kiinteistotunnus',
                                               id_fixer=fix_property_id)

        print "%d properties imported" % len(self.property_map)

        picklef = open('geo_properties.pickle', 'w')
        cPickle.dump(self.property_map,
                     picklef,
                     protocol=cPickle.HIGHEST_PROTOCOL)
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from __future__ import unicode_literals
from noaho import NoAho    # 多模式匹配
from collections import Counter, defaultdict
trie = NoAho()
trie.add('hehe')
trie.add('py')
trie.add('python')


txt = """
我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python
小米科技有限公司
"""

'''
c = defaultdict(int)
words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)]
wc = Counter(words)

for k in trie.findall_long(txt):
    word = txt[k[0]:k[1]]
    c[word] += 1
    #print(k)
    print(txt[k[0]:k[1]])


for k, v in wc.items():
    print k, v
Esempio n. 16
0
# 1. reads links and labels form the ./data/link_labels.csv file
# 2. creates a trie for Aho-Corasick string matching
# 3. finds non-overlapping matches by length first
# 4. replace matches in a text file with links and writes the result to an output file in
# quasi-html format.

# Skip these generic words
skipwords = [
    "frantzösiske", "landet", "staden", "kongen", "konungen", "general",
    "sundet", "printzen", "öfwersten", "slottet", "keysaren"
]

# Valid word boundaries
word_boundaries = ".\n\r\t /:"

trie = NoAho()
text = ""


def make_trie():
    pattern_list = []
    with open('./data/link_labels.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if not row[
                    "link"] in pattern_list and "https://sv.wikipedia.org/wiki/N.N." not in row[
                        "link"] and not row['label'].lower(
                        ) in skipwords and len(row["label"]) > 3:
                pattern_list.append((row['label'], row['link']))
                #f.write(f"""s|{row['label']}|<a href="{row['link']}">{row['label']}</a>|\n""")
Esempio n. 17
0
 def setUp(self):
     self.tree = NoAho()
Esempio n. 18
0
class AhoCorasickTest(unittest.TestCase):
    def setUp(self):
        self.tree = NoAho()

    def tearDown(self):
        self.tree = None

    def test_compile_before_use(self):
        self.tree.add('bar')
        self.assertRaises(AssertionError,
                          lambda: self.tree.find_short('xxxbaryyy'))
        self.tree.compile()
        self.tree.find_short('xxxbaryyy')
        self.assertRaises(AssertionError, lambda: self.tree.add('foo'))

    def test_keyword_as_prefix_of_another(self):
        """According to John, there's a problem with the matcher.
        this test case should expose the bug."""
        self.tree.add('foobar')
        self.tree.add('foo')
        self.tree.add('bar')
        self.tree.compile()
        self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy'))
        self.assertEqual((0, 3, None), self.tree.find_short('foo'))
        self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy'))

    def test_another_find(self):
        """Just to triangulate the search code.  We want to make sure
        that the implementation can do more than one search, at
        least."""
        self.tree.add("Python")
        self.tree.add("PLT Scheme")
        self.tree.compile()
        self.assertEqual((19, 25, None), self.tree.find_short(
            "I am learning both Python and PLT Scheme"))
        self.assertEqual((0, 10, None), self.tree.find_short(
            "PLT Scheme is an interesting language."))

    def test_simple_construction(self):
        self.tree.add("foo")
        self.tree.add("bar")
        self.tree.compile()
        self.assertEqual((10, 13, None),
                         self.tree.find_short("this is a foo message"))
        self.assertEqual(self.tree.children_count(), 6)

    def test_find_longest(self):
        self.tree.add("a")
        self.tree.add("alphabet")
        self.tree.compile()
        self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup"))
        self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup"))
        self.assertEqual((13, 14, None), self.tree.find_long(
            "yummy, I see an alphabet soup bowl"))

    def test_find_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_short(longString))

    def test_find_longest_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_long(longString))

    def test_find_longest_with_no_match(self):
        self.tree.add("foobar")
        self.tree.compile()
        self.assertEqual((None, None, None), self.tree.find_long("fooba"))

    def test_with_expected_non_match(self):
        """Check to see that we don't always get a successful match."""
        self.tree.add("wise man")
        self.tree.compile()
        self.assertEqual((None, None, None), self.tree.find_short(
            "where fools and wise men fear to tread"))

    def test_reject_empty_key(self):
        self.assertRaises(ValueError, self.tree.add, "")

    def test_empty_construction(self):
        """Make sure that we can safely construct and dealloc a tree
        with no initial keywords.  Important because the C
        implementation assumes keywords exist on its dealloc, so we
        have to do some work on the back end to avoid silly segmentation
        errors."""
        tree = NoAho()
        del tree

    def test_embedded_nulls(self):
        """Check to see if we can accept embedded nulls"""
        self.tree.add("hell\0 world")
        self.tree.compile()
        self.assertEqual((None, None, None),
                         self.tree.find_short("ello\0 world"))
        self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world"))

    def test_embedded_nulls_again(self):
        self.tree.add("\0\0\0")
        self.tree.compile()
        self.assertEqual((0, 3, None),
                         self.tree.find_short("\0\0\0\0\0\0\0\0"))

    def test_findall_and_findall_longest(self):
        self.tree.add("python")
        self.tree.add("perl")
        self.tree.add("scheme")
        self.tree.add("java")
        self.tree.add("pythonperl")
        self.tree.compile()
        self.assertEqual(
            [(0, 6, None), (6, 10, None), (10, 16, None), (16, 20, None)],
            list(self.tree.findall_short("pythonperlschemejava")))
        self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)],
                         list(self.tree.findall_long("pythonperlschemejava")))
        self.assertEqual([],
                         list(self.tree.findall_short("no pascal here")))
        self.assertEqual([],
                         list(self.tree.findall_long("no pascal here")))

    def test_bug2_competing_longests(self):
        """Previously we'd return the /last/ key found, now we look forward
        while there are contiguous candidate keys, and actually return the
        longest.
        """
        self.tree.add('cisco', 'cisco')
        self.tree.add('em', 'em')
        self.tree.add('cisco systems australia', 'cisco systems')
        self.tree.compile()
        self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')],
                         list(self.tree.findall_long('cisco systems')))

    def test_bug3_false_terminal_nodes(self):
        self.tree.add('an', None)
        self.tree.add('canal', None)
        self.tree.add('e can oilfield', None)
        self.tree.compile()
        self.assertEqual([(4, 4+5, None)],
                         list(self.tree.findall_long('one canal')))

    def test_payload(self):
        class RandomClass(object):
            def __init__(self):
                pass
        obj = RandomClass()
        self.tree.add("python", "yes-python")
        self.tree.add("perl", "")
        self.tree.add("scheme", None)
        self.tree.add("lisp", [1, 2, 3])
        # no payload, comes out None
        self.tree.add("C++")
        self.tree.add("dylan", obj)
        self.tree.compile()

        self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python"))
        self.assertEqual((0, 4, ""), self.tree.find_short("perl"))
        self.assertEqual((0, 6, None), self.tree.find_short("scheme"))
        self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp"))
        self.assertEqual((0, 3, None), self.tree.find_short("C++"))
        self.assertEqual((0, 5, obj), self.tree.find_short("dylan"))

    def test_dict_style_get_and_set(self):
        self.tree['foo'] = 5
        self.assertEqual(5, self.tree['foo'])

    def test_dict_style_set_empty_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, '', None)

    def test_dict_style_set_nonstring_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, 6, None)
        self.assertRaises(ValueError, self.tree.__setitem__, None, None)
        self.assertRaises(ValueError, self.tree.__setitem__, [], None)

    def test_dict_style_get_unseen_key(self):
        # __getitem__ implements this part of the [] protocol
        self.assertRaises(KeyError, self.tree.__getitem__, 'unseen')
        self.assertRaises(KeyError, self.tree.__getitem__, '')

    def test_dict_style_containment(self):
        self.tree['foo'] = 5
        self.assertEqual(True, 'foo' in self.tree)
        self.assertEqual(False, '' in self.tree)
        self.assertEqual(False, 'fo' in self.tree)
        self.assertEqual(False, 'o' in self.tree)
        self.assertEqual(False, 'oo' in self.tree)
        self.assertEqual(False, 'f' in self.tree)

    def test_dict_style_len(self):
        self.tree['a'] = None
        self.tree['b'] = [1, 2]
        self.tree['c'] = 12
        self.assertEqual(3, len(self.tree))

    # reminder that we need to figure out which version we're in, and
    # test Python 2 unicode explicitly
    @unittest.expectedFailure
    def test_unicode_in_python2(self):
        self.assertEqual(True, False)

    # key iteration is unimplemented
    @unittest.expectedFailure
    def test_iteration(self):
        self.tree.add("Harry")
        self.tree.add("Hermione")
        self.tree.add("Ron")
        self.assertEqual(set("Harry", "Hermione", "Ron"),
                         set(self.tree.keys()))

    # reminder that we need to implement findall_short
    @unittest.expectedFailure
    def test_subset(self):
        self.tree.add("he")
        self.tree.add("hers")
        self.assertEqual([(0, 2, None), (0, 4, None)],
                         list(self.tree.findall_short("hers")))
Esempio n. 19
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from __future__ import unicode_literals
from noaho import NoAho  # 多模式匹配
from collections import Counter, defaultdict
trie = NoAho()
trie.add('hehe')
trie.add('py')
trie.add('python')

txt = """
我是谁不重要,重要的是你要学会python, hehe我是谁不重要,重要的是你要学会python
小米科技有限公司
"""
'''
c = defaultdict(int)
words = [txt[k[0]:k[1]] for k in trie.findall_long(txt)]
wc = Counter(words)

for k in trie.findall_long(txt):
    word = txt[k[0]:k[1]]
    c[word] += 1
    #print(k)
    print(txt[k[0]:k[1]])


for k, v in wc.items():
    print k, v
'''
k = trie.find_short(txt)
Esempio n. 20
0
class AhoCorasickTest(unittest.TestCase):
    def setUp(self):
        self.tree = NoAho()

    def tearDown(self):
        self.tree = None

    def test_compile_before_use(self):
        self.tree.add('bar')
        self.assertRaises(AssertionError,
                          lambda: self.tree.find_short('xxxbaryyy'))
        self.tree.compile()
        self.tree.find_short('xxxbaryyy')
        self.assertRaises(AssertionError, lambda: self.tree.add('foo'))

    def test_keyword_as_prefix_of_another(self):
        """According to John, there's a problem with the matcher.
        this test case should expose the bug."""
        self.tree.add('foobar')
        self.tree.add('foo')
        self.tree.add('bar')
        self.tree.compile()
        self.assertEqual((3, 6, None), self.tree.find_short('xxxfooyyy'))
        self.assertEqual((0, 3, None), self.tree.find_short('foo'))
        self.assertEqual((3, 6, None), self.tree.find_short('xxxbaryyy'))

    def test_another_find(self):
        """Just to triangulate the search code.  We want to make sure
        that the implementation can do more than one search, at
        least."""
        self.tree.add("Python")
        self.tree.add("PLT Scheme")
        self.tree.compile()
        self.assertEqual(
            (19, 25, None),
            self.tree.find_short("I am learning both Python and PLT Scheme"))
        self.assertEqual(
            (0, 10, None),
            self.tree.find_short("PLT Scheme is an interesting language."))

    def test_simple_construction(self):
        self.tree.add("foo")
        self.tree.add("bar")
        self.tree.compile()
        self.assertEqual((10, 13, None),
                         self.tree.find_short("this is a foo message"))
        self.assertEqual(self.tree.children_count(), 6)

    def test_find_longest(self):
        self.tree.add("a")
        self.tree.add("alphabet")
        self.tree.compile()
        self.assertEqual((0, 1, None), self.tree.find_short("alphabet soup"))
        self.assertEqual((0, 8, None), self.tree.find_long("alphabet soup"))
        self.assertEqual(
            (13, 14, None),
            self.tree.find_long("yummy, I see an alphabet soup bowl"))

    def test_find_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_short(longString))

    def test_find_longest_with_whole_match(self):
        """Make sure that longest search will match the whole string."""
        longString = "supercalifragilisticexpialidocious"
        self.tree.add(longString)
        self.tree.compile()
        self.assertEqual((0, len(longString), None),
                         self.tree.find_long(longString))

    def test_find_longest_with_no_match(self):
        self.tree.add("foobar")
        self.tree.compile()
        self.assertEqual((None, None, None), self.tree.find_long("fooba"))

    def test_with_expected_non_match(self):
        """Check to see that we don't always get a successful match."""
        self.tree.add("wise man")
        self.tree.compile()
        self.assertEqual(
            (None, None, None),
            self.tree.find_short("where fools and wise men fear to tread"))

    def test_reject_empty_key(self):
        self.assertRaises(ValueError, self.tree.add, "")

    def test_empty_construction(self):
        """Make sure that we can safely construct and dealloc a tree
        with no initial keywords.  Important because the C
        implementation assumes keywords exist on its dealloc, so we
        have to do some work on the back end to avoid silly segmentation
        errors."""
        tree = NoAho()
        del tree

    def test_embedded_nulls(self):
        """Check to see if we can accept embedded nulls"""
        self.tree.add("hell\0 world")
        self.tree.compile()
        self.assertEqual((None, None, None),
                         self.tree.find_short("ello\0 world"))
        self.assertEqual((0, 11, None), self.tree.find_short("hell\0 world"))

    def test_embedded_nulls_again(self):
        self.tree.add("\0\0\0")
        self.tree.compile()
        self.assertEqual((0, 3, None),
                         self.tree.find_short("\0\0\0\0\0\0\0\0"))

    def test_findall_and_findall_longest(self):
        self.tree.add("python")
        self.tree.add("perl")
        self.tree.add("scheme")
        self.tree.add("java")
        self.tree.add("pythonperl")
        self.tree.compile()
        self.assertEqual([(0, 6, None), (6, 10, None), (10, 16, None),
                          (16, 20, None)],
                         list(self.tree.findall_short("pythonperlschemejava")))
        self.assertEqual([(0, 10, None), (10, 16, None), (16, 20, None)],
                         list(self.tree.findall_long("pythonperlschemejava")))
        self.assertEqual([], list(self.tree.findall_short("no pascal here")))
        self.assertEqual([], list(self.tree.findall_long("no pascal here")))

    def test_bug2_competing_longests(self):
        """Previously we'd return the /last/ key found, now we look forward
        while there are contiguous candidate keys, and actually return the
        longest.
        """
        self.tree.add('cisco', 'cisco')
        self.tree.add('em', 'em')
        self.tree.add('cisco systems australia', 'cisco systems')
        self.tree.compile()
        self.assertEqual([(0, 5, 'cisco'), (10, 12, 'em')],
                         list(self.tree.findall_long('cisco systems')))

    def test_bug3_false_terminal_nodes(self):
        self.tree.add('an', None)
        self.tree.add('canal', None)
        self.tree.add('e can oilfield', None)
        self.tree.compile()
        self.assertEqual([(4, 4 + 5, None)],
                         list(self.tree.findall_long('one canal')))

    def test_payload(self):
        class RandomClass(object):
            def __init__(self):
                pass

        obj = RandomClass()
        self.tree.add("python", "yes-python")
        self.tree.add("perl", "")
        self.tree.add("scheme", None)
        self.tree.add("lisp", [1, 2, 3])
        # no payload, comes out None
        self.tree.add("C++")
        self.tree.add("dylan", obj)
        self.tree.compile()

        self.assertEqual((0, 6, "yes-python"), self.tree.find_short("python"))
        self.assertEqual((0, 4, ""), self.tree.find_short("perl"))
        self.assertEqual((0, 6, None), self.tree.find_short("scheme"))
        self.assertEqual((0, 4, [1, 2, 3]), self.tree.find_short("lisp"))
        self.assertEqual((0, 3, None), self.tree.find_short("C++"))
        self.assertEqual((0, 5, obj), self.tree.find_short("dylan"))

    def test_dict_style_get_and_set(self):
        self.tree['foo'] = 5
        self.assertEqual(5, self.tree['foo'])

    def test_dict_style_set_empty_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, '', None)

    def test_dict_style_set_nonstring_key(self):
        # equivalent to self.tree[''] = None
        # __setitem__ implements this part of the [] protocol
        self.assertRaises(ValueError, self.tree.__setitem__, 6, None)
        self.assertRaises(ValueError, self.tree.__setitem__, None, None)
        self.assertRaises(ValueError, self.tree.__setitem__, [], None)

    def test_dict_style_get_unseen_key(self):
        # __getitem__ implements this part of the [] protocol
        self.assertRaises(KeyError, self.tree.__getitem__, 'unseen')
        self.assertRaises(KeyError, self.tree.__getitem__, '')

    def test_dict_style_containment(self):
        self.tree['foo'] = 5
        self.assertEqual(True, 'foo' in self.tree)
        self.assertEqual(False, '' in self.tree)
        self.assertEqual(False, 'fo' in self.tree)
        self.assertEqual(False, 'o' in self.tree)
        self.assertEqual(False, 'oo' in self.tree)
        self.assertEqual(False, 'f' in self.tree)

    def test_dict_style_len(self):
        self.tree['a'] = None
        self.tree['b'] = [1, 2]
        self.tree['c'] = 12
        self.assertEqual(3, len(self.tree))

    # reminder that we need to figure out which version we're in, and
    # test Python 2 unicode explicitly
    @unittest.expectedFailure
    def test_unicode_in_python2(self):
        self.assertEqual(True, False)

    # key iteration is unimplemented
    @unittest.expectedFailure
    def test_iteration(self):
        self.tree.add("Harry")
        self.tree.add("Hermione")
        self.tree.add("Ron")
        self.assertEqual(set("Harry", "Hermione", "Ron"),
                         set(self.tree.keys()))

    # reminder that we need to implement findall_short
    @unittest.expectedFailure
    def test_subset(self):
        self.tree.add("he")
        self.tree.add("hers")
        self.assertEqual([(0, 2, None), (0, 4, None)],
                         list(self.tree.findall_short("hers")))
Esempio n. 21
0
class AhjoGeocoder(object):
    PLAN_UNIT_SHORT_MATCH = r'^(\d{3,5})/(\d+)(.*)$'
    PLAN_UNIT_LONG_MATCH = r'^0?91-(\d+)-(\d+)-(\d+)(.*)$'

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.no_match_addresses = []
        self.no_match_plans = []
        self.no_match_plan_units = []
        self.plan_map = {}
        self.plan_unit_map = {}
        self.property_map = {}
        self.street_tree = None
        self.matches = 0

    def convert_from_gk25(self, north, east):
        pnt = Point(east, north, srid=GK25_SRID)
        pnt.transform(settings.PROJECTION_SRID)
        return pnt

    def geocode_address(self, text):
        if not self.street_tree:
            return {}

        STREET_SUFFIXES = ('katu', 'tie', 'kuja', 'polku', 'kaari', 'linja', 'raitti', 'rinne', 'penger', 'ranta', u'väylä')
        for sfx in STREET_SUFFIXES:
            m = re.search(r'([A-Z]\w+%s)\s+(\d+)' % sfx, text)
            if not m:
                continue
            street_name = m.groups()[0].lower()
            if street_name not in self.street_hash:
                print "Street name not found: %s" % street_name.encode('utf8')
                self.no_match_addresses.append('%s %s' % (m.groups()[0], m.groups()[1]))
        textl = text.lower()
        ret = [x for x in self.street_tree.findall_long(textl)]
        geometries = {}
        for street_match in ret:
            (start, end) = street_match[0:2]
            street_name = textl[start:end]
            # check for the address number
            m = re.match(r'\s*(\d+)', text[end:])
            if not m:
                #print "\tno address: %s" % text[start:]
                continue
            num = int(m.groups()[0])

            e_list = self.street_hash[street_name]
            for e in e_list:
                if num == e['num']:
                    break
                if e['num_end'] and e['num'] < num <= e['num_end']:
                    break
            else:
                self.logger.warning("No match found for '%s %d'" % (street_name, num))
                s = '%s %d' % (e['street'], num)
                if not s in self.no_match_addresses:
                    self.no_match_addresses.append(s)
                continue

            pnt = self.convert_from_gk25(e['coord_n'], e['coord_e'])
            geom = {'name': '%s %d' % (e['street'], num), 'geometry': pnt,
                    'type': 'address', 'text': text}
            geom_id = "%s/%s" % (geom['type'], geom['name'])
            geometries[geom_id] = geom
        return geometries

    def geocode_plan(self, plan_id):
        plan = self.plan_map.get(plan_id)
        if not plan:
            if plan_id not in self.no_match_plans:
                self.logger.warning("No plan found for plan id %s" % plan_id)
                self.no_match_plans.append(plan_id)
            return
        return {'name': plan_id, 'geometry': plan['geometry'], 'type': 'plan'}

    def geocode_plan_unit(self, text, context):
        # If there are more than one '/' characters, it's not a plan unit
        m = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        if m:
            if text.count('/') > 1:
                return None
            block_id, unit_id, rest = m.groups()
            block_id = int(block_id)
            unit_id = int(unit_id)
            district_id = block_id // 1000
            block_id %= 1000
            # TODO: Code the logic to extract and use unit
            #       ids from the rest of the match.
            # if rest:
            #     if rest[0].lower() in ('a', 'b', 'c', 'd', 'e'):
            #         rest = rest[1:]
            #     rest = rest.strip()
            #     if rest and rest[0] == '-':
            #         range_end = int(re.match('-\s?(\d)+', rest).groups()[0])
            #     elif rest.startswith('ja'):
            #         range_end = int(rest[2:])
            #     elif rest.lower().startswith('.a'): # Ksv notation
            #         pass
            #     elif rest.startswith(':'): # ???
            #         pass
            # check for '161/3.A' style
            if not district_id:
                for l in context['all_text']:
                    m = re.match(r'(\d+)\.ko', l, re.I)
                    if not m:
                        continue
                    district_id = int(m.groups()[0])
                    break
                if not district_id:
                    self.logger.warning("No district id found for '%s'" % text)
                    return None
        else:
            m = re.match(self.PLAN_UNIT_LONG_MATCH, text)
            district_id, block_id, unit_id = [int(x) for x in m.groups()[0:3]]
            rest = m.groups()[3]

        jhs_id = '091%03d%04d%04d' % (district_id, block_id, unit_id)
        name = '91-%d-%d-%d' % (district_id, block_id, unit_id)
        plan_unit = self.plan_unit_map.get(jhs_id, None)
        prop = self.property_map.get(jhs_id, None)
        geometry = None
        if plan_unit:
            geometry = plan_unit['geometry']
        elif prop:
            geometry = prop['geometry']
        else:
            print("No geometry found for '%s'" % jhs_id)
            self.logger.warning("No geometry found for '%s'" % jhs_id)
            self.no_match_plan_units.append([text, jhs_id])
            return None

        self.matches += 1
        return {'name': name, 'type': 'plan_unit', 'geometry': geometry}

    def geocode_district(self, text):
        return

    def geocode_from_text(self, text, context):
        text = text.strip()
        if not isinstance(text, unicode):
            text = unicode(text)

        geometries = {}

        # Check for plan unit IDs
        m1 = re.match(self.PLAN_UNIT_SHORT_MATCH, text)
        m2 = re.match(self.PLAN_UNIT_LONG_MATCH, text)
        if m1 or m2:
            geom = self.geocode_plan_unit(text, context)
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom
            return geometries

        m = re.match(r'^(\d{3,5})\.[pP]$', text)
        if m:
            geom = self.geocode_plan(m.groups()[0])
            if geom:
                geom['text'] = text
                geom_id = "%s/%s" % (geom['type'], geom['name'])
                geometries[geom_id] = geom

        geometries.update(self.geocode_address(text))

        return geometries

    def geocode_from_text_list(self, text_list):
        geometries = {}
        context = {'all_text': text_list}
        for text in text_list:
            g = self.geocode_from_text(text, context)
            geometries.update(g)
        return [geom for geom_id, geom in geometries.iteritems()]

    def load_address_database(self, csv_file):
        reader = csv.reader(csv_file, delimiter=',')
        reader.next()
        addr_hash = {}
        for idx, row in enumerate(reader):
            row_type = int(row[-2])
            if row_type != 1:
                continue
            street = row[0].strip()
            if not row[1]:
                continue
            num = int(row[1])
            if not num:
                continue
            num2 = row[2]
            if not num2:
                num2 = None
            letter = row[3].strip()
            muni_name = row[10].strip()
            coord_n = int(row[8])
            coord_e = int(row[9])
            if muni_name != "Helsinki":
                continue
            e = {'muni': muni_name, 'street': street, 'num': num, 'num_end': num2,
                 'letter': letter, 'coord_n': coord_n, 'coord_e': coord_e}
            street = street.lower().decode('utf8')
            num_list = addr_hash.setdefault(street, [])
            for s in num_list:
                if e['num'] == s['num'] and e['num_end'] == s['num_end'] and e['letter'] == s['letter']:
                    break
            else:
                num_list.append(e)

        self.street_hash = addr_hash
        self.street_tree = NoAho()
        print "%d street names loaded" % len(self.street_hash)
        for street in self.street_hash.keys():
            self.street_tree.add(street)

    def _load_mapinfo(self, ds, id_field_name, id_fixer=None):
        geom_map = {}
        lyr = ds[0]
        for idx, feat in enumerate(lyr):
            origin_id = feat[id_field_name].as_string().strip()
            if id_fixer:
                origin_id = id_fixer(origin_id)
            geom = feat.geom
            geom.srid = GK25_SRID
            geom.transform(settings.PROJECTION_SRID)
            if origin_id not in geom_map:
                plan = {'geometry': None}
                geom_map[origin_id] = plan
            else:
                plan = geom_map[origin_id]
            poly = GEOSGeometry(geom.wkb, srid=geom.srid)
            if isinstance(poly, LineString):
                try:
                    ring = LinearRing(poly.tuple)
                except Exception:
                    self.logger.error("Skipping plan %s, it's linestring doesn't close." % origin_id)
                    # if the LineString doesn't form a polygon, skip it.
                    continue
                poly = Polygon(ring)
            if plan['geometry']:
                if isinstance(plan['geometry'], Polygon):
                    plan['geometry'] = MultiPolygon(plan['geometry'])
                if isinstance(poly, MultiPolygon):
                    plan['geometry'].extend(poly)
                else:
                    plan['geometry'].append(poly)
            else:
                plan['geometry'] = poly

        for key, e in geom_map.items():
            geom = e['geometry']
            if not geom.valid:
                self.logger.warning("geometry for %s not OK, fixing" % key)
                geom = geom.simplify()
                assert geom.valid
                e['geometry'] = geom
        return geom_map

    def load_plans(self, plan_file, in_effect):
        if getattr(self, 'all_plans_loaded', False):
            return
        if not in_effect: # Okay, this is hacky!
            try:
                picklef = open('plans.pickle', 'r')
                self.plan_map = cPickle.load(picklef)
                self.all_plans_loaded = True
                print "%d pickled plans loaded" % len(self.plan_map)
                return
            except IOError:
                pass

        ds = DataSource(plan_file, encoding='iso8859-1')

        plan_map = self._load_mapinfo(ds, 'kaavatunnus')
        print "%d plans imported" % len(plan_map)
        self.plan_map.update(plan_map)

        if in_effect:
            picklef = open('plans.pickle', 'w')
            cPickle.dump(self.plan_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)

    def load_plan_units(self, plan_unit_file):
        try:
            picklef = open('plan_units.pickle', 'r')
            self.plan_unit_map = cPickle.load(picklef)
            print "%d plan units loaded" % len(self.plan_unit_map)
            return
        except IOError:
            pass

        ds = DataSource(plan_unit_file, encoding='iso8859-1')

        self.plan_unit_map = self._load_mapinfo(ds, 'jhstunnus')

        print "%d plan units imported" % len(self.plan_unit_map)

        picklef = open('plan_units.pickle', 'w')
        cPickle.dump(self.plan_unit_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)

    def load_properties(self, property_file):
        try:
            picklef = open('geo_properties.pickle', 'r')
            self.property_map = cPickle.load(picklef)
            print "%d properties loaded" % len(self.property_map)
            return
        except IOError:
            pass

        def fix_property_id(s):
            if s[0] != '0':
                return '0' + s
            return s

        ds = DataSource(property_file, encoding='iso8859-1')

        self.property_map = self._load_mapinfo(ds, 'Kiinteistotunnus', id_fixer=fix_property_id)

        print "%d properties imported" % len(self.property_map)

        picklef = open('geo_properties.pickle', 'w')
        cPickle.dump(self.property_map, picklef, protocol=cPickle.HIGHEST_PROTOCOL)
Esempio n. 22
0
 def setUp(self):
     self.tree = NoAho()