Example #1
0
 def restore(self, stream):
     self.modify_lock.acquire()
     try:
         ln = stream.readline().strip().split(b':')
         self.max_edit_distance = int(ln[0])
         self.longest_word_length = int(ln[1])
         for line in stream:
             ln = line.strip().split(b':')
             words = [safe_utf8_decode(x) for x in ln[2:] if len(x) != 0]
             self.dictionary[safe_utf8_decode(ln[0])] = (words, int(ln[1]))
     finally:
         self.modify_lock.release()
Example #2
0
def ccg_parse(client, sentence, session_id=DEFAULT_SESSION, timeout=0):
    """Parse the sentence using the specified session.

    Args:
        client: The client end-point stub returned from get_client_transport()
        sentence: The sentence. Can be unicode, utf-8, or ascii.
        session_id: Optional session id.
        timeout: If non-zero make the call asynchronously with timeout equal to this value.
            Typically not needed unless the call may timeout when run synchronously.

    Returns:
        The response message string .
    """
    isUnicode = isinstance(sentence, unicode)
    if isUnicode:
        # CCG Parser is Java so input must be utf-8 or ascii
        sentence = sentence.encode('utf-8')
    query_input = create_query_input('text', sentence)
    request = Request()
    request.LUCID = session_id
    request.spec.name = 'infer'
    request.spec.content.extend([query_input])
    if timeout <= 0:
        response = client.infer(request)
    else:
        infer_future = client.infer.future(request, timeout)
        # FIXME: Need to add error reporting to Response structure.
        response = infer_future.result()
    if future_string == unicode:
        isUnicode = True
    if isinstance(response.msg, unicode):
        return response.msg if isUnicode else safe_utf8_encode(response.msg)
    return response.msg if not isUnicode else safe_utf8_decode(response.msg)
Example #3
0
def strip_apostrophe_s(word):
    """Strip trailing 's from nouns.

    Args:
        word: An ascii or utf-8 string.

    Returns:
        The stripped word.
    """
    # Must support utf-8
    if len(word) > 2:
        if word.endswith("'s"):
            return word[0:-2]
        elif isinstance(word, unicode):
            if word.endswith(u"’s"):
                return word.replace(u"’s", u'')
        else:
            uword = safe_utf8_decode(word)
            if uword.endswith(u"’s"):
                return safe_utf8_encode(uword.replace(u"’s", u''))
    return word
Example #4
0
    def create_dictionary_entry(self, w):
        """Add word `w` and its derived deletions to dictionary.

        Remarks:
            Not threadsafe.
        """
        # check if word is already in dictionary
        # dictionary entries are in the form: (list of suggested corrections,
        # frequency of word in corpus)
        w = safe_utf8_decode(w)
        new_real_word_added = False
        if w in self.dictionary:
            # increment frequency of word in corpus
            entry = (self.dictionary[w][0], self.dictionary[w][1] + 1)
        else:
            entry = ([], 1)
            self.longest_word_length = max(self.longest_word_length, len(w))
        self.dictionary[w] = entry

        if entry[1] == 1:
            # first appearance of word in corpus
            # n.b. word may already be in dictionary as a derived word
            # (deleting character from a real word)
            # but counter of frequency of word in corpus is not incremented
            # in those cases)
            new_real_word_added = True
            deletes = self.get_deletes_list(w)
            for item in deletes:
                # If not in dictionary add empty suggestion list and zero frequency
                # then add (correct) word to delete's suggested correction list
                self.dictionary.setdefault(item, ([], 0))[0].append(w)
                # TODO: remove commented code below
                #if item in self.dictionary:
                    # add (correct) word to delete's suggested correction list
                #    self.dictionary[item][0].append(w)
                #else:
                    # note frequency of word in corpus is not incremented
                #    self.dictionary[item] = ([w], 0)
        return new_real_word_added
Example #5
0
 def __unicode__(self):
     return safe_utf8_decode(self._get_str())
Example #6
0
 def __unicode__(self):
     return safe_utf8_decode(self._s)
Example #7
0
    def get_suggestions(self, string, suggest_mode=BEST_SUGGESTION):
        """Return list of suggested corrections for the potentially incorrectly spelled word.

        # get_suggestions('file', suggest_mode=BEST_SUGGESTION)
          returns 'file'
        # get_suggestions('file', suggest_mode=NBEST_SUGGESTIONS)
          returns ['file', 'five', 'fire', 'fine', ...]
        # get_suggestions('file', suggest_mode=ALL_SUGGESTIONS)
          returns [('file', (5, 0)), ('five', (67, 1)), ('fire', (54, 1)), ('fine', (17, 1))...]
        """
        global BEST_SUGGESTION, NBEST_SUGGESTIONS, ALL_SUGGESTIONS
        if (len(string) - self.longest_word_length) > self.max_edit_distance:
            if not self.silent:
                print("no items in dictionary within maximum edit distance")
            return []

        string = safe_utf8_decode(string)
        suggest_dict = {}
        min_suggest_len = float('inf')

        queue = collections.deque([string])
        q_dictionary = {}  # items other than string that we've checked

        while len(queue) != 0:
            q_item = queue.popleft()

            # early exit
            if suggest_mode < ALL_SUGGESTIONS and len(suggest_dict) > 0 and \
                    (len(string)-len(q_item)) > min_suggest_len:
                break

            # process queue item
            if q_item in self.dictionary and q_item not in suggest_dict:
                q_item_entry = self.dictionary[q_item]
                if q_item_entry[1] > 0:
                    # word is in dictionary, and is a word from the corpus, and
                    # not already in suggestion list so add to suggestion
                    # dictionary, indexed by the word with value (frequency in
                    # corpus, edit distance)
                    # note q_items that are not the input string are shorter
                    # than input string since only deletes are added (unless
                    # manual dictionary corrections are added)
                    assert len(string) >= len(q_item)
                    suggest_dict[q_item] = (q_item_entry[1], len(string) - len(q_item))
                    # early exit
                    if suggest_mode < ALL_SUGGESTIONS and len(string) == len(q_item):
                        break
                    elif (len(string) - len(q_item)) < min_suggest_len:
                        min_suggest_len = len(string) - len(q_item)

                # the suggested corrections for q_item as stored in
                # dictionary (whether or not q_item itself is a valid word
                # or merely a delete) can be valid corrections
                for sc_item in q_item_entry[0]:
                    if sc_item not in suggest_dict:

                        # compute edit distance
                        # suggested items should always be longer
                        # (unless manual corrections are added)
                        assert len(sc_item) > len(q_item)

                        # q_items that are not input should be shorter
                        # than original string
                        # (unless manual corrections added)
                        assert len(q_item) <= len(string)

                        if len(q_item) == len(string):
                            assert q_item == string
                            item_dist = len(sc_item) - len(q_item)

                        # item in suggestions list should not be the same as
                        # the string itself
                        assert sc_item != string

                        # calculate edit distance using, for example,
                        # Damerau-Levenshtein distance
                        item_dist = dameraulevenshtein(sc_item, string)

                        # do not add words with greater edit distance if
                        # suggest_mode setting not ALL_SUGGESTIONS
                        if suggest_mode < ALL_SUGGESTIONS and item_dist > min_suggest_len:
                            pass
                        elif item_dist <= self.max_edit_distance:
                            assert sc_item in self.dictionary  # should already be in dictionary if in suggestion list
                            suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
                            if item_dist < min_suggest_len:
                                min_suggest_len = item_dist

                        # depending on order words are processed, some words
                        # with different edit distances may be entered into
                        # suggestions; trim suggestion dictionary if suggest_mode
                        # setting not ALL_SUGGESTIONS
                        if suggest_mode < ALL_SUGGESTIONS:
                            suggest_dict = {k:v for k, v in suggest_dict.items() if v[1]<=min_suggest_len}
                            #suggest_dict = dict(filter(lambda x: x[1][1] <= min_suggest_len, suggest_dict.items()))

            # now generate deletes (e.g. a substring of string or of a delete)
            # from the queue item
            # as additional items to check -- add to end of queue
            assert len(string)>=len(q_item)

            # do not add words with greater edit distance if suggest_mode setting
            # is not ALL_SUGGESTIONS
            if suggest_mode < ALL_SUGGESTIONS and (len(string)-len(q_item)) > min_suggest_len:
                pass
            elif (len(string)-len(q_item)) < self.max_edit_distance and len(q_item) > 1:
                for c in range(len(q_item)): # character index
                    word_minus_c = q_item[:c] + q_item[c+1:]
                    if word_minus_c not in q_dictionary:
                        queue.append(word_minus_c)
                        q_dictionary[word_minus_c] = None  # arbitrary value, just to identify we checked this

            # queue is now empty: convert suggestions in dictionary to
            # list for output
            if not self.silent and suggest_mode != BEST_SUGGESTION:
                print("number of possible corrections: %i" %len(suggest_dict))
                print("  edit distance for deletions: %i" % self.max_edit_distance)

            # output option NBEST_SUGGESTIONS
            # sort results by ascending order of edit distance and descending
            # order of frequency
            #     and return list of suggested word corrections only:
            # return sorted(suggest_dict, key = lambda x:
            #               (suggest_dict[x][1], -suggest_dict[x][0]))

            # output option ALL_SUGGESTIONS
            # return list of suggestions with (correction,
            #                                  (frequency in corpus, edit distance)):
            as_list = suggest_dict.items()
            outlist = sorted(as_list, key=lambda(term, (freq, dist)): (dist, -freq))
            if suggest_mode == BEST_SUGGESTION:
                return outlist[0][0]
            elif suggest_mode == NBEST_SUGGESTIONS:
                return [x[0] for x in outlist]
            return outlist
Example #8
0
def make_lexicon(daemon):
    global pypath, projdir, datapath, idsrch
    allfiles = []
    projdir = os.path.dirname(os.path.dirname(__file__))

    easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon')
    if not os.path.exists(easysrl_path):
        os.makedirs(easysrl_path)
    if not os.path.exists(os.path.join(easysrl_path, 'rt')):
        os.makedirs(os.path.join(easysrl_path, 'rt'))
    if not os.path.exists(os.path.join(easysrl_path, 'az')):
        os.makedirs(os.path.join(easysrl_path, 'az'))

    # Get files
    ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank')
    dirlist1 = sorted(os.listdir(ldcpath))
    #dirlist1 = ['ccg_derivation00.txt']
    for fname in dirlist1:
        if 'ccg_derivation' not in fname:
            continue
        ldcpath1 = os.path.join(ldcpath, fname)
        if os.path.isfile(ldcpath1):
            allfiles.append(ldcpath1)

    failed_parse = 0
    failed_ccg_derivation = []
    start = 0
    progress = -1
    dictionary = None
    for fn in allfiles:
        idx = idsrch.match(fn)
        if idx is None:
            continue
        idx = idx.group('id')

        with open(fn, 'r') as fd:
            lines = fd.readlines()

        name, _ = os.path.splitext(os.path.basename(fn))
        for i in range(start, len(lines)):
            start = 0
            ccgbank = lines[i].strip()
            if len(ccgbank) == 0 or ccgbank[0] == '#':
                continue

            if progress < 0:
                print('%s-%04d' % (name, i))
            else:
                progress = print_progress(progress, 10)

            try:
                # CCG parser is Java so output is UTF-8.
                ccgbank = safe_utf8_decode(ccgbank)
                pt = parse_ccg_derivation(ccgbank)
                s = sentence_from_pt(pt).strip()
            except Exception:
                failed_parse += 1
                raise
                continue

            uid = '%s-%04d' % (idx, i)
            try:
                #dictionary[0-25][stem][set([c]), set(uid)]
                dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid)
            except Exception as e:
                print(e)
                raise
                continue

    rtdict = {}
    for idx in range(len(dictionary)):
        fname = unichr(idx + 0x40)
        filepath = os.path.join(easysrl_path, 'az', fname + '.txt')
        with open(filepath, 'w') as fd:
            d = dictionary[idx]
            for k, v in d.iteritems():
                # k == stem, v = {c: set(uid)}
                fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k))
                for x, w in v.iteritems():
                    fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x))
                    nc = x.split(':')
                    if len(nc) == 2:
                        c = Category.from_cache(
                            Category(nc[1].strip()).clean(True))
                        # Return type atom
                        rt = c.extract_unify_atoms(False)[-1]
                        if rt in rtdict:
                            cdict = rtdict[rt]
                            if c in cdict:
                                cdict[c].append(nc[0])
                            else:
                                cdict[c] = [nc[0]]
                        else:
                            rtdict[rt] = {c: [nc[0]]}
                    for y in w:
                        fd.write(b'sentence id: ' + safe_utf8_encode(y))
                        fd.write(b'\n')
                    fd.write(b'</usage>\n')
                fd.write(b'</predicate>\n\n')
            # Free up memory
            dictionary[idx] = None
            d = None
    for rt, cdict in rtdict.iteritems():
        fname = rt.signature.replace('[', '_').replace(']', '')
        filepath = os.path.join(easysrl_path, 'rt', fname + '.txt')
        with open(filepath, 'w') as fd:
            for c, vs in cdict.iteritems():
                fd.write(b'<category signature=\'%s\'>\n' %
                         safe_utf8_encode(c))
                for v in vs:
                    fd.write(v)
                    fd.write(b'\n')
                fd.write(b'</category>\n\n')
Example #9
0
 def __unicode__(self):
     return safe_utf8_decode(self.to_string())