def restore(self, stream): self.modify_lock.acquire() try: ln = stream.readline().strip().split(b':') self.max_edit_distance = int(ln[0]) self.longest_word_length = int(ln[1]) for line in stream: ln = line.strip().split(b':') words = [safe_utf8_decode(x) for x in ln[2:] if len(x) != 0] self.dictionary[safe_utf8_decode(ln[0])] = (words, int(ln[1])) finally: self.modify_lock.release()
def ccg_parse(client, sentence, session_id=DEFAULT_SESSION, timeout=0): """Parse the sentence using the specified session. Args: client: The client end-point stub returned from get_client_transport() sentence: The sentence. Can be unicode, utf-8, or ascii. session_id: Optional session id. timeout: If non-zero make the call asynchronously with timeout equal to this value. Typically not needed unless the call may timeout when run synchronously. Returns: The response message string . """ isUnicode = isinstance(sentence, unicode) if isUnicode: # CCG Parser is Java so input must be utf-8 or ascii sentence = sentence.encode('utf-8') query_input = create_query_input('text', sentence) request = Request() request.LUCID = session_id request.spec.name = 'infer' request.spec.content.extend([query_input]) if timeout <= 0: response = client.infer(request) else: infer_future = client.infer.future(request, timeout) # FIXME: Need to add error reporting to Response structure. response = infer_future.result() if future_string == unicode: isUnicode = True if isinstance(response.msg, unicode): return response.msg if isUnicode else safe_utf8_encode(response.msg) return response.msg if not isUnicode else safe_utf8_decode(response.msg)
def strip_apostrophe_s(word): """Strip trailing 's from nouns. Args: word: An ascii or utf-8 string. Returns: The stripped word. """ # Must support utf-8 if len(word) > 2: if word.endswith("'s"): return word[0:-2] elif isinstance(word, unicode): if word.endswith(u"’s"): return word.replace(u"’s", u'') else: uword = safe_utf8_decode(word) if uword.endswith(u"’s"): return safe_utf8_encode(uword.replace(u"’s", u'')) return word
def create_dictionary_entry(self, w): """Add word `w` and its derived deletions to dictionary. Remarks: Not threadsafe. """ # check if word is already in dictionary # dictionary entries are in the form: (list of suggested corrections, # frequency of word in corpus) w = safe_utf8_decode(w) new_real_word_added = False if w in self.dictionary: # increment frequency of word in corpus entry = (self.dictionary[w][0], self.dictionary[w][1] + 1) else: entry = ([], 1) self.longest_word_length = max(self.longest_word_length, len(w)) self.dictionary[w] = entry if entry[1] == 1: # first appearance of word in corpus # n.b. word may already be in dictionary as a derived word # (deleting character from a real word) # but counter of frequency of word in corpus is not incremented # in those cases) new_real_word_added = True deletes = self.get_deletes_list(w) for item in deletes: # If not in dictionary add empty suggestion list and zero frequency # then add (correct) word to delete's suggested correction list self.dictionary.setdefault(item, ([], 0))[0].append(w) # TODO: remove commented code below #if item in self.dictionary: # add (correct) word to delete's suggested correction list # self.dictionary[item][0].append(w) #else: # note frequency of word in corpus is not incremented # self.dictionary[item] = ([w], 0) return new_real_word_added
def __unicode__(self): return safe_utf8_decode(self._get_str())
def __unicode__(self): return safe_utf8_decode(self._s)
def get_suggestions(self, string, suggest_mode=BEST_SUGGESTION): """Return list of suggested corrections for the potentially incorrectly spelled word. # get_suggestions('file', suggest_mode=BEST_SUGGESTION) returns 'file' # get_suggestions('file', suggest_mode=NBEST_SUGGESTIONS) returns ['file', 'five', 'fire', 'fine', ...] # get_suggestions('file', suggest_mode=ALL_SUGGESTIONS) returns [('file', (5, 0)), ('five', (67, 1)), ('fire', (54, 1)), ('fine', (17, 1))...] """ global BEST_SUGGESTION, NBEST_SUGGESTIONS, ALL_SUGGESTIONS if (len(string) - self.longest_word_length) > self.max_edit_distance: if not self.silent: print("no items in dictionary within maximum edit distance") return [] string = safe_utf8_decode(string) suggest_dict = {} min_suggest_len = float('inf') queue = collections.deque([string]) q_dictionary = {} # items other than string that we've checked while len(queue) != 0: q_item = queue.popleft() # early exit if suggest_mode < ALL_SUGGESTIONS and len(suggest_dict) > 0 and \ (len(string)-len(q_item)) > min_suggest_len: break # process queue item if q_item in self.dictionary and q_item not in suggest_dict: q_item_entry = self.dictionary[q_item] if q_item_entry[1] > 0: # word is in dictionary, and is a word from the corpus, and # not already in suggestion list so add to suggestion # dictionary, indexed by the word with value (frequency in # corpus, edit distance) # note q_items that are not the input string are shorter # than input string since only deletes are added (unless # manual dictionary corrections are added) assert len(string) >= len(q_item) suggest_dict[q_item] = (q_item_entry[1], len(string) - len(q_item)) # early exit if suggest_mode < ALL_SUGGESTIONS and len(string) == len(q_item): break elif (len(string) - len(q_item)) < min_suggest_len: min_suggest_len = len(string) - len(q_item) # the suggested corrections for q_item as stored in # dictionary (whether or not q_item itself is a valid word # or merely a delete) can be valid corrections for sc_item in q_item_entry[0]: if sc_item not in suggest_dict: # compute edit distance # suggested items should always be longer # (unless manual corrections are added) assert len(sc_item) > len(q_item) # q_items that are not input should be shorter # than original string # (unless manual corrections added) assert len(q_item) <= len(string) if len(q_item) == len(string): assert q_item == string item_dist = len(sc_item) - len(q_item) # item in suggestions list should not be the same as # the string itself assert sc_item != string # calculate edit distance using, for example, # Damerau-Levenshtein distance item_dist = dameraulevenshtein(sc_item, string) # do not add words with greater edit distance if # suggest_mode setting not ALL_SUGGESTIONS if suggest_mode < ALL_SUGGESTIONS and item_dist > min_suggest_len: pass elif item_dist <= self.max_edit_distance: assert sc_item in self.dictionary # should already be in dictionary if in suggestion list suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist) if item_dist < min_suggest_len: min_suggest_len = item_dist # depending on order words are processed, some words # with different edit distances may be entered into # suggestions; trim suggestion dictionary if suggest_mode # setting not ALL_SUGGESTIONS if suggest_mode < ALL_SUGGESTIONS: suggest_dict = {k:v for k, v in suggest_dict.items() if v[1]<=min_suggest_len} #suggest_dict = dict(filter(lambda x: x[1][1] <= min_suggest_len, suggest_dict.items())) # now generate deletes (e.g. a substring of string or of a delete) # from the queue item # as additional items to check -- add to end of queue assert len(string)>=len(q_item) # do not add words with greater edit distance if suggest_mode setting # is not ALL_SUGGESTIONS if suggest_mode < ALL_SUGGESTIONS and (len(string)-len(q_item)) > min_suggest_len: pass elif (len(string)-len(q_item)) < self.max_edit_distance and len(q_item) > 1: for c in range(len(q_item)): # character index word_minus_c = q_item[:c] + q_item[c+1:] if word_minus_c not in q_dictionary: queue.append(word_minus_c) q_dictionary[word_minus_c] = None # arbitrary value, just to identify we checked this # queue is now empty: convert suggestions in dictionary to # list for output if not self.silent and suggest_mode != BEST_SUGGESTION: print("number of possible corrections: %i" %len(suggest_dict)) print(" edit distance for deletions: %i" % self.max_edit_distance) # output option NBEST_SUGGESTIONS # sort results by ascending order of edit distance and descending # order of frequency # and return list of suggested word corrections only: # return sorted(suggest_dict, key = lambda x: # (suggest_dict[x][1], -suggest_dict[x][0])) # output option ALL_SUGGESTIONS # return list of suggestions with (correction, # (frequency in corpus, edit distance)): as_list = suggest_dict.items() outlist = sorted(as_list, key=lambda(term, (freq, dist)): (dist, -freq)) if suggest_mode == BEST_SUGGESTION: return outlist[0][0] elif suggest_mode == NBEST_SUGGESTIONS: return [x[0] for x in outlist] return outlist
def make_lexicon(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) if not os.path.exists(os.path.join(easysrl_path, 'rt')): os.makedirs(os.path.join(easysrl_path, 'rt')) if not os.path.exists(os.path.join(easysrl_path, 'az')): os.makedirs(os.path.join(easysrl_path, 'az')) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = sorted(os.listdir(ldcpath)) #dirlist1 = ['ccg_derivation00.txt'] for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg_derivation = [] start = 0 progress = -1 dictionary = None for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. ccgbank = safe_utf8_decode(ccgbank) pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() except Exception: failed_parse += 1 raise continue uid = '%s-%04d' % (idx, i) try: #dictionary[0-25][stem][set([c]), set(uid)] dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid) except Exception as e: print(e) raise continue rtdict = {} for idx in range(len(dictionary)): fname = unichr(idx + 0x40) filepath = os.path.join(easysrl_path, 'az', fname + '.txt') with open(filepath, 'w') as fd: d = dictionary[idx] for k, v in d.iteritems(): # k == stem, v = {c: set(uid)} fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k)) for x, w in v.iteritems(): fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x)) nc = x.split(':') if len(nc) == 2: c = Category.from_cache( Category(nc[1].strip()).clean(True)) # Return type atom rt = c.extract_unify_atoms(False)[-1] if rt in rtdict: cdict = rtdict[rt] if c in cdict: cdict[c].append(nc[0]) else: cdict[c] = [nc[0]] else: rtdict[rt] = {c: [nc[0]]} for y in w: fd.write(b'sentence id: ' + safe_utf8_encode(y)) fd.write(b'\n') fd.write(b'</usage>\n') fd.write(b'</predicate>\n\n') # Free up memory dictionary[idx] = None d = None for rt, cdict in rtdict.iteritems(): fname = rt.signature.replace('[', '_').replace(']', '') filepath = os.path.join(easysrl_path, 'rt', fname + '.txt') with open(filepath, 'w') as fd: for c, vs in cdict.iteritems(): fd.write(b'<category signature=\'%s\'>\n' % safe_utf8_encode(c)) for v in vs: fd.write(v) fd.write(b'\n') fd.write(b'</category>\n\n')
def __unicode__(self): return safe_utf8_decode(self.to_string())