def test_codec(): for args in LOOKUP_TEST: word = args[0] try: encoded = icepy_encode(word) except ValueError: print "Could not encode %s" % repr(word) continue decoded = icepy_decode(encoded) assert word==decoded
def expand(self, wordform, category=None, tag=None, single_part=True, allow_morphemes=True): """ Splits the input wordform into segments and returns a list of AnalysisMatch objects. Each AnalysisMatch object contains a *parts* attribute, which is a list of matches for each of the segments. Refer to the AnalysisMatch object documentation for information about it's properties. category and tag can be specified to filter in the same manner as in lookup(). If you specify single_part=False the function will not return or attempt to lookup the entire word form. By default, non-standalone morphemes (category 'm') are included in expansions if found. You may specify allow_morphemes=False to disable these, to only get independent word segments. """ self.expand_count += 1 if not self._check_input(wordform,category,tag): return WordformAnalysis([]) #special case for nouns, a compound word segment should not be a proper noun if tag: tag = tag.split('-')[0] wordform = wordform.replace('-','') instances = defaultdict(list) word_len = len(wordform) done = False self.timing.start('expand_loop') for sublen in range(word_len,1,-1): index = -1 while index+sublen < word_len: index += 1 word = wordform[index:index+sublen] if not single_part and sublen==word_len: continue if index+sublen==word_len-1: continue #make sure last segment matches the specified filter if index+sublen==word_len: lookup = self._lookup_candidates(word, category, tag) else: lookup = self._lookup_candidates(word, None, None) for match in lookup: if not allow_morphemes and match.match_tag[0]=='m': continue instances[index].append(ExpansionPart(index,word,match)) if done: break self.timing.end('expand_loop') #for index,parts in instances.iteritems(): # print index, ''.join([str(p)+' ' for p in parts]) if not instances: return WordformAnalysis([]) """find all valid matches""" matches = [] #loop through the "roots" (i.e. instances with index 0) if 0 not in instances: return WordformAnalysis([]) self.timing.start('match_traverse') for root in instances[0]: if root.index > 0: break candidate = [root] #special case of entire word being found on first index if root.end == word_len: matches.append(ExpansionMatch(candidate)) #recursively traverse the "tree" of parts to find all sets which add up #to the full length of the word _traverse_parts(instances, matches, root.end, word_len, candidate, [0]) self.timing.end('match_traverse') if not matches: return WordformAnalysis([]) #filter matches and set attributes for AnalysisMatch objects results = [] for match in matches: if not len(match): continue if not single_part and len(match.parts) < 2: continue last_tag = match[-1].analysis.match_tag if len(match)==1 and last_tag[0]=='m': continue #a compound word should not end with a proper noun. at least that's #not a supported case for now. if last_tag[0]=='n' and len(last_tag)>2 and last_tag[-2]=='-': continue result_tag = last_tag if last_tag[0]!='m' else last_tag[3:] prefix = ''.join([m.analysis.word for m in match[:-1]]) lemma = prefix + match[-1].analysis.lemma prefix = prefix + match[-1].analysis.prefix suffix = icepy_encode(wordform[len(prefix):]) result = AnalysisMatch(prefix, suffix, 0, lemma, match[-1].analysis.otb_count, match[-1].analysis.suffix_id, result_tag, 'expanded_lookup', match.parts) result.tag_count = self.tag_count.get(result_tag,0) result.tag_pattern_count = self.expansion_tag_patterns.get(result.tags(),0) results.append(result) return WordformAnalysis(results)
def inflection_analysis(self, input): """ Generate a InflectionAnalysis object, given an AnalysisMatch object or a list/tuple of words. The InflectionAnalysis object contains a breakdown of how the given word is declined. It includes a stem, a list of stem-variables and corresponding lists of tags and inflectional suffixes. """ wordforms = [] tags = [] if isinstance(input, AnalysisMatch): for suffix,tagset in self.id_suffixes[input.suffix_id].iteritems(): for tag in tagset: wordforms.append(input.prefix+suffix) tags.append(tag) elif isinstance(input, (list,tuple)): for word in input: if isinstance(word, str): wordforms.append(word) elif isinstance(word, unicode): wordforms.append(icepy_encode(word)) else: raise ValueError('input list/tuple must only contain strings') tags = ['' for w in wordforms] else: raise ValueError('input object must be AnalysisMatch instance or list/tuple of wordforms') wordforms.sort(key=lambda x: len(x)) base = wordforms[0] pattern = r'' variables = [] for i in range(len(wordforms)): variables.append([]) running_vars = [None for i in range(len(wordforms))] suffixes = [None for i in range(len(wordforms))] last_constant = -1 for x,char in enumerate(base): #determine if character at offset X is a constant (that is, appears #predictably in all wordforms) range_start = last_constant + 1 range_end = x + 1 is_constant = True for w,word in enumerate(wordforms[1:]): index = word[range_start:range_end].find(char) #print word, char, range_end, range_start, index if index < 0: is_constant = False if not is_constant: for w,word in enumerate(wordforms[1:]): if running_vars[w+1] is None: running_vars[w+1] = word[x] else: running_vars[w+1] += word[x] else: for w,word in enumerate(wordforms[1:]): index = word[range_start:range_end].find(char) suffixes[w+1] = word[range_start+index+1:] if is_constant: if x > range_start: var_set = set() for i,v in enumerate(running_vars): variables[i].append(v) if v is not None: var_set.add(v) vars = r'|'.join( sorted(var_set,key=lambda x: len(x),reverse=True) ) pattern += r'(%s)' % vars pattern += char last_constant = x running_vars = [None for i in range(len(wordforms))] else: if running_vars[0] is None: running_vars[0] = char else: running_vars[0] += char suffixes[0] = base[last_constant+1:] return InflectionAnalysis(pattern, suffixes, variables, tags, wordforms)
def _lookup_candidates(self, wordform, category=None, tag=None): self.lookup_count += 1 #encode input strings try: wordform = icepy_encode(wordform.lower()) if tag: tag = icepy_encode(tag.lower()) category = tag[0] except ValueError: #print "warning: could not encode word/tag %s/%s" % (repr(wordform),repr(tag)) return [] #first check if wordform is in the bloom filter. this is primarily used #to increase performance in expansions, as they frequently include a #large number of lookups of non-existent words. this may return a #false positive but then the prefix lookup will simply return an empty set. self.timing.start('bloom_lookup') in_bloom = True if self.bloom and not self.bloom.InFilter(wordform): in_bloom = False self.bloom_negatives += 1 self.timing.end('bloom_lookup') if not in_bloom: return [] #look for prefix in prefix map. self.timing.start('prefix_lookup') prefix = wordform candidates = [] while True: if prefix in self.prefix_map: #search for the word in the selected prefixes suffix map word_suffix = wordform[len(prefix):] self.timing.start('lookup_prefix_loop') for candidate_id,candidate_suffix_id,wordform_count in self.prefix_map[prefix]: suffixmap = self.id_suffixes[candidate_suffix_id] tags = suffixmap.get(word_suffix, None) if not tags: continue lemma, lemma_category, otb_count = self.id_lemma[candidate_id] if lemma_category=='m' and tag and tag[0]!='m': if tag not in [t[3:] for t in tags]: continue tags = [tags[0][:3] + tag] elif tag: if tag not in tags: continue tags = [tag] elif category and lemma_category not in (category,'m'): continue candidates.extend([ AnalysisMatch(prefix, word_suffix, candidate_id, lemma, otb_count, candidate_suffix_id, candidate_tag, 'lookup') for candidate_tag in tags ]) self.timing.end('lookup_prefix_loop') if not prefix or len(prefix) < self.stem_minimum_length: break prefix = prefix[:-1] if not candidates: self.lookup_negatives += 1 self.timing.end('prefix_lookup') return candidates
def _process_raw(self): suffix_tmp = {} #load OTB otb = {} adverbs = [] for word, tag, count in corpustools.read_otb(): otb[word] = count #pluck out any adverbs if tag[0]=='a': adverbs.append((word,tag,count)) #load BIN lemma_id = 0 for entries in corpustools.read_bin_grouped(filter=True): count = 0 category = CATEGORY_MAP[entries[0].flokkur] lemma = None coded_entries = [] for entry in entries: count += otb.get(entry.ordmynd, 0) #encode/preprocess entries tag = icepy_encode( translate_tag(category,entry.flokkur,entry.hluti,entry.greining) ) #add proper noun marker to tag if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag: if tag[-1]=='g': tag += 's' else: tag += '-s' if not lemma: lemma = icepy_encode(entry.lemma.lower()) word = icepy_encode(entry.ordmynd.lower()) self.tag_count[tag] += 1 coded_entries.append((word,tag)) lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,coded_entries,suffix_tmp) #inject morphemes for lemma,entries in corpustools.read_morphemes_grouped(): count = 0 #currently no count info available for morphemes category = 'm' lemma = icepy_encode(lemma) entries = [icepy_encode(e) for e in entries] for word,tag in entries: self.tag_count[tag] += 1 lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,entries,suffix_tmp) #inject adverb tags from OTB for word,tag,count in adverbs: tag = icepy_encode(tag) frozenmap = (('', (tag,)),) self.tag_count[tag] += 1 if frozenmap in suffix_tmp: suffix_id = suffix_tmp[frozenmap] else: suffix_id = len(suffix_tmp) suffix_tmp[frozenmap] = suffix_id #reverse suffix and tag maps for suffixes,suffix_id in suffix_tmp.iteritems(): self.id_suffixes[suffix_id] = dict(suffixes) #inject adverbs from OTB, if they are not already in the maps for word,tag,count in adverbs: if not self._lookup_candidates(word,tag=tag): word = icepy_encode(word) lemma_id += 1 self.id_lemma[lemma_id] = (word, 'a', count) frozenmap = (('', (icepy_encode(tag),)),) suffix_id = suffix_tmp[frozenmap] self.prefix_map[word].append( (lemma_id, suffix_id, 1) ) #generate bloom filter self._generate_bloom()