def lemma_deduplicate(matches): """ Accepts an iterable of AnalysisMatch objects and returns a list deduplicated by lemma, so that only one match object per lemma is returned. """ output = OrderedDict() for m in matches: output[m.lemma_id] = m return output.values()
def _prefix_fill(self, lemma_id, entries, suffix_tmp): if len(entries) == 0: return first_word = entries[0] prefix = first_word[0] okay = False while len(prefix) >= self.stem_minimum_length and not okay: okay = True if len(prefix)==0: break for wordform in entries: if not wordform[0].startswith(prefix): okay = False break if not okay: prefix = prefix[:-1] if okay: suffixes = [e[0][len(prefix):] for e in entries] tags = [tag for word,tag in entries] formcount = len(tags) suffixmap = OrderedDict.fromkeys( sorted(list(set(suffixes))) ) for key in suffixmap.iterkeys(): suffixmap[key] = [] for suffix,tag in zip(suffixes,tags): suffixmap[suffix].append(tag) for key,val in suffixmap.iteritems(): suffixmap[key] = tuple(val) frozenmap = tuple(suffixmap.items()) if frozenmap in suffix_tmp: suffix_id = suffix_tmp[frozenmap] else: suffix_id = len(suffix_tmp) suffix_tmp[frozenmap] = suffix_id self.prefix_map[prefix].append( (lemma_id, suffix_id, formcount) ) return prefix else: for word,tag in set(entries): frozenmap = (('', (tag,)),) if frozenmap in suffix_tmp: suffix_id = suffix_tmp[frozenmap] else: suffix_id = len(suffix_tmp) suffix_tmp[frozenmap] = suffix_id self.prefix_map[word].append( (lemma_id, suffix_id, 1) )