Ejemplo n.º 1
0
    def __init__(self, path_prefix=None):
        """
        Initialise a Morph class. When first executed/deployed the initialiser
        will construct several maps from raw data, which is included in the
        distribution. The process will take several minutes, but once it has
        been done once, the class will load the generated maps from cache files
        which it outputs after processing the raw data.

        By default the data is stored in the data/ subfolder of the distribution
        but you may specify a path_prefix to have the class look for data files
        elsewhere.

        Also note that, while the in-memory maps are quite efficient, they still
        consume a noticable amount of RAM (approx 150-200MB) so using more than
        one Morph instance in a single application is to be avoided.

        """
        logging.debug("Starting the IcePy morphological analyser...")

        #data maps
        self.prefix_map  = defaultdict(list)
        self.tag_count   = defaultdict(int)
        self.id_lemma    = {}
        self.id_suffixes = {}
        self.bloom       = None

        #statistics / counters
        self.timing      = Timing()
        self.nofix_count = 0
        self.lookup_count = 0
        self.expand_count = 0
        self.bloom_negatives = 0
        self.lookup_negatives = 0

        self.stem_minimum_length = 0 #setting this to anything higher than 0 breaks the get_wordforms functionality
        
        if not path_prefix:
            path_prefix = os.path.join(BASE_PATH,"data/")

        self.maps = [
            ('prefixmap.cache', self.prefix_map),
            ('lemmamap.cache' , self.id_lemma),
            ('suffixes.cache' , self.id_suffixes),
            ('tagcount.cache' , self.tag_count),
            ('bloom.cache'    , 'bloom')
        ]
        for x, item in enumerate(self.maps):
            self.maps[x] = (path_prefix+item[0],item[1])

        #static maps
        sys.stdout.flush()
        self.expansion_tag_patterns = {}
        self.static_maps = [
            ('expansion_tag_patterns', self.expansion_tag_patterns)
        ]
        self._load_static(path_prefix)

        #if all cache files are available, try to load
        use_cache = True
        for filename in (v[0] for v in self.maps):
            if not os.path.exists(filename):
                use_cache = False
                break

        if use_cache:
            logging.debug("-> loading from cache")

            try:
                self._load_cache()
            except KeyboardInterrupt:
                raise
            except:
                logging.warning("-> loading from cache failed!")
                traceback.print_exc()
            else:
                return

        #else load raw data from BIN file and write new cache
        logging.warning("IcePy Morph: Performing initial processing of BIN corpus (this will take several minutes but \
                        only needs to be done once)")
        self._process_raw()
        self._write_cache()
        logging.debug("-> completed building Morph dictionary")
Ejemplo n.º 2
0
class Morph(object):
    def __init__(self, path_prefix=None):
        """
        Initialise a Morph class. When first executed/deployed the initialiser
        will construct several maps from raw data, which is included in the
        distribution. The process will take several minutes, but once it has
        been done once, the class will load the generated maps from cache files
        which it outputs after processing the raw data.

        By default the data is stored in the data/ subfolder of the distribution
        but you may specify a path_prefix to have the class look for data files
        elsewhere.

        Also note that, while the in-memory maps are quite efficient, they still
        consume a noticable amount of RAM (approx 150-200MB) so using more than
        one Morph instance in a single application is to be avoided.

        """
        logging.debug("Starting the IcePy morphological analyser...")

        #data maps
        self.prefix_map  = defaultdict(list)
        self.tag_count   = defaultdict(int)
        self.id_lemma    = {}
        self.id_suffixes = {}
        self.bloom       = None

        #statistics / counters
        self.timing      = Timing()
        self.nofix_count = 0
        self.lookup_count = 0
        self.expand_count = 0
        self.bloom_negatives = 0
        self.lookup_negatives = 0

        self.stem_minimum_length = 0 #setting this to anything higher than 0 breaks the get_wordforms functionality
        
        if not path_prefix:
            path_prefix = os.path.join(BASE_PATH,"data/")

        self.maps = [
            ('prefixmap.cache', self.prefix_map),
            ('lemmamap.cache' , self.id_lemma),
            ('suffixes.cache' , self.id_suffixes),
            ('tagcount.cache' , self.tag_count),
            ('bloom.cache'    , 'bloom')
        ]
        for x, item in enumerate(self.maps):
            self.maps[x] = (path_prefix+item[0],item[1])

        #static maps
        sys.stdout.flush()
        self.expansion_tag_patterns = {}
        self.static_maps = [
            ('expansion_tag_patterns', self.expansion_tag_patterns)
        ]
        self._load_static(path_prefix)

        #if all cache files are available, try to load
        use_cache = True
        for filename in (v[0] for v in self.maps):
            if not os.path.exists(filename):
                use_cache = False
                break

        if use_cache:
            logging.debug("-> loading from cache")

            try:
                self._load_cache()
            except KeyboardInterrupt:
                raise
            except:
                logging.warning("-> loading from cache failed!")
                traceback.print_exc()
            else:
                return

        #else load raw data from BIN file and write new cache
        logging.warning("IcePy Morph: Performing initial processing of BIN corpus (this will take several minutes but \
                        only needs to be done once)")
        self._process_raw()
        self._write_cache()
        logging.debug("-> completed building Morph dictionary")

    def _process_raw(self):
        suffix_tmp = {}

        #load OTB
        otb = {}
        adverbs = []
        for word, tag, count in corpustools.read_otb():
            otb[word] = count
            #pluck out any adverbs
            if tag[0]=='a': adverbs.append((word,tag,count))

        #load BIN
        lemma_id = 0
        for entries in corpustools.read_bin_grouped(filter=True):
            count = 0
            category = CATEGORY_MAP[entries[0].flokkur]
            lemma = None

            coded_entries = []
            for entry in entries:
                count   += otb.get(entry.ordmynd, 0)

                #encode/preprocess entries
                tag   = icepy_encode(
                            translate_tag(category,entry.flokkur,entry.hluti,entry.greining)
                        )
                #add proper noun marker to tag
                if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag:
                    if tag[-1]=='g':
                        tag += 's'
                    else:
                        tag += '-s'

                if not lemma: lemma = icepy_encode(entry.lemma.lower())
                word  = icepy_encode(entry.ordmynd.lower())
                
                self.tag_count[tag] += 1
                coded_entries.append((word,tag))

            lemma_id += 1

            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,coded_entries,suffix_tmp)

        #inject morphemes
        for lemma,entries in corpustools.read_morphemes_grouped():
            count = 0 #currently no count info available for morphemes
            category = 'm'
            lemma = icepy_encode(lemma)
            entries = [icepy_encode(e) for e in entries]

            for word,tag in entries:
                self.tag_count[tag] += 1

            lemma_id += 1
            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,entries,suffix_tmp)

        #inject adverb tags from OTB
        for word,tag,count in adverbs:
            tag = icepy_encode(tag)
            frozenmap = (('', (tag,)),)
            self.tag_count[tag] += 1    
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

        #reverse suffix and tag maps
        for suffixes,suffix_id in suffix_tmp.iteritems():
            self.id_suffixes[suffix_id] = dict(suffixes)

        #inject adverbs from OTB, if they are not already in the maps
        for word,tag,count in adverbs:
            if not self._lookup_candidates(word,tag=tag):
                word = icepy_encode(word)
                lemma_id += 1
                self.id_lemma[lemma_id] = (word, 'a', count)

                frozenmap = (('', (icepy_encode(tag),)),)
                suffix_id = suffix_tmp[frozenmap]

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )

        #generate bloom filter
        self._generate_bloom()


    def _prefix_fill(self, lemma_id, entries, suffix_tmp):
        if len(entries) == 0: return
        first_word = entries[0]

        prefix = first_word[0]

        okay = False
        while len(prefix) >= self.stem_minimum_length and not okay:
            okay = True
            if len(prefix)==0:
                break
            for wordform in entries:
                if not wordform[0].startswith(prefix):
                    okay = False
                    break
            if not okay: prefix = prefix[:-1]

        if okay:
            suffixes = [e[0][len(prefix):] for e in entries]
            tags = [tag for word,tag in entries]

            formcount = len(tags)
            suffixmap = OrderedDict.fromkeys( sorted(list(set(suffixes))) )

            for key in suffixmap.iterkeys():
                suffixmap[key] = []

            for suffix,tag in zip(suffixes,tags):
                suffixmap[suffix].append(tag)

            for key,val in suffixmap.iteritems():
                suffixmap[key] = tuple(val)

            frozenmap = tuple(suffixmap.items())
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

            self.prefix_map[prefix].append( (lemma_id, suffix_id, formcount) )
            return prefix

        else:
            for word,tag in set(entries):
                frozenmap = (('', (tag,)),)
                if frozenmap in suffix_tmp:
                    suffix_id = suffix_tmp[frozenmap]
                else:
                    suffix_id = len(suffix_tmp)
                    suffix_tmp[frozenmap] = suffix_id

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )

    def _count_wordforms(self):
        return sum((p[2] for prefix in self.prefix_map.itervalues() for p in prefix))

    def _delete_cache(self):
        for filename,obj in self.maps:
            try: os.remove(filename)
            except: pass

    def _write_cache(self):
        self._delete_cache()
        for filename,obj in self.maps:
            outfile = open(filename, 'wb')
            if isinstance(obj, str):
                obj = eval('self.'+obj)
            pickle.dump(obj, outfile, PICKLE_PROTOCOL)
            outfile.close()

    def _load_cache(self):
        sys.stdout.flush()
        for filename,obj in self.maps:
            outfile = open(filename,'rb')
            if isinstance(obj, dict):
                cache = pickle.load(outfile)
                obj.update(cache)
            elif isinstance(obj, str):
                exec 'self.%s = pickle.load(outfile)' % obj
            else:
                obj = pickle.load(outfile)
            outfile.close()
            print '.',
            sys.stdout.flush()

    def _load_static(self, path_prefix):
        for filename,obj in self.static_maps:
            infile = open(path_prefix+filename+'.static','rb')
            cache = pickle.load(infile)
            obj.update(cache)
            infile.close()

    def _generate_bloom(self):
        print "-> preparing bloom filter"

        #initialize bloom filter
        filter_m = self._count_wordforms() * 10 #using 10 bits per wordform
        filter_k = 4
        self.bloom = BloomFilter(filter_m,filter_k)

        #load wordforms into filter
        for word,tag in self.iterate_words(show_status=True, decode=False):
            self.bloom.Insert(word)

    def print_stats(self):
        """ Prints a few statistics about the data in the class along with some
        usage data for the instance.
        """
        stats = {
            'prefixes in maps':     len(self.prefix_map),
            'distinct suffix sets': len(self.id_suffixes),
            'lemmas in maps':       len(self.id_lemma),
            'bloom negatives':      self.bloom_negatives,
            'lookup negatives':     self.lookup_negatives,
            'expansions performed': self.expand_count,
            'lookups performed':    self.lookup_count,
            'total wordforms in maps': self._count_wordforms()
        }
        for id, stat in sorted(stats.items()):
            print "%s\t%s" % (id.ljust(30), stat)

    def iterate_words(self, show_status=False, decode=True):
        """ Returns an iterator over all the word forms in the maps.

        Set show_status=True to have the generator print a progress counter.

        By default the output is a decoded unicode string. Set decode=False if
        you need the IcePy encoded string.
        """
        i = 0
        total = self._count_wordforms()
        for prefix,idlist in self.prefix_map.iteritems():
            for lemma_id, suffix_id, wordform_count in idlist:
                for suffix,tags in self.id_suffixes[suffix_id].iteritems():
                    for tag in tags:
                        i += 1
                        if show_status and i%1000==0:
                            print '%d / %d\r' % (i, total),
                            sys.stdout.flush()
                        if decode:
                            yield icepy_decode(prefix+suffix), icepy_decode(tag)
                        else:
                            yield prefix+suffix, tag

        if show_status: print '%d / %d' % (i, total)

    def iterate_words_grouped(self, show_status=False, decode=True):
        """ Returns an iterator over all the word forms in the maps, grouped by
        lemma. That is, yields a list for every lexeme in the maps.

        Accepts the same options as iterate_words().
        """
        #TODO: refactor this and iterate_words to use more of the same code
        i = 0
        total = len(self.id_lemma)
        for prefix,idlist in self.prefix_map.iteritems():
            for lemma_id, suffix_id, wordform_count in idlist:
                group = []
                for suffix,tags in self.id_suffixes[suffix_id].iteritems():
                    for tag in tags:
                        if decode:
                            try:
                                group.append((icepy_decode(prefix+suffix),icepy_decode(tag)))
                            except KeyError:
                                print "Could not decode word/tag %s/%s!" % (repr(prefix+suffix),repr(tag))
                                raise
                        else:
                            group.append((prefix+suffix,tag))
                i += 1
                if show_status:
                    print '%d / %d\r' % (i, total),
                    sys.stdout.flush()
                yield group

        if show_status: print '%d / %d' % (i, total)

    def _check_input(self, string, category=None, tag=None):
        if tag: category = tag[0]
        return category not in BLACKTAGS and ALPHA_RE.match(string)

    def _lookup_candidates(self, wordform, category=None, tag=None):
        self.lookup_count += 1

        #encode input strings
        try:
            wordform = icepy_encode(wordform.lower())
            if tag:
                tag = icepy_encode(tag.lower())
                category = tag[0]
        except ValueError:
            #print "warning: could not encode word/tag %s/%s" % (repr(wordform),repr(tag))
            return []

        #first check if wordform is in the bloom filter. this is primarily used
        #to increase performance in expansions, as they frequently include a
        #large number of lookups of non-existent words. this may return a
        #false positive but then the prefix lookup will simply return an empty set.
        self.timing.start('bloom_lookup')
        in_bloom = True
        if self.bloom and not self.bloom.InFilter(wordform):
            in_bloom = False
            self.bloom_negatives += 1
        self.timing.end('bloom_lookup')

        if not in_bloom: return []

        #look for prefix in prefix map.
        self.timing.start('prefix_lookup')
        prefix = wordform
        candidates = []
        while True:
            if prefix in self.prefix_map:
                #search for the word in the selected prefixes suffix map
                word_suffix = wordform[len(prefix):]

                self.timing.start('lookup_prefix_loop')
                for candidate_id,candidate_suffix_id,wordform_count in self.prefix_map[prefix]:
                    suffixmap = self.id_suffixes[candidate_suffix_id]

                    tags = suffixmap.get(word_suffix, None)
                    if not tags: continue

                    lemma, lemma_category, otb_count = self.id_lemma[candidate_id]

                    if lemma_category=='m' and tag and tag[0]!='m':
                        if tag not in [t[3:] for t in tags]: continue
                        tags = [tags[0][:3] + tag]
                    elif tag:
                        if tag not in tags: continue
                        tags = [tag]
                    elif category and lemma_category not in (category,'m'):
                        continue

                    candidates.extend([
                        AnalysisMatch(prefix, word_suffix, candidate_id, lemma,
                        otb_count, candidate_suffix_id, candidate_tag, 'lookup')
                        for candidate_tag in tags
                    ])
                self.timing.end('lookup_prefix_loop')
                        
            if not prefix or len(prefix) < self.stem_minimum_length: break
            prefix = prefix[:-1]

        if not candidates:
            self.lookup_negatives += 1

        self.timing.end('prefix_lookup')
        return candidates

    def lookup(self, wordform, category=None, tag=None):
        """ Performs a lookup of the specified word form and returns a list of
        AnalysisMatch objects.

        Specify category as a string of length 1 to filter by word category. For
        example, set category='n' to filter by nouns.

        Specify tag as a string to filter by specific POS tag, as defined in the
        tagset specification used for IceNLP. Note that the tags returned by the
        IceNLP tagger may not always match 100% with those derived from the BÍN
        analysis markup, due to some differences in approach between the two
        projects.
        """
        if not self._check_input(wordform,category,tag): return WordformAnalysis([])

        candidates = self._lookup_candidates(wordform, category, tag)

        #don't return bound morphemes in direct lookup
        candidates = [c for c in candidates if not c.match_tag.startswith(('mb','md'))]
        
        #add tag count information to results
        for c in candidates:
            c.tag_count = self.tag_count.get(c.match_tag)

        return WordformAnalysis(candidates)

    def inflection_analysis(self, input):
        """ Generate a InflectionAnalysis object, given an AnalysisMatch object
        or a list/tuple of words.

        The InflectionAnalysis object contains a breakdown of how the given word
        is declined. It includes a stem, a list of stem-variables
        and corresponding lists of tags and inflectional suffixes.
        """
        wordforms = []
        tags = []
        if isinstance(input, AnalysisMatch):
            for suffix,tagset in self.id_suffixes[input.suffix_id].iteritems():
                for tag in tagset:
                    wordforms.append(input.prefix+suffix)
                    tags.append(tag)
        elif isinstance(input, (list,tuple)):
            for word in input:
                if isinstance(word, str):
                    wordforms.append(word)
                elif isinstance(word, unicode):
                    wordforms.append(icepy_encode(word))
                else:
                    raise ValueError('input list/tuple must only contain strings')
            tags = ['' for w in wordforms]
        else:
            raise ValueError('input object must be AnalysisMatch instance or list/tuple of wordforms')

        wordforms.sort(key=lambda x: len(x))

        base = wordforms[0]
        pattern = r''
        
        variables = []
        for i in range(len(wordforms)): variables.append([])

        running_vars   = [None for i in range(len(wordforms))]
        suffixes       = [None for i in range(len(wordforms))]
        last_constant  = -1
        for x,char in enumerate(base):
            #determine if character at offset X is a constant (that is, appears
            #predictably in all wordforms)
            range_start = last_constant + 1
            range_end   = x + 1
            is_constant = True

            for w,word in enumerate(wordforms[1:]):
                index = word[range_start:range_end].find(char)
                #print word, char, range_end, range_start, index
                if index < 0:
                    is_constant = False

            if not is_constant:
                for w,word in enumerate(wordforms[1:]):
                    if running_vars[w+1] is None:
                        running_vars[w+1] = word[x]
                    else:
                        running_vars[w+1] += word[x]
            else:
                for w,word in enumerate(wordforms[1:]):
                    index = word[range_start:range_end].find(char)
                    suffixes[w+1] = word[range_start+index+1:]

            if is_constant:
                if x > range_start:
                    var_set = set()
                    for i,v in enumerate(running_vars):
                        variables[i].append(v)
                        if v is not None: var_set.add(v)
                    vars = r'|'.join( sorted(var_set,key=lambda x: len(x),reverse=True) )
                    pattern += r'(%s)' % vars
                pattern += char
                last_constant = x
                running_vars = [None for i in range(len(wordforms))]
            else:
                if running_vars[0] is None:
                    running_vars[0] = char
                else:
                    running_vars[0] += char

        suffixes[0] = base[last_constant+1:]

        return InflectionAnalysis(pattern, suffixes, variables, tags, wordforms)

    def analyse(self, wordform, category=None, tag=None, force_expand=False):
        """
        Uses the available methods (currently lookup() and expand()) to
        analyse the given word form. category and tag can be specified in the
        same manner as in lookup() to narrow the results. The output is a
        WordformAnalysis object.

        By default the function will stop and return the results if a direct
        lookup yields one or more matches, and will only attempt an expanded
        lookup if nothing is found by direct lookup. You can specify
        force_expand=True to override this.

        Note that the AnalysisMatch object's *type* attribute indicates which
        type of lookup returned the match.
        """
        if not self._check_input(wordform,category,tag): return WordformAnalysis([])

        results = []

        wordform = wordform.lower()
        if wordform.endswith(('-',"'")): wordform = wordform[:-1]

        #first do a direct lookup
        results.extend(self._lookup_candidates(wordform, category, tag))

        #if no results are found or force_expand is set, try expanded lookup
        if not results or force_expand:
            expansion = self.expand(wordform, category, tag)
            #not including direct lookups (single part in expansion)
            results.extend([e for e in expansion.matches if len(e.parts)>1])

        return WordformAnalysis(results)

    def expand(self, wordform, category=None, tag=None, single_part=True,
                     allow_morphemes=True):
        """
        Splits the input wordform into segments and returns a list of
        AnalysisMatch objects. Each AnalysisMatch object contains a *parts*
        attribute, which is a list of matches for each of the segments. Refer to
        the AnalysisMatch object documentation for information about it's
        properties.

        category and tag can be specified to filter in the same manner as in
        lookup().

        If you specify single_part=False the function will not return or attempt
        to lookup the entire word form.

        By default, non-standalone morphemes (category 'm') are included in
        expansions if found. You may specify allow_morphemes=False to disable
        these, to only get independent word segments.
        """
        self.expand_count += 1

        if not self._check_input(wordform,category,tag): return WordformAnalysis([])

        #special case for nouns, a compound word segment should not be a proper noun
        if tag: tag = tag.split('-')[0]

        wordform = wordform.replace('-','')

        instances   = defaultdict(list)

        word_len = len(wordform)
        done = False
        self.timing.start('expand_loop')
        for sublen in range(word_len,1,-1):
            index = -1
            while index+sublen < word_len:
                index += 1
                word = wordform[index:index+sublen]

                if not single_part and sublen==word_len: continue
                if index+sublen==word_len-1: continue

                #make sure last segment matches the specified filter
                if index+sublen==word_len:
                    lookup = self._lookup_candidates(word, category, tag)
                else:
                    lookup = self._lookup_candidates(word, None, None)

                for match in lookup:
                    if not allow_morphemes and match.match_tag[0]=='m': continue
                    instances[index].append(ExpansionPart(index,word,match))

            if done: break
        self.timing.end('expand_loop')

        #for index,parts in instances.iteritems():
        #    print index, ''.join([str(p)+' ' for p in parts])
        if not instances: return WordformAnalysis([])
        
        """find all valid matches"""
        matches = []
        #loop through the "roots" (i.e. instances with index 0)
        if 0 not in instances: return WordformAnalysis([])
        self.timing.start('match_traverse')
        for root in instances[0]:
            if root.index > 0: break

            candidate = [root]
            #special case of entire word being found on first index
            if root.end == word_len:
                matches.append(ExpansionMatch(candidate))

            #recursively traverse the "tree" of parts to find all sets which add up
            #to the full length of the word
            _traverse_parts(instances, matches, root.end, word_len, candidate, [0])
        self.timing.end('match_traverse')

        if not matches: return WordformAnalysis([])

        #filter matches and set attributes for AnalysisMatch objects
        results = []
        for match in matches:
            if not len(match): continue
            if not single_part and len(match.parts) < 2: continue

            last_tag = match[-1].analysis.match_tag
            if len(match)==1 and last_tag[0]=='m': continue
            
            #a compound word should not end with a proper noun. at least that's
            #not a supported case for now.
            if last_tag[0]=='n' and len(last_tag)>2 and last_tag[-2]=='-': continue

            result_tag = last_tag if last_tag[0]!='m' else last_tag[3:]

            prefix = ''.join([m.analysis.word for m in match[:-1]])
            lemma  = prefix + match[-1].analysis.lemma
            prefix = prefix + match[-1].analysis.prefix
            suffix = icepy_encode(wordform[len(prefix):])

            result = AnalysisMatch(prefix, suffix, 0, lemma, match[-1].analysis.otb_count,
                               match[-1].analysis.suffix_id, result_tag,
                               'expanded_lookup', match.parts)

            result.tag_count = self.tag_count.get(result_tag,0)
            result.tag_pattern_count = self.expansion_tag_patterns.get(result.tags(),0)

            results.append(result)

        return WordformAnalysis(results)

    def wordforms(self, match):
        """ Return list of all wordforms for an AnalysisMatch object's lexeme.
        """
        if not isinstance(match, AnalysisMatch):
            raise ValueError('input object must be AnalysisMatch instance')

        return [icepy_decode(match.prefix+suffix) for suffix in self.id_suffixes[match.suffix_id]]

    def inflections(self, match):
        """ Return list of all inflections for an AnalysisMatch object's lexeme.
        """
        if not isinstance(match, AnalysisMatch):
            raise ValueError('input object must be AnalysisMatch instance')

        results = []
        for suffix,tags in self.id_suffixes[match.suffix_id].iteritems():
            for tag in tags:
                results.append((icepy_decode(match.prefix+suffix),icepy_decode(tag)))
        return results

    def lemmatise(self, wordform, category=None, tag=None, return_original=True):
        """ Return a tuple (lemma,tag) with the lemma of the input wordform,
        along with the assumed tag.

        Basically this function does the same as analyse() but attempts to
        select one best match and return the lemma string for that.

        By default the input is returned back (in a tuple) if nothing is found.
        If return_original=False is set, None is return instead.

        """
        if self._check_input(wordform,category,tag):
            #first try to lookup the whole word directly
            candidates = self.lookup(wordform,category,tag)
            if candidates:
                #TODO: (vastly) improve selection process
                m = candidates.top_pick()
                return icepy_decode((m.lemma,m.match_tag))

            #else try expanding the word
            candidates = self.expand(wordform,category)
            if candidates:
                #TODO: (vastly) improve selection process
                #prefer matches with fewer parts
                m = candidates.top_pick()
                return icepy_decode((m.lemma,m.match_tag))

        #else return original or none
        if return_original: return (wordform,tag or category or None)
        return None

    def group_lexemes(self, chunks):
        """ Accepts a list of lists of (word,tag) tuples (chunks) and groups the
        chunks together by lexeme. Returns a mapping of
        (lemmatised words) => [tuples]

        Works by lemmatising each of the words and grouping together those chunks
        which have all the same lemmas for each of their words. A bit crude, but
        works reasonably well in many cases.

        """
        map = defaultdict(list)
        for chunk in chunks:
            analysed = [self.analyse(word, tag=tag) for word,tag in chunk]
            lemmas = []
            
            for a in analysed:
                match = a.top_pick()
                if not match:
                    lemmas = []
                    break

                if match.lemma_id:
                    lemmas.append(match.lemma_id)
                else:
                    lemmas.append(match.lemma)

            if lemmas:
                lemma_tuple = tuple(lemmas)
            else:
                lemma_tuple = tuple([word for word,tag in chunk])

            map[lemma_tuple].append(chunk)

        return dict(map)

    def group_lexemes_count(self, chunks):

        map = defaultdict(list)
        for chunk in chunks:
            analysed = [self.analyse(word, category=tag[0]) for word,tag in chunk]

            lemmas = []
            for a in analysed:
                match = a.top_pick()
                if not match:
                    lemmas = []
                    break

                lemmas.append(match.lemma)

            if lemmas:
                lemma_strings = tuple(lemmas)
            else:
                lemma_strings = tuple([word for word,tag in chunk])

            lemma_list = []
            for i in range(len(chunk)):
                lemma_list.append((lemma_strings[i],chunk[i][1][-1]))

            lemma_tuple = tuple(lemma_list)
            map[lemma_tuple].append(chunk)

        return dict(map)

    def unambiguous_expansions(self, category=None, tag=None, silent=False):
        """ Runs through all of the word forms in the database, expands them
        and gathers those that return only one expansion candidate.

        This is useful for making training data and gathering statistics, but be
        warned: It will take many hours to complete!

        Specify category or tag to filter the search (which should make the
        process considerably faster).

        By default the function prints every unambiguous expansion it finds but
        you may set silent=True to avoid this.

        """
        print """Computing all unambiguous word expansions in BÍN.
        Warning: This will take many hours!"""
        
        expansions = []
        #iterate all wordforms
        for words in self.iterate_words_grouped(True):
            for wordform,wordtag in words:
                if tag and tag!=wordtag: continue
                elif category and category!=wordtag[0]: continue
                #expand the wordform
                expansion = self.expand(wordform,tag=wordtag,single_part=False)
                #add to expansion list if it has only one expansion
                if len(expansion)==1:
                    expansions.append(expansion.matches[0])
                    if not silent: print expansion.matches[0]

        if not silent: print "Done!"
        return expansions