Example #1
0
def bin_debug_hk():
    """ A short routine created to detect a strange bug in the BÍN corpus where
    certain types of compound words ('hk', 'alm') were incorrectly entered.

    This bug has apparently been fixed but the code may be useful as a reference.
    """
    for entries in read_bin_grouped(silent=True):
        first_entry = entries[0]

        #handle strange bug in BIN, where certain compound words are incorrectly specified
        if first_entry.flokkur=='hk' and first_entry.hluti=='alm':
            proper = [entry for entry in entries if entry.ordmynd[0]==first_entry.ordmynd[0]
                      and not (len(entry.lemma) > 3 and len(entry.ordmynd)<=len(entry.lemma)-2)]
            if len(entries) - len(proper) == 4:
                print first_entry.lemma_id, first_entry.lemma
                continue
Example #2
0
    def _process_raw(self):
        suffix_tmp = {}

        #load OTB
        otb = {}
        adverbs = []
        for word, tag, count in corpustools.read_otb():
            otb[word] = count
            #pluck out any adverbs
            if tag[0]=='a': adverbs.append((word,tag,count))

        #load BIN
        lemma_id = 0
        for entries in corpustools.read_bin_grouped(filter=True):
            count = 0
            category = CATEGORY_MAP[entries[0].flokkur]
            lemma = None

            coded_entries = []
            for entry in entries:
                count   += otb.get(entry.ordmynd, 0)

                #encode/preprocess entries
                tag   = icepy_encode(
                            translate_tag(category,entry.flokkur,entry.hluti,entry.greining)
                        )
                #add proper noun marker to tag
                if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag:
                    if tag[-1]=='g':
                        tag += 's'
                    else:
                        tag += '-s'

                if not lemma: lemma = icepy_encode(entry.lemma.lower())
                word  = icepy_encode(entry.ordmynd.lower())
                
                self.tag_count[tag] += 1
                coded_entries.append((word,tag))

            lemma_id += 1

            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,coded_entries,suffix_tmp)

        #inject morphemes
        for lemma,entries in corpustools.read_morphemes_grouped():
            count = 0 #currently no count info available for morphemes
            category = 'm'
            lemma = icepy_encode(lemma)
            entries = [icepy_encode(e) for e in entries]

            for word,tag in entries:
                self.tag_count[tag] += 1

            lemma_id += 1
            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,entries,suffix_tmp)

        #inject adverb tags from OTB
        for word,tag,count in adverbs:
            tag = icepy_encode(tag)
            frozenmap = (('', (tag,)),)
            self.tag_count[tag] += 1    
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

        #reverse suffix and tag maps
        for suffixes,suffix_id in suffix_tmp.iteritems():
            self.id_suffixes[suffix_id] = dict(suffixes)

        #inject adverbs from OTB, if they are not already in the maps
        for word,tag,count in adverbs:
            if not self._lookup_candidates(word,tag=tag):
                word = icepy_encode(word)
                lemma_id += 1
                self.id_lemma[lemma_id] = (word, 'a', count)

                frozenmap = (('', (icepy_encode(tag),)),)
                suffix_id = suffix_tmp[frozenmap]

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )

        #generate bloom filter
        self._generate_bloom()