def bin_debug_hk(): """ A short routine created to detect a strange bug in the BÍN corpus where certain types of compound words ('hk', 'alm') were incorrectly entered. This bug has apparently been fixed but the code may be useful as a reference. """ for entries in read_bin_grouped(silent=True): first_entry = entries[0] #handle strange bug in BIN, where certain compound words are incorrectly specified if first_entry.flokkur=='hk' and first_entry.hluti=='alm': proper = [entry for entry in entries if entry.ordmynd[0]==first_entry.ordmynd[0] and not (len(entry.lemma) > 3 and len(entry.ordmynd)<=len(entry.lemma)-2)] if len(entries) - len(proper) == 4: print first_entry.lemma_id, first_entry.lemma continue
def bin_debug_hk(): """ A short routine created to detect a strange bug in the BÍN corpus where certain types of compound words ('hk', 'alm') were incorrectly entered. This bug has apparently been fixed but the code may be useful as a reference. """ for entries in read_bin_grouped(silent=True): first_entry = entries[0] #handle strange bug in BIN, where certain compound words are incorrectly specified if first_entry.flokkur == 'hk' and first_entry.hluti == 'alm': proper = [ entry for entry in entries if entry.ordmynd[0] == first_entry.ordmynd[0] and not (len(entry.lemma) > 3 and len(entry.ordmynd) <= len(entry.lemma) - 2) ] if len(entries) - len(proper) == 4: print first_entry.lemma_id, first_entry.lemma continue
def _process_raw(self): suffix_tmp = {} #load OTB otb = {} adverbs = [] for word, tag, count in corpustools.read_otb(): otb[word] = count #pluck out any adverbs if tag[0]=='a': adverbs.append((word,tag,count)) #load BIN lemma_id = 0 for entries in corpustools.read_bin_grouped(filter=True): count = 0 category = CATEGORY_MAP[entries[0].flokkur] lemma = None coded_entries = [] for entry in entries: count += otb.get(entry.ordmynd, 0) #encode/preprocess entries tag = icepy_encode( translate_tag(category,entry.flokkur,entry.hluti,entry.greining) ) #add proper noun marker to tag if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag: if tag[-1]=='g': tag += 's' else: tag += '-s' if not lemma: lemma = icepy_encode(entry.lemma.lower()) word = icepy_encode(entry.ordmynd.lower()) self.tag_count[tag] += 1 coded_entries.append((word,tag)) lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,coded_entries,suffix_tmp) #inject morphemes for lemma,entries in corpustools.read_morphemes_grouped(): count = 0 #currently no count info available for morphemes category = 'm' lemma = icepy_encode(lemma) entries = [icepy_encode(e) for e in entries] for word,tag in entries: self.tag_count[tag] += 1 lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,entries,suffix_tmp) #inject adverb tags from OTB for word,tag,count in adverbs: tag = icepy_encode(tag) frozenmap = (('', (tag,)),) self.tag_count[tag] += 1 if frozenmap in suffix_tmp: suffix_id = suffix_tmp[frozenmap] else: suffix_id = len(suffix_tmp) suffix_tmp[frozenmap] = suffix_id #reverse suffix and tag maps for suffixes,suffix_id in suffix_tmp.iteritems(): self.id_suffixes[suffix_id] = dict(suffixes) #inject adverbs from OTB, if they are not already in the maps for word,tag,count in adverbs: if not self._lookup_candidates(word,tag=tag): word = icepy_encode(word) lemma_id += 1 self.id_lemma[lemma_id] = (word, 'a', count) frozenmap = (('', (icepy_encode(tag),)),) suffix_id = suffix_tmp[frozenmap] self.prefix_map[word].append( (lemma_id, suffix_id, 1) ) #generate bloom filter self._generate_bloom()