Ejemplo n.º 1
0
def letter_frequency():
    """ Reads the BÍN corpus and counts instances of each letter found in the
    corpus. Returns a letter=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        for letter in entry.ordmynd: d[letter] += 1
    return d
Ejemplo n.º 2
0
def bin_debug():
    """ Prints a list of all entries in the BÍN corpus where the first letter of
    the word form is not the same as the first letter of the lemma.

    Useful for debugging, not much else.
    """
    for entry in read_bin(silent=True):
        if entry.ordmynd[0] != entry.lemma[0]:
            print unicode(entry).encode('utf-8')
Ejemplo n.º 3
0
def letter_frequency():
    """ Reads the BÍN corpus and counts instances of each letter found in the
    corpus. Returns a letter=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        for letter in entry.ordmynd:
            d[letter] += 1
    return d
Ejemplo n.º 4
0
def bin_debug():
    """ Prints a list of all entries in the BÍN corpus where the first letter of
    the word form is not the same as the first letter of the lemma.

    Useful for debugging, not much else.
    """
    for entry in read_bin(silent=True):
        if entry.ordmynd[0]!=entry.lemma[0]:
            print unicode(entry).encode('utf-8')
Ejemplo n.º 5
0
def tag_frequency():
    """ Reads the BÍN corpus and counts instances of each tag (translated to
    IceNLP format) found in the corpus. Returns a tag=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        tag = translate_tag(CATEGORY_MAP[entry.flokkur],entry.flokkur,entry.hluti,entry.greining)
        d[(tag,entry.flokkur,entry.hluti,entry.greining)] += 1
    return d
Ejemplo n.º 6
0
def tag_frequency():
    """ Reads the BÍN corpus and counts instances of each tag (translated to
    IceNLP format) found in the corpus. Returns a tag=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        tag = translate_tag(CATEGORY_MAP[entry.flokkur], entry.flokkur,
                            entry.hluti, entry.greining)
        d[(tag, entry.flokkur, entry.hluti, entry.greining)] += 1
    return d
Ejemplo n.º 7
0
def write_wordlist(fout, encoding='utf8'):
    """ Converts the BÍN database into a simple list of word forms and dumps
    into fout, which should be a file or file-like object.
    """
    for entry in read_bin():
        fout.write(entry.ordmynd.encode(encoding) + '\n')
Ejemplo n.º 8
0
def bin_read():
    """ Simply prints all the entries in the BÍN corpus to screen.
    """
    for entry in read_bin(silent=True):
        print unicode(entry).encode('utf-8')
Ejemplo n.º 9
0
def write_wordlist(fout, encoding='utf8'):
    """ Converts the BÍN database into a simple list of word forms and dumps
    into fout, which should be a file or file-like object.
    """
    for entry in read_bin():
        fout.write(entry.ordmynd.encode(encoding)+'\n')
Ejemplo n.º 10
0
def bin_read():
    """ Simply prints all the entries in the BÍN corpus to screen.
    """
    for entry in read_bin(silent=True):
        print unicode(entry).encode('utf-8')