def count_lemmas(filename):
    # Read file
    lines = []
    for line in open(filename, encoding='utf-8').readlines():
        lines.append(line)
    print(u"%d lines" % len(lines))

    # Tokenize words
    punctuation = u"""¡,.;:!"#$%&'()*+-/<=>¿?@[\]^_`{|}~"""
    words = [word.strip(punctuation) for line in lines for word in line.split()]
    print(u"%d words" % len(words))

    # APICULTUR
    apiculture = ApiculturRateLimitSafe(ACCESS_TOKEN, "example")  # create API proxy
    #apiculture.set_throttle(60, 60)  # 20 messages every 60 seconds (~bronze suscription)

    # Lemmatize -- APICULTUR ;D
    counter = Counter()
    for word in words:
        lemmas = apiculture.lematiza2(word=word)
        if lemmas:
            lema = lemmas['lemas'][0]  # TODO: Desambiguation!
            print(u'%s => %s' % (word, lema['lema']))
            counter[(lema['lema'], lema['categoria'])] += 1
    return counter
def cervantes_level(filename):
    # We are going to request Cevantes level for each lematized word

    # Read file
    lines = []
    for line in open(filename, encoding='utf-8').readlines():
        lines.append(line)
    print(u"%d lines" % len(lines))

    # Tokenize words
    punctuation = u"""¡,.;:!"#$%&'()*+-/<=>¿?@[\]^_`{|}~"""
    words = [word.strip(punctuation) for line in lines for word in line.split()]
    print(u"%d words" % len(words))

    # APICULTURE
    apiculture = ApiculturRateLimitSafe(ACCESS_TOKEN, "example")  # create API proxy
    
    # Lemmatize -- APICULTUR ;D
    print(u"=== 1) Lematizar")
    counter = Counter()
    for word in words:
        lemmas = apiculture.lematiza2(word=word)
        if lemmas:
            lema = lemmas['lemas'][0]  # TODO: Desambiguation!
            print(u'%s => %s' % (word, lema['lema']))
            counter[(lema['lema'], lema['categoria'])] += 1

    # Get Cervantes level for each lemma -- APICULTUR
    print(u"=== 2) Nivel de cada lema")
    sum_levels = 0
    n_lemmas = 0
    for (lemma, cat), count in counter.most_common():
        level = apiculture.damenivel(word = lemma)
        if level:
            level = level['valor']
            if level != 0:  # TODO: level == 0 for non categorized words
                print(u'%s (%s) => level %s' % (lemma, cat, level))            
                sum_levels += level*count
                n_lemmas += count

    # Return medium value
    return sum_levels, n_lemmas