def count_lemmas(filename): # Read file lines = [] for line in open(filename, encoding='utf-8').readlines(): lines.append(line) print(u"%d lines" % len(lines)) # Tokenize words punctuation = u"""¡,.;:!"#$%&'()*+-/<=>¿?@[\]^_`{|}~""" words = [word.strip(punctuation) for line in lines for word in line.split()] print(u"%d words" % len(words)) # APICULTUR apiculture = ApiculturRateLimitSafe(ACCESS_TOKEN, "example") # create API proxy #apiculture.set_throttle(60, 60) # 20 messages every 60 seconds (~bronze suscription) # Lemmatize -- APICULTUR ;D counter = Counter() for word in words: lemmas = apiculture.lematiza2(word=word) if lemmas: lema = lemmas['lemas'][0] # TODO: Desambiguation! print(u'%s => %s' % (word, lema['lema'])) counter[(lema['lema'], lema['categoria'])] += 1 return counter
def cervantes_level(filename): # We are going to request Cevantes level for each lematized word # Read file lines = [] for line in open(filename, encoding='utf-8').readlines(): lines.append(line) print(u"%d lines" % len(lines)) # Tokenize words punctuation = u"""¡,.;:!"#$%&'()*+-/<=>¿?@[\]^_`{|}~""" words = [word.strip(punctuation) for line in lines for word in line.split()] print(u"%d words" % len(words)) # APICULTURE apiculture = ApiculturRateLimitSafe(ACCESS_TOKEN, "example") # create API proxy # Lemmatize -- APICULTUR ;D print(u"=== 1) Lematizar") counter = Counter() for word in words: lemmas = apiculture.lematiza2(word=word) if lemmas: lema = lemmas['lemas'][0] # TODO: Desambiguation! print(u'%s => %s' % (word, lema['lema'])) counter[(lema['lema'], lema['categoria'])] += 1 # Get Cervantes level for each lemma -- APICULTUR print(u"=== 2) Nivel de cada lema") sum_levels = 0 n_lemmas = 0 for (lemma, cat), count in counter.most_common(): level = apiculture.damenivel(word = lemma) if level: level = level['valor'] if level != 0: # TODO: level == 0 for non categorized words print(u'%s (%s) => level %s' % (lemma, cat, level)) sum_levels += level*count n_lemmas += count # Return medium value return sum_levels, n_lemmas