Example #1
0
def process_language(lang):
    corpus = src.corpus.POSCorpus.create_from_pickle(
        data_file_path=args.pkl_dir + '/' + lang + '.pkl')
    #corpus.sentences = corpus.sentences[:len(corpus.sentences) // 50]

    # Cosine similarity between noun and verb usages
    lemma_count_df = corpus.get_per_lemma_stats()

    # Filter: must have at least [x] noun and [x] verb usages
    lemma_count_df = lemma_count_df[(lemma_count_df['noun_count'] >= 30)
                                    & (lemma_count_df['verb_count'] >= 30) &
                                    (lemma_count_df['is_flexible']) &
                                    (lemma_count_df['lemma'] != '_')]
    lemma_count_df = lemma_count_df.sort_values('total_count', ascending=False)
    print('Remaining lemmas:', len(lemma_count_df))
    print('Noun lemmas:',
          len(lemma_count_df[lemma_count_df.majority_tag == 'NOUN']))
    print('Verb lemmas:',
          len(lemma_count_df[lemma_count_df.majority_tag == 'VERB']))

    def paradigm_counters(lemma):
        noun_counter = collections.Counter()
        verb_counter = collections.Counter()
        for sentence in corpus.sentences:
            for tok in sentence:
                if tok['lemma'] == lemma:
                    if tok['pos'] == 'NOUN':
                        noun_counter[tok['word'].lower()] += 1
                    elif tok['pos'] == 'VERB':
                        verb_counter[tok['word'].lower()] += 1

        noun_entropy = scipy.stats.entropy(
            np.array(list(noun_counter.values())))
        verb_entropy = scipy.stats.entropy(
            np.array(list(verb_counter.values())))
        print(lemma, noun_entropy, verb_entropy)
        return noun_entropy, verb_entropy

    lemma_count_df[['noun_entropy', 'verb_entropy']] = lemma_count_df.apply(
        lambda row: paradigm_counters(row.lemma), axis=1, result_type="expand")
    return lemma_count_df
Example #2
0
pos_counts = defaultdict(int)

for sentence in corpus.sentences:
    for token in sentence:
        pos_tag = token['pos']
        if pos_tag:
            pos_counts[pos_tag] += 1

# In[4]:

plt.figure(figsize=(12, 6))
plt.bar(pos_counts.keys(), pos_counts.values())

# In[5]:

lemma_count_df = corpus.get_per_lemma_stats()
lemma_count_df.sort_values('total_count', ascending=False).head(20)

# ## Distribution of lemmas

# In[6]:

plt.figure(figsize=(15, 5))
lemma_count_df['total_count'].hist(bins=range(0, 60))

# ## Syntax flexibility metrics

# In[7]:

# Only consider lemmas with at least 5 usages
lemma_count_df = lemma_count_df[