def process_language(lang): corpus = src.corpus.POSCorpus.create_from_pickle( data_file_path=args.pkl_dir + '/' + lang + '.pkl') #corpus.sentences = corpus.sentences[:len(corpus.sentences) // 50] # Cosine similarity between noun and verb usages lemma_count_df = corpus.get_per_lemma_stats() # Filter: must have at least [x] noun and [x] verb usages lemma_count_df = lemma_count_df[(lemma_count_df['noun_count'] >= 30) & (lemma_count_df['verb_count'] >= 30) & (lemma_count_df['is_flexible']) & (lemma_count_df['lemma'] != '_')] lemma_count_df = lemma_count_df.sort_values('total_count', ascending=False) print('Remaining lemmas:', len(lemma_count_df)) print('Noun lemmas:', len(lemma_count_df[lemma_count_df.majority_tag == 'NOUN'])) print('Verb lemmas:', len(lemma_count_df[lemma_count_df.majority_tag == 'VERB'])) def paradigm_counters(lemma): noun_counter = collections.Counter() verb_counter = collections.Counter() for sentence in corpus.sentences: for tok in sentence: if tok['lemma'] == lemma: if tok['pos'] == 'NOUN': noun_counter[tok['word'].lower()] += 1 elif tok['pos'] == 'VERB': verb_counter[tok['word'].lower()] += 1 noun_entropy = scipy.stats.entropy( np.array(list(noun_counter.values()))) verb_entropy = scipy.stats.entropy( np.array(list(verb_counter.values()))) print(lemma, noun_entropy, verb_entropy) return noun_entropy, verb_entropy lemma_count_df[['noun_entropy', 'verb_entropy']] = lemma_count_df.apply( lambda row: paradigm_counters(row.lemma), axis=1, result_type="expand") return lemma_count_df
pos_counts = defaultdict(int) for sentence in corpus.sentences: for token in sentence: pos_tag = token['pos'] if pos_tag: pos_counts[pos_tag] += 1 # In[4]: plt.figure(figsize=(12, 6)) plt.bar(pos_counts.keys(), pos_counts.values()) # In[5]: lemma_count_df = corpus.get_per_lemma_stats() lemma_count_df.sort_values('total_count', ascending=False).head(20) # ## Distribution of lemmas # In[6]: plt.figure(figsize=(15, 5)) lemma_count_df['total_count'].hist(bins=range(0, 60)) # ## Syntax flexibility metrics # In[7]: # Only consider lemmas with at least 5 usages lemma_count_df = lemma_count_df[