Ejemplo n.º 1
0
Archivo: lsa.py Proyecto: bsrdrk/NLP
lemma_sentence = []
lemma_dict = {}
if __name__ == '__main__':
    ZEMBEREK_PATH: str = join('zemberek-full.jar')

    startJVM(  # dogru olan
        getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

    morphology = JClass(
        'zemberek.morphology.TurkishMorphology').createWithDefaults()
    WordAnalysis = JClass('zemberek.morphology.analysis.WordAnalysis')
    lemmas = {}
    for sentence in sentence_tokens:
        if sentence != '':
            analysis: java.util.ArrayList = (
                morphology.analyzeAndDisambiguate(sentence).bestAnalysis())

            lemma = []

            for i, analysis in enumerate(analysis, start=1):
                lemma.append(f'{str(analysis.getLemmas()[0])}')
                lemma_dict[f'{str(analysis.getLemmas()[0])}'] = lemma_dict.get(
                    f'{str(analysis.getLemmas()[0])}', 0.0) + 1.0
                lemmas_list.append(f'{str(analysis.getLemmas()[0])}')
            #print(f'\nFull sentence with POS tags: {" ".join(lemma)}')
            lemma_sentence.append(f'{" ".join(lemma)}')

# %% ngram
bigram = ngrams(lemmas_list, 2)
bigram_freq = Counter(bigram)