Exemple #1
0
def _print_histogram_agg(histogram_1: Histogram,
                         histogram_2: Histogram) -> None:
    """
    Prints Histogram aggregation examples.

    Args:
        histogram_1 (Histogram): First example Histogram.
        histogram_2 (Histogram): Second example Histogram.
    """
    print('\nHistogram 1, Total Count:', histogram_1.totalCount())
    print('Histogram 2, Total Count:', histogram_2.totalCount())

    print('\nHistogram 1, Size:', histogram_1.size())
    print('Histogram 2, Size:', histogram_2.size())

    print(
        '\nHistogram 1, \'apple\' Count:',
        histogram_1.getCount(JString('apple')),
    )
    print('Histogram 2, \'apple\' Count:',
          histogram_2.getCount(JString('apple')))

    print('\nHistogram 1, Max Count:', histogram_1.maxValue())
    print('Histogram 2, Max Count:', histogram_2.maxValue())

    print('\nHistogram 1, Min Count:', histogram_1.minValue())
    print('Histogram 2, Min Count:', histogram_2.minValue())
    def replace_words_with_lemma(cls, sentence: str) -> str:
        tokens: List[str] = sentence.split()
        label: str = tokens[0]
        del tokens[0]
        sentence = ' '.join(tokens)

        if len(sentence) == 0:
            return sentence

        analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate(
            JString(sentence))
        res: java.util.ArrayList = java.util.ArrayList()
        res.add(JString(label))

        for word_analysis in analysis:
            best: SingleAnalysis = word_analysis.getBestAnalysis()

            if best.isUnknown():
                res.add(word_analysis.getWordAnalysis().getInput())
                continue

            lemmas: java.util.ArrayList = best.getLemmas()
            res.add(lemmas[0])

        return java.lang.String.join(JString(' '), res)
Exemple #3
0
def run(sentence: str) -> None:
    """
    Spell checking example.

    Args:
        sentence (str): Sentence to check for spelling errors.
    """

    morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()

    spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology)

    words: List[str] = sentence.split(' ')
    fixed_words: List[str] = []

    for word in words:
        if not spell_checker.check(JString(word)):
            print(f'Spelling error: {word}')
            suggestions: java.util.ArrayList = spell_checker.suggestForWord(
                JString(word))
            if suggestions:
                print(f'\nSuggestions for "{word}":')
                for suggestion in suggestions:
                    print(f' | {suggestion}')
                fixed_words.append(str(suggestions[0]))
                continue
            else:
                print(f'No suggestions found for "{word}".')
        fixed_words.append(word)

    print('\nFixed sentence:', ' '.join(fixed_words))
    def split_words(cls, sentence: str) -> str:
        tokens: List[str] = sentence.split()
        label: java.lang.String = JString(tokens[0])
        del tokens[0]
        sentence = ' '.join(tokens)

        if len(sentence) == 0:
            return JString(sentence)

        analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate(
            JString(sentence))
        res: java.util.ArrayList = java.util.ArrayList()
        res.add(label)

        for word_analysis in analysis:
            best: SingleAnalysis = word_analysis.getBestAnalysis()
            inp: java.lang.String = word_analysis.getWordAnalysis().getInput()

            if best.isUnknown():
                res.add(inp)
                continue

            lemmas: java.util.ArrayList = best.getLemmas()

            if len(lemmas[0]) < len(inp):
                res.add(lemmas[0])
                res.add(JString('_' + str(inp[len(lemmas[0])])))
            else:
                res.add(lemmas[0])

        return java.lang.String.join(JString(' '), res)
Exemple #5
0
def _print_histogram_item_comp(histogram_1: Histogram,
                               histogram_2: Histogram) -> None:
    print(
        '\nIntersection of Histogram 1 and 2:',
        histogram_1.getIntersectionOfKeys(histogram_2),
    )

    print(
        '\nHistogram 1, Contains \'grape\':',
        histogram_1.contains(JString('grape')),
    )
    print(
        'Histogram 2, Contains \'grape\':',
        histogram_2.contains(JString('grape')),
    )

    print(
        '\nHistogram 1, Contains Apple:',
        histogram_1.contains(JString('apple')),
    )
    print('Histogram 2, Contains Apple:',
          histogram_2.contains(JString('apple')))

    print('\nHistogram 1, Top 3:', histogram_1.getTop(JInt(3)))
    print('Histogram 2, Top 3:', histogram_2.getTop(JInt(3)))
Exemple #6
0
 def generate_set_tokenized(cls, lines: List[str], tokenized_path: str):
     with open(tokenized_path, 'w', encoding='utf-8') as f:
         for line in [
                 cls.remove_non_words(
                     java.lang.String.join(
                         JString(' '),
                         TurkishTokenizer.DEFAULT.tokenizeToStrings(
                             JString(line)))).lower() for line in lines
         ]:
             f.write(f'{line}\n')
Exemple #7
0
 def update_input_file(self, positions=None,qm_core_fixed=None):
   Double=JClass("java.lang.Double")
   if(positions is not  None):
     for iposition, position in enumerate(positions):
       atom=self.atoms[iposition]
       atom.setX3(Double(float(position[0])))
       atom.setY3(Double(float(position[1])))
       atom.setZ3(Double(float(position[2])))
   if(qm_core_fixed != None):
     for m in self.molecules:
       m.setId(JString("MM"))
     for q in qm_core_fixed:
       self.molecules[q-1].setId(JString("QM_CORE_FIXED"))
   self.jaxbFileWriter.write(JString(self.input_file),self.jaxb_cml.getValue())
    def generateNoun(self, noun):  # For generating nouns

        # ***********************This part is from Zemberek***********************
        number: List[JString] = [JString('A3sg'), JString('A3pl')]
        possessives: List[JString] = [
            JString('P1sg'), JString('P2sg'),
            JString('P3sg')
        ]
        cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')]
        TurkishMorphology: JClass = JClass(
            'zemberek.morphology.TurkishMorphology')
        morphology: TurkishMorphology = (TurkishMorphology.builder(
        ).setLexicon(noun).disableCache().build())
        # ***********************This part is from Zemberek***********************
        item = morphology.getLexicon().getMatchingItems(noun).get(0)

        for number_m in number:
            for possessive_m in possessives:
                for case_m in cases:
                    for result in morphology.getWordGenerator().generate(
                            item, number_m, possessive_m, case_m):
                        # After we generate new noun from source we look if this noun obeys the rule
                        self.controller(str(result.surface), recurrence=False)
                        # We call controller with recurrence = False because if we call it without giving this parameter
                        # Program will enter a recurrence relation that never ends.
        return
def _generate_nouns(root_word: str) -> None:
    """
    Generates inflections of the given root word using possessive and case
    suffix combinations.

    Args:
        root_word (str): Root word to generate inflections from.
    """

    print('\nGenerating nouns.\n')

    number: List[JString] = [JString('A3sg'), JString('A3pl')]
    possessives: List[JString] = [
        JString('P1sg'),
        JString('P2sg'),
        JString('P3sg'),
    ]
    cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')]

    morphology: TurkishMorphology = (TurkishMorphology.builder().setLexicon(
        root_word).disableCache().build())

    item: DictionaryItem = (
        morphology.getLexicon().getMatchingItems(root_word).get(0))

    for number_m in number:
        for possessive_m in possessives:
            for case_m in cases:
                for result in morphology.getWordGenerator().generate(
                        item, number_m, possessive_m, case_m):
                    print(str(result.surface))
Exemple #10
0
def rule_based_corrector(text):

    paragraph = []
    Result = " "

    TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
    TurkishSentenceNormalizer: JClass = JClass(
        'zemberek.normalization.TurkishSentenceNormalizer')

    Paths: JClass = JClass('java.nio.file.Paths')

    for i, word in enumerate([text]):
        paragraph.append(word)

    normalizer = TurkishSentenceNormalizer(
        TurkishMorphology.createWithDefaults(),
        Paths.get(
            join(
                '..', '..', 'data',
                '/home/busra/PycharmProjects/Piton/myenv/Zemberek-Python-Examples-master/examples/normalization'
            )),
        Paths.get(
            join('..', '..', 'data', 'lm',
                 '/home/busra/PycharmProjects/Piton/myenv/lm.2gram.slm')))

    for i, example in enumerate(paragraph):
        Result = Result + str(normalizer.normalize(
            JString(example))).capitalize() + " "

    return Result
    def correctDocument(self,document):
        '''This function corrects misspelled words in a document.'''
        spell_checker = self.TurkishSpellChecker(self.DefaultMorphology)
        tokens = self.TurkishTokenizer.ALL.tokenize(JString(document))

        corrected_tokens = []

        for token in tokens :
            text = token.content
            if (
                    token.type != self.Token.Type.NewLine
                    and token.type != self.Token.Type.SpaceTab
                    and token.type != self.Token.Type.Punctuation
                    and token.type != self.Token.Type.RomanNumeral
                    and token.type != self.Token.Type.UnknownWord
                    and token.type != self.Token.Type.Unknown
                    and not spell_checker.check(text)
            ) :
                suggestions = list(spell_checker.suggestForWord(token.content))
                if suggestions :
                    suggestion: str = str(suggestions[0])
                    print(f'Correction: {token.content} -> {suggestion}.')
                    corrected_tokens.append(suggestion)
                    continue
            corrected_tokens.append(str(token.content))

        correctedDoc = ' '.join(corrected_tokens)
        if self.verbose:
            print('\nCorrected Document:\n', correctedDoc)

        return correctedDoc
def Normalize(query):
    ZEMBEREK_PATH: str = join('Zemberek', 'bin', 'zemberek-full.jar')

    startJVM(getDefaultJVMPath(),
             '-ea',
             f'-Djava.class.path={ZEMBEREK_PATH}',
             convertStrings=False)

    TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
    TurkishSentenceNormalizer: JClass = JClass(
        'zemberek.normalization.TurkishSentenceNormalizer')
    Paths: JClass = JClass('java.nio.file.Paths')

    normalizer = TurkishSentenceNormalizer(
        TurkishMorphology.createWithDefaults(),
        Paths.get(join('Zemberek', 'ZemberekData', 'normalization')),
        Paths.get(join('Zemberek', 'ZemberekData', 'lm', 'lm.2gram.slm')))

    norm = normalizer.normalize(JString(query))

    print((f'\nNoisy : {query}'
           f'\nNormalized : {normalizer.normalize(JString(query))}\n'))

    return norm

    shutdownJVM()
Exemple #13
0
def run(
    source_word: str,
    target_word: str,
) -> None:
    """
    Stem change example.

    Args:
        source_word (str): Word to get stem from.
        target_word (str): Word to apply stem change.
    """
    morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()

    new_stem: DictionaryItem = (
        morphology.getLexicon().getMatchingItems(target_word).get(0))

    results: WordAnalysis = morphology.analyze(JString(source_word))

    for result in results:
        generated: java.util.ArrayList = (
            morphology.getWordGenerator().generate(new_stem,
                                                   result.getMorphemes()))
        for gen_word in generated:
            print(f'\nInput Analysis: {str(result.formatLong())}'
                  f'\nAfter Stem Change, Word: {str(gen_word.surface)}'
                  '\nAfter Stem Change, Analysis:'
                  f'{str(gen_word.analysis.formatLong())}')
Exemple #14
0
def generate_wordnet(sent, trie):
    """
    生成词网
    :param sent: 句子
    :param trie: 词典(unigram)
    :return: 词网
    """
    searcher = trie.getSearcher(JString(sent), 0)
    wordnet = WordNet(sent)
    while searcher.next():
        wordnet.add(
            searcher.begin + 1,
            Vertex(sent[searcher.begin:searcher.begin + searcher.length],
                   searcher.value, searcher.index))
    # 原子分词,保证图连通
    vertexes = wordnet.getVertexes()
    i = 0
    while i < len(vertexes):
        if len(vertexes[i]) == 0:  # 空白行
            j = i + 1
            for j in range(i + 1, len(vertexes) - 1):  # 寻找第一个非空行 j
                if len(vertexes[j]):
                    break
            wordnet.add(i, Vertex.newPunctuationInstance(
                sent[i - 1:j - 1]))  # 填充[i, j)之间的空白行
            i = j
        else:
            i += len(vertexes[i][-1].realWord)

    return wordnet
def clean_up_sentence(sentence):
    sentence_words = normalizer.normalize(JString(sentence))
    token_iterator = tokenizer.getTokenIterator(JString(sentence_words))
    words_temp = []
    for token in token_iterator:
        words_temp.append(str(token.content))

    sentence_words = []
    for w in words_temp:
        results: WordAnalysis = morphology.analyze(JString(w))
        for result in results:
            for r in result.getLemmas():
                sentence_words.append(str(r))

    sentence_words = list(set(sentence_words))

    return sentence_words
def run(sentence: str) -> None:
    """
    News classification example. Trains a new model if there are no model
    available.

    Args:
        sentence (str): Sentence to classify.
    """
    label_data_path: Path = DATA_PATH.joinpath('classification',
                                               'news-title-category-set')
    model_path: Path = label_data_path.with_suffix('.model')

    if not model_path.is_file():

        print('Could not find a model, training a new one. FastText will print'
              ' some errors, do not terminate the process!')

        if not label_data_path.is_file():
            raise FileNotFoundError('Could not train a model!'
                                    ' Please include news-title-category-set!')

        subprocess.run(
            [
                str(JAVA_PATH.absolute()),
                '-jar',
                str(ZEMBEREK_PATH.absolute()),
                'TrainClassifier',
                '-i',
                str(label_data_path.absolute()),
                '-o',
                str(model_path.absolute()),
                '--learningRate',
                '0.1',
                '--epochCount',
                '50',
                '--applyQuantization',
                '--cutOff',
                '15000',
            ],
            check=True,
        )

    classifier: FastTextClassifier = FastTextClassifier.load(model_path)

    processed: str = ' '.join([
        str(token) for token in TurkishTokenizer.DEFAULT.tokenizeToStrings(
            JString(sentence))
    ]).lower()

    results: java.util.ArrayList = classifier.predict(processed, 3)

    print(f'Sentence: {sentence}')

    for i, result in enumerate(results):
        print(
            f'\nItem {i + 1}: {result.item}',
            f'\nScore {i + 1}: {result.score}',
        )
Exemple #17
0
    def testCallOverloads(self):
        # build the harness
        h = JPackage("jpype.objectwrapper").Test1()

        o = java.lang.Integer(1)
        assert h.Method1(JObject(o, java.lang.Number)) == 1
        assert h.Method1(o) == 2
        assert h.Method1(JObject(java.lang.Integer(1), java.lang.Object)) == 3
        assert h.Method1(JString("")) == 4
    def sentenceTokenization(self,sentence):
        '''This function tokenizes a simple sentence.'''
        token_iterator = self.TurkishTokenizer.DEFAULT.tokenizeToStrings(JString(sentence))

        if self.verbose:
            print('\nToken Iterator Example:\n')
            for i, token in enumerate(token_iterator) :
                print(f'Token {i} = {token}')

        return list(map(str,token_iterator))
Exemple #19
0
    def normalize(self):

        #normalizer

        deneme = self.normalizer.normalize(
            JString(self.sentenceThatWillBeChanged))
        self.sentenceThatWillBeChanged = str(deneme)
        self.sentenceThatWontBeChangedButNormalized = str(deneme)

        return self.sentenceThatWillBeChanged
Exemple #20
0
    def tokenize(self, cumle):
        kelimeler = []

        inp: str = cumle

        for i, token in enumerate(
                self.tokenizer.tokenizeToStrings(JString(inp))):
            kelimeler.append(str(token))

        return kelimeler
 def test(cls, test_path: Path, predictions_path: Path,
          model_path: Path) -> None:
     EvaluateClassifier().execute(
         JString('-i'),
         JString(str(test_path)),
         JString('-m'),
         JString(str(model_path)),
         JString('-o'),
         JString(str(predictions_path)),
         JString('-k'),
         JString('1'),
     )
Exemple #22
0
    def testCallOverloads(self):
        # build the harness
        h = JPackage("jpype.objectwrapper").Test1()

        o = java.lang.Integer(1)
        self.assertEqual(h.Method1(JObject(o, java.lang.Number)), 1)
        self.assertEqual(h.Method1(o), 2)
        self.assertEqual(
            h.Method1(JObject(java.lang.Integer(1), java.lang.Object)), 3)
        self.assertEqual(h.Method1(JString("")), 4)
 def test(inp: str, new_item: DictionaryItem):
     print(f'Parses for {inp} before adding {new_item}')
     before: WordAnalysis = morphology.analyze(JString(inp))
     print_results(before)
     morphology.invalidateCache()
     morphology.getMorphotactics().getStemTransitions().addDictionaryItem(
         new_item
     )
     after: WordAnalysis = morphology.analyze(inp)
     print(f'Parses for {inp} after adding {new_item}')
     print_results(after)
 def analyze(self,word):
     '''This function analyzes the given word.'''
     results = self.DefaultMorphology.analyze(JString(word))
     if self.verbose:
         for result in results :
             print(
                 f'\nLexical and Surface: {str(result.formatLong())}'
                 f'\nOnly Lexical: {str(result.formatLexical())}'
                 '\nOflazer Style:'
                 f'{str(self.AnalysisFormatters.OFLAZER_STYLE.format(result))}'
             )
     return results, str(results)
def run() -> None:
    """
    Dictionary item addition tests.
    """

    morphology: TurkishMorphology = TurkishMorphology.createWithDefaults()

    print('\nProper Noun Test - 1:\n')
    _test(
        morphology,
        'Meydan\'a',
        DictionaryItem(
            JString('Meydan'),
            JString('meydan'),
            JString('meydan'),
            PrimaryPos.Noun,
            SecondaryPos.ProperNoun,
        ),
    )

    print('\nProper Noun Test - 2:\n')
    _test(
        morphology,
        'Meeeydan\'a',
        DictionaryItem(
            JString('Meeeydan'),
            JString('meeeydan'),
            JString('meeeydan'),
            PrimaryPos.Noun,
            SecondaryPos.ProperNoun,
        ),
    )

    print('\nVerb Test:\n')
    _test(
        morphology,
        'tweetleyeyazdım',
        DictionaryItem(
            JString('tweetlemek'),
            JString('tweetle'),
            JString('tivitle'),
            PrimaryPos.Verb,
            SecondaryPos.None_,
        ),
    )
Exemple #26
0
    def word_Tokenize(self, inp):
        Token: JClass = JClass('zemberek.tokenization.Token')
        TurkishTokenizer: JClass = JClass(
            'zemberek.tokenization.TurkishTokenizer')

        tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT
        tokenizer: TurkishTokenizer = TurkishTokenizer.builder().ignoreTypes(
            Token.Type.Punctuation, Token.Type.NewLine,
            Token.Type.SpaceTab).build()

        print(f'Input = {inp} ')
        for i, token in enumerate(tokenizer.tokenize(JString(inp))):
            print(f' | Token {i} = {token.getText()}')
Exemple #27
0
 def __init__(self, lang='zh'):
     if lang == 'zh':
         try:
             self.zh = ZHConvert('http://localhost:9998/pos?wsdl',
                                 'http://localhost:9999/seg?wsdl')
             self.zh.tw_postag(u'今天天氣真好')
         except:
             self.zh = ZHConvert()
     class_path = dirname(__file__)
     startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=" + class_path)
     Parser = JPackage('service').jpype.ParserJPype
     self.parser = Parser()
     self.parser.init(JString(lang))
    def normalizeDocument(self,document):
        '''This function normalizes a given document.'''
        Paths: JClass = JClass('java.nio.file.Paths')
        path1 = Paths.get(os.path.join('.', 'req_data'))
        path2 = Paths.get(os.path.join('.', 'req_data', 'lm.2gram.slm'))
        normalizer = self.TurkishSentenceNormalizer(self.TurkishMorphology.createWithDefaults(),path1,path2)

        normalizedDoc = normalizer.normalize(JString(document))
        if self.verbose:
            print(f'\nNoisy : {document}')
            print(f'\nNormalized : {normalizedDoc}' )

        return str(normalizedDoc)
Exemple #29
0
    def normalize_comments(unformatted_file, normalized_file):
        """
        Normalize the comments from a csv file and writes them a new file under the raw_data directory.

        Parameters
        ----------
        unformatted_file : str
            The name of the input csv file under the raw_data directory
        normalized_file : str
            The name of the output csv file under the raw_data directory
        """
        ZEMBEREK_PATH: str = join('..', 'bin', 'zemberek-full.jar')

        print(ZEMBEREK_PATH)

        startJVM(getDefaultJVMPath(),
                 '-ea',
                 f'-Djava.class.path={ZEMBEREK_PATH}',
                 convertStrings=False)

        TurkishMorphology: JClass = JClass(
            'zemberek.morphology.TurkishMorphology')
        TurkishSentenceNormalizer: JClass = JClass(
            'zemberek.normalization.TurkishSentenceNormalizer')
        Paths: JClass = JClass('java.nio.file.Paths')

        normalizer = TurkishSentenceNormalizer(
            TurkishMorphology.createWithDefaults(),
            Paths.get(join('..', 'data', 'normalization')),
            Paths.get(join('..', 'data', 'lm', 'lm.2gram.slm')))

        comments = AtlasFormatter.get_comments(unformatted_file)
        scores = AtlasFormatter.get_scores(unformatted_file)

        final = []
        for i, comment in enumerate(comments):
            if comment.__len__() > 1:
                normalizer.normalize(JString(comment))
                final.append(
                    comment.strip("\n") + "|" + str(format(scores[i], ".2f")))

        shutdownJVM()

        pretrained = []
        pretrained.append("Comment|Score\n")
        for i in range(len(final) - 1):
            tmp = final[i]
            if tmp != final[i + 1]:
                pretrained.append(tmp + "\n")

        AtlasFormatter.list_to_file(pretrained, normalized_file)
Exemple #30
0
def replace_stropwords_text(text, replacement, trie):
    searcher = trie.getLongestSearcher(JString(text), 0)
    offset = 0
    result = ''
    while searcher.next():
        begin = searcher.begin
        end = begin + searcher.length
        if begin > offset:
            result += text[offset:begin]
        result += replacement
        offset = end
    if offset < len(text):
        result += text[offset:]
    return result