def _print_histogram_agg(histogram_1: Histogram, histogram_2: Histogram) -> None: """ Prints Histogram aggregation examples. Args: histogram_1 (Histogram): First example Histogram. histogram_2 (Histogram): Second example Histogram. """ print('\nHistogram 1, Total Count:', histogram_1.totalCount()) print('Histogram 2, Total Count:', histogram_2.totalCount()) print('\nHistogram 1, Size:', histogram_1.size()) print('Histogram 2, Size:', histogram_2.size()) print( '\nHistogram 1, \'apple\' Count:', histogram_1.getCount(JString('apple')), ) print('Histogram 2, \'apple\' Count:', histogram_2.getCount(JString('apple'))) print('\nHistogram 1, Max Count:', histogram_1.maxValue()) print('Histogram 2, Max Count:', histogram_2.maxValue()) print('\nHistogram 1, Min Count:', histogram_1.minValue()) print('Histogram 2, Min Count:', histogram_2.minValue())
def replace_words_with_lemma(cls, sentence: str) -> str: tokens: List[str] = sentence.split() label: str = tokens[0] del tokens[0] sentence = ' '.join(tokens) if len(sentence) == 0: return sentence analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate( JString(sentence)) res: java.util.ArrayList = java.util.ArrayList() res.add(JString(label)) for word_analysis in analysis: best: SingleAnalysis = word_analysis.getBestAnalysis() if best.isUnknown(): res.add(word_analysis.getWordAnalysis().getInput()) continue lemmas: java.util.ArrayList = best.getLemmas() res.add(lemmas[0]) return java.lang.String.join(JString(' '), res)
def run(sentence: str) -> None: """ Spell checking example. Args: sentence (str): Sentence to check for spelling errors. """ morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() spell_checker: TurkishSpellChecker = TurkishSpellChecker(morphology) words: List[str] = sentence.split(' ') fixed_words: List[str] = [] for word in words: if not spell_checker.check(JString(word)): print(f'Spelling error: {word}') suggestions: java.util.ArrayList = spell_checker.suggestForWord( JString(word)) if suggestions: print(f'\nSuggestions for "{word}":') for suggestion in suggestions: print(f' | {suggestion}') fixed_words.append(str(suggestions[0])) continue else: print(f'No suggestions found for "{word}".') fixed_words.append(word) print('\nFixed sentence:', ' '.join(fixed_words))
def split_words(cls, sentence: str) -> str: tokens: List[str] = sentence.split() label: java.lang.String = JString(tokens[0]) del tokens[0] sentence = ' '.join(tokens) if len(sentence) == 0: return JString(sentence) analysis: SentenceAnalysis = cls.morphology.analyzeAndDisambiguate( JString(sentence)) res: java.util.ArrayList = java.util.ArrayList() res.add(label) for word_analysis in analysis: best: SingleAnalysis = word_analysis.getBestAnalysis() inp: java.lang.String = word_analysis.getWordAnalysis().getInput() if best.isUnknown(): res.add(inp) continue lemmas: java.util.ArrayList = best.getLemmas() if len(lemmas[0]) < len(inp): res.add(lemmas[0]) res.add(JString('_' + str(inp[len(lemmas[0])]))) else: res.add(lemmas[0]) return java.lang.String.join(JString(' '), res)
def _print_histogram_item_comp(histogram_1: Histogram, histogram_2: Histogram) -> None: print( '\nIntersection of Histogram 1 and 2:', histogram_1.getIntersectionOfKeys(histogram_2), ) print( '\nHistogram 1, Contains \'grape\':', histogram_1.contains(JString('grape')), ) print( 'Histogram 2, Contains \'grape\':', histogram_2.contains(JString('grape')), ) print( '\nHistogram 1, Contains Apple:', histogram_1.contains(JString('apple')), ) print('Histogram 2, Contains Apple:', histogram_2.contains(JString('apple'))) print('\nHistogram 1, Top 3:', histogram_1.getTop(JInt(3))) print('Histogram 2, Top 3:', histogram_2.getTop(JInt(3)))
def generate_set_tokenized(cls, lines: List[str], tokenized_path: str): with open(tokenized_path, 'w', encoding='utf-8') as f: for line in [ cls.remove_non_words( java.lang.String.join( JString(' '), TurkishTokenizer.DEFAULT.tokenizeToStrings( JString(line)))).lower() for line in lines ]: f.write(f'{line}\n')
def update_input_file(self, positions=None,qm_core_fixed=None): Double=JClass("java.lang.Double") if(positions is not None): for iposition, position in enumerate(positions): atom=self.atoms[iposition] atom.setX3(Double(float(position[0]))) atom.setY3(Double(float(position[1]))) atom.setZ3(Double(float(position[2]))) if(qm_core_fixed != None): for m in self.molecules: m.setId(JString("MM")) for q in qm_core_fixed: self.molecules[q-1].setId(JString("QM_CORE_FIXED")) self.jaxbFileWriter.write(JString(self.input_file),self.jaxb_cml.getValue())
def generateNoun(self, noun): # For generating nouns # ***********************This part is from Zemberek*********************** number: List[JString] = [JString('A3sg'), JString('A3pl')] possessives: List[JString] = [ JString('P1sg'), JString('P2sg'), JString('P3sg') ] cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')] TurkishMorphology: JClass = JClass( 'zemberek.morphology.TurkishMorphology') morphology: TurkishMorphology = (TurkishMorphology.builder( ).setLexicon(noun).disableCache().build()) # ***********************This part is from Zemberek*********************** item = morphology.getLexicon().getMatchingItems(noun).get(0) for number_m in number: for possessive_m in possessives: for case_m in cases: for result in morphology.getWordGenerator().generate( item, number_m, possessive_m, case_m): # After we generate new noun from source we look if this noun obeys the rule self.controller(str(result.surface), recurrence=False) # We call controller with recurrence = False because if we call it without giving this parameter # Program will enter a recurrence relation that never ends. return
def _generate_nouns(root_word: str) -> None: """ Generates inflections of the given root word using possessive and case suffix combinations. Args: root_word (str): Root word to generate inflections from. """ print('\nGenerating nouns.\n') number: List[JString] = [JString('A3sg'), JString('A3pl')] possessives: List[JString] = [ JString('P1sg'), JString('P2sg'), JString('P3sg'), ] cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')] morphology: TurkishMorphology = (TurkishMorphology.builder().setLexicon( root_word).disableCache().build()) item: DictionaryItem = ( morphology.getLexicon().getMatchingItems(root_word).get(0)) for number_m in number: for possessive_m in possessives: for case_m in cases: for result in morphology.getWordGenerator().generate( item, number_m, possessive_m, case_m): print(str(result.surface))
def rule_based_corrector(text): paragraph = [] Result = " " TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') TurkishSentenceNormalizer: JClass = JClass( 'zemberek.normalization.TurkishSentenceNormalizer') Paths: JClass = JClass('java.nio.file.Paths') for i, word in enumerate([text]): paragraph.append(word) normalizer = TurkishSentenceNormalizer( TurkishMorphology.createWithDefaults(), Paths.get( join( '..', '..', 'data', '/home/busra/PycharmProjects/Piton/myenv/Zemberek-Python-Examples-master/examples/normalization' )), Paths.get( join('..', '..', 'data', 'lm', '/home/busra/PycharmProjects/Piton/myenv/lm.2gram.slm'))) for i, example in enumerate(paragraph): Result = Result + str(normalizer.normalize( JString(example))).capitalize() + " " return Result
def correctDocument(self,document): '''This function corrects misspelled words in a document.''' spell_checker = self.TurkishSpellChecker(self.DefaultMorphology) tokens = self.TurkishTokenizer.ALL.tokenize(JString(document)) corrected_tokens = [] for token in tokens : text = token.content if ( token.type != self.Token.Type.NewLine and token.type != self.Token.Type.SpaceTab and token.type != self.Token.Type.Punctuation and token.type != self.Token.Type.RomanNumeral and token.type != self.Token.Type.UnknownWord and token.type != self.Token.Type.Unknown and not spell_checker.check(text) ) : suggestions = list(spell_checker.suggestForWord(token.content)) if suggestions : suggestion: str = str(suggestions[0]) print(f'Correction: {token.content} -> {suggestion}.') corrected_tokens.append(suggestion) continue corrected_tokens.append(str(token.content)) correctedDoc = ' '.join(corrected_tokens) if self.verbose: print('\nCorrected Document:\n', correctedDoc) return correctedDoc
def Normalize(query): ZEMBEREK_PATH: str = join('Zemberek', 'bin', 'zemberek-full.jar') startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={ZEMBEREK_PATH}', convertStrings=False) TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology') TurkishSentenceNormalizer: JClass = JClass( 'zemberek.normalization.TurkishSentenceNormalizer') Paths: JClass = JClass('java.nio.file.Paths') normalizer = TurkishSentenceNormalizer( TurkishMorphology.createWithDefaults(), Paths.get(join('Zemberek', 'ZemberekData', 'normalization')), Paths.get(join('Zemberek', 'ZemberekData', 'lm', 'lm.2gram.slm'))) norm = normalizer.normalize(JString(query)) print((f'\nNoisy : {query}' f'\nNormalized : {normalizer.normalize(JString(query))}\n')) return norm shutdownJVM()
def run( source_word: str, target_word: str, ) -> None: """ Stem change example. Args: source_word (str): Word to get stem from. target_word (str): Word to apply stem change. """ morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() new_stem: DictionaryItem = ( morphology.getLexicon().getMatchingItems(target_word).get(0)) results: WordAnalysis = morphology.analyze(JString(source_word)) for result in results: generated: java.util.ArrayList = ( morphology.getWordGenerator().generate(new_stem, result.getMorphemes())) for gen_word in generated: print(f'\nInput Analysis: {str(result.formatLong())}' f'\nAfter Stem Change, Word: {str(gen_word.surface)}' '\nAfter Stem Change, Analysis:' f'{str(gen_word.analysis.formatLong())}')
def generate_wordnet(sent, trie): """ 生成词网 :param sent: 句子 :param trie: 词典(unigram) :return: 词网 """ searcher = trie.getSearcher(JString(sent), 0) wordnet = WordNet(sent) while searcher.next(): wordnet.add( searcher.begin + 1, Vertex(sent[searcher.begin:searcher.begin + searcher.length], searcher.value, searcher.index)) # 原子分词,保证图连通 vertexes = wordnet.getVertexes() i = 0 while i < len(vertexes): if len(vertexes[i]) == 0: # 空白行 j = i + 1 for j in range(i + 1, len(vertexes) - 1): # 寻找第一个非空行 j if len(vertexes[j]): break wordnet.add(i, Vertex.newPunctuationInstance( sent[i - 1:j - 1])) # 填充[i, j)之间的空白行 i = j else: i += len(vertexes[i][-1].realWord) return wordnet
def clean_up_sentence(sentence): sentence_words = normalizer.normalize(JString(sentence)) token_iterator = tokenizer.getTokenIterator(JString(sentence_words)) words_temp = [] for token in token_iterator: words_temp.append(str(token.content)) sentence_words = [] for w in words_temp: results: WordAnalysis = morphology.analyze(JString(w)) for result in results: for r in result.getLemmas(): sentence_words.append(str(r)) sentence_words = list(set(sentence_words)) return sentence_words
def run(sentence: str) -> None: """ News classification example. Trains a new model if there are no model available. Args: sentence (str): Sentence to classify. """ label_data_path: Path = DATA_PATH.joinpath('classification', 'news-title-category-set') model_path: Path = label_data_path.with_suffix('.model') if not model_path.is_file(): print('Could not find a model, training a new one. FastText will print' ' some errors, do not terminate the process!') if not label_data_path.is_file(): raise FileNotFoundError('Could not train a model!' ' Please include news-title-category-set!') subprocess.run( [ str(JAVA_PATH.absolute()), '-jar', str(ZEMBEREK_PATH.absolute()), 'TrainClassifier', '-i', str(label_data_path.absolute()), '-o', str(model_path.absolute()), '--learningRate', '0.1', '--epochCount', '50', '--applyQuantization', '--cutOff', '15000', ], check=True, ) classifier: FastTextClassifier = FastTextClassifier.load(model_path) processed: str = ' '.join([ str(token) for token in TurkishTokenizer.DEFAULT.tokenizeToStrings( JString(sentence)) ]).lower() results: java.util.ArrayList = classifier.predict(processed, 3) print(f'Sentence: {sentence}') for i, result in enumerate(results): print( f'\nItem {i + 1}: {result.item}', f'\nScore {i + 1}: {result.score}', )
def testCallOverloads(self): # build the harness h = JPackage("jpype.objectwrapper").Test1() o = java.lang.Integer(1) assert h.Method1(JObject(o, java.lang.Number)) == 1 assert h.Method1(o) == 2 assert h.Method1(JObject(java.lang.Integer(1), java.lang.Object)) == 3 assert h.Method1(JString("")) == 4
def sentenceTokenization(self,sentence): '''This function tokenizes a simple sentence.''' token_iterator = self.TurkishTokenizer.DEFAULT.tokenizeToStrings(JString(sentence)) if self.verbose: print('\nToken Iterator Example:\n') for i, token in enumerate(token_iterator) : print(f'Token {i} = {token}') return list(map(str,token_iterator))
def normalize(self): #normalizer deneme = self.normalizer.normalize( JString(self.sentenceThatWillBeChanged)) self.sentenceThatWillBeChanged = str(deneme) self.sentenceThatWontBeChangedButNormalized = str(deneme) return self.sentenceThatWillBeChanged
def tokenize(self, cumle): kelimeler = [] inp: str = cumle for i, token in enumerate( self.tokenizer.tokenizeToStrings(JString(inp))): kelimeler.append(str(token)) return kelimeler
def test(cls, test_path: Path, predictions_path: Path, model_path: Path) -> None: EvaluateClassifier().execute( JString('-i'), JString(str(test_path)), JString('-m'), JString(str(model_path)), JString('-o'), JString(str(predictions_path)), JString('-k'), JString('1'), )
def testCallOverloads(self): # build the harness h = JPackage("jpype.objectwrapper").Test1() o = java.lang.Integer(1) self.assertEqual(h.Method1(JObject(o, java.lang.Number)), 1) self.assertEqual(h.Method1(o), 2) self.assertEqual( h.Method1(JObject(java.lang.Integer(1), java.lang.Object)), 3) self.assertEqual(h.Method1(JString("")), 4)
def test(inp: str, new_item: DictionaryItem): print(f'Parses for {inp} before adding {new_item}') before: WordAnalysis = morphology.analyze(JString(inp)) print_results(before) morphology.invalidateCache() morphology.getMorphotactics().getStemTransitions().addDictionaryItem( new_item ) after: WordAnalysis = morphology.analyze(inp) print(f'Parses for {inp} after adding {new_item}') print_results(after)
def analyze(self,word): '''This function analyzes the given word.''' results = self.DefaultMorphology.analyze(JString(word)) if self.verbose: for result in results : print( f'\nLexical and Surface: {str(result.formatLong())}' f'\nOnly Lexical: {str(result.formatLexical())}' '\nOflazer Style:' f'{str(self.AnalysisFormatters.OFLAZER_STYLE.format(result))}' ) return results, str(results)
def run() -> None: """ Dictionary item addition tests. """ morphology: TurkishMorphology = TurkishMorphology.createWithDefaults() print('\nProper Noun Test - 1:\n') _test( morphology, 'Meydan\'a', DictionaryItem( JString('Meydan'), JString('meydan'), JString('meydan'), PrimaryPos.Noun, SecondaryPos.ProperNoun, ), ) print('\nProper Noun Test - 2:\n') _test( morphology, 'Meeeydan\'a', DictionaryItem( JString('Meeeydan'), JString('meeeydan'), JString('meeeydan'), PrimaryPos.Noun, SecondaryPos.ProperNoun, ), ) print('\nVerb Test:\n') _test( morphology, 'tweetleyeyazdım', DictionaryItem( JString('tweetlemek'), JString('tweetle'), JString('tivitle'), PrimaryPos.Verb, SecondaryPos.None_, ), )
def word_Tokenize(self, inp): Token: JClass = JClass('zemberek.tokenization.Token') TurkishTokenizer: JClass = JClass( 'zemberek.tokenization.TurkishTokenizer') tokenizer: TurkishTokenizer = TurkishTokenizer.DEFAULT tokenizer: TurkishTokenizer = TurkishTokenizer.builder().ignoreTypes( Token.Type.Punctuation, Token.Type.NewLine, Token.Type.SpaceTab).build() print(f'Input = {inp} ') for i, token in enumerate(tokenizer.tokenize(JString(inp))): print(f' | Token {i} = {token.getText()}')
def __init__(self, lang='zh'): if lang == 'zh': try: self.zh = ZHConvert('http://localhost:9998/pos?wsdl', 'http://localhost:9999/seg?wsdl') self.zh.tw_postag(u'今天天氣真好') except: self.zh = ZHConvert() class_path = dirname(__file__) startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=" + class_path) Parser = JPackage('service').jpype.ParserJPype self.parser = Parser() self.parser.init(JString(lang))
def normalizeDocument(self,document): '''This function normalizes a given document.''' Paths: JClass = JClass('java.nio.file.Paths') path1 = Paths.get(os.path.join('.', 'req_data')) path2 = Paths.get(os.path.join('.', 'req_data', 'lm.2gram.slm')) normalizer = self.TurkishSentenceNormalizer(self.TurkishMorphology.createWithDefaults(),path1,path2) normalizedDoc = normalizer.normalize(JString(document)) if self.verbose: print(f'\nNoisy : {document}') print(f'\nNormalized : {normalizedDoc}' ) return str(normalizedDoc)
def normalize_comments(unformatted_file, normalized_file): """ Normalize the comments from a csv file and writes them a new file under the raw_data directory. Parameters ---------- unformatted_file : str The name of the input csv file under the raw_data directory normalized_file : str The name of the output csv file under the raw_data directory """ ZEMBEREK_PATH: str = join('..', 'bin', 'zemberek-full.jar') print(ZEMBEREK_PATH) startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={ZEMBEREK_PATH}', convertStrings=False) TurkishMorphology: JClass = JClass( 'zemberek.morphology.TurkishMorphology') TurkishSentenceNormalizer: JClass = JClass( 'zemberek.normalization.TurkishSentenceNormalizer') Paths: JClass = JClass('java.nio.file.Paths') normalizer = TurkishSentenceNormalizer( TurkishMorphology.createWithDefaults(), Paths.get(join('..', 'data', 'normalization')), Paths.get(join('..', 'data', 'lm', 'lm.2gram.slm'))) comments = AtlasFormatter.get_comments(unformatted_file) scores = AtlasFormatter.get_scores(unformatted_file) final = [] for i, comment in enumerate(comments): if comment.__len__() > 1: normalizer.normalize(JString(comment)) final.append( comment.strip("\n") + "|" + str(format(scores[i], ".2f"))) shutdownJVM() pretrained = [] pretrained.append("Comment|Score\n") for i in range(len(final) - 1): tmp = final[i] if tmp != final[i + 1]: pretrained.append(tmp + "\n") AtlasFormatter.list_to_file(pretrained, normalized_file)
def replace_stropwords_text(text, replacement, trie): searcher = trie.getLongestSearcher(JString(text), 0) offset = 0 result = '' while searcher.next(): begin = searcher.begin end = begin + searcher.length if begin > offset: result += text[offset:begin] result += replacement offset = end if offset < len(text): result += text[offset:] return result