def __init__(self, lang_code: str, ids: Set[str]): lt_code = {"en": "en_US", "de": "de_DE", "es": "es_ES"}[lang_code] self.tool = language_tool_python.LanguageTool( lt_code, remote_server="http://localhost:8081/") self.tool.disabled_rules = { "MORFOLOGIK_RULE_EN_US", "GERMAN_SPELLER_RULE", "COMMA_PARENTHESIS_WHITESPACE", "DOUBLE_PUNCTUATION", "UPPERCASE_SENTENCE_START", "WHITESPACE_RULE", "SENTENCE_WHITESPACE", "WHITESPACE_PARAGRAPH", "WHITESPACE_PARAGRAPH_BEGIN", "EMPTY_LINE", "TOO_LONG_SENTENCE", "TOO_LONG_PARAGRAPH", "PARAGRAPH_REPEAT_BEGINNING_RULE", "PUNCTUATION_PARAGRAPH_END", "PUNCTUATION_PARAGRAPH_END2", "EN_SPECIFIC_CASE", "EN_UNPAIRED_BRACKETS", "ENGLISH_WORD_REPEAT_RULE", "EN_A_VS_AN", "ENGLISH_WORD_REPEAT_BEGINNING_RULE", "EN_COMPOUNDS", "EN_CONTRACTION_SPELLING", "ENGLISH_WRONG_WORD_IN_CONTEXT", "EN_DASH_RULE", "EN_WORD_COHERENCY", "EN_DIACRITICS_REPLACE", "EN_PLAIN_ENGLISH_REPLACE", "EN_REDUNDANCY_REPLACE", "EN_SIMPLE_REPLACE", "READABILITY_RULE_SIMPLE", "READABILITY_RULE_DIFFICULT", "DE_SIMPLE_REPLACE", "OLD_SPELLING", "DE_SENTENCE_WHITESPACE", "DE_DOUBLE_PUNCTUATION", "MISSING_VERB", "GERMAN_WORD_REPEAT_RULE", "GERMAN_WORD_REPEAT_BEGINNING_RULE", "GERMAN_WRONG_WORD_IN_CONTEXT", "DE_AGREEMENT", "DE_AGREEMENT2", "DE_CASE", "DE_DASH", "DE_VERBAGREEMENT", "DE_SUBJECT_VERB_AGREEMENT", "DE_WORD_COHERENCY", "DE_SIMILAR_NAMES", "DE_WIEDER_VS_WIDER", "STYLE_REPEATED_WORD_RULE_DE", "DE_COMPOUND_COHERENCY", "TOO_LONG_SENTENCE_DE", "FILLER_WORDS_DE", "GERMAN_PARAGRAPH_REPEAT_BEGINNING_RULE", "DE_DU_UPPER_LOWER", "EINHEITEN_METRISCH", "COMMA_BEHIND_RELATIVE_CLAUSE", "COMMA_IN_FRONT_RELATIVE_CLAUSE", "READABILITY_RULE_SIMPLE_DE", "READABILITY_RULE_DIFFICULT_DE", "COMPOUND_INFINITIV_RULE", "STYLE_REPEATED_SHORT_SENTENCES", "STYLE_REPEATED_SENTENCE_BEGINNING", }
def __init__(self): import language_tool_python self.language_tool = language_tool_python.LanguageTool('en-US')
def grammar_tool(text, error_count): tool = language_tool_python.LanguageTool('en-US') corrected_text = tool.correct(text) error_count += 1 return corrected_text, error_count
def grammar_check_score(text: str) -> int: tool = language_tool_python.LanguageTool('en-GB') matches = tool.check(text) return len(matches)
import language_tool_python import csv tool = language_tool_python.LanguageTool('ru') # что-то что соберёт комментарии в список comments = [] with open(filename, encoding='utf-8') as csvfile: text = csv.DictReader(csvfile) for line in text: comments.append(line['text']) ortho_counter = 0 # считаем орфографические ошибки sentence_start = 0 # предложение с маленькой буквы whitespace = 0 # пробел перед запятой или перед скобкой/после скобки capital_names = 0 # имя с маленькой буквы talk = 0 # просторечия other = 0 # другие правила (всего правил 880 и 1.всё кроме вышеперечисленного встречается очень редко в интернет-речи # 2.в их названиях нет ничего что помогло бы их группировать) csv_list = [[ 'Орфографические ошибки', 'Предложение с маленькой буквы', 'Пробел перед запятой или скобкой', 'Имя с маленькой буквы', 'Просторечные слова', 'Другие ошибки' ]] for line in comments: matches = tool.check(line) for match in matches: if match.ruleId == 'MORFOLOGIK_RULE_RU_RU': ortho_counter += 1 elif match.ruleId == 'UPPERCASE_SENTENCE_START': sentence_start += 1
from collections import Counter import textstat import numpy as np import scipy.stats as stats import language_tool_python import string from germansentiment import sentimentmodel from textblob_de import TextBlobDE # sys.stdout = open('result.txt', 'w', encoding='utf-8') textstat.set_lang("de") nltk.download('averaged_perceptron_tagger') tool = language_tool_python.LanguageTool('de') data = pd.read_csv("Comments.csv") contentList = data["content"].values.tolist() numberOfWordsPerComm = [] tokenList = [] charList = [] digitList = [] upperList = [] lowerList = [] syllablesTuple = ()
def test_langtool_languages(): import language_tool_python lang_tool = language_tool_python.LanguageTool("en-US") assert lang_tool._get_languages() == {'ta-IN', 'en-CA', 'da', 'eo', 'pt-AO', 'de', 'gl', 'ru-RU', 'de-DE', 'en', 'br', 'en-ZA', 'pt-MZ', 'ast-ES', 'sk-SK', 'en-AU', 'ta', 'ga', 'be', 'pl', 'tl-PH', 'sl', 'ar', 'es', 'sl-SI', 'en-NZ', 'el', 'el-GR', 'ru', 'zh-CN', 'en-GB', 'be-BY', 'pl-PL', 'km-KH', 'pt', 'uk-UA', 'ca', 'de-DE-x-simple-language', 'ro', 'ca-ES', 'de-CH', 'ja-JP', 'tl', 'pt-PT', 'gl-ES', 'pt-BR', 'km', 'ga-IE', 'ja', 'sv', 'sk', 'en-US', 'de-AT', 'ca-ES-valencia', 'uk', 'it', 'zh', 'br-FR', 'da-DK', 'ast', 'fr', 'fa', 'nl', 'ro-RO'}
import pycorrector import language_tool_python from utils import contain_english print("[Corrector]") pycorrector.enable_char_error(enable=False) english_corrector = language_tool_python.LanguageTool('en-US') print("[Corrector] Init end") def correct(st): if contain_english(st): return correct_english(st) else: return correct_chinese(st) def correct_chinese(st): corrected, detail = pycorrector.correct(st) if len(detail) == 0: return None else: return corrected def correct_english(st): corrected = english_corrector.correct(st) if st.lower() == corrected.lower(): return None else: return corrected
def findError(author, msgList, questions): str1 = "" f = open(author + ".txt", "w+") f.write("Submission by: " + author) str1 = str1 + "Submission by: " + author ans = [] for i in msgList: temp1 = i.split(" ", 1) temp2 = temp1[0].split("A") temp3 = temp2[1] ans.append(int(temp3)) temp = "\n\nTotal Number of Assignments = " + str( len(questions)) + "\nNumber of Assignments Attempted = " + str( len(ans)) f.write(temp) str1 = str1 + temp cnt = 0 accuracy = [] accuracy_no = [] errorCategory = {} indErr = 0 for s in range(len(questions)): string = "\n\nAssignment " + str(s + 1) + ":" + questions[s].split( " ", 1)[1] str1 = str1 + string f.write(string) if s + 1 in ans: line = msgList[cnt].split(" ", 1)[1] cnt += 1 tool = language_tool_python.LanguageTool('en-US') i = 0 c = 0 errors = {} matches = tool.check(line) count_simple = 0 i = 0 for mistake in matches: if (mistake.category == "GRAMMAR" or mistake.category == "TYPOS" or mistake.category == "CASING"): c += 1 count_simple += 1 errors[c] = { 'Error': mistake.ruleId, 'Suggestion': mistake.replacements, 'Message': mistake.message, 'Actual': mistake.matchedText } if mistake.category in errorCategory.keys(): errorCategory[mistake.category] = errorCategory[ mistake.category] + 1 else: errorCategory[mistake.category] = 1 i = i + count_simple indErr += i print(100 - 100 * (len(matches) / len(line.split()))) accuracy.append(100 - 100 * (len(matches) / len(line.split()))) accuracy_no.append(s + 1) f.write("\n\nYour Submission: " + line) str1 = str1 + "\n\nYour Submission: " + line f.write("\n\nNo. of mistakes found in submission is " + str(i)) str1 = str1 + "\n\nNo. of mistakes found in submission is " + str( i) for j in errors: temp = "\n\nError message: " + str( errors[j]['Message']) + "\nSuggestion is: " + str( errors[j]['Suggestion'] ) + "\nMistake found in: " + str(errors[j]['Actual']) f.write(temp) str1 = str1 + temp else: f.write("\n\nYou did not attempt this assignment.") str1 = str1 + "\n\nYou did not attempt this assignment." f.write("\n----------x----------x---------") str1 = str1 + "\n----------x----------x---------" f.close() Individual(author, msgList, questions, ans, accuracy, accuracy_no, errorCategory, indErr, str1)
def grammar_tool(text): tool = language_tool_python.LanguageTool('en-US') corrected_text = tool.correct(text) return corrected_text
def main(fileName): fileName, fileExt = fileName.split('.') print(fileName, fileExt) # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav") # ipFile = ffmpeg.input(fileName + fileExt) # opFile = ffmpeg.output(ipFile, fileName + ".wav") clip = AudioFileClip(f"{fileName}.{fileExt}") clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le') f = sf.SoundFile(f'{fileName}.wav') audio_dur = len(f) / f.samplerate r = sr.Recognizer() text = "" rec_dur = 25 with sr.AudioFile(f'{fileName}.wav') as source: for x in range(0, int(audio_dur / rec_dur)): audio = r.record(source, duration=rec_dur) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass audio = r.record(source, duration=(audio_dur - int(audio_dur / rec_dur))) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass print("Done") p = Punctuator('Demo-Europarl-EN.pcl') text = p.punctuate(text) tool = language_tool_python.LanguageTool('en-US') matches = tool.check(text) print(len(matches)) for lab in range(len(matches)): print(lab) print(matches[lab].ruleId, matches[lab].replacements) text_new = tool.correct(text) print(text_new) nltk.download('punkt') nltk.download('stopwords') preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner()) similarity_algorithm = BM25Plus() ranker = TextRank() ir = ClassicalIR() # Text Summarization model = Summarizer(preprocessor, similarity_algorithm, ranker, ir) summarised_content = model.summarise(text_new, reduction_ratio=0.80, preserve_order=True) print("\n --- Summarized Text ---\n") print(construct_sentences_from_ranking(summarised_content)) with open(f"{fileName}.txt", "w+") as file: file.write(construct_sentences_from_ranking(summarised_content)) # Text Keyword Extraction preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner(skip_stemming=True)) keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR()) keywords = keyword_extractor.extract_keywords(text, count=10, raw=False) print("\n --- Keywords ---\n") print(keywords)
def grammarCheck(self, language="en-US"): tool = language_tool_python.LanguageTool(language) errors = tool.check(self.string) return (errors, len(errors))
def scoreTextForGrammaticalCorrectness( article ): # The higher the score, the better (although never greater than 0) tool = lt.LanguageTool('en-US') return -1 * len(tool.check(article))
def __init__(self): self.lang_tool = language_tool_python.LanguageTool('en-US') self.bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
from flask import Flask from flask import jsonify from flask import request import language_tool_python as ltp # make a new flask app app = Flask(__name__) # make a new language tool locally tool = ltp.LanguageTool('en-US') # Base route @app.route('/', methods=['GET']) def hello_world(): return jsonify({'message': "Hello, World!"}) # Grammar Check route @app.route('/check/<string:text>') def check_grammar(text): corrected = tool.correct(text) return jsonify({'corrected': corrected}) if __name__ == "__main__": app.run(debug=True)
def test_langtool_load(): import language_tool_python lang_tool = language_tool_python.LanguageTool("en-US") matches = lang_tool.check('ain\'t nothin but a thang') assert str(matches) == """[Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter.', 'replacements': ['Ain'], 'offsetInContext': 0, 'context': "ain't nothin but a thang", 'offset': 0, 'errorLength': 3, 'category': 'CASING', 'ruleIssueType': 'typographical', 'sentence': "ain't nothin but a thang"}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['nothing', 'no thin'], 'offsetInContext': 6, 'context': "ain't nothin but a thang", 'offset': 6, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': "ain't nothin but a thang"}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['than', 'thing', 'hang', 'thank', 'Chang', 'tang', 'thong', 'twang', 'Thant', 'thane', 'Thanh', 't hang', 'Shang', 'Zhang'], 'offsetInContext': 19, 'context': "ain't nothin but a thang", 'offset': 19, 'errorLength': 5, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': "ain't nothin but a thang"})]"""
import numpy as np import nltk import re from nltk.corpus import stopwords from gensim.models import Word2Vec import gensim.models.keyedvectors as word2vec import math import language_tool_python from collections import Counter import nltk from nltk.corpus import stopwords language_tool = language_tool_python.LanguageTool('en-US') def accuracy(content): matches = language_tool.check(content) incorrect = len(matches) wc = word_count(content) return ((wc - incorrect) / wc) * 100 def word_count(content): return len(nltk.word_tokenize(content)) def tense(content): tagged = nltk.pos_tag(content.split()) counts = Counter(tag for word, tag in tagged) past = counts["VBD"] + counts["VBN"]
def __init__(self): self._tool_en = langtool.LanguageTool('en-US') self._tool_de = langtool.LanguageTool('de')