Exemple #1
0
 def __init__(self, lang_code: str, ids: Set[str]):
     lt_code = {"en": "en_US", "de": "de_DE", "es": "es_ES"}[lang_code]
     self.tool = language_tool_python.LanguageTool(
         lt_code, remote_server="http://localhost:8081/")
     self.tool.disabled_rules = {
         "MORFOLOGIK_RULE_EN_US",
         "GERMAN_SPELLER_RULE",
         "COMMA_PARENTHESIS_WHITESPACE",
         "DOUBLE_PUNCTUATION",
         "UPPERCASE_SENTENCE_START",
         "WHITESPACE_RULE",
         "SENTENCE_WHITESPACE",
         "WHITESPACE_PARAGRAPH",
         "WHITESPACE_PARAGRAPH_BEGIN",
         "EMPTY_LINE",
         "TOO_LONG_SENTENCE",
         "TOO_LONG_PARAGRAPH",
         "PARAGRAPH_REPEAT_BEGINNING_RULE",
         "PUNCTUATION_PARAGRAPH_END",
         "PUNCTUATION_PARAGRAPH_END2",
         "EN_SPECIFIC_CASE",
         "EN_UNPAIRED_BRACKETS",
         "ENGLISH_WORD_REPEAT_RULE",
         "EN_A_VS_AN",
         "ENGLISH_WORD_REPEAT_BEGINNING_RULE",
         "EN_COMPOUNDS",
         "EN_CONTRACTION_SPELLING",
         "ENGLISH_WRONG_WORD_IN_CONTEXT",
         "EN_DASH_RULE",
         "EN_WORD_COHERENCY",
         "EN_DIACRITICS_REPLACE",
         "EN_PLAIN_ENGLISH_REPLACE",
         "EN_REDUNDANCY_REPLACE",
         "EN_SIMPLE_REPLACE",
         "READABILITY_RULE_SIMPLE",
         "READABILITY_RULE_DIFFICULT",
         "DE_SIMPLE_REPLACE",
         "OLD_SPELLING",
         "DE_SENTENCE_WHITESPACE",
         "DE_DOUBLE_PUNCTUATION",
         "MISSING_VERB",
         "GERMAN_WORD_REPEAT_RULE",
         "GERMAN_WORD_REPEAT_BEGINNING_RULE",
         "GERMAN_WRONG_WORD_IN_CONTEXT",
         "DE_AGREEMENT",
         "DE_AGREEMENT2",
         "DE_CASE",
         "DE_DASH",
         "DE_VERBAGREEMENT",
         "DE_SUBJECT_VERB_AGREEMENT",
         "DE_WORD_COHERENCY",
         "DE_SIMILAR_NAMES",
         "DE_WIEDER_VS_WIDER",
         "STYLE_REPEATED_WORD_RULE_DE",
         "DE_COMPOUND_COHERENCY",
         "TOO_LONG_SENTENCE_DE",
         "FILLER_WORDS_DE",
         "GERMAN_PARAGRAPH_REPEAT_BEGINNING_RULE",
         "DE_DU_UPPER_LOWER",
         "EINHEITEN_METRISCH",
         "COMMA_BEHIND_RELATIVE_CLAUSE",
         "COMMA_IN_FRONT_RELATIVE_CLAUSE",
         "READABILITY_RULE_SIMPLE_DE",
         "READABILITY_RULE_DIFFICULT_DE",
         "COMPOUND_INFINITIV_RULE",
         "STYLE_REPEATED_SHORT_SENTENCES",
         "STYLE_REPEATED_SENTENCE_BEGINNING",
     }
Exemple #2
0
 def __init__(self):
     import language_tool_python
     self.language_tool = language_tool_python.LanguageTool('en-US')
def grammar_tool(text, error_count):
    tool = language_tool_python.LanguageTool('en-US')
    corrected_text = tool.correct(text)
    error_count += 1
    return corrected_text, error_count
Exemple #4
0
def grammar_check_score(text: str) -> int:
    tool = language_tool_python.LanguageTool('en-GB')
    matches = tool.check(text)
    return len(matches)
Exemple #5
0
import language_tool_python
import csv

tool = language_tool_python.LanguageTool('ru')

# что-то что соберёт комментарии в список
comments = []
with open(filename, encoding='utf-8') as csvfile:
    text = csv.DictReader(csvfile)
    for line in text:
        comments.append(line['text'])

ortho_counter = 0  # считаем орфографические ошибки
sentence_start = 0  # предложение с маленькой буквы
whitespace = 0  # пробел перед запятой или перед скобкой/после скобки
capital_names = 0  # имя с маленькой буквы
talk = 0  # просторечия
other = 0  # другие правила (всего правил 880 и 1.всё кроме вышеперечисленного встречается очень редко в интернет-речи
# 2.в их названиях нет ничего что помогло бы их группировать)
csv_list = [[
    'Орфографические ошибки', 'Предложение с маленькой буквы',
    'Пробел перед запятой или скобкой', 'Имя с маленькой буквы',
    'Просторечные слова', 'Другие ошибки'
]]
for line in comments:
    matches = tool.check(line)
    for match in matches:
        if match.ruleId == 'MORFOLOGIK_RULE_RU_RU':
            ortho_counter += 1
        elif match.ruleId == 'UPPERCASE_SENTENCE_START':
            sentence_start += 1
Exemple #6
0
from collections import Counter
import textstat
import numpy as np
import scipy.stats as stats
import language_tool_python
import string
from germansentiment import sentimentmodel
from textblob_de import TextBlobDE

# sys.stdout = open('result.txt', 'w', encoding='utf-8')

textstat.set_lang("de")

nltk.download('averaged_perceptron_tagger')

tool = language_tool_python.LanguageTool('de')

data = pd.read_csv("Comments.csv")

contentList = data["content"].values.tolist()

numberOfWordsPerComm = []

tokenList = []

charList = []
digitList = []
upperList = []
lowerList = []

syllablesTuple = ()
def test_langtool_languages():
	import language_tool_python
	lang_tool = language_tool_python.LanguageTool("en-US")
	assert lang_tool._get_languages() == {'ta-IN', 'en-CA', 'da', 'eo', 'pt-AO', 'de', 'gl', 'ru-RU', 'de-DE', 'en', 'br', 'en-ZA', 'pt-MZ', 'ast-ES', 'sk-SK', 'en-AU', 'ta', 'ga', 'be', 'pl', 'tl-PH', 'sl', 'ar', 'es', 'sl-SI', 'en-NZ', 'el', 'el-GR', 'ru', 'zh-CN', 'en-GB', 'be-BY', 'pl-PL', 'km-KH', 'pt', 'uk-UA', 'ca', 'de-DE-x-simple-language', 'ro', 'ca-ES', 'de-CH', 'ja-JP', 'tl', 'pt-PT', 'gl-ES', 'pt-BR', 'km', 'ga-IE', 'ja', 'sv', 'sk', 'en-US', 'de-AT', 'ca-ES-valencia', 'uk', 'it', 'zh', 'br-FR', 'da-DK', 'ast', 'fr', 'fa', 'nl', 'ro-RO'}
Exemple #8
0
import pycorrector
import language_tool_python
from utils import contain_english
print("[Corrector]")
pycorrector.enable_char_error(enable=False)
english_corrector = language_tool_python.LanguageTool('en-US')
print("[Corrector] Init end")


def correct(st):
    if contain_english(st):
        return correct_english(st)
    else:
        return correct_chinese(st)


def correct_chinese(st):
    corrected, detail = pycorrector.correct(st)
    if len(detail) == 0:
        return None
    else:
        return corrected


def correct_english(st):
    corrected = english_corrector.correct(st)
    if st.lower() == corrected.lower():
        return None
    else:
        return corrected
Exemple #9
0
def findError(author, msgList, questions):
    str1 = ""
    f = open(author + ".txt", "w+")
    f.write("Submission by: " + author)
    str1 = str1 + "Submission by: " + author
    ans = []
    for i in msgList:
        temp1 = i.split(" ", 1)
        temp2 = temp1[0].split("A")
        temp3 = temp2[1]
        ans.append(int(temp3))
    temp = "\n\nTotal Number of Assignments = " + str(
        len(questions)) + "\nNumber of Assignments Attempted = " + str(
            len(ans))
    f.write(temp)
    str1 = str1 + temp
    cnt = 0
    accuracy = []
    accuracy_no = []
    errorCategory = {}
    indErr = 0
    for s in range(len(questions)):
        string = "\n\nAssignment " + str(s + 1) + ":" + questions[s].split(
            " ", 1)[1]
        str1 = str1 + string
        f.write(string)
        if s + 1 in ans:
            line = msgList[cnt].split(" ", 1)[1]
            cnt += 1
            tool = language_tool_python.LanguageTool('en-US')
            i = 0
            c = 0
            errors = {}
            matches = tool.check(line)
            count_simple = 0
            i = 0
            for mistake in matches:
                if (mistake.category == "GRAMMAR"
                        or mistake.category == "TYPOS"
                        or mistake.category == "CASING"):
                    c += 1
                    count_simple += 1
                    errors[c] = {
                        'Error': mistake.ruleId,
                        'Suggestion': mistake.replacements,
                        'Message': mistake.message,
                        'Actual': mistake.matchedText
                    }
                    if mistake.category in errorCategory.keys():
                        errorCategory[mistake.category] = errorCategory[
                            mistake.category] + 1
                    else:
                        errorCategory[mistake.category] = 1
            i = i + count_simple
            indErr += i
            print(100 - 100 * (len(matches) / len(line.split())))
            accuracy.append(100 - 100 * (len(matches) / len(line.split())))
            accuracy_no.append(s + 1)
            f.write("\n\nYour Submission: " + line)
            str1 = str1 + "\n\nYour Submission: " + line
            f.write("\n\nNo. of mistakes found in submission is " + str(i))
            str1 = str1 + "\n\nNo. of mistakes found in submission is " + str(
                i)
            for j in errors:
                temp = "\n\nError message: " + str(
                    errors[j]['Message']) + "\nSuggestion is: " + str(
                        errors[j]['Suggestion']
                    ) + "\nMistake found in: " + str(errors[j]['Actual'])
                f.write(temp)
                str1 = str1 + temp

        else:
            f.write("\n\nYou did not attempt this assignment.")
            str1 = str1 + "\n\nYou did not attempt this assignment."
        f.write("\n----------x----------x---------")
        str1 = str1 + "\n----------x----------x---------"
    f.close()
    Individual(author, msgList, questions, ans, accuracy, accuracy_no,
               errorCategory, indErr, str1)
Exemple #10
0
def grammar_tool(text):
	tool = language_tool_python.LanguageTool('en-US')
	corrected_text = tool.correct(text)
	return corrected_text
def main(fileName):
    fileName, fileExt = fileName.split('.')
    print(fileName, fileExt)
    # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav")

    # ipFile = ffmpeg.input(fileName + fileExt)
    # opFile = ffmpeg.output(ipFile, fileName + ".wav")

    clip = AudioFileClip(f"{fileName}.{fileExt}")
    clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le')

    f = sf.SoundFile(f'{fileName}.wav')
    audio_dur = len(f) / f.samplerate

    r = sr.Recognizer()
    text = ""
    rec_dur = 25

    with sr.AudioFile(f'{fileName}.wav') as source:
        for x in range(0, int(audio_dur / rec_dur)):
            audio = r.record(source, duration=rec_dur)
            try:
                new_txt = r.recognize_google(audio)
                text = text + new_txt
            except:
                pass

        audio = r.record(source,
                         duration=(audio_dur - int(audio_dur / rec_dur)))
        try:
            new_txt = r.recognize_google(audio)
            text = text + new_txt
        except:
            pass

        print("Done")

    p = Punctuator('Demo-Europarl-EN.pcl')
    text = p.punctuate(text)

    tool = language_tool_python.LanguageTool('en-US')

    matches = tool.check(text)
    print(len(matches))

    for lab in range(len(matches)):
        print(lab)
        print(matches[lab].ruleId, matches[lab].replacements)

    text_new = tool.correct(text)

    print(text_new)

    nltk.download('punkt')
    nltk.download('stopwords')

    preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner())
    similarity_algorithm = BM25Plus()
    ranker = TextRank()
    ir = ClassicalIR()

    # Text Summarization
    model = Summarizer(preprocessor, similarity_algorithm, ranker, ir)
    summarised_content = model.summarise(text_new,
                                         reduction_ratio=0.80,
                                         preserve_order=True)

    print("\n --- Summarized Text ---\n")
    print(construct_sentences_from_ranking(summarised_content))

    with open(f"{fileName}.txt", "w+") as file:
        file.write(construct_sentences_from_ranking(summarised_content))

    # Text Keyword Extraction
    preprocessor = TextPreProcessor(NLTKTokenizer(),
                                    NLTKCleaner(skip_stemming=True))
    keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR())
    keywords = keyword_extractor.extract_keywords(text, count=10, raw=False)

    print("\n --- Keywords ---\n")
    print(keywords)
 def grammarCheck(self, language="en-US"):
     tool = language_tool_python.LanguageTool(language)
     errors = tool.check(self.string)
     return (errors, len(errors))
Exemple #13
0
def scoreTextForGrammaticalCorrectness(
    article
):  # The higher the score, the better (although never greater than 0)
    tool = lt.LanguageTool('en-US')
    return -1 * len(tool.check(article))
 def __init__(self):
     self.lang_tool = language_tool_python.LanguageTool('en-US')
     self.bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
from flask import Flask
from flask import jsonify
from flask import request
import language_tool_python as ltp

# make a new flask app
app = Flask(__name__)

# make a new language tool locally
tool = ltp.LanguageTool('en-US')


# Base route
@app.route('/', methods=['GET'])
def hello_world():
    return jsonify({'message': "Hello, World!"})


# Grammar Check route
@app.route('/check/<string:text>')
def check_grammar(text):
    corrected = tool.correct(text)
    return jsonify({'corrected': corrected})


if __name__ == "__main__":
    app.run(debug=True)
def test_langtool_load():
	import language_tool_python
	lang_tool = language_tool_python.LanguageTool("en-US")
	matches = lang_tool.check('ain\'t nothin but a thang')
	assert str(matches) == """[Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter.', 'replacements': ['Ain'], 'offsetInContext': 0, 'context': "ain't nothin but a thang", 'offset': 0, 'errorLength': 3, 'category': 'CASING', 'ruleIssueType': 'typographical', 'sentence': "ain't nothin but a thang"}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['nothing', 'no thin'], 'offsetInContext': 6, 'context': "ain't nothin but a thang", 'offset': 6, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': "ain't nothin but a thang"}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['than', 'thing', 'hang', 'thank', 'Chang', 'tang', 'thong', 'twang', 'Thant', 'thane', 'Thanh', 't hang', 'Shang', 'Zhang'], 'offsetInContext': 19, 'context': "ain't nothin but a thang", 'offset': 19, 'errorLength': 5, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': "ain't nothin but a thang"})]"""
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim.models.keyedvectors as word2vec
import math

import language_tool_python
from collections import Counter
import nltk
from nltk.corpus import stopwords

language_tool = language_tool_python.LanguageTool('en-US')


def accuracy(content):
    matches = language_tool.check(content)
    incorrect = len(matches)
    wc = word_count(content)
    return ((wc - incorrect) / wc) * 100


def word_count(content):
    return len(nltk.word_tokenize(content))


def tense(content):
    tagged = nltk.pos_tag(content.split())
    counts = Counter(tag for word, tag in tagged)
    past = counts["VBD"] + counts["VBN"]
Exemple #18
0
 def __init__(self):
     self._tool_en = langtool.LanguageTool('en-US')
     self._tool_de = langtool.LanguageTool('de')