Esempio n. 1
0
    def features(self):
        """Extract baseline features"""
        sp_en = spacy.load("en")
        sp_de = spacy.load("de")
        en_checker = language_check.LanguageTool("en-GB")
        ge_checker = language_check.LanguageTool("de-DE")

        ft = self.df.copy()
        # Sentences without punctuation
        ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower(
        ).translate(str.maketrans("", "", string.punctuation)))
        # Number of tokens
        ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" ")))
        ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" ")))
        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
        # Number of non alphanumeric characters
        ft["src_#punc"] = ft["src"].apply(
            lambda x: count(x, set(string.punctuation)))
        ft["tgt_#punc"] = ft["tgt"].apply(
            lambda x: count(x, set(string.punctuation)))
        # Sentiment analysis
        ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity)
        ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity)
        ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs()
        # Spacy encoding
        ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x))
        ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x))
        # Proofread errors
        ft["sp_pos_diff"] = [
            spacy_parser(x, y, "pos_")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["sp_ent_diff"] = [
            spacy_parser(x, y, "ents")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["src_gram_err"] = ft["src"].apply(
            lambda x: len(en_checker.check(x)))
        ft["tgt_gram_err"] = ft["tgt"].apply(
            lambda x: len(ge_checker.check(x)))
        # Features of interest
        foi = [
            "src_len",
            "tgt_len",
            "src_#punc",
            "tgt_#punc",
            "tgt_polar",
            "src_polar",
            "src_gram_err",
            "tgt_gram_err",
            "sp_pos_diff",
            "sp_ent_diff",
        ]  # Features of interest

        features = ft[foi].values
        normalized_features = MinMaxScaler().fit_transform(features)

        return features
Esempio n. 2
0
def correct(text, reps):
    corrected = []
    wrongN = 0
    sentenceN = 0
    mistakesN = 0
    rulesApplied = []
    replacements = []
    types = []
    noMistakes = []
    tool = language_check.LanguageTool('en-US')
    for sentence in text:
        matches = tool.check(sentence)
        if len(matches) > 0:
            corrected.append(language_check.correct(sentence, matches))
            wrongN += 1
            for rule in matches:
                mistakesN += 1
                rulesApplied.append(rule.ruleId)
                types.append(rule.category)
                new = rule.replacements
                old = sentence[rule.fromx:rule.tox]
                if reps == True:
                    replacements.append((old, new, sentenceN))
        else:
            noMistakes.append(sentenceN)

        if sentenceN % 100 == 0:
            print("sentence " + str(sentenceN) + "was corrected")
        sentenceN += 1
    stats = [wrongN, mistakesN, rulesApplied, types, replacements, noMistakes]
    return corrected, stats
Esempio n. 3
0
async def grammar_check(to_fix):
    if not to_fix.text[0].isalpha() and to_fix.text[0] not in ("/", "#", "@",
                                                               "!"):
        reply = False
        textx = await to_fix.get_reply_message()
        if textx:
            message = str(textx.message)
            reply = True
        elif to_fix.pattern_match.group(2):
            message = to_fix.pattern_match.group(2)
        else:
            await to_fix.edit(
                "```Give a text to fix!\nReplying to your message will fix "
                "and edit it, while giving an inline text will output the "
                "fixed version of it.```")

        tool = language_check.LanguageTool('en-GB')
        matches = tool.check(message)
        result = language_check.correct(message, matches)

        if reply:
            me = await bot.get_me()

            if textx.from_id == me.id:
                await textx.edit(result)
                await to_fix.delete()
            else:
                await to_fix.edit("Did you mean? \n\n`" + result)
        else:
            await to_fix.edit(result)
Esempio n. 4
0
    def compromise(self):
        classCompromise = Compromise()
        qArry = classCompromise.qArry()
        aArry = classCompromise.aArry()
        random_index = randrange(0, len(qArry))
        questionStr = qArry[random_index]
        answerStr = aArry[random_index]

        #here we validate the created questions
        tool = language_check.LanguageTool('en-US')
        matchesQuestion = tool.check(questionStr)
        matchesAnswer = tool.check(answerStr)
        questionStr = language_check.correct(questionStr, matchesQuestion)
        answerStr = language_check.correct(answerStr, matchesAnswer)

        # url = "http://api.meaningcloud.com/stilus-1.2"
        #
        # payload = classCompromise.payload()
        # headers = {'content-type': 'application/x-www-form-urlencoded'}
        #
        # response = requests.request("POST", url, data=payload, headers=headers)
        #
        # print(response.text)
        #
        return "Question: " + questionStr + " </br> answer: " + answerStr
Esempio n. 5
0
def clean_output_text(output_text, use_language_tool=False):
    '''
    Post-processing to clean up the output returned by the
    text generation program
    
    This uses the rule-based grammar checking of Language Tool
    to correct minor capitalization and tense issues in the 
    outputted text
    
    Parameters
    ----------
    
    output_text : str
    
    use_language_tool : bool
        Whether to use LanguageTool to automatically clean up
        the output text
    
    '''

    swappairs = zip(replacements, to_replace)
    for member in swappairs:
        output_text = output_text.replace(member[0], member[1])

    for member in rep_pairs:
        output_text = output_text.replace(member[0], member[1])

    if has_language_tool and use_language_tool:
        tool = language_check.LanguageTool('en-US')
        matches = tool.check(output_text)
        output_text = language_check.correct(output_text, matches)
        output_text = str(output_text)

    return output_text
Esempio n. 6
0
 def correct_grammar(self, poem):
     tool = language_check.LanguageTool('en-US')
     matches = tool.check(poem)
     for i in matches:
         print(matches[i])
     new_poem = language_check.correct(poem, matches)
     return new_poem
Esempio n. 7
0
def mutate_synonym(member: str, nlp):
    # nltk.download('wordnet')
    words = member.split(' ')

    # Pick a random word in the text
    locus = rand.randrange(0, len(words))
    rand_word = words[locus]

    # Get a unique list of synonyms to rand_word
    synonyms = list(
        set([
            l.name() for syn in wordnet.synsets(rand_word)
            for l in syn.lemmas()
        ]))
    if synonyms:
        words[locus] = synonyms[rand.randrange(0, len(synonyms))]

    # Grammar Checking
    text = ' '.join(words)
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(text)
    corrected_text = language_check.correct(text, matches)
    if text != corrected_text:
        print(f'Corrected Text: {corrected_text} | Original Text: {text}')

    return text
Esempio n. 8
0
def error_stats(inputpath, lang, output_path):
    """
    Creates three text files with information of different errors in input texts.
    :param inputpath: path to folder with input data
    :param lang: string with name of language, e.g. 'de'
    :param output_path: path to the output text files
    """
    files = os.listdir(inputpath)  # input files
    checker = language_check.LanguageTool(lang)
    rules = {}
    locqualityissuetypes = {}
    categories = {}

    for file in files:
        if file.endswith(".txt"):
            text = open(os.path.join(inputpath, file)).read()
            matches = checker.check(text)
            for match in matches:
                rule = match.ruleId
                loc = match.locqualityissuetype
                cat = match.category
                rules[rule] = rules.get(rule, 0) + 1
                locqualityissuetypes[loc] = locqualityissuetypes.get(loc,
                                                                     0) + 1
                categories[cat] = categories.get(cat, 0) + 1

    write_featurelist(output_path + lang + "-rules.txt", sorted(rules.keys()))
    write_featurelist(output_path + lang + "-locquality.txt",
                      sorted(locqualityissuetypes.keys()))
    write_featurelist(output_path + lang + "-errorcats.txt",
                      sorted(categories.keys()))
 def grammar_check(self, data):
     tool = lan.LanguageTool(self.in_lan)
     matches = tool.check(data)
     if len(matches) > 0: data = lan.correct(data, matches)
     if self.in_lan in self.sc_lang:
         data = SpellChecker(language=self.in_lan).correction(data)
     return data
    def process_data(self, tweets_and_date, dialect):
        """
        Data processing function
        """

        scanner = language_check.LanguageTool(dialect)
        print "Chosen language/dialect: " + str(dialect)

        for tweet in tweets_and_date:
            matches = scanner.check(tweet['text'].encode(
                'ascii', 'ignore').decode('ascii'))

            for i, k in enumerate(matches):
                print "----------------"
                print "Context: "
                print matches[i].context.encode('ascii')
                print "Rule Id:" + str(matches[i].ruleId)
                print "Category: " + matches[i].category
                print "Based upon language/grammar user may have meant: "
                did_you_mean = ""
                if matches[i].replacements:
                    for m in matches[i].replacements:
                        did_you_mean = did_you_mean + m.encode(
                            'ascii', 'ignore').decode('ascii') + ' ,'
                print did_you_mean
def depermute_input(mrs, sents, predictions, num_permutes):
    new_mr = []
    new_sent = []
    new_pred = []
    x = 0
    tool = language_check.LanguageTool('en-UK')

    base = max(int(len(predictions) * .1), 1)
    benchmarks = [base * i for i in range(1, 11)]

    while x < len(predictions):
        if x in benchmarks:
            curr_state = x / base
            print("Depermute processing is " + str(10 * curr_state) +
                  "% done.")

        scores = {}
        for i in range(0, num_permutes):
            scores[x + i] = score_output(mrs[x // num_permutes],
                                         sents[x // num_permutes],
                                         predictions[x + i],
                                         tool,
                                         correction=False)

        top_score = max(scores.keys(), key=(lambda key: scores[key]))
        new_mr.append(mrs[top_score // num_permutes])
        new_sent.append(sents[top_score // num_permutes])
        new_pred.append(predictions[top_score])
        x += num_permutes

    return new_mr, new_sent, new_pred
Esempio n. 12
0
def check_grammar(line, max_errors=2):
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(line)
    if len(matches) > max_errors:
        return False
    else:
        return True
Esempio n. 13
0
def langcheck(input_path, output_path, replace_what, replace_with):
    tool = language_check.LanguageTool('en-UK')
    path = input_path
    files = glob.glob(path)

    for file in files:
        f1 = open(file, 'r')
        filename = os.path.basename(file)
        outputpath = output_path
        filename = replace_last(filename, 'e', 'l')
        text = f1.read()
        # print filename
        text.encode('utf8', 'ignore')
        #text.encode('\"', 'ignore')
        matches = tool.check(text)
        print len(matches)
        # q = len(matches)
        # for x in range(0,q):
        # 	matches[x].fromy, matches[x].fromx
        # 	print (matches[x])
        # print (matches[1])
        outputpath += filename
        temp = language_check.correct(text, matches)
        f2 = open(outputpath, 'w')
        f2.write(temp.encode('utf8', 'ignore'))
        f2.close()
        print filename, "is created"
        f1.close()
Esempio n. 14
0
def create_ppdb_dataset(dataset_file, all_sentences_file, output_file):
    tool = language_check.LanguageTool('en-US')
    all_sentences = open(all_sentences_file).readlines()
    with open(output_file, 'w') as file_writer:
        with open(dataset_file) as file_reader:
            for line in file_reader:
                info = line.split("|||")
                if info[0].rstrip() == "[S]" and info[5].strip().lower(
                ) == "equivalence":
                    sentence1 = info[1]
                    sentence2 = info[2]
                    random_sentence = np.random.choice(all_sentences, 1)[0]
                    if '-' in sentence1:
                        sentence1 = sentence1.replace("-", "")
                        sentence1 = sentence1.strip()

                    if '-' in sentence2:
                        sentence2 = sentence2.replace("-", "")
                        sentence2 = sentence2.strip()

                    if '-' in random_sentence:
                        random_sentence = random_sentence.replace("-", "")
                        random_sentence = random_sentence.strip()
                    sentence1 = tool.correct(sentence1)
                    sentence2 = tool.correct(sentence2)
                    random_sentence = tool.correct(random_sentence)
                    file_writer.write(sentence1.strip() + "\t" +
                                      sentence2.strip() + "\t" +
                                      random_sentence.strip() + "\t0\n")
Esempio n. 15
0
def error_stats(inputpath, lang, output_path):
    files = os.listdir(inputpath)
    checker = language_check.LanguageTool(lang)
    rules = {}
    locqualityissuetypes = {}
    categories = {}

    for file in files:
        if file.endswith(".txt"):
            text = open(os.path.join(inputpath, file)).read()
            matches = checker.check(text)
            for match in matches:
                rule = match.ruleId
                loc = match.locqualityissuetype
                cat = match.category
                rules[rule] = rules.get(rule, 0) + 1
                locqualityissuetypes[loc] = locqualityissuetypes.get(loc,
                                                                     0) + 1
                categories[cat] = categories.get(cat, 0) + 1

    write_featurelist(output_path + lang + "-rules.txt", sorted(rules.keys()))
    write_featurelist(output_path + lang + "-locquality.txt",
                      sorted(locqualityissuetypes.keys()))
    write_featurelist(output_path + lang + "-errorcats.txt",
                      sorted(categories.keys()))
Esempio n. 16
0
def correct_statement(statement):
    if len(statement.strip()) == 0:
        return ""
    global _tool
    try:
        matches = _tool.check(statement)
    except:
        try:
            logging.error("Problem with LanguageTools for " + statement)
            time.sleep(60)
            try:
                del _tool
            except NameError:
                pass
            time.sleep(60)
            call(["killall", "java"])
            time.sleep(60)
            try:
                _tool = language_check.LanguageTool('en-US')
            except:
                raise
            return correct_statement(statement)
        except:
            logging.error("Problem with LanguageTools for " + statement)
            raise
    return language_check.correct(statement, matches).lower()
Esempio n. 17
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
def get_all_features(dirpath, lang, categories, locqualityissuetypes, rules):
    """
    Get features and language levels for a language
    :param dirpath: path to the folder with input files
    :param lang: string with name of language, e.g. 'de'
    :param categories: dictionary of categories
    :param locqualityissuetypes: dictionary of issue types
    :param rules: dicionary of rules
    :return: two lists: with language levels and with features of all files in the folder
    """
    files = os.listdir(dirpath)
    all_feats = []  # list of feature dictionaries of all files in the folder
    all_cats = []  # list of language levels of all files in the folder
    checker = language_check.LanguageTool(lang)
    for f in files:
        if f.endswith(".txt"):
            text = open(os.path.join(dirpath, f)).read()
            matches = checker.check(text)
            for match in matches:
                rule = match.ruleId
                loc = match.locqualityissuetype
                cat = match.category
                rules[rule] = rules.get(rule, 0) + 1
                locqualityissuetypes[loc] = locqualityissuetypes.get(loc, 0) + 1
                categories[cat] = categories.get(cat, 0) + 1
            all_feats.append(get_file_feats(rules, locqualityissuetypes, categories))
            all_cats.append(f.split(".txt")[0][-2:])

            # reset all values in a dictionmaries
            rules = reset_dict(rules)
            categories = reset_dict(categories)
            locqualityissuetypes = reset_dict(locqualityissuetypes)

    return all_cats, all_feats
Esempio n. 19
0
def grammar(data):
    tool = language_check.LanguageTool('en-US')
    for d in data:
        text=d[2]
        matches = tool.check(text)
        d.append(len(matches))
    return data
Esempio n. 20
0
def main(file):
    #file = 'substinces.PNG'
    # opening an image from the source path
    img = Image.open(file)
    dot = file.index(".")

    name = file[0:dot]

    tool = language_check.LanguageTool('en-US')

    # path where the tesseract module is installed
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'
    # converts the image to result and saves it into result variable
    result = pytesseract.image_to_string(img)
    # write text in a text file and save it to source path
    with open(name + '.txt', mode='w') as file:
        file.write(result)

    with open(name + '.txt', mode='r') as file:
        orig = file.read()

    matches = tool.check(orig)
    orig = orig.replace(orig, language_check.correct(orig, matches))

    with open(name + '.txt', mode='w') as file:
        file.write(orig)

    return str(len(matches)) + " mistakes were found."
Esempio n. 21
0
def errorCount(essay):
    tool = language_check.LanguageTool('en-US')
    checker = []
    matches = tool.check(essay)
    for i in matches:
        checker.append(i.fromx)
    return (len(set(checker)))
def long_suggestion_grammar_check(text):
    """
    This method returns a longer list of suggestions, which can then be looped
    through if desired.

    Parameters
    ----------
        text : str
            The text that is inputted. Should be unformatted.

    Returns
    -------
        list of str
            Returns a list of suggestions.
    """
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(text)
    for i, match in enumerate(matches):
        fromy = match.fromy + 1
        fromx = match.fromx + 1
        ruleId = match.ruleId
        replacement = match.replacements[0]
        matches[
            i] = "Line {}, column {}, Rule ID: {}[{}]\nMessage: Did you mean '{}'?\nSuggestion: {}".format(
                fromy, fromx, ruleId, i, replacement, replacement)
    return matches
Esempio n. 23
0
 def __init__(self, tracker, random_seed=None):
     self.tracker = tracker
     if random_seed is None:
         random_seed = random.randint(1, 2**31 - 1)
     self.seed = random_seed
     random.seed(random_seed)
     self.tool = language_check.LanguageTool("en-US")
Esempio n. 24
0
def QuestionConfirmView(request):

    if request.method == 'POST':

        # Receive data from client
        question = request.POST.get('myquestion')
        print("question = ", question)
        tool = language_check.LanguageTool('en-US')
        texts = question
        matches = tool.check(texts)
        confirm_ques = language_check.correct(texts, matches)
        print("question_match = ", confirm_ques)
        context = {'confirm_ques': confirm_ques}
        # return JsonResponse({'sepal_length': sepal_length,
        # })
        return render(request, 'question/question_confirm.html', context)
        # return render(request,"question/questioncreate.html",{'sepal_length':sepal_length})
    else:
        print("this is post method in viesw")
        print("request.POST", request)
        # confirm_question = request.POST.get['final_que']
        confirm_question = request.GET.get("confirm_ques")
        print("confirm_question", confirm_question)

        context = {'ram': 'ram'}
        return render(request, 'question/questioncreate.html', context)
Esempio n. 25
0
def check(data):

    tool = language_check.LanguageTool('en-GB')
    matches = tool.check(data)
    print(matches)
    print(language_check.correct(data, matches))
    return len(matches)
Esempio n. 26
0
 def n_grammar_errors(string):
     tool = language_check.LanguageTool('en-US')
     matches = tool.check(string)
     count = 0
     for m in matches:
         if m.category == "Grammar":
             count = count + 1
     return count
Esempio n. 27
0
File: cred.py Progetto: trvon/Shrink
def language_check(text):
    tool = LChecker.LanguageTool('en-US')
    errors = tool.check(text)
    # We need to determine how many errors is to many
    num = len(errors)
    if num > 100:
        num = num % 100
    return num
Esempio n. 28
0
def check_grammar_of_str(string):
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(string)
    score = 0
    for match in matches:
        if not match.ruleId == "MORFOLOGIK_RULE_EN_US":  # do not count any spell errors
            score += 1
    return score
Esempio n. 29
0
 def __init__(self):
   self.reddit = praw.Reddit(user_agent='Comment History Parser',
                 client_id='nkVxbwp1RsHHCA',
                 client_secret='SlzWUhAhV5nIXPy4_1PTJSOaLrA')
   self.sid = SentimentIntensityAnalyzer()
   self.tool = language_check.LanguageTool('en-US')
   self.LDA = LDA_predict()
   self.txc = toxicity_classifier()
Esempio n. 30
0
def add_percent_typos_to_words_column(fake_news_df):
    language_tool = language_check.LanguageTool("en-US")

    fake_news_df["percent_typos_to_words"] = fake_news_df["article_text"].apply(
        lambda x: typos_to_words(language_tool, x)
    )

    return fake_news_df