def features(self): """Extract baseline features""" sp_en = spacy.load("en") sp_de = spacy.load("de") en_checker = language_check.LanguageTool("en-GB") ge_checker = language_check.LanguageTool("de-DE") ft = self.df.copy() # Sentences without punctuation ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower( ).translate(str.maketrans("", "", string.punctuation))) # Number of tokens ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" "))) ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" "))) count = lambda l1, l2: sum([1 for x in l1 if x in l2]) # Number of non alphanumeric characters ft["src_#punc"] = ft["src"].apply( lambda x: count(x, set(string.punctuation))) ft["tgt_#punc"] = ft["tgt"].apply( lambda x: count(x, set(string.punctuation))) # Sentiment analysis ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity) ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity) ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs() # Spacy encoding ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x)) ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x)) # Proofread errors ft["sp_pos_diff"] = [ spacy_parser(x, y, "pos_") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["sp_ent_diff"] = [ spacy_parser(x, y, "ents") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["src_gram_err"] = ft["src"].apply( lambda x: len(en_checker.check(x))) ft["tgt_gram_err"] = ft["tgt"].apply( lambda x: len(ge_checker.check(x))) # Features of interest foi = [ "src_len", "tgt_len", "src_#punc", "tgt_#punc", "tgt_polar", "src_polar", "src_gram_err", "tgt_gram_err", "sp_pos_diff", "sp_ent_diff", ] # Features of interest features = ft[foi].values normalized_features = MinMaxScaler().fit_transform(features) return features
def correct(text, reps): corrected = [] wrongN = 0 sentenceN = 0 mistakesN = 0 rulesApplied = [] replacements = [] types = [] noMistakes = [] tool = language_check.LanguageTool('en-US') for sentence in text: matches = tool.check(sentence) if len(matches) > 0: corrected.append(language_check.correct(sentence, matches)) wrongN += 1 for rule in matches: mistakesN += 1 rulesApplied.append(rule.ruleId) types.append(rule.category) new = rule.replacements old = sentence[rule.fromx:rule.tox] if reps == True: replacements.append((old, new, sentenceN)) else: noMistakes.append(sentenceN) if sentenceN % 100 == 0: print("sentence " + str(sentenceN) + "was corrected") sentenceN += 1 stats = [wrongN, mistakesN, rulesApplied, types, replacements, noMistakes] return corrected, stats
async def grammar_check(to_fix): if not to_fix.text[0].isalpha() and to_fix.text[0] not in ("/", "#", "@", "!"): reply = False textx = await to_fix.get_reply_message() if textx: message = str(textx.message) reply = True elif to_fix.pattern_match.group(2): message = to_fix.pattern_match.group(2) else: await to_fix.edit( "```Give a text to fix!\nReplying to your message will fix " "and edit it, while giving an inline text will output the " "fixed version of it.```") tool = language_check.LanguageTool('en-GB') matches = tool.check(message) result = language_check.correct(message, matches) if reply: me = await bot.get_me() if textx.from_id == me.id: await textx.edit(result) await to_fix.delete() else: await to_fix.edit("Did you mean? \n\n`" + result) else: await to_fix.edit(result)
def compromise(self): classCompromise = Compromise() qArry = classCompromise.qArry() aArry = classCompromise.aArry() random_index = randrange(0, len(qArry)) questionStr = qArry[random_index] answerStr = aArry[random_index] #here we validate the created questions tool = language_check.LanguageTool('en-US') matchesQuestion = tool.check(questionStr) matchesAnswer = tool.check(answerStr) questionStr = language_check.correct(questionStr, matchesQuestion) answerStr = language_check.correct(answerStr, matchesAnswer) # url = "http://api.meaningcloud.com/stilus-1.2" # # payload = classCompromise.payload() # headers = {'content-type': 'application/x-www-form-urlencoded'} # # response = requests.request("POST", url, data=payload, headers=headers) # # print(response.text) # return "Question: " + questionStr + " </br> answer: " + answerStr
def clean_output_text(output_text, use_language_tool=False): ''' Post-processing to clean up the output returned by the text generation program This uses the rule-based grammar checking of Language Tool to correct minor capitalization and tense issues in the outputted text Parameters ---------- output_text : str use_language_tool : bool Whether to use LanguageTool to automatically clean up the output text ''' swappairs = zip(replacements, to_replace) for member in swappairs: output_text = output_text.replace(member[0], member[1]) for member in rep_pairs: output_text = output_text.replace(member[0], member[1]) if has_language_tool and use_language_tool: tool = language_check.LanguageTool('en-US') matches = tool.check(output_text) output_text = language_check.correct(output_text, matches) output_text = str(output_text) return output_text
def correct_grammar(self, poem): tool = language_check.LanguageTool('en-US') matches = tool.check(poem) for i in matches: print(matches[i]) new_poem = language_check.correct(poem, matches) return new_poem
def mutate_synonym(member: str, nlp): # nltk.download('wordnet') words = member.split(' ') # Pick a random word in the text locus = rand.randrange(0, len(words)) rand_word = words[locus] # Get a unique list of synonyms to rand_word synonyms = list( set([ l.name() for syn in wordnet.synsets(rand_word) for l in syn.lemmas() ])) if synonyms: words[locus] = synonyms[rand.randrange(0, len(synonyms))] # Grammar Checking text = ' '.join(words) tool = language_check.LanguageTool('en-US') matches = tool.check(text) corrected_text = language_check.correct(text, matches) if text != corrected_text: print(f'Corrected Text: {corrected_text} | Original Text: {text}') return text
def error_stats(inputpath, lang, output_path): """ Creates three text files with information of different errors in input texts. :param inputpath: path to folder with input data :param lang: string with name of language, e.g. 'de' :param output_path: path to the output text files """ files = os.listdir(inputpath) # input files checker = language_check.LanguageTool(lang) rules = {} locqualityissuetypes = {} categories = {} for file in files: if file.endswith(".txt"): text = open(os.path.join(inputpath, file)).read() matches = checker.check(text) for match in matches: rule = match.ruleId loc = match.locqualityissuetype cat = match.category rules[rule] = rules.get(rule, 0) + 1 locqualityissuetypes[loc] = locqualityissuetypes.get(loc, 0) + 1 categories[cat] = categories.get(cat, 0) + 1 write_featurelist(output_path + lang + "-rules.txt", sorted(rules.keys())) write_featurelist(output_path + lang + "-locquality.txt", sorted(locqualityissuetypes.keys())) write_featurelist(output_path + lang + "-errorcats.txt", sorted(categories.keys()))
def grammar_check(self, data): tool = lan.LanguageTool(self.in_lan) matches = tool.check(data) if len(matches) > 0: data = lan.correct(data, matches) if self.in_lan in self.sc_lang: data = SpellChecker(language=self.in_lan).correction(data) return data
def process_data(self, tweets_and_date, dialect): """ Data processing function """ scanner = language_check.LanguageTool(dialect) print "Chosen language/dialect: " + str(dialect) for tweet in tweets_and_date: matches = scanner.check(tweet['text'].encode( 'ascii', 'ignore').decode('ascii')) for i, k in enumerate(matches): print "----------------" print "Context: " print matches[i].context.encode('ascii') print "Rule Id:" + str(matches[i].ruleId) print "Category: " + matches[i].category print "Based upon language/grammar user may have meant: " did_you_mean = "" if matches[i].replacements: for m in matches[i].replacements: did_you_mean = did_you_mean + m.encode( 'ascii', 'ignore').decode('ascii') + ' ,' print did_you_mean
def depermute_input(mrs, sents, predictions, num_permutes): new_mr = [] new_sent = [] new_pred = [] x = 0 tool = language_check.LanguageTool('en-UK') base = max(int(len(predictions) * .1), 1) benchmarks = [base * i for i in range(1, 11)] while x < len(predictions): if x in benchmarks: curr_state = x / base print("Depermute processing is " + str(10 * curr_state) + "% done.") scores = {} for i in range(0, num_permutes): scores[x + i] = score_output(mrs[x // num_permutes], sents[x // num_permutes], predictions[x + i], tool, correction=False) top_score = max(scores.keys(), key=(lambda key: scores[key])) new_mr.append(mrs[top_score // num_permutes]) new_sent.append(sents[top_score // num_permutes]) new_pred.append(predictions[top_score]) x += num_permutes return new_mr, new_sent, new_pred
def check_grammar(line, max_errors=2): tool = language_check.LanguageTool('en-US') matches = tool.check(line) if len(matches) > max_errors: return False else: return True
def langcheck(input_path, output_path, replace_what, replace_with): tool = language_check.LanguageTool('en-UK') path = input_path files = glob.glob(path) for file in files: f1 = open(file, 'r') filename = os.path.basename(file) outputpath = output_path filename = replace_last(filename, 'e', 'l') text = f1.read() # print filename text.encode('utf8', 'ignore') #text.encode('\"', 'ignore') matches = tool.check(text) print len(matches) # q = len(matches) # for x in range(0,q): # matches[x].fromy, matches[x].fromx # print (matches[x]) # print (matches[1]) outputpath += filename temp = language_check.correct(text, matches) f2 = open(outputpath, 'w') f2.write(temp.encode('utf8', 'ignore')) f2.close() print filename, "is created" f1.close()
def create_ppdb_dataset(dataset_file, all_sentences_file, output_file): tool = language_check.LanguageTool('en-US') all_sentences = open(all_sentences_file).readlines() with open(output_file, 'w') as file_writer: with open(dataset_file) as file_reader: for line in file_reader: info = line.split("|||") if info[0].rstrip() == "[S]" and info[5].strip().lower( ) == "equivalence": sentence1 = info[1] sentence2 = info[2] random_sentence = np.random.choice(all_sentences, 1)[0] if '-' in sentence1: sentence1 = sentence1.replace("-", "") sentence1 = sentence1.strip() if '-' in sentence2: sentence2 = sentence2.replace("-", "") sentence2 = sentence2.strip() if '-' in random_sentence: random_sentence = random_sentence.replace("-", "") random_sentence = random_sentence.strip() sentence1 = tool.correct(sentence1) sentence2 = tool.correct(sentence2) random_sentence = tool.correct(random_sentence) file_writer.write(sentence1.strip() + "\t" + sentence2.strip() + "\t" + random_sentence.strip() + "\t0\n")
def error_stats(inputpath, lang, output_path): files = os.listdir(inputpath) checker = language_check.LanguageTool(lang) rules = {} locqualityissuetypes = {} categories = {} for file in files: if file.endswith(".txt"): text = open(os.path.join(inputpath, file)).read() matches = checker.check(text) for match in matches: rule = match.ruleId loc = match.locqualityissuetype cat = match.category rules[rule] = rules.get(rule, 0) + 1 locqualityissuetypes[loc] = locqualityissuetypes.get(loc, 0) + 1 categories[cat] = categories.get(cat, 0) + 1 write_featurelist(output_path + lang + "-rules.txt", sorted(rules.keys())) write_featurelist(output_path + lang + "-locquality.txt", sorted(locqualityissuetypes.keys())) write_featurelist(output_path + lang + "-errorcats.txt", sorted(categories.keys()))
def correct_statement(statement): if len(statement.strip()) == 0: return "" global _tool try: matches = _tool.check(statement) except: try: logging.error("Problem with LanguageTools for " + statement) time.sleep(60) try: del _tool except NameError: pass time.sleep(60) call(["killall", "java"]) time.sleep(60) try: _tool = language_check.LanguageTool('en-US') except: raise return correct_statement(statement) except: logging.error("Problem with LanguageTools for " + statement) raise return language_check.correct(statement, matches).lower()
def get_text_features(article_contents: str) -> dict: """ Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates other factors such as the number of typos. @param article_contents, a string which contains the contents of an article @return language_analysis_dict, a dictionary which contains """ tool = language_check.LanguageTool('en-US') language_analysis_dict = { "flesch_reading": textstat.flesch_reading_ease(article_contents), "flesch_kincaid": textstat.flesch_kincaid_grade(article_contents), "coleman_liau": textstat.coleman_liau_index(article_contents), "typos_to_words": len(tool.check(article_contents)) / textstat.lexicon_count(article_contents), "percent_difficult_words": textstat.difficult_words(article_contents) / textstat.lexicon_count(article_contents), } return language_analysis_dict
def get_all_features(dirpath, lang, categories, locqualityissuetypes, rules): """ Get features and language levels for a language :param dirpath: path to the folder with input files :param lang: string with name of language, e.g. 'de' :param categories: dictionary of categories :param locqualityissuetypes: dictionary of issue types :param rules: dicionary of rules :return: two lists: with language levels and with features of all files in the folder """ files = os.listdir(dirpath) all_feats = [] # list of feature dictionaries of all files in the folder all_cats = [] # list of language levels of all files in the folder checker = language_check.LanguageTool(lang) for f in files: if f.endswith(".txt"): text = open(os.path.join(dirpath, f)).read() matches = checker.check(text) for match in matches: rule = match.ruleId loc = match.locqualityissuetype cat = match.category rules[rule] = rules.get(rule, 0) + 1 locqualityissuetypes[loc] = locqualityissuetypes.get(loc, 0) + 1 categories[cat] = categories.get(cat, 0) + 1 all_feats.append(get_file_feats(rules, locqualityissuetypes, categories)) all_cats.append(f.split(".txt")[0][-2:]) # reset all values in a dictionmaries rules = reset_dict(rules) categories = reset_dict(categories) locqualityissuetypes = reset_dict(locqualityissuetypes) return all_cats, all_feats
def grammar(data): tool = language_check.LanguageTool('en-US') for d in data: text=d[2] matches = tool.check(text) d.append(len(matches)) return data
def main(file): #file = 'substinces.PNG' # opening an image from the source path img = Image.open(file) dot = file.index(".") name = file[0:dot] tool = language_check.LanguageTool('en-US') # path where the tesseract module is installed pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe' # converts the image to result and saves it into result variable result = pytesseract.image_to_string(img) # write text in a text file and save it to source path with open(name + '.txt', mode='w') as file: file.write(result) with open(name + '.txt', mode='r') as file: orig = file.read() matches = tool.check(orig) orig = orig.replace(orig, language_check.correct(orig, matches)) with open(name + '.txt', mode='w') as file: file.write(orig) return str(len(matches)) + " mistakes were found."
def errorCount(essay): tool = language_check.LanguageTool('en-US') checker = [] matches = tool.check(essay) for i in matches: checker.append(i.fromx) return (len(set(checker)))
def long_suggestion_grammar_check(text): """ This method returns a longer list of suggestions, which can then be looped through if desired. Parameters ---------- text : str The text that is inputted. Should be unformatted. Returns ------- list of str Returns a list of suggestions. """ tool = language_check.LanguageTool('en-US') matches = tool.check(text) for i, match in enumerate(matches): fromy = match.fromy + 1 fromx = match.fromx + 1 ruleId = match.ruleId replacement = match.replacements[0] matches[ i] = "Line {}, column {}, Rule ID: {}[{}]\nMessage: Did you mean '{}'?\nSuggestion: {}".format( fromy, fromx, ruleId, i, replacement, replacement) return matches
def __init__(self, tracker, random_seed=None): self.tracker = tracker if random_seed is None: random_seed = random.randint(1, 2**31 - 1) self.seed = random_seed random.seed(random_seed) self.tool = language_check.LanguageTool("en-US")
def QuestionConfirmView(request): if request.method == 'POST': # Receive data from client question = request.POST.get('myquestion') print("question = ", question) tool = language_check.LanguageTool('en-US') texts = question matches = tool.check(texts) confirm_ques = language_check.correct(texts, matches) print("question_match = ", confirm_ques) context = {'confirm_ques': confirm_ques} # return JsonResponse({'sepal_length': sepal_length, # }) return render(request, 'question/question_confirm.html', context) # return render(request,"question/questioncreate.html",{'sepal_length':sepal_length}) else: print("this is post method in viesw") print("request.POST", request) # confirm_question = request.POST.get['final_que'] confirm_question = request.GET.get("confirm_ques") print("confirm_question", confirm_question) context = {'ram': 'ram'} return render(request, 'question/questioncreate.html', context)
def check(data): tool = language_check.LanguageTool('en-GB') matches = tool.check(data) print(matches) print(language_check.correct(data, matches)) return len(matches)
def n_grammar_errors(string): tool = language_check.LanguageTool('en-US') matches = tool.check(string) count = 0 for m in matches: if m.category == "Grammar": count = count + 1 return count
def language_check(text): tool = LChecker.LanguageTool('en-US') errors = tool.check(text) # We need to determine how many errors is to many num = len(errors) if num > 100: num = num % 100 return num
def check_grammar_of_str(string): tool = language_check.LanguageTool('en-US') matches = tool.check(string) score = 0 for match in matches: if not match.ruleId == "MORFOLOGIK_RULE_EN_US": # do not count any spell errors score += 1 return score
def __init__(self): self.reddit = praw.Reddit(user_agent='Comment History Parser', client_id='nkVxbwp1RsHHCA', client_secret='SlzWUhAhV5nIXPy4_1PTJSOaLrA') self.sid = SentimentIntensityAnalyzer() self.tool = language_check.LanguageTool('en-US') self.LDA = LDA_predict() self.txc = toxicity_classifier()
def add_percent_typos_to_words_column(fake_news_df): language_tool = language_check.LanguageTool("en-US") fake_news_df["percent_typos_to_words"] = fake_news_df["article_text"].apply( lambda x: typos_to_words(language_tool, x) ) return fake_news_df