def statistics(self, text): self.asl = textstat.avg_sentence_length(text) self.avg_sentence_per_word = textstat.avg_sentence_per_word(text) self.avg_syllables_per_word = textstat.avg_syllables_per_word(text) self.difficult_words = textstat.difficult_words(text) self.lexicon_count = textstat.lexicon_count(text) self.polysyllable_count = textstat.polysyllabcount(text) self.sentence_count = textstat.sentence_count(text)
def test_avg_syllables_per_word(self): avg = textstat.avg_syllables_per_word(self.long_test) self.assertEqual(1.4, avg)
def test_avg_syllables_per_word(): textstat.set_lang("en_US") avg = textstat.avg_syllables_per_word(long_test) assert avg == 1.4
def get_redability_assessments(data_text: str) -> Optional[dict]: divided_text = tokenize.sent_tokenize(data_text) word_tokenizes = nltk.word_tokenize(data_text) pos_tags = nltk.pos_tag(word_tokenizes) pos_tags_tagger = TAGGER.tag(word_tokenizes) f_dist = nltk.FreqDist(word_tokenizes) uniqueWordCount = compute_unique_word_count(f_dist.most_common()) paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n'))) counts = Counter(tag for word, tag in pos_tags) # Readability Grade Levels readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0, ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0, raygorReadability=0, fryReadability=0, flesch=0) readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text)) readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text)) readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text)) readability_grade_levels.update(smog=textstat.smog_index(data_text)) readability_grade_levels.update(ari=textstat.automated_readability_index(data_text)) readability_grade_levels.update(rix=textstat.rix(data_text)) # need to check readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2)) readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) + textstat.avg_syllables_per_word(data_text) + 2.7971, 2)) readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text)) readability_grade_levels.update(fryReadability=count_fry_readability(divided_text)) # need to check readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text)) # Readability Scores readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0, newDaleChallScore=0, lixReadability=0, lensearWrite=0) readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels)) readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text)) readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels)) readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels)) readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2)) readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text)) readability_scores.update(lixReadability=textstat.lix(data_text)) readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text)) # Text Statistics text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0, sentenceCount=0, paragraphCount=0) text_statistics.update(characterCount=textstat.char_count(data_text)) text_statistics.update(syllableCount=textstat.syllable_count(data_text)) text_statistics.update(wordCount=textstat.lexicon_count(data_text)) text_statistics.update(uniqueWordCount=uniqueWordCount) text_statistics.update(sentenceCount=textstat.sentence_count(data_text)) text_statistics.update(paragraphCount=paragraphCount) # Timings timings_statistics = dict(readingTime=0, speakingTime=0) timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text))) timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text))) # Text Composition text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0, properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0) text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0)) text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0)) text_composition.update(conjunctions=counts.get('CC', 0)) text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0)) text_composition.update(interjections=counts.get('UH', 0)) text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0)) text_composition.update( verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get( 'VBP', 0) + counts.get('VBZ', 0)) text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0)) text_composition.update(prepositions=counts.get('IN', 0)) text_composition.update( pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0)) text_composition.update(qualifiers=counts.get('RB', 0)) text_composition.update(unrecognised=counts.get(None, 0)) text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0)) # Readability Issues text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0, sentences30Syllables=[], sentences20Syllables=[], words4SyllablesCount=0, words12LettersCount=0, words4Syllables=[], words12Letters=[]) sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables( divided_text) sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables, "sentences_30_syllables", "sentences_30_syllables", "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables, "sentences_20_syllables", "sentences_20_syllables", "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") text_readability_issues.update(sentences30SyllablesCount=sentences_30_count, sentences20SyllablesCount=sentences_20_count) words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text) words_12_letters = find_limit_offcet(data_text, words_12_letters, "words_12_letters", "words_12_letters", "This word is more than 12 letters", "Readability Issues") words_4_syllables = find_limit_offcet(data_text, words_4_syllables, "words_4_syllables", "words_4_syllables", "This word is more than 4 syllables", "Readability Issues") text_readability_issues.update(words4SyllablesCount=words_4_count, words12LettersCount=words_12_count) # Writing Style Issues text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[], adverbsCount=0, adverbs=[], clicheCount=0, cliches=[]) passive_voises_return = find_passives(divided_text) passive_voises_return = find_limit_offcet(data_text, passive_voises_return, "passive_voises", "passive_voises", "Too much of using passive voises", "Writing Style Issues") adverbs_return = find_adverbs(pos_tags_tagger) adverbs_return = find_limit_offcet(data_text, adverbs_return, "adverbs", # writing_style_issues "adverbs", "Too much of using adverbs", "Writing Style Issues") text_style_issues.update(passiveVoiceCount=len(passive_voises_return), adverbsCount=len(adverbs_return)) # Text Density Issues text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0, wordsPerParagraph=0, sentencesPerParagraph=0) text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text), syllablesPerWord=textstat.avg_syllables_per_word(data_text), wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2), wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2), sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2)) # Language Issues text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0) matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \ passive_voises_return + adverbs_return return dict(readabilityGradeLevels=readability_grade_levels, readabilityScores=readability_scores, textStatistics=text_statistics, timings=timings_statistics, textComposition=text_composition, textReadabilityIssues=text_readability_issues, textStyleIssues=text_style_issues, textDensityIssues=text_density_issues, textLanguageIssues=text_language_issues, matches=matches_limit_offcet)
def test_avg_syllables_per_word(): avg = textstat.avg_syllables_per_word(long_test) assert avg == 1.4
def Getting_Score_Result(folder_path, abs_file): ###### # Preprocessing ###### # path="D:\\PycharmProjects\\CE_RSC_client\\sample files for CE\\Difficult\\*\\*.doc" # path=folder_path+"C8EN00688N.doc" # print("here folder_path -->", folder_path) # print("fname -->", abs_file) path = abs_file if path.endswith(".doc"): files = glob2.glob(path) # files="/home/shantakumar/Projects/context_copy_edit_RAC/sample_watcher/2/C8EN00688A.docx" # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*.doc")) # print ("!!!!!!!!!!! Converting doc files to docx files, if any") path = convert_to_docx(files) # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\EASY\\C8CE00717A\\C8CE00717A.docx" # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\Difficult\\*.docx" # files = glob2.glob(path) # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*.docx")) ###### # Preprocessing ###### # path=folder_path+"C8EN00688N.docx" files = glob2.glob(path) preprocessed_flag = 0 for file in files: if (file.find("preprocessed") != -1): # print("Preprocessing is already carried out!!") preprocessed_flag = 1 break if (preprocessed_flag == 0): for file in files: # print(file) flag = file.find('preprocessed') if (flag == -1): # to eliminate preprocessed file # print("Preprocessing ", file) file_name = file.split("/")[-1] try: writing_preprocessed_paragraphs(file) except Exception as e: print("Bad Word Document. The specific error is ", e) result = [(file, file_name, 0, 0, 0, 0, 0, 0, 0, "Bad word formatting")] result_df = pd.DataFrame( result, columns=[ 'File Path', 'File Name', 'Lang/Grammar', 'Topic', 'No of words', 'Journal code', 'author nation', 'article type', 'CE', 'Obtained Level' ] ) # columns=['File Path','File Name', 'Lang/Grammar', 'Topic', 'No of words', 'Journal code',\ # 'author nation', 'article type', 'CE', 'Obtained Level']) file_topic = [(file, file_name, 0, 0, "Bad word formatting")] # columns=['File Path','File Name', 'Title', 'Abstract', 'obtained Level']) file_topic_df = pd.DataFrame(file_topic, columns=[ 'File Path', 'File Name', 'Title', 'Abstract', 'obtained Level' ]) return result_df, file_topic_df # path="D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\Difficult\\*preprocessed.docx" # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE\\EASY\\C8CE00717A\\C8CE00717A.docx__preprocessed.docx" # files = glob2.glob(path) # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*preprocessed.docx")) preprocess_path = path + "__preprocessed.docx" files = glob2.glob(preprocess_path) nlp = spacy.load('en_core_web_sm') grmrerr_df = pd.DataFrame() file_topic_df = pd.DataFrame() result_df = pd.DataFrame() result = [] file_grammar_and_lang = [] file_topic = [] file_grammar_and_lang_df = pd.DataFrame() total_file = 0 # for computing passive sentences import importlib.machinery loader = importlib.machinery.SourceFileLoader( 'report', os.path.join(BASE_DIR_PATH, 'ispassive-master', 'ispassive.py')) handle = loader.load_module('report') t = handle.Tagger() # to iterate through and compute level of each file for file in tqdm(files): # print("result file -->", file) file_name = file.split("/")[-1].split("__")[0] # file_name_ext(file) file_path = file total_file = total_file + 1 text = docx2txt.process(file) doc = nlp(text) sents = list(doc.sents) total_no_sent = sents.__len__() # a1 - Grammar and language check(); gl = Grammar_and_Language(file) # computing the number of grammar errors tmp_grmrerr_df, no_grmr_error = gl.grammar() grmrerr_df = grmrerr_df.append(tmp_grmrerr_df) avg_grmr_error = no_grmr_error / total_no_sent # computing the average number of syllabel per word avg_syllabel_per_word = textstat.avg_syllables_per_word(text) # computing the parse tree height of the given doc avg_prse_tree_height, percnt_len_perse_less_than_3, percnt_len_perse_between_3_6, percnt_len__perse_between_7_9,\ percnt_len_perse_greater_than_9 = gl.parse_tree(doc) len_sent = 0 sent_len_lt_ten = 0 sent_len_gt_forty = 0 all_distances = 0 list_all_distances = [] passive_sent = [] # print("Computing sentence length & dist bw subject and root verb ...") for i in range(sents.__len__()): s = str(sents[i]) passive_sent.append(t.is_passive(s)) len_s = get_tokens_lengths(s) len_sent = len_sent + (len_s) if (len_s < 10): # print('10', len_s, s) sent_len_lt_ten = sent_len_lt_ten + 1 elif (len_s > 40): # print('40', len_s, s) sent_len_gt_forty = sent_len_gt_forty + 1 ''' Get all Distance between subject & ROOT ''' dist = gl.distnce_sub_root(s) all_distances = all_distances + (dist) list_all_distances.append(dist) avg_len_noun_verb = all_distances / sents.__len__() # computing the average no of passive sentences no_of_passive_sent = 0 for p in passive_sent: if (str(p) == ("True")): no_of_passive_sent = no_of_passive_sent + 1 avg_passive_sent = no_of_passive_sent / total_no_sent avg_word_per_sent = len_sent / total_no_sent per_sent_len_lt_ten = (sent_len_lt_ten / total_no_sent) * 100 per_sent_len_gt_forty = (sent_len_gt_forty / total_no_sent) * 100 df3_distance = pd.DataFrame({"dist": list_all_distances}) dist_1 = df3_distance[df3_distance["dist"] <= 1].__len__() dist_2 = df3_distance[df3_distance["dist"] == 2].__len__() dist_3 = df3_distance[df3_distance["dist"] == 3].__len__() dist_4_6 = df3_distance[(df3_distance["dist"] >= 4) & (df3_distance["dist"] <= 6)].__len__() dist_greater_than_6 = df3_distance[df3_distance["dist"] > 6].__len__() perc_dist_1 = round(dist_1 / len(df3_distance) * 100, 2) perc_dist_2 = round(dist_2 / len(df3_distance) * 100, 2) perc_dist_3 = round(dist_3 / len(df3_distance) * 100, 2) perc_dist_4_6 = round(dist_4_6 / len(df3_distance) * 100, 2) perc_dist_greater_than_6 = round( dist_greater_than_6 / len(df3_distance) * 100, 2) tmp_results = [ avg_grmr_error, avg_syllabel_per_word, avg_word_per_sent, avg_prse_tree_height, percnt_len_perse_less_than_3, percnt_len_perse_between_3_6, percnt_len__perse_between_7_9, percnt_len_perse_greater_than_9, avg_len_noun_verb, perc_dist_1, perc_dist_2, perc_dist_3, perc_dist_4_6, perc_dist_greater_than_6, avg_passive_sent ] with open( os.path.join(BASE_DIR_PATH, 'decision_tree_model', 'dt_set1_set2_set3_depth9.pkl'), 'rb') as fid: dt_loaded = pickle.load(fid) a1 = dt_loaded.predict([tmp_results]) # print("Score based on Grammar and language check for ", file, a1[0]) grammar_and_lang2 = [ file_path, file_name, avg_grmr_error, avg_syllabel_per_word, avg_word_per_sent, avg_prse_tree_height, percnt_len_perse_less_than_3, percnt_len_perse_between_3_6, percnt_len__perse_between_7_9, percnt_len_perse_greater_than_9, avg_len_noun_verb, perc_dist_1, perc_dist_2, perc_dist_3, perc_dist_4_6, perc_dist_greater_than_6, avg_passive_sent, a1[0] ] file_grammar_and_lang.append(grammar_and_lang2) # b -no.of words pos = file.rfind("__preprocessed.docx") unprocessed_file = file[0:pos] # print("result file -->", unprocessed_file) full_text = docx2txt.process(unprocessed_file) b = word_count(full_text) # print(" Score based on no.of words extraction for ", file, b) x = metadata_xml() # tmp_doc_file_path = file # pos = tmp_doc_file_path.rfind('\\') # xml_file_path = tmp_doc_file_path[0:(pos+11)]+"_metadata.xml" # try: xml_path = glob(os.path.join(dirname(file), '*.xml')) mydoc = minidom.parse(xml_path[0]) c = x.journal_code(mydoc) d = x.authors_nation(mydoc) e = x.type_article(mydoc) title = x.article_title(mydoc) print(" Score based on metadata extraction for ", file, c, d, e) except: c = 2 d = 2 e = 2 title = title_ext(file) # a2 - topic detection abstract = abst_ext(file) title_and_abs = title + abstract doc = nlp(title_and_abs) # score the document based on the nouns and verbs in it topic = Topic_noun_verb() a2 = topic.term_extraction(doc) # print(" Score based on topic detection for ", file, a2) topic_list = [file_path, file_name, title, abstract, a2] file_topic.append(topic_list) ce = ((0.2 * a1[0]) + (0.2 * a2) + (.25 * b) + (0.1 * c) + (.15 * d) + (.1 * e)) # ce=((0.30 * a1[0])+(0.3 * a2) + (.2 * b) + (.2 * d)) if (ce <= 1.66): obtained_level = 'EASY' elif (ce <= 2.33): obtained_level = 'INTERMEDIATE' else: obtained_level = 'DIFFICULT' result.append( (file_path, file_name, a1[0], a2, b, c, d, e, ce, obtained_level)) # print("Overall score", file, ce, obtained_level) print(result) result_df = pd.DataFrame(result, columns=[ 'File Path', 'File Name', 'Lang/Grammar', 'Topic', 'No of words', 'Journal code', 'author nation', 'article type', 'CE', 'Obtained Level' ]) file_grammar_and_lang_df = pd.DataFrame( file_grammar_and_lang, columns=[ 'File Path', 'File Name', 'avg_grmr_error', 'avg_syllabel_per_word', 'avg_word_per_sent', 'avg_parse_tree_length', 'percnt_len_perse_less_than_3', 'percnt_len_perse_between_3_6', 'percnt_len__perse_between_7_9', 'percnt_len_perse_greater_than_9', 'avg_len_noun_verb', 'perc_dist_1', 'perc_dist_2', 'perc_dist_3', 'perc_dist_4_6', 'perc_dist_greater_than_6', 'avg_passive_sent', 'obtained Level' ]) file_topic_df = pd.DataFrame(file_topic, columns=[ 'File Path', 'File Name', 'Title', 'Abstract', 'obtained Level' ]) path = folder_path + "/" + "test.xlsx" writer = pd.ExcelWriter(path, engine='xlsxwriter') result_df.to_excel(writer, 'Result') file_topic_df.to_excel(writer, 'Result_topic') file_grammar_and_lang_df.to_excel(writer, 'Result_lang&grammar') writer.save() try: # p = os.popen('attrib +h ' + path) # t = p.read() # p.close() fold_path = os.listdir(folder_path + "/") for item in fold_path: if item.endswith("__preprocessed.docx"): os.remove(os.path.join(folder_path + "/", item)) elif item.endswith("st.xlsx"): os.rename(os.path.join(folder_path + "/", item), folder_path + "/." + item) else: pass except: print("problem in delete, hidden conversion") print("folder path -->", folder_path) print("normal file path -->", path) return result_df, file_topic_df