Esempio n. 1
0
 def statistics(self, text):
     self.asl = textstat.avg_sentence_length(text)
     self.avg_sentence_per_word = textstat.avg_sentence_per_word(text)
     self.avg_syllables_per_word = textstat.avg_syllables_per_word(text)
     self.difficult_words = textstat.difficult_words(text)
     self.lexicon_count = textstat.lexicon_count(text)
     self.polysyllable_count = textstat.polysyllabcount(text)
     self.sentence_count = textstat.sentence_count(text)
Esempio n. 2
0
    def test_avg_syllables_per_word(self):
        avg = textstat.avg_syllables_per_word(self.long_test)

        self.assertEqual(1.4, avg)
Esempio n. 3
0
def test_avg_syllables_per_word():
    textstat.set_lang("en_US")
    avg = textstat.avg_syllables_per_word(long_test)

    assert avg == 1.4
def get_redability_assessments(data_text: str) -> Optional[dict]:
    divided_text = tokenize.sent_tokenize(data_text)
    word_tokenizes = nltk.word_tokenize(data_text)
    pos_tags = nltk.pos_tag(word_tokenizes)
    pos_tags_tagger = TAGGER.tag(word_tokenizes)
    f_dist = nltk.FreqDist(word_tokenizes)

    uniqueWordCount = compute_unique_word_count(f_dist.most_common())

    paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n')))

    counts = Counter(tag for word, tag in pos_tags)

    # Readability Grade Levels
    readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0,
                                    ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0,
                                    raygorReadability=0, fryReadability=0, flesch=0)

    readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text))
    readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text))
    readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text))
    readability_grade_levels.update(smog=textstat.smog_index(data_text))
    readability_grade_levels.update(ari=textstat.automated_readability_index(data_text))
    readability_grade_levels.update(rix=textstat.rix(data_text))

    # need to check
    readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2))

    readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) +
                                                                 textstat.avg_syllables_per_word(data_text) +
                                                                 2.7971, 2))
    readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text))
    readability_grade_levels.update(fryReadability=count_fry_readability(divided_text))
    # need to check

    readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text))

    # Readability Scores
    readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0,
                              newDaleChallScore=0, lixReadability=0, lensearWrite=0)
    readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels))
    readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text))
    readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels))
    readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels))
    readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2))
    readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text))
    readability_scores.update(lixReadability=textstat.lix(data_text))
    readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text))

    # Text Statistics
    text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0,
                           sentenceCount=0, paragraphCount=0)
    text_statistics.update(characterCount=textstat.char_count(data_text))
    text_statistics.update(syllableCount=textstat.syllable_count(data_text))
    text_statistics.update(wordCount=textstat.lexicon_count(data_text))
    text_statistics.update(uniqueWordCount=uniqueWordCount)
    text_statistics.update(sentenceCount=textstat.sentence_count(data_text))
    text_statistics.update(paragraphCount=paragraphCount)

    # Timings
    timings_statistics = dict(readingTime=0, speakingTime=0)
    timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text)))
    timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text)))

    # Text Composition
    text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0,
                            properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0)

    text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0))
    text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0))
    text_composition.update(conjunctions=counts.get('CC', 0))
    text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0))
    text_composition.update(interjections=counts.get('UH', 0))
    text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0))
    text_composition.update(
        verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get(
            'VBP', 0) + counts.get('VBZ', 0))
    text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0))
    text_composition.update(prepositions=counts.get('IN', 0))
    text_composition.update(
        pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0))
    text_composition.update(qualifiers=counts.get('RB', 0))
    text_composition.update(unrecognised=counts.get(None, 0))
    text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0))

    # Readability Issues
    text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0,
                                   sentences30Syllables=[], sentences20Syllables=[],
                                   words4SyllablesCount=0, words12LettersCount=0,
                                   words4Syllables=[], words12Letters=[])

    sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables(
        divided_text)

    sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables,
                                               "sentences_30_syllables",
                                               "sentences_30_syllables",
                                               "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")
    sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables,
                                               "sentences_20_syllables",
                                               "sentences_20_syllables",
                                               "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")

    text_readability_issues.update(sentences30SyllablesCount=sentences_30_count,
                                   sentences20SyllablesCount=sentences_20_count)

    words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text)

    words_12_letters = find_limit_offcet(data_text, words_12_letters,
                                         "words_12_letters",
                                         "words_12_letters",
                                         "This word is more than 12 letters",
                                         "Readability Issues")
    words_4_syllables = find_limit_offcet(data_text, words_4_syllables,
                                          "words_4_syllables",
                                          "words_4_syllables",
                                          "This word is more than 4 syllables",
                                          "Readability Issues")

    text_readability_issues.update(words4SyllablesCount=words_4_count,
                                   words12LettersCount=words_12_count)

    # Writing Style Issues
    text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[],
                             adverbsCount=0, adverbs=[],
                             clicheCount=0, cliches=[])
    passive_voises_return = find_passives(divided_text)
    passive_voises_return = find_limit_offcet(data_text, passive_voises_return,
                                              "passive_voises",
                                              "passive_voises",
                                              "Too much of using passive voises",
                                              "Writing Style Issues")
    adverbs_return = find_adverbs(pos_tags_tagger)
    adverbs_return = find_limit_offcet(data_text, adverbs_return,
                                       "adverbs",  # writing_style_issues
                                       "adverbs",
                                       "Too much of using adverbs",
                                       "Writing Style Issues")
    text_style_issues.update(passiveVoiceCount=len(passive_voises_return),
                             adverbsCount=len(adverbs_return))

    # Text Density Issues
    text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0,
                               wordsPerParagraph=0, sentencesPerParagraph=0)

    text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text),
                               syllablesPerWord=textstat.avg_syllables_per_word(data_text),
                               wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2),
                               wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2),
                               sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2))

    # Language Issues
    text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0)

    matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \
                           passive_voises_return + adverbs_return

    return dict(readabilityGradeLevels=readability_grade_levels,
                readabilityScores=readability_scores,
                textStatistics=text_statistics,
                timings=timings_statistics,
                textComposition=text_composition,
                textReadabilityIssues=text_readability_issues,
                textStyleIssues=text_style_issues,
                textDensityIssues=text_density_issues,
                textLanguageIssues=text_language_issues,
                matches=matches_limit_offcet)
Esempio n. 5
0
def test_avg_syllables_per_word():
    avg = textstat.avg_syllables_per_word(long_test)

    assert avg == 1.4
Esempio n. 6
0
def Getting_Score_Result(folder_path, abs_file):
    ######
    # Preprocessing
    ######
    # path="D:\\PycharmProjects\\CE_RSC_client\\sample files for CE\\Difficult\\*\\*.doc"
    # path=folder_path+"C8EN00688N.doc"

    # print("here folder_path -->", folder_path)
    # print("fname -->", abs_file)

    path = abs_file
    if path.endswith(".doc"):
        files = glob2.glob(path)
        # files="/home/shantakumar/Projects/context_copy_edit_RAC/sample_watcher/2/C8EN00688A.docx"
        # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*.doc"))
        # print ("!!!!!!!!!!! Converting doc files to docx files, if any")
        path = convert_to_docx(files)

    # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\EASY\\C8CE00717A\\C8CE00717A.docx"
    # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\Difficult\\*.docx"
    # files = glob2.glob(path)
    # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*.docx"))

    ######
    # Preprocessing
    ######
    # path=folder_path+"C8EN00688N.docx"
    files = glob2.glob(path)
    preprocessed_flag = 0
    for file in files:
        if (file.find("preprocessed") != -1):
            # print("Preprocessing is already carried out!!")
            preprocessed_flag = 1
            break

    if (preprocessed_flag == 0):
        for file in files:
            # print(file)
            flag = file.find('preprocessed')
            if (flag == -1):  # to eliminate preprocessed file
                # print("Preprocessing ", file)
                file_name = file.split("/")[-1]
                try:
                    writing_preprocessed_paragraphs(file)
                except Exception as e:
                    print("Bad Word Document. The specific error is ", e)

                    result = [(file, file_name, 0, 0, 0, 0, 0, 0, 0,
                               "Bad word formatting")]
                    result_df = pd.DataFrame(
                        result,
                        columns=[
                            'File Path', 'File Name', 'Lang/Grammar', 'Topic',
                            'No of words', 'Journal code', 'author nation',
                            'article type', 'CE', 'Obtained Level'
                        ]
                    )  # columns=['File Path','File Name', 'Lang/Grammar', 'Topic', 'No of words', 'Journal code',\
                    #      'author nation', 'article type', 'CE', 'Obtained Level'])

                    file_topic = [(file, file_name, 0, 0,
                                   "Bad word formatting")]
                    # columns=['File Path','File Name', 'Title', 'Abstract', 'obtained Level'])
                    file_topic_df = pd.DataFrame(file_topic,
                                                 columns=[
                                                     'File Path', 'File Name',
                                                     'Title', 'Abstract',
                                                     'obtained Level'
                                                 ])
                    return result_df, file_topic_df

    # path="D:\\PycharmProjects\\CE_RSC_client\\sample files for CE 1\\Difficult\\*preprocessed.docx"
    # path = "D:\\PycharmProjects\\CE_RSC_client\\sample files for CE\\EASY\\C8CE00717A\\C8CE00717A.docx__preprocessed.docx"
    # files = glob2.glob(path)
    # files = glob(os.path.join("D:\\","PycharmProjects", "CE_RSC_client","sample files for CE","DIFFICULT","*","*preprocessed.docx"))

    preprocess_path = path + "__preprocessed.docx"
    files = glob2.glob(preprocess_path)

    nlp = spacy.load('en_core_web_sm')
    grmrerr_df = pd.DataFrame()
    file_topic_df = pd.DataFrame()
    result_df = pd.DataFrame()
    result = []
    file_grammar_and_lang = []
    file_topic = []
    file_grammar_and_lang_df = pd.DataFrame()

    total_file = 0

    # for computing passive sentences
    import importlib.machinery
    loader = importlib.machinery.SourceFileLoader(
        'report',
        os.path.join(BASE_DIR_PATH, 'ispassive-master', 'ispassive.py'))
    handle = loader.load_module('report')
    t = handle.Tagger()

    # to iterate through and compute level of each file
    for file in tqdm(files):
        # print("result file -->", file)

        file_name = file.split("/")[-1].split("__")[0]  # file_name_ext(file)
        file_path = file
        total_file = total_file + 1

        text = docx2txt.process(file)
        doc = nlp(text)
        sents = list(doc.sents)
        total_no_sent = sents.__len__()

        # a1 - Grammar and language check();

        gl = Grammar_and_Language(file)

        # computing the number of grammar errors
        tmp_grmrerr_df, no_grmr_error = gl.grammar()
        grmrerr_df = grmrerr_df.append(tmp_grmrerr_df)
        avg_grmr_error = no_grmr_error / total_no_sent

        # computing the average number of syllabel per word
        avg_syllabel_per_word = textstat.avg_syllables_per_word(text)

        # computing the parse tree height of the given doc
        avg_prse_tree_height, percnt_len_perse_less_than_3, percnt_len_perse_between_3_6, percnt_len__perse_between_7_9,\
            percnt_len_perse_greater_than_9 = gl.parse_tree(doc)

        len_sent = 0
        sent_len_lt_ten = 0
        sent_len_gt_forty = 0
        all_distances = 0
        list_all_distances = []
        passive_sent = []

        # print("Computing sentence length & dist bw subject and root verb ...")
        for i in range(sents.__len__()):
            s = str(sents[i])
            passive_sent.append(t.is_passive(s))

            len_s = get_tokens_lengths(s)
            len_sent = len_sent + (len_s)

            if (len_s < 10):
                # print('10', len_s, s)
                sent_len_lt_ten = sent_len_lt_ten + 1
            elif (len_s > 40):
                # print('40', len_s, s)
                sent_len_gt_forty = sent_len_gt_forty + 1
            '''
            Get all Distance between subject & ROOT
            '''
            dist = gl.distnce_sub_root(s)
            all_distances = all_distances + (dist)
            list_all_distances.append(dist)

        avg_len_noun_verb = all_distances / sents.__len__()

        # computing the average no of passive sentences
        no_of_passive_sent = 0
        for p in passive_sent:
            if (str(p) == ("True")):
                no_of_passive_sent = no_of_passive_sent + 1

        avg_passive_sent = no_of_passive_sent / total_no_sent

        avg_word_per_sent = len_sent / total_no_sent
        per_sent_len_lt_ten = (sent_len_lt_ten / total_no_sent) * 100
        per_sent_len_gt_forty = (sent_len_gt_forty / total_no_sent) * 100

        df3_distance = pd.DataFrame({"dist": list_all_distances})

        dist_1 = df3_distance[df3_distance["dist"] <= 1].__len__()
        dist_2 = df3_distance[df3_distance["dist"] == 2].__len__()
        dist_3 = df3_distance[df3_distance["dist"] == 3].__len__()
        dist_4_6 = df3_distance[(df3_distance["dist"] >= 4)
                                & (df3_distance["dist"] <= 6)].__len__()
        dist_greater_than_6 = df3_distance[df3_distance["dist"] > 6].__len__()

        perc_dist_1 = round(dist_1 / len(df3_distance) * 100, 2)
        perc_dist_2 = round(dist_2 / len(df3_distance) * 100, 2)
        perc_dist_3 = round(dist_3 / len(df3_distance) * 100, 2)
        perc_dist_4_6 = round(dist_4_6 / len(df3_distance) * 100, 2)
        perc_dist_greater_than_6 = round(
            dist_greater_than_6 / len(df3_distance) * 100, 2)

        tmp_results = [
            avg_grmr_error, avg_syllabel_per_word, avg_word_per_sent,
            avg_prse_tree_height, percnt_len_perse_less_than_3,
            percnt_len_perse_between_3_6, percnt_len__perse_between_7_9,
            percnt_len_perse_greater_than_9, avg_len_noun_verb, perc_dist_1,
            perc_dist_2, perc_dist_3, perc_dist_4_6, perc_dist_greater_than_6,
            avg_passive_sent
        ]

        with open(
                os.path.join(BASE_DIR_PATH, 'decision_tree_model',
                             'dt_set1_set2_set3_depth9.pkl'), 'rb') as fid:
            dt_loaded = pickle.load(fid)
        a1 = dt_loaded.predict([tmp_results])
        # print("Score based on Grammar and language check for ", file, a1[0])

        grammar_and_lang2 = [
            file_path, file_name, avg_grmr_error, avg_syllabel_per_word,
            avg_word_per_sent, avg_prse_tree_height,
            percnt_len_perse_less_than_3, percnt_len_perse_between_3_6,
            percnt_len__perse_between_7_9, percnt_len_perse_greater_than_9,
            avg_len_noun_verb, perc_dist_1, perc_dist_2, perc_dist_3,
            perc_dist_4_6, perc_dist_greater_than_6, avg_passive_sent, a1[0]
        ]

        file_grammar_and_lang.append(grammar_and_lang2)

        # b -no.of words

        pos = file.rfind("__preprocessed.docx")
        unprocessed_file = file[0:pos]
        # print("result file -->", unprocessed_file)
        full_text = docx2txt.process(unprocessed_file)
        b = word_count(full_text)
        # print(" Score based on no.of words extraction for ", file, b)

        x = metadata_xml()

        # tmp_doc_file_path = file
        # pos = tmp_doc_file_path.rfind('\\')
        # xml_file_path = tmp_doc_file_path[0:(pos+11)]+"_metadata.xml"
        #

        try:
            xml_path = glob(os.path.join(dirname(file), '*.xml'))
            mydoc = minidom.parse(xml_path[0])

            c = x.journal_code(mydoc)
            d = x.authors_nation(mydoc)
            e = x.type_article(mydoc)
            title = x.article_title(mydoc)
            print(" Score based on metadata extraction for ", file, c, d, e)

        except:
            c = 2
            d = 2
            e = 2
            title = title_ext(file)

        # a2 - topic detection
        abstract = abst_ext(file)
        title_and_abs = title + abstract

        doc = nlp(title_and_abs)
        # score the document based on the nouns and verbs in it
        topic = Topic_noun_verb()
        a2 = topic.term_extraction(doc)

        # print(" Score based on topic detection for ", file, a2)

        topic_list = [file_path, file_name, title, abstract, a2]
        file_topic.append(topic_list)

        ce = ((0.2 * a1[0]) + (0.2 * a2) + (.25 * b) + (0.1 * c) + (.15 * d) +
              (.1 * e))
        # ce=((0.30 * a1[0])+(0.3 * a2) + (.2 * b) + (.2 * d))

        if (ce <= 1.66):
            obtained_level = 'EASY'
        elif (ce <= 2.33):
            obtained_level = 'INTERMEDIATE'
        else:
            obtained_level = 'DIFFICULT'

        result.append(
            (file_path, file_name, a1[0], a2, b, c, d, e, ce, obtained_level))
        # print("Overall score", file, ce, obtained_level)

        print(result)

    result_df = pd.DataFrame(result,
                             columns=[
                                 'File Path', 'File Name', 'Lang/Grammar',
                                 'Topic', 'No of words', 'Journal code',
                                 'author nation', 'article type', 'CE',
                                 'Obtained Level'
                             ])
    file_grammar_and_lang_df = pd.DataFrame(
        file_grammar_and_lang,
        columns=[
            'File Path', 'File Name', 'avg_grmr_error',
            'avg_syllabel_per_word', 'avg_word_per_sent',
            'avg_parse_tree_length', 'percnt_len_perse_less_than_3',
            'percnt_len_perse_between_3_6', 'percnt_len__perse_between_7_9',
            'percnt_len_perse_greater_than_9', 'avg_len_noun_verb',
            'perc_dist_1', 'perc_dist_2', 'perc_dist_3', 'perc_dist_4_6',
            'perc_dist_greater_than_6', 'avg_passive_sent', 'obtained Level'
        ])
    file_topic_df = pd.DataFrame(file_topic,
                                 columns=[
                                     'File Path', 'File Name', 'Title',
                                     'Abstract', 'obtained Level'
                                 ])

    path = folder_path + "/" + "test.xlsx"
    writer = pd.ExcelWriter(path, engine='xlsxwriter')
    result_df.to_excel(writer, 'Result')
    file_topic_df.to_excel(writer, 'Result_topic')
    file_grammar_and_lang_df.to_excel(writer, 'Result_lang&grammar')
    writer.save()

    try:
        # p = os.popen('attrib +h ' + path)
        # t = p.read()
        # p.close()
        fold_path = os.listdir(folder_path + "/")

        for item in fold_path:
            if item.endswith("__preprocessed.docx"):
                os.remove(os.path.join(folder_path + "/", item))
            elif item.endswith("st.xlsx"):
                os.rename(os.path.join(folder_path + "/", item),
                          folder_path + "/." + item)
            else:
                pass

    except:
        print("problem in delete, hidden conversion")
        print("folder path -->", folder_path)
        print("normal file path -->", path)

    return result_df, file_topic_df
Esempio n. 7
0
def test_avg_syllables_per_word():
    avg = textstat.avg_syllables_per_word(long_test)

    assert avg == 1.4