Example #1
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
Example #2
0
def readability(text):
    print("Readability\n=================================\n\n")
    print("Flesch Reading Ease\n________________________\n\n")
    print str(textstat.flesch_reading_ease(text)) + "\n"
    print("Smog Index\n________________________\n\n")
    print str(textstat.smog_index(text)) + "\n"
    print("Flesch Kincaid Grade\n________________________\n\n")
    print str(textstat.flesch_kincaid_grade(text)) + "\n"
    print("Coleman Liau Index\n________________________\n\n")
    print str(textstat.coleman_liau_index(text)) + "\n"
    print("ARI\n________________________\n\n")
    print str(textstat.automated_readability_index(text)) + "\n"
    print("Dale Chall\n________________________\n\n")
    print str(textstat.dale_chall_readability_score(text)) + "\n"
    print("Difficult Words\n________________________\n\n")
    print str(textstat.difficult_words(text)) + "\n"
    print("Linsear Write Formula\n________________________\n\n")
    print str(textstat.linsear_write_formula(text)) + "\n"
    print("Gunning Fog\n________________________\n\n")
    print str(textstat.gunning_fog(text)) + "\n"
    print "Compiled Score\n_____________________________\n\n"
    print str(textstat.text_standard(text)) + "\n"


    return len(adjectives)
Example #3
0
def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(
            lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(
            lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(
            lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(
            lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(
            lambda x: textstat.text_standard(x))
    return df
Example #4
0
 def features_tokenized_difficultword_title(cls, row):
     data = row['review_title']
     token = cls.tokenizer.tokenize(data)
     if(token == None or len(token)==0):
         return -1
     else:
         val = textstat.difficult_words(row['review_title'])
         return val
Example #5
0
def vecify(v):
    return [ts.flesch_reading_ease(v),
    # ts.smog_index(v),
    ts.flesch_kincaid_grade(v),
    ts.coleman_liau_index(v),
    ts.automated_readability_index(v),
    ts.dale_chall_readability_score(v),
    ts.difficult_words(v),
    ts.linsear_write_formula(v),
    ts.gunning_fog(v)]
def textstat_analysis(profile_text):
    fre = textstat.flesch_reading_ease(profile_text)
    smog = textstat.smog_index(profile_text)
    fkg = textstat.flesch_kincaid_grade(profile_text)
    coleman = textstat.coleman_liau_index(profile_text)
    ari = textstat.automated_readability_index(profile_text)
    dale = textstat.dale_chall_readability_score(profile_text)
    dw = textstat.difficult_words(profile_text)
    lwf = textstat.linsear_write_formula(profile_text)
    gf = textstat.gunning_fog(profile_text)
    rc = textstat.readability_consensus(profile_text)
    word_count = textstat.lexicon_count(profile_text)
    return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
Example #7
0
 def reading_difficulty(self):
     diff_words = textstat.difficult_words(self.text) / self.nword
     flesch_kincaid = textstat.flesch_kincaid_grade(self.text)
     coleman_liau = textstat.coleman_liau_index(self.text)
     ari = textstat.automated_readability_index(self.text)
     dale_chall = textstat.dale_chall_readability_score(self.text)
     linsear = textstat.linsear_write_formula(self.text)
     gunning_fog = textstat.gunning_fog(self.text) - 6
     smog = textstat.smog_index(self.text)
     avg_grade = max(
         math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall +
                    linsear + gunning_fog + smog) / 7), 12)
     return avg_grade, diff_words
def get_readability(contents):
    readability = []
    readability.append(textstat.flesch_reading_ease(contents))
    readability.append(textstat.smog_index(contents))
    readability.append(textstat.flesch_kincaid_grade(contents))
    readability.append(textstat.automated_readability_index(contents))
    readability.append(textstat.dale_chall_readability_score(contents))
    readability.append(textstat.difficult_words(contents))
    readability.append(textstat.linsear_write_formula(contents))
    readability.append(textstat.gunning_fog(contents))
    readability.append(textstat.coleman_liau_index(contents))
    readability.append(textstat.text_standard(contents))

    return readability
Example #9
0
    def features_linsear_title(cls, row):
        data = row['review_title']
        token = cls.tokenizer.tokenize(data)
        if(token == None or len(token)==0):
            return 0
        # val = textstat.smog_index(row['review_title']) # #-0.00216594769872
        # val = textstat.dale_chall_readability_score(row['review_title'])  # #  0.025131347883
        val = textstat.difficult_words(row['review_title']) #   # 0.0363778452286
        # val = textstat.linsear_write_formula(row['review_title'])  # #-0.00557553587525
        # val = textstat.gunning_fog(row['review_title'])  # 0.0202643684371

        # val = textstat.flesch_reading_ease(row['review_title'])  # -0.0207657385707
        # val = textstat.automated_readability_index(row['review_title']) ##0.0142244017091
        if(val != None):
            return val
        else:
            return 0
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
Example #11
0
    def features_difficultword_content(cls, row):
        data = row['review_content']
        token = cls.tokenizer.tokenize(data)
        if(token == None or len(token)==0):
            return 0
        # val = textstat.smog_index(row['review_content'])  # 0.0970957994444
        # val = textstat.dale_chall_readability_score(row['review_content'])  # 0.0655506963852
        val = textstat.difficult_words(row['review_content'])  # 0.119689698366
        # val = textstat.linsear_write_formula(row['review_content'])  # 0.0393165149095
        # val = textstat.gunning_fog(row['review_content'])  # 0.064893772836

        # val = textstat.flesch_reading_ease(row['review_content'])  # -0.000962802863895
        # val = textstat.automated_readability_index(row['review_content']) #0.0206780263383
        if(val != None):
            return val
        else:
            return 0
Example #12
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Example #13
0
def run_textstat(text):
    #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension."""

    ts_flesch_reading_ease = textstat.flesch_reading_ease(text)
    ts_smog_index = textstat.smog_index(text)
    ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    ts_coleman_liau_index = textstat.coleman_liau_index(text)
    ts_automated_readability_index = textstat.automated_readability_index(text)
    ts_dale_chall_readability_score = textstat.dale_chall_readability_score(
        text)
    ts_difficult_words = textstat.difficult_words(text)
    ts_linsear_write_formula = textstat.linsear_write_formula(text)
    ts_gunning_fog = textstat.gunning_fog(text)
    ts_text_standard = textstat.text_standard(text)

    return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade,
            ts_coleman_liau_index, ts_automated_readability_index,
            ts_dale_chall_readability_score, ts_difficult_words,
            ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
Example #14
0
def lambda_handler(event, context):

    text = event['text']

    response = {}
    response['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    response['smog_index'] = textstat.smog_index(text)
    response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    response['coleman_liau_index'] = textstat.coleman_liau_index(text)
    response[
        'automated_readability_index'] = textstat.automated_readability_index(
            text)
    response[
        'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
            text)
    response['difficult_words'] = textstat.difficult_words(text)
    response['linsear_write_formula'] = textstat.linsear_write_formula(text)
    response['gunning_fog'] = textstat.gunning_fog(text)
    response['text_standard'] = textstat.text_standard(text)

    return respond(None, response)
Example #15
0
    def get_feat_readability_metrics(self):
        # https://github.com/shivam5992/textstat

        try:
            test_data = self.webscrap.get_body()
            out = []
            out.append(textstat.flesch_reading_ease(test_data))
            out.append(textstat.smog_index(test_data))
            out.append(textstat.flesch_kincaid_grade(test_data))
            out.append(textstat.coleman_liau_index(test_data))
            out.append(textstat.automated_readability_index(test_data))
            out.append(textstat.dale_chall_readability_score(test_data))
            out.append(textstat.difficult_words(test_data))
            out.append(textstat.linsear_write_formula(test_data))
            out.append(textstat.gunning_fog(test_data))
            #out.append(textstat.text_standard(test_data))
            return out, False

        except Exception as e:
            config.logger.error(repr(e))
            return MISSING_FEATURE * 9, True
Example #16
0
def feature_readability(essay):
    syllable_count = textstat.syllable_count(essay)
    #音节数统计
    flesch_reading_ease = textstat.flesch_reading_ease(essay)
    #文档的易读性0-100之间的分数
    smog_index = textstat.smog_index(essay)
    #烟雾指数,反映文档的易读程度,更精确,更容易计算
    flesch_kincaid_index = textstat.flesch_kincaid_grade(essay)
    #等级分数,年级等级
    coleman_liau_index = textstat.coleman_liau_index(essay)
    #返回文本的年级级别
    automated_readability_index = textstat.automated_readability_index(essay)
    #自动可读性指数,接近理解文本需要的年级
    dale_chall_readability_score = textstat.dale_chall_readability_score(essay)
    #返回年级级别,使用最常见的英文单词
    difficult_words = textstat.difficult_words(essay)

    linsear_write_formula = textstat.linsear_write_formula(essay)
    #返回文本的年级级别
    gunning_fog = textstat.gunning_fog(essay)
    #迷雾指数, 反映文本的阅读难度
    return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
Example #17
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch':
            textstat.flesch_reading_ease(source['content']),
            'smog':
            textstat.smog_index(source['content']),
            'flesch_kincaid':
            textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau':
            textstat.coleman_liau_index(source['content']),
            'readability':
            textstat.automated_readability_index(source['content']),
            'dale_chall':
            textstat.dale_chall_readability_score(source['content']),
            'difficult_words':
            textstat.difficult_words(source['content']),
            'linsear_write_formula':
            textstat.linsear_write_formula(source['content']),
            'gunning_fog':
            textstat.gunning_fog(source['content']),
            'consensus':
            textstat.readability_consensus(source['content']),
        }

        es.update(index='beek',
                  doc_type='page',
                  id=id,
                  body={'doc': {
                      'measures': measures
                  }},
                  refresh=True)
    except Exception as err:
        pass
Example #18
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch': textstat.flesch_reading_ease(source['content']),
            'smog': textstat.smog_index(source['content']),
            'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau': textstat.coleman_liau_index(source['content']),
            'readability': textstat.automated_readability_index(source['content']),
            'dale_chall': textstat.dale_chall_readability_score(source['content']),
            'difficult_words': textstat.difficult_words(source['content']),
            'linsear_write_formula': textstat.linsear_write_formula(source['content']),
            'gunning_fog': textstat.gunning_fog(source['content']),
            'consensus': textstat.readability_consensus(source['content']),
        }

        es.update(index='beek', doc_type='page', id=id,
                  body={'doc': {'measures': measures}}, refresh=True)
    except Exception as err:
        pass
def process(data):
    res = np.array([])
    cleaned = data.lower().strip()
    original = data.strip()
    fea1 = numOfWords(cleaned)
    # fea1 = fea1 / 10
    fea2 = numOfChar(cleaned)
    # fea2 = fea2 / 100
    fea3 = count(cleaned, string.punctuation)
    fea5 = numOfContUpperCase(original)
    fea4 = textstat.gunning_fog(data)
    fea6 = textstat.automated_readability_index(data)
    fea7 = textstat.linsear_write_formula(data)
    fea8 = textstat.difficult_words(data)
    fea9 = textstat.dale_chall_readability_score(data)
    fea10 = data.count("\'") + data.count(".") + data.count("\"") + data.count(",") + data.count(
        "’") + data.count("‘") + data.count("”") + data.count("“")
    fea10 = (fea10 / len(data)) * 1000
    fea11 = data.count("1") + data.count("2") + data.count("3") + data.count("4") + data.count(
        "5") + data.count("6") + data.count("7") + data.count("8") + data.count("9") + data.count("0")
    fea12 = data.count("?") + data.count("!") + data.count("@") + data.count("#") + data.count(
        "$") + data.count("%") + data.count("&")
    fea13 = data.count(":") + data.count(";")
    fea14 = data.count("—") + data.count("-") + data.count("_")
    fea15 = (fea10 / len(data)) * 100
    fea16 = data.count("(") + data.count(")") + data.count("[") + data.count("]") + data.count(
        "{") + data.count("}")

    fea17 = data.count("*") + data.count("/")
    fea18 = data.count("?")
    fea19 = fea10 + fea11 + fea12 + fea13 + fea14 + fea15 + fea16 + fea17 + fea18
    res = np.array([[fea1, fea2, fea3, fea5, fea4, fea6, fea7, fea8, fea9, fea10, fea11, fea12, fea13, fea14,
                     fea15, fea16, fea17, fea18, fea19]])


    return res
    def get_readability(self, corpus, type='ari'):
        readability = None
        if type == 'ari':
            readability = textstat.automated_readability_index(corpus)
        elif type == 'flesch':
            readability = textstat.flesch_reading_ease(corpus)
        elif type == 'smog':
            readability = textstat.smog_index(corpus)
        elif type == 'flesch_kinciad':
            readability = textstat.flesch_kincaid_grade(corpus)
        elif type == 'coleman':
            readability = textstat.coleman_liau_index(corpus)
        elif type == 'dale_chall':
            readability = textstat.dale_chall_readability_score(corpus)
        elif type == 'difficult_words':
            readability = textstat.difficult_words(corpus)
        elif type == 'linsear':
            readability = textstat.linsear_write_formula(corpus)
        elif type == 'gunning_fog':
            readability = textstat.gunning_fog(corpus)
        elif type == 'readability_conensus':
            readability = textstat.readability_consensus(corpus)

        return readability
Example #21
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Example #22
0
    def __init__(self, path):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            path (str): path to file to open, anaylze, close


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made by
        textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.user = ""
        self.path = path
        self.abs_path = os.path.abspath(self.path)
        if os.path.isfile(self.path):
            self.time_stamp = self.timestamp()
            self.file_name = os.path.basename(path)
            self.mime = MimeTypes()
            self.guessed_type = self.mime.guess_type(self.path)
            self.file_type = self.guessed_type[0]
            self.raw_text = textract.process(self.path, encoding="ascii")
            self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text)
            self.ptext = re.sub(u"\u2014", "--", self.ptext)
            self.ptext = re.sub(",", ",", self.ptext)
            self.ptext = re.sub("—", "--", self.ptext)
            self.ptext = re.sub("…", "...", self.ptext)
            self.text_no_feed = self.clean_new_lines(self.ptext)
            self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
            self.sentence_count = len(self.sentence_tokens)
            self.passive_sentences = passive(self.text_no_feed)
            self.passive_sentence_count = len(self.passive_sentences)
            self.percent_passive = (100 * (float(self.passive_sentence_count) /
                                           float(self.sentence_count)))
            self.percent_passive_round = round(self.percent_passive, 2)

            self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
            self.be_verb_count = self.be_verb_analysis[0]
            self.weak_sentences_all = self.be_verb_analysis[1]
            self.weak_sentences_set = set(self.weak_sentences_all)
            self.weak_sentences_count = len(self.weak_sentences_set)
            self.weak_verbs_to_sentences = 100 * float(
                self.weak_sentences_count) / float(self.sentence_count)
            self.weak_verbs_to_sentences_round = round(
                self.weak_verbs_to_sentences, 2)
            self.word_tokens = self.word_tokenize(self.text_no_feed)
            self.word_tokens_no_punct = \
                self.word_tokenize_no_punct(self.text_no_feed)
            self.no_punct = self.strip_punctuation(self.text_no_feed)
            # use this! It make lower and strips symbols
            self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)


            self.readability_flesch_re = \
                textstat.flesch_reading_ease(self.text_no_feed)
            self.readability_smog_index = \
                textstat.smog_index(self.text_no_feed)
            self.readability_flesch_kincaid_grade = \
                textstat.flesch_kincaid_grade(self.text_no_feed)
            self.readability_coleman_liau_index = \
                textstat.coleman_liau_index(self.text_no_feed)
            self.readability_ari = \
                textstat.automated_readability_index(self.text_no_feed)
            self.readability_linser_write = \
                textstat.linsear_write_formula(self.text_no_feed)
            self.readability_dale_chall = \
                textstat.dale_chall_readability_score(self.text_no_feed)
            self.readability_standard = \
                textstat.text_standard(self.text_no_feed)

            self.flesch_re_desc_str = self.flesch_re_desc(
                int(textstat.flesch_reading_ease(self.text_no_feed)))
            self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
            self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
            self.avg_syllables_per_word = textstat.avg_syllables_per_word(
                self.text_no_feed)
            self.avg_sentence_per_word = textstat.avg_sentence_per_word(
                self.text_no_feed)
            self.avg_sentence_length = textstat.avg_sentence_length(
                self.text_no_feed)
            self.avg_letter_per_word = textstat.avg_letter_per_word(
                self.text_no_feed)
            self.difficult_words = textstat.difficult_words(self.text_no_feed)
            self.rand_passive = self.select_random(self.passive_sentence_count,
                                                   self.passive_sentences)
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
            if self.word_tokens_no_punct:
                self.word_count = len(self.word_tokens_no_punct)
                self.page_length = float(self.word_count) / float(250)
                self.paper_count = int(math.ceil(self.page_length))
                self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
                self.pos_counts = Counter(
                    tag for word, tag in self.parts_of_speech)
                self.pos_total = sum(self.pos_counts.values())
                self.pos_freq = dict(
                    (word, float(count) / self.pos_total)
                    for word, count in self.pos_counts.items())
                self.doc_pages = float(float(self.word_count) / float(250))
                self.freq_words = \
                    self.word_frequency(self.word_tokens_no_punct)
                self.modal_dist = self.modal_count(self.word_tokens_no_punct)
                # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
                self.pos_count_dict = self.pos_counts.items()

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            self.semicolon_sentences = self.list_sentences(";")
            self.semicolon_example = self.select_random(
                len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)
sf['FRE_text'] = sf['content'].apply(lambda x: textstat.flesch_reading_ease(x))
sf['FRE_tagged_text'] = sf['FRE_text'].apply(
    lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
    if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
    if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
sf['FK_text'] = sf['content'].apply(
    lambda x: int(textstat.flesch_kincaid_grade(x)))
sf['GFI_text'] = sf['content'].apply(lambda x: textstat.gunning_fog(x))
sf['SMI_text'] = sf['content'].apply(lambda x: textstat.smog_index(x))
sf['CLI_text'] = sf['content'].apply(lambda x: textstat.coleman_liau_index(x))
sf['ARI_text'] = sf['content'].apply(
    lambda x: int(textstat.automated_readability_index(x)))
sf['DC_text'] = sf['content'].apply(
    lambda x: textstat.dale_chall_readability_score(x))
sf['Difficult_text_wc'] = sf['content'].apply(
    lambda x: textstat.difficult_words(x))

#Hand-picked quantitative features - # of percentage occurrences
percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%')
sf['Percent_occurrences'] = sf['content'].apply(
    lambda x: len(percent_pattern.findall(x)))

#Polarity feature extraction from news headlines
sf['Polarity_head'] = sf['title'].apply(
    lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
sf['Subjectivity_head'] = sf['title'].apply(
    lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
sf['Positive_head_wc'] = sf['title'].apply(
    lambda x: lm.get_score(lm.tokenize(x))['Positive'])
sf['Negative_head_wc'] = sf['title'].apply(
    lambda x: lm.get_score(lm.tokenize(x))['Negative'])
def predict_relevance(df):

    #Loading data into SFrame
    df[[a for a in df.columns.values]] = df[[a for a in df.columns.values
                                             ]].astype(str)
    tf = gl.SFrame(data=df)
    tf = tf.unique()

    #Loading LDA model for topic modeling, pysentiment module for financial sentiment analysis and the relevance prediction model
    lda = models.ldamodel.LdaModel.load('lda1.model')
    lm = py.LM()
    model = gl.load_model('relevance_model_64feat')

    #Building the LDA model using news articles
    tf['tokens'] = tf['content'].apply(lambda x: dc.tokenize_doc(x, 'STEM'))
    tokens_text = [
        unicode('|'.join(i), errors='replace').split('|') for i in tf['tokens']
    ]
    dictionary = corpora.Dictionary(tokens_text)
    corpus = [dictionary.doc2bow(text) for text in tokens_text]
    ldamat = lda[corpus]

    #Building LDA topic arrays per topic
    topic_arrays = np.zeros((30, len(ldamat)))
    for i, x in enumerate(ldamat):
        for topic_no, contrib in x:
            topic_arrays[topic_no, i] = contrib

    #Adding LDA topic arrays as feature columns as 'Tx'
    for i, x in enumerate(topic_arrays):
        tf['T' + str(i)] = gl.SArray(data=x, dtype=float)

    #Polarity feature extraction from content of news articles
    tf['Polarity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_text_wc'] = tf['content'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_text_rate'] = tf['Negative_text_wc'] / tf['Total_text_wc']
    tf['Positive_text_rate'] = tf['Positive_text_wc'] / tf['Total_text_wc']
    tf['Max_Polarity'] = tf['content'].apply(lambda x: max(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Min_Polarity'] = tf['content'].apply(lambda x: min(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Sentences_wc'] = tf['content'].apply(lambda x: len(sent_tokenize(x)))
    tf['Positive_sentrate'] = tf['Positive_text_wc'] / tf['Sentences_wc']
    tf['Negative_sentrate'] = tf['Negative_text_wc'] / tf['Sentences_wc']

    #Readability feature extraction from content of news articles
    tf['FRE_text'] = tf['content'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_text'] = tf['FRE_text'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_text'] = tf['content'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_text'] = tf['content'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_text'] = tf['content'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_text'] = tf['content'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_text'] = tf['content'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_text'] = tf['content'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_text_wc'] = tf['content'].apply(
        lambda x: textstat.difficult_words(x))

    #Hand-picked quantitative features - # of percentage occurrences
    percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%')
    tf['Percent_occurrences'] = tf['content'].apply(
        lambda x: len(percent_pattern.findall(x)))

    #Polarity feature extraction from news headlines
    tf['Polarity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_head_wc'] = tf['title'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_head_rate'] = tf['Negative_head_wc'] / tf['Total_head_wc']
    tf['Positive_head_rate'] = tf['Positive_head_wc'] / tf['Total_head_wc']

    #Readability feature extraction from news headlines
    tf['FRE_head'] = tf['title'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_head'] = tf['FRE_head'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_head'] = tf['title'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_head'] = tf['title'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_head'] = tf['title'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_head'] = tf['title'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_head'] = tf['title'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_head'] = tf['title'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_head_wc'] = tf['title'].apply(
        lambda x: textstat.difficult_words(x))

    #Predicting relevance class using these features in sorted order of confidence
    tf = tf.add_row_number()
    pred = model.classify(tf)
    pred = pred.add_row_number()
    relevant = pred.sort('probability', ascending=False)[:10]
    relevant = pred[pred['class'] == 1]
    non_relevant = pred[pred['class'] == 0]
    if relevant.num_rows() > 10:
        relevant_news_out = tf.join(relevant).sort('probability',
                                                   ascending=False)[:10]
    else:
        relevant_news = relevant.sort('probability', ascending=False)
        req_num_non_relevant_news = 10 - relevant.num_rows()
        non_relevant_news = non_relevant.sort(
            'probability')[:req_num_non_relevant_news]
        relevant_news = relevant_news.append(non_relevant_news)
        relevant_news_out = tf.join(relevant_news)

    return relevant_news_out
Example #25
0
    training_essays_df.set_value(
        i, "flesch_kincaid_grade",
        textstat.flesch_kincaid_grade(training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "coleman_liau_index",
        textstat.coleman_liau_index(training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "automated_readability_index",
        textstat.automated_readability_index(
            training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "dale_chall_readability_score",
        textstat.dale_chall_readability_score(
            training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "difficult_words",
        textstat.difficult_words(training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "linsear_write_formula",
        textstat.linsear_write_formula(training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "gunning_fog",
        textstat.gunning_fog(training_essays_df.iloc[i]['essay']))
    training_essays_df.set_value(
        i, "text_standard",
        textstat.text_standard(training_essays_df.iloc[i]['essay']))

writer = pd.ExcelWriter(sys.argv[2])
training_essays_df.to_excel(writer, sheet_name="results", index=False)
writer.save()
Example #26
0
                    # Build Dataset
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment":
                            sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index":
                            ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
                print e

    outfile = "years/" + str(year) + '.txt'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 31 15:25:10 2016

@author: megan
"""

from textstat.textstat import textstat as ts

fname = 'actbacFB.txt'

with open(fname, 'r', encoding='utf-8') as f:
    data = f.read().replace('\n', '')

total = ts.lexicon_count(data)
difficult = ts.difficult_words(data)
fkre = ts.flesch_reading_ease(data)
grade = ts.flesch_kincaid_grade(data)
overall = ts.text_standard(data)

print("Total words:", total)
print("Difficult words:", difficult)
print("FKRE:", fkre)
print("Grade:", grade)
print("Overall readability", overall)
#             "great booking book a hotel in ibiza."
# test_data = "refund hotel in ewr of george area area nearby. 3-star belfast hotel in"
test_data = "great rates. book at western blue casino hotel, bangkok. no reservation costs. great"

print(
    "-------------------------Text Statistic-----------------------------------"
)
print("Returns the number of syllables present in the given text.")
print(textstat.syllable_count(test_data, lang='en_US'))
print(
    "Calculates the number of words present in the text - punctuation removed")
print(textstat.lexicon_count(test_data, removepunct=True))
print("Returns the number of sentences present in the given text.")
print(textstat.sentence_count(test_data))
print("difficult words")
print(textstat.difficult_words(test_data))
print(
    "-------------------------Readability Formula------------------------------"
)
print("The Flesch Reading Ease Score")
print(textstat.flesch_reading_ease(test_data))
print("The SMOG Index")
print("Texts of fewer than 30 sentences are statistically invalid, "
      "because the SMOG formula was normed on 30-sentence samples.")
print("textstat requires atleast 3 sentences for a result.")
print(textstat.smog_index(test_data))
print("The Flesch-Kincaid Grade")
print(textstat.flesch_kincaid_grade(test_data))
print("The Coleman-Liau Index")
print(textstat.coleman_liau_index(test_data))
print("Automated Readability Index (ARI)")
Example #29
0
    def test_difficult_words(self):
        result = textstat.difficult_words(self.long_test)

        self.assertEqual(62, result)
Example #30
0
        # print(textstat.syllable_count(test_data, lang='en_US'))
        num_syllables = textstat.syllable_count(test_data, lang='en_US')
        print(num_syllables)
        print(
            "Calculates the number of words present in the text - punctuation removed"
        )
        # print(textstat.lexicon_count(test_data, removepunct=True))
        num_words = textstat.lexicon_count(test_data, removepunct=True)
        print(num_words)
        print("Returns the number of sentences present in the given text.")
        # print(textstat.sentence_count(test_data))
        num_sentences = textstat.sentence_count(test_data)
        print(num_sentences)
        print("difficult words")
        # print(textstat.difficult_words(test_data))
        num_difficult_words = textstat.difficult_words(test_data)
        print(num_difficult_words)

        print(
            "-------------------------Difficulty------------------------------"
        )
        print("The Flesch Reading Ease Score")
        # print(textstat.flesch_reading_ease(test_data))
        difficulty_score = textstat.flesch_reading_ease(test_data)
        print(difficulty_score)

        if 0 <= difficulty_score < 30:
            difficulty_label = "Confusing"
        elif 30 <= difficulty_score < 50:
            difficulty_label = "Difficult"
        elif 50 <= difficulty_score < 60:
                    target.close()

                    # Build Dataset
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment": sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index": ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
                print e

    outfile = "years/" + str(year) + '.txt'
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
Example #33
0
import re

raw_input("Please copy the lyrics to the two text files song1 and song 2. \nWhen complete hit enter to analyze.")
print ""

try:
    f = open('song1.txt')
    f_read = str(f.read())
    cleaned = re.sub("[\(\[].*?[\)\]]", "", f_read)
    if textstat.dale_chall_readability_score(cleaned) < 5:
        print "Song #1 | Dale Chall Score: " + str(textstat.dale_chall_readability_score(cleaned))
        print "Song #1 | " + "Easily understood by 4th-grade students or lower."
        f.close()
    elif textstat.dale_chall_readability_score(cleaned) < 6:
        print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned))
        print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned))
        print "Song #1 | " + "Easily understood by 5th-grade and 6th-grade students."
        f.close()
    elif textstat.dale_chall_readability_score(cleaned) < 7:
        print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned))
        print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned))
        print "Song #1 | " + "Easily understood by 7th-grade and 8th-grade students."
        f.close()
    elif textstat.dale_chall_readability_score(cleaned) < 8:
        print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned))
        print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned))
        print "Song #1 | " + "Easily understood by 9th-grade and 10th-grade students."
        f.close()
    elif textstat.dale_chall_readability_score(cleaned) < 9:
        print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned))
        print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned))
Example #34
0
        if os.path.isdir(FILE_OR_DIR):
            FILES = [os.path.join(FILE_OR_DIR, fn) for fn in os.listdir(FILE_OR_DIR)]
        else:
            FILES = [FILE_OR_DIR]

        for FILE in FILES:
            print 'Processing', FILE
            TEXT = read_file(FILE)

            print 'Flesh reading ease', textstat.flesch_reading_ease(TEXT)
            print 'Smog index', textstat.smog_index(TEXT)
            print 'Flesch Kincaid grade', textstat.flesch_kincaid_grade(TEXT)
            print 'Coleman Liau', textstat.flesch_kincaid_grade(TEXT)
            print 'Automated readability index', textstat.automated_readability_index(TEXT)
            print 'Dale Chall readability score', textstat.dale_chall_readability_score(TEXT)
            print 'Difficult words', textstat.difficult_words(TEXT)
            print 'Linsear write formula', textstat.linsear_write_formula(TEXT)
            print 'Gunning fog', textstat.gunning_fog(TEXT)
            print 'Text standard', textstat.text_standard(TEXT)

            print '\nWords'
            WORDS = get_words(TEXT)
            get_word_stats(WORDS)

            print '\nWords no Stop Words'
            WORDS_NO_STOP = [w for w in WORDS if w not in stop]
            get_word_stats(WORDS_NO_STOP)

            print '\nSentences'
            SENTENCES = get_sentences(TEXT)
            get_sentence_stats(SENTENCES)
Example #35
0
def percent_difficult_words(article):
    if textstat.lexicon_count(article) == 0:
        return 0

    return textstat.difficult_words(article) / textstat.lexicon_count(article)
Example #36
0
def oov_words_diff(q1, q2):
    return textstat.difficult_words(q1) - textstat.difficult_words(q2)
Example #37
0
    '../output_text/trump_out.txt', '../output_text/shakespeare_out.txt',
    '../output_text/drseuss_out.txt'
]

# input_file_names = ['../data_parsed/trump.txt',
input_file_names = [
    '../data_parsed/shakespeare.txt', '../data_parsed/drseuss.txt'
]

for i in range(0, len(input_file_names)):
    input_file_name = input_file_names[i]
    print(input_file_name)
    with open(input_file_name, 'r') as myfile:
        test_data = myfile.read().replace('\n', '')

    print "flesch_reading_ease: " + str(
        textstat.flesch_reading_ease(test_data))
    print "smog_index: " + str(textstat.smog_index(test_data))
    print "flesch_kincaid_grade: " + str(
        textstat.flesch_kincaid_grade(test_data))
    print "coleman_liau_index: " + str(textstat.coleman_liau_index(test_data))
    print "automated_readability_index: " + str(
        textstat.automated_readability_index(test_data))
    print "dale_chall_readability_score: " + str(
        textstat.dale_chall_readability_score(test_data))
    print "difficult_words: " + str(textstat.difficult_words(test_data))
    print "linsear_write_formula: " + str(
        textstat.linsear_write_formula(test_data))
    print "gunning_fog: " + str(textstat.gunning_fog(test_data))
    print "text_standard: " + str(textstat.text_standard(test_data))