Exemple #1
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
def _get_base_textstats(no_code_text):
    """
    Find basic text statistics
    :param no_code_text: Text we are analyzing
    :return: list: List of results
    """
    results = []
    group_by = 'Basic Text Statistics'
    num_chars = len(no_code_text)
    num_lower = sum(1 for c in no_code_text if c.islower())
    num_upper = sum(1 for c in no_code_text if c.isupper())
    num_letters = sum(1 for c in no_code_text if c.isalpha())
    num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by))

    results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by))
    results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by))
    results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by))
    results.append(TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(TextFeature('Number of upper case characters', num_upper, group_by))
    return results
Exemple #3
0
def calculate_statistics(lyrics):
    """
    Calculates statistics based on the text_raw of the lyrics.
    :return: Annotated lyrics containing information about the songs
    """
    logging.info("Calculating Statistics")
    from textstat.textstat import textstat
    for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)):
        try:
            song["num_syllables"] = textstat.syllable_count(song["text_raw"])
            song["num_words"] = textstat.lexicon_count(song["text_raw"])
            song["num_sentences"] = textstat.sentence_count(song["text_raw"])
            song["flesch_score"] = textstat.flesch_reading_ease(
                song["text_raw"])
            song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade(
                song["text_raw"])
            song["fog_score"] = textstat.gunning_fog(song["text_raw"])
            song[
                "num_difficult_words"] = textstat.dale_chall_readability_score(
                    song["text_raw"])
        except Exception as e:
            logging.error(
                "Something bad happened in the current song ! Skipping it... \n{}"
                .format(song))
            logging.exception(e)
    return lyrics
Exemple #4
0
def main():
    csv_file2 = open(sys.argv[2], 'w', encoding="utf8")
    writer = csv.writer(csv_file2, delimiter=',')
    doc_id = 1
    writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"])
    with open(sys.argv[1], 'r',  encoding="utf8", errors='ignore') as csv_file1:
        reader = csv.reader(csv_file1)
        # Skip the first line with headers
        next(reader)
        for row in reader:
            impact = str(row[0])
            url = str(row[1])
            text = str(row[2])
            read_ease = textstat.flesch_reading_ease(text)
            grade = textstat.flesch_kincaid_grade(text)
            smog = textstat.smog_index(text)
            words = textstat.lexicon_count(text)
            sentences = textstat.sentence_count(text)
            # Uncomment this if we want summary and key words
            # summary = summarize(text, ratio=0.3)
            # key_words = keywords(text, ratio=0.3)

            writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences])
            doc_id = doc_id+1
    csv_file1.close()
    csv_file2.close()

    print('Summary statistics complete!')
Exemple #5
0
    def _calculate_scores(self, docs):
        docs_scores = []

        for doc in docs:
            scores = {}
            scores['chars'] = ts.char_count(doc)
            scores['words'] = ts.lexicon_count(doc)
            scores['sents'] = ts.sentence_count(doc)
            #scores['syllables'] = ts.syllable_count(doc)
            scores['avg_sent_length'] = ts.avg_sentence_length(doc)
            scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc)
            scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc)
            scores['flesch'] = ts.flesch_reading_ease(doc)
            #scores['smog'] = ts.smog_index(doc)
            #scores['coleman_liau'] = ts.coleman_liau_index(doc)
            scores['automated_readability'] = ts.automated_readability_index(
                doc)
            #scores['linsear'] = ts.linsear_write_formula(doc)
            #scores['difficult_words'] = ts.difficult_words(doc)
            scores['dale_chall'] = ts.dale_chall_readability_score(doc)
            #scores['gunning_fog'] = ts.gunning_fog(doc)
            scores['lix'] = ts.lix(doc)
            docs_scores.append(scores)

        return docs_scores
Exemple #6
0
def main():
    """
    Evaluate and print Readability scores
    """

    if len(sys.argv) > 1:
        inf = open(sys.argv[1], 'r')
    else:
        sys.stderr.write('Error: specify input file.\n')
        sys.exit()

    text = inf.read()
    inf.close()

    lexcount = textstat.lexicon_count(text)

    sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount))
    
    # reading time in minutes
    # assumes 180 WPM plus some offset
    tread = (lexcount + 250) / 180.

    sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread))

    ease = textstat.flesch_reading_ease(text)
    grade = textstat.flesch_kincaid_grade(text)

    sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease))
    sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
Exemple #7
0
def split_pages(text, page_words=WORDS_PAGE):

    paragraphs = text.split("\n\n")

    pages = []
    working = ''
    for para in paragraphs:
        working = working + para
        if ts.lexicon_count(working) >= page_words:
            pages.append(working)
            working = ''

    if not ts.lexicon_count(working) == 0:
        pages.append(working)

    return pages
Exemple #8
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
Exemple #10
0
def text_analytics(text):
    if textstat.sentence_count(text) != 0:
        lexicon = textstat.lexicon_count(text) #word count
        sent = textstat.sentence_count(text) #sentence count
        syll = textstat.syllable_count(text) #syllable count
        flesch = textstat.flesch_reading_ease(text) #flesch score
        smog = textstat.smog_index(text) #SMOG index
        fog = textstat.gunning_fog(text) #FOG index
        dale = textstat.dale_chall_readability_score(text) #grade level
        ari = textstat.automated_readability_index(text) #grade level
        cl = textstat.coleman_liau_index(text) #grade level

        flesch1 = lexicon*flesch
        flesch2 = sent*flesch
        flesch3 = syll*flesch
        smog1 = lexicon*smog
        smog2 = sent*smog
        smog3 = syll*smog
        fog1 = lexicon*fog
        fog2 = sent*fog
        fog3 = syll*fog
        dale1 = lexicon*dale
        dale2 = sent*dale
        dale3=syll*dale
        ari1 = lexicon*ari
        ari2 = sent*ari
        ari3 = syll*ari
        cl1 = lexicon*cl
        cl2 = sent*cl
        cl3 = syll*cl
        x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1,                 smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3]
    return(x)
Exemple #11
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
Exemple #12
0
def analyse_plain_text(test_data):
    text_stats = TextStats()

    # Do some simple analysis.
    from textblob import TextBlob
    zen = TextBlob(test_data)
    text_stats.word_count = len(zen.words)
    text_stats.sentence_count = len(zen.sentences)
    text_stats.polarity = zen.sentiment.polarity
    text_stats.subjectivity = zen.sentiment.subjectivity

    # Easy to read, this?
    from textstat.textstat import textstat
    text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data)

    # Words per sentence count.
    from textstat.textstat import textstat
    text_stats.word_per_sentence_count = (
        textstat.lexicon_count(test_data, False) /
        textstat.sentence_count(test_data))

    # Convert all to lower.
    test_data = test_data.lower()

    # Tokenise.
    from nltk.tokenize import word_tokenize
    words = word_tokenize(test_data)

    # Tokenise stemmed text.
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    test_data_stemmed = ''
    for w in words:
        test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w)
    stemmed_words = word_tokenize(test_data_stemmed)

    # Remove non-words.
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in stemmed_words if nonPunct.match(w)]

    # Remove stopwords:
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    extra_stopwords = set([
        'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get'
    ])
    filtered = [
        w for w in filtered if w not in stopwords and w not in extra_stopwords
    ]

    # How many unique words?
    from collections import Counter
    counts = Counter(filtered)
    text_stats.unique_word_count = len(counts)

    # Words sorted by most common.
    text_stats.counts = counts

    return text_stats
Exemple #13
0
def composition(text, file):
    char_count = textstat.char_count(text)
    syll_count = textstat.syllable_count(text)
    lex_count = textstat.lexicon_count(text)
    sent_count = textstat.sentence_count(text)
    file.write(
        '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d'
        % (char_count, syll_count, lex_count, sent_count))
Exemple #14
0
def _get_base_textstats(no_code_text):
    """
    Find basic text statistics
    :param no_code_text: Text we are analyzing
    :return: list: List of results
    """
    results = []
    group_by = 'Basic Text Statistics'
    num_chars = len(no_code_text)
    num_lower = sum(1 for c in no_code_text if c.islower())
    num_upper = sum(1 for c in no_code_text if c.isupper())
    num_letters = sum(1 for c in no_code_text if c.isalpha())
    num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(
        TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(
                TextFeature(
                    'Character count for "{}"'.format(
                        c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(
                TextFeature('Character count for "{}"'.format(c[0]), c[1],
                            group_by))

    results.append(
        TextFeature('Number of syllables',
                    textstat.syllable_count(no_code_text), group_by))
    results.append(
        TextFeature('Lexicon Count (without punctuation)',
                    textstat.lexicon_count(no_code_text, True), group_by))
    results.append(
        TextFeature('Lexicon Count (with punctuation)',
                    textstat.lexicon_count(no_code_text, False), group_by))
    results.append(
        TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(
        TextFeature('Number of upper case characters', num_upper, group_by))
    return results
Exemple #15
0
    def get_statistics(self, f, content):
        content = content.lower()

        reading_level = textstat.flesch_kincaid_grade(content)
        word_count = textstat.lexicon_count(content)
        keyword_frequency = map(lambda x: x[1],
                                self.get_keyword_frequency(content))
        sentiment = DocumentStatistics.get_sentiment(content)
        return [f, reading_level, word_count] + keyword_frequency + sentiment
def textstat_analysis(profile_text):
    fre = textstat.flesch_reading_ease(profile_text)
    smog = textstat.smog_index(profile_text)
    fkg = textstat.flesch_kincaid_grade(profile_text)
    coleman = textstat.coleman_liau_index(profile_text)
    ari = textstat.automated_readability_index(profile_text)
    dale = textstat.dale_chall_readability_score(profile_text)
    dw = textstat.difficult_words(profile_text)
    lwf = textstat.linsear_write_formula(profile_text)
    gf = textstat.gunning_fog(profile_text)
    rc = textstat.readability_consensus(profile_text)
    word_count = textstat.lexicon_count(profile_text)
    return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def get_stats(sentence):
	syllables = textstat.syllable_count(sentence)
	words = textstat.lexicon_count(sentence, True)
	sentence_count = textstat.sentence_count(sentence)

	if sentence_count > 0:
		text_standard = textstat.text_standard(sentence)
	else:
		text_standard = EMPTY_TEXT_STANDARD

	text_standard = fix_grammar_errors(text_standard)

	return combine(syllables, words, sentence_count, text_standard)
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')])
     print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
Exemple #20
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Exemple #21
0
def analyse_json(json_text):
    # consider moving this to be a feature of Transcript in the other module

    df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name',
                                         'syllable_count','lexicon_count',
                                         'sentence_count',
                                         'syllables_per_word',
                                         'gunning_fog', 'smog_index',
                                         'text_standard'],
                      index=[])

    trscrpt = json.loads(json_text)
    if 'witnesses' in trscrpt:
        witnesses = trscrpt['witnesses']


        for s in trscrpt['all_sections']:
            if 'speaker' in s and 'person' in s['speaker'] and \
                    s['speaker']['person']['speaker_type']=='witness':
                witness =  witnesses[s['speaker']['person']['name']]
                witness.setdefault('all_text', []).append(s['spoken_text'])

        for i, p in enumerate(witnesses):
            if 'all_text' in witnesses[p]:
                witness_text = '\n\n'.join(witnesses[p]['all_text'])
                if len(witness_text) > 0:
                    stats_data = {'html_file_location': trscrpt['html_file_location'],
                                  'witness_name': p,
                                  'syllable_count': textstat.syllable_count(witness_text),
                                  'lexicon_count': textstat.lexicon_count(witness_text),
                                  'sentence_count': textstat.sentence_count(witness_text),
                                  'syllables_per_word': textstat.avg_syllables_per_word(witness_text),
                                  'gunning_fog': textstat.gunning_fog(witness_text),
                                  'smog_index': textstat.smog_index(witness_text),
                                  'text_standard': textstat.text_standard(witness_text)}
                    df_witnesses.loc['witness_%i' % i] = stats_data
                else:
                    df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                    df_witnesses.loc['witness_%i' % i, 'witness_name'] = p
            else:
                df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                df_witnesses.loc['witness_%i' % i, 'witness_name'] = p

    return df_witnesses
Exemple #22
0
def count_partsofspeech(article_contents: str) -> dict:
    """
    Returns the number of adjectives in a given string.

    @param article_contents, a string containing a news article
    @return pos_dict, which contains the parts of speech breakdown of an article
    """
    pos_dict = {}
    text = nltk.word_tokenize(article_contents)

    for word in nltk.pos_tag(text):
        if word[1] in pos_dict:
            pos_dict[word[1]] += 1

        else:
            pos_dict[word[1]] = 1

    for item in pos_dict:
        pos_dict[item] = pos_dict[item] / textstat.lexicon_count(
            article_contents)
    return pos_dict
Exemple #23
0
def analyseText():
    values = request.get_json()
    required = [ 'inputText' ]
    if not all(k in values for k in required):
        return 'Missing values', 400

    text = values['inputText']
    result = {
        'syllable_count': textstat.syllable_count(text),
        'lexicon_count': textstat.lexicon_count(text),
        'sentence_count': textstat.sentence_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'gunning_fog': textstat.gunning_fog(text),
        'smog_index': textstat.smog_index(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(text)
    };

    return jsonify(result), 200
Exemple #24
0
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename),
                      'r',
                      encoding='utf8',
                      errors='ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(
         data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([
         len(w) for s in sentences for w in s.split(' ')
         if w not in stopwords.words('english')
     ])
     print 'Parse ', len(
         sentences
     ), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
Exemple #25
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Exemple #26
0
    def parse_HtmlResponse(self, response):
        item = SeedItem()
        item['url'] = response.url
        item['response_code'] = response.status
        item['response_type'] = 'HTML'

        soup = BeautifulSoup(response.body, "lxml")
        if soup.html.has_attr('lang'):
            lang = soup.html['lang']
            item['declared_language'] = lang

        stripped_text = soup.get_text()
        item['num_words'] = str(textstat.lexicon_count(stripped_text))
        item['fk_grade'] = textstat.flesch_kincaid_grade(stripped_text)

        Detector(stripped_text, True)
        language = Detector(stripped_text)

        if language.reliable:
            lang = language.language.code
            #print "detected language: " + language.language.code + " confidence:" + str(language.language.confidence)
            item['detected_language'] = lang

        num_links = 0
        for href in response.css('a::attr(href)').extract():
            url = response.urljoin(href)
            if self.language_train.is_url_predicted_in_accepted_lang(url):
                #print "link is predicted to be acceptable, keeping: " + url
                yield scrapy.Request(url=response.urljoin(href),
                                     callback=self.parse)
            else:
                print "Skipping url because of predicted language:" + url
            num_links += 1
        item['num_links'] = num_links

        yield item
Exemple #27
0
def text_proc(corpus, urlDat={}, WORD_LIM=30, verbose=False):
    if type(corpus) is type(str()) and corpus not in str(
            "Redirecting"):  # and not str("privacy policy") in corpus:

        if str("some error has occurred while processing your request"
               ) in corpus:
            return {}
        if str("We apologize for the inconvenience...") in corpus:
            return {}
        # if np.mean([len(w) for w in corpus]) > 35:
        # 	return {}

        corpus = corpus.replace("/",
                                " ")  # remove characters that nltk can't read
        corpus = corpus.lower()
        corpus = corpus.replace(u"\xa0", u" ")
        corpus = corpus.replace(u"\\", u" ")
        corpus, this_is_science = extract_science_block(corpus)
        if "semantic" in urlDat.keys():
            if urlDat["semantic"]:
                this_is_science = True
        urlDat["science"] = this_is_science

        # print(corpus)
        # print(this_is_science, "this_is_science")
        urlDat["big_words"] = [word for word in corpus if len(word) > 16]
        ignoreSingleSentences = 1

        corpus = cleanup_pretagger_all(corpus)
        if verbose:
            st.text("pretagger all")
            st.text(type(corpus))

        tokens = word_tokenize(corpus)
        if verbose:
            st.text("token input")
            st.text(tokens)
        tokens = [t for t in tokens if t not in not_want_list]
        # if np.mean([len(t) for t in tokens]) > 50:
        # 	return {}
        # tokens = [t for t in tokens if len(t) < 50]
        # if verbose:
        # 	st.text("token input")
        # 	st.text(tokens)
        wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
            tokens, ignoreSingleSentences=1)

        if len(tokens) < WORD_LIM:
            return {}
        if len(tokens) >= WORD_LIM:

            remainingText = " ".join(remainingText)
            remainingText = remainingText.lower()
            urlDat["standard"] = textstat.text_standard(remainingText,
                                                        float_output=True)
            # st.markdown(urlDat["standard"])
            if wc > 0 and sc > 0:
                if "semantic" in urlDat.keys() or urlDat["standard"] > 95:

                    # else:
                    #    urlDat["hard_snippet"] = None
                    urlDat["fre_unbiased"] = freeAlongtheText(corpus,
                                                              chunk_length=512)
                    fre = FRE(wc, sc, sylCount)

                    if "semantic" in urlDat.keys():
                        if urlDat["semantic"]:
                            ndc = NDC(
                                remainingText, wc, sc
                            )  # calc NDC Index and Perctage Diff Words                                         #calc NDC index
                # if not "fre_unbiased" in urlDat.keys() and urlDat["standard"]>100:
                meanv, total, hard_snippet = complexityAlongtheText(
                    corpus, chunk_length=256)
                urlDat["standard_unbiased"] = meanv
                # urlDat["standard"] = total
                # if this_is_science:
                if "semantic" in urlDat.keys():
                    urlDat["hard_snippet"] = hard_snippet

                    # urlDat["fre"] = fre  # textstat.text_standard(corpus, float_output=True)
                    # urlDat["standard"] = ndc[0]
                # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
            # print(urlDat["standard"])
            """
            if "fre_unbiased" in urlDat.keys():
                if (
                    urlDat["fre_unbiased"] < urlDat["standard"]
                    and urlDat["fre_unbiased"] > 0
                ):
                    urlDat["standard"] = urlDat["fre_unbiased"]
                if urlDat["standard"] == 0 and urlDat["fre_unbiased"] > 0:
                    urlDat["standard"] = urlDat["fre_unbiased"]
            """
            # if (
            #    urlDat["standard_unbiased"] < urlDat["standard"]
            #    and urlDat["standard_unbiased"] > 0
            # ):
            #    urlDat["standard"] = urlDat["standard_unbiased"]
            # if fre<urlDat["standard"] and fre>0:
            #    urlDat["standard"] = fre
            # if urlDat["standard"] > 60 and ndc[0]>0 and ndc[0]<60:
            #    urlDat["standard"] = ndc[0]

            # urlDat["concensus"] = np.mean(
            # 	[
            # 		np.mean(fre),
            # 		np.mean(urlDat["standard_unbiased"]),
            # 	]
            # )
            tokens = [w.lower() for w in tokens if w.isalpha()]
            tokens = [w.lower() for w in tokens]  # make everything lower case
            urlDat["wcount"] = textstat.lexicon_count(str(tokens))
            word_lim = bool(urlDat["wcount"] > WORD_LIM)
            # print(urlDat["tokens"])

            if len(tokens):
                if "semantic" in urlDat.keys():
                    urlDat["tokens"] = tokens

                lexicon = textstat.lexicon_count(corpus, True)
                urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
                urlDat["unique_words"] = len(set(tokens))

                # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
                # big deltas mean redudancy/sparse information/information/density

                testimonial = TextBlob(corpus)
                urlDat["sp"] = testimonial.sentiment.polarity
                urlDat["ss"] = testimonial.sentiment.subjectivity
                urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
                urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
                urlDat["gf"] = textstat.gunning_fog(corpus)
    if "standard" in urlDat.keys():
        if urlDat["standard"] == 0:
            if verbose:
                st.text("gets here")
            # return {}

    return urlDat
Exemple #28
0
#main script
if __name__ == '__main__':

	print "TextStat Comparison Script"
	print "--------------------------"
	
	#read in text from the command line
	#This needs to be fixed to deal/escape special characters
	textToCheck = raw_input("Please enter the text you would like to analyse: ") 
	
	#read in text from a file- but what format?
	
	print "\n\n"
	print "Results"
	print "=============================================="
	print "==============================================\n"
	
	print "Syllable Count: " + str(textstat.syllable_count(textToCheck))
	print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting
	print "Sentence Count: " + str(textstat.sentence_count(textToCheck))
	print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck))
	print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck))
	print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck))
	print "SMOG Index: " + str(textstat.smog_index(textToCheck))
	print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck))
	print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck))
	print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck))
	print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck))
	print "--------------------------------------------------------------"
	print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck))
	print "\n\n"
Exemple #29
0
with open(data_file, "r", encoding='UTF-8') as file:
    for test_data in file:
        test_data = test_data.replace("\n", "")
        print(test_data)
        print(
            "-------------------------Text Statistic-----------------------------------"
        )
        print("Returns the number of syllables present in the given text.")
        # print(textstat.syllable_count(test_data, lang='en_US'))
        num_syllables = textstat.syllable_count(test_data, lang='en_US')
        print(num_syllables)
        print(
            "Calculates the number of words present in the text - punctuation removed"
        )
        # print(textstat.lexicon_count(test_data, removepunct=True))
        num_words = textstat.lexicon_count(test_data, removepunct=True)
        print(num_words)
        print("Returns the number of sentences present in the given text.")
        # print(textstat.sentence_count(test_data))
        num_sentences = textstat.sentence_count(test_data)
        print(num_sentences)
        print("difficult words")
        # print(textstat.difficult_words(test_data))
        num_difficult_words = textstat.difficult_words(test_data)
        print(num_difficult_words)

        print(
            "-------------------------Difficulty------------------------------"
        )
        print("The Flesch Reading Ease Score")
        # print(textstat.flesch_reading_ease(test_data))
Exemple #30
0
def getReadTimeNewUser(content):
    avg_murica = 200 #wpm
    return textstat.lexicon_count(content) / (avg_murica / 60)
strip_nums = str.maketrans('', '', digits)
df['text'] = df['text'].apply(lambda x: x.translate(remove_digits))

# Print pro/con text of one review.
#df.loc[6,'pros']
#df.loc[6,'cons']
#df.loc[7,'text']

# Create features for grade level, reading ease, word counts, sentence count, and paragraphs.
# Source: https://pypi.python.org/pypi/textstat/
# Note: \r = paragraph break. \n = white space.
df['read_ease_grade'] = df['text'].apply(
    lambda x: textstat.flesch_kincaid_grade(x))
df['sentence_count'] = df['text'].apply(lambda x: textstat.sentence_count(x))
df['word_count'] = df['text'].apply(lambda x: textstat.lexicon_count(x))
df['word_count_squared'] = (df['word_count'])**2
df['paragraph'] = df['pros'].apply(lambda x: x.count('\r')) + df['cons'].apply(
    lambda x: x.count('\r'))
df['text_ratio'] = (df['textLengthPro'] - df['textLengthCon']) / (
    (df['textLengthPro'] + df['textLengthCon']))

################################################################
#### Stop words, tokenize, stemming.
################################################################

#### Tokenize text.
tokenizer = RegexpTokenizer(r'\w+')
df['tokens'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

#### Stem text.
Exemple #32
0
# comma_count
try:
  comma_count = count_comma(AB) 
except:
  warning_message = 1

# num_syllables
try: 
  num_syllables = textstat.syllable_count(AB)
except: 
  warning_message = 1

# word_count
try:
  word_count = textstat.lexicon_count(AB) 
except: 
  warning_message = 1

# avg_word_len
try: 
  avg_word_len = avg_word_length(AB) 
except:
  warning_message = 1
  
# flesch_score
try: 
  flesch_score = textstat.flesch_reading_ease(AB) 
except:
  warning_message = 1
                    # Build Dataset
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment": sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index": ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
                print e

    outfile = "years/" + str(year) + '.txt'
    dir = os.path.dirname(outfile)
    if not os.path.exists(dir):
Exemple #34
0
def get_word_count(string):
    return textstat.lexicon_count(string, False)
Exemple #35
0
from textstat.textstat import textstat
import pandas as pd

key = 'adbbd909ff7241929e6a6c6a5e938f3f'
archive = ArchiveAPI(key)
data = []
for year in range(1950, 2016):
    for month in range(1, 13):
        contents = archive.query(year, month)
        date = str(year) + '-' + str(month)
        print date
        headlines = []
        total = 0.0
        count = 0.0
        length = 0.0
        for articles in contents['response']['docs']:
            #print articles
            count = count + 1
            length = length + textstat.lexicon_count(str(articles['headline']))
            total = total + textstat.flesch_reading_ease(
                str(articles['headline']))
        data.append((date, total / count, length / count))
        print count
print data
labels = ['date', 'flesch_reading_ease', 'average_length']
df = pd.DataFrame.from_records(data, columns=labels)
df.to_csv('headlines.csv')
print df
#print data['1950-12']

#if articles['news_desk'] == 'National Desk'or articles['news_desk'] == None:
Exemple #36
0
if __name__ == '__main__':

    # prompt user for file and open it
    user_input = input("Enter file name to open: ")
    input_string = open(user_input).read()
    user_input = input("Enter file name to write to: ")

    # declare/initialize lists
    copy_string = input_string.split()
    words_with_synonyms = []
    the_synonyms = []

    # get number of syllables, words, sentences, and FK score for the file
    num_syllables = textstat.syllable_count(input_string)
    num_words = textstat.lexicon_count(input_string)
    num_sentences = textstat.sentence_count(input_string)
    fk_score = 206.835 - float(1.105 * (num_words / num_sentences)) - float(
        84.6 * (num_syllables / num_words))

    # print number of syllables, words, and sentences in the file
    print("\nNumber of syllables: ", num_syllables)
    print("Number of words: ", num_words)
    print("Number of sentences: ", num_sentences)

    output = synonym_replacement(input_string, copy_string)
    #output = remove_adjective(output, copy_string)
    initial_grade = check_reading_level(input_string)
    new_grade = check_reading_level(output)

    new_num_syllables = textstat.syllable_count(output)
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
def calculate_number_of_lexicons(review):
    if len(review) > 0:
        return math.sqrt(math.sqrt(textstat.lexicon_count(review)))
    else:
        return 0
        response = opener.open(qqll)
        rt = response
        raw_html = response.read()
        g = goose.Goose()
        a = g.extract(raw_html=raw_html)
        htext = a.cleaned_text
        opinion = TextBlob(htext)
        pol = opinion.sentiment.polarity
        sub = opinion.sentiment.subjectivity
        rt = requests.get(qqll).elapsed.total_seconds()
        kw = str(keywords(htext, lemmatize=True))
        kw = kw.replace('\r', ' ').replace('\n', ' ')
        keyw = ' '.join(kw.split()[:3])
        sbody = htext.replace(',', '')
        fkg = textstat.flesch_kincaid_grade(htext)
        wc = textstat.lexicon_count(htext)
        sc = textstat.sentence_count(htext)
        fre = textstat.flesch_reading_ease(htext)
        sinsite = [
            'response time', 'subjective', 'polarity', 'fgrade', 'fscore',
            'words.counts', 'sentence.count', 'keywords', 'title', 'link',
            'text'
        ]
        wr.writerow(sinsite)
        insite = [rt, sub, pol, fkg, fre, wc, sc, keyw, a.title, qqll]
        wr.writerow(insite)

    rec = re.compile(r"https?://(www\.)?")
    zz = rec.sub('', qqll).strip().strip('/')
    with open('rowTwittersite.csv', 'w') as tsout:
        wr = csv.writer(tsout, quoting=csv.QUOTE_ALL)
Exemple #40
0
def lexicon_count_diff(q1, q2):
    return textstat.lexicon_count(q1) - textstat.lexicon_count(q2)
Exemple #41
0
    def __init__(self, path):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            path (str): path to file to open, anaylze, close


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made by
        textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.user = ""
        self.path = path
        self.abs_path = os.path.abspath(self.path)
        if os.path.isfile(self.path):
            self.time_stamp = self.timestamp()
            self.file_name = os.path.basename(path)
            self.mime = MimeTypes()
            self.guessed_type = self.mime.guess_type(self.path)
            self.file_type = self.guessed_type[0]
            self.raw_text = textract.process(self.path, encoding="ascii")
            self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text)
            self.ptext = re.sub(u"\u2014", "--", self.ptext)
            self.ptext = re.sub(",", ",", self.ptext)
            self.ptext = re.sub("—", "--", self.ptext)
            self.ptext = re.sub("…", "...", self.ptext)
            self.text_no_feed = self.clean_new_lines(self.ptext)
            self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
            self.sentence_count = len(self.sentence_tokens)
            self.passive_sentences = passive(self.text_no_feed)
            self.passive_sentence_count = len(self.passive_sentences)
            self.percent_passive = (100 * (float(self.passive_sentence_count) /
                                           float(self.sentence_count)))
            self.percent_passive_round = round(self.percent_passive, 2)

            self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
            self.be_verb_count = self.be_verb_analysis[0]
            self.weak_sentences_all = self.be_verb_analysis[1]
            self.weak_sentences_set = set(self.weak_sentences_all)
            self.weak_sentences_count = len(self.weak_sentences_set)
            self.weak_verbs_to_sentences = 100 * float(
                self.weak_sentences_count) / float(self.sentence_count)
            self.weak_verbs_to_sentences_round = round(
                self.weak_verbs_to_sentences, 2)
            self.word_tokens = self.word_tokenize(self.text_no_feed)
            self.word_tokens_no_punct = \
                self.word_tokenize_no_punct(self.text_no_feed)
            self.no_punct = self.strip_punctuation(self.text_no_feed)
            # use this! It make lower and strips symbols
            self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)


            self.readability_flesch_re = \
                textstat.flesch_reading_ease(self.text_no_feed)
            self.readability_smog_index = \
                textstat.smog_index(self.text_no_feed)
            self.readability_flesch_kincaid_grade = \
                textstat.flesch_kincaid_grade(self.text_no_feed)
            self.readability_coleman_liau_index = \
                textstat.coleman_liau_index(self.text_no_feed)
            self.readability_ari = \
                textstat.automated_readability_index(self.text_no_feed)
            self.readability_linser_write = \
                textstat.linsear_write_formula(self.text_no_feed)
            self.readability_dale_chall = \
                textstat.dale_chall_readability_score(self.text_no_feed)
            self.readability_standard = \
                textstat.text_standard(self.text_no_feed)

            self.flesch_re_desc_str = self.flesch_re_desc(
                int(textstat.flesch_reading_ease(self.text_no_feed)))
            self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
            self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
            self.avg_syllables_per_word = textstat.avg_syllables_per_word(
                self.text_no_feed)
            self.avg_sentence_per_word = textstat.avg_sentence_per_word(
                self.text_no_feed)
            self.avg_sentence_length = textstat.avg_sentence_length(
                self.text_no_feed)
            self.avg_letter_per_word = textstat.avg_letter_per_word(
                self.text_no_feed)
            self.difficult_words = textstat.difficult_words(self.text_no_feed)
            self.rand_passive = self.select_random(self.passive_sentence_count,
                                                   self.passive_sentences)
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
            if self.word_tokens_no_punct:
                self.word_count = len(self.word_tokens_no_punct)
                self.page_length = float(self.word_count) / float(250)
                self.paper_count = int(math.ceil(self.page_length))
                self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
                self.pos_counts = Counter(
                    tag for word, tag in self.parts_of_speech)
                self.pos_total = sum(self.pos_counts.values())
                self.pos_freq = dict(
                    (word, float(count) / self.pos_total)
                    for word, count in self.pos_counts.items())
                self.doc_pages = float(float(self.word_count) / float(250))
                self.freq_words = \
                    self.word_frequency(self.word_tokens_no_punct)
                self.modal_dist = self.modal_count(self.word_tokens_no_punct)
                # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
                self.pos_count_dict = self.pos_counts.items()

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            self.semicolon_sentences = self.list_sentences(";")
            self.semicolon_example = self.select_random(
                len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)