Ejemplo n.º 1
1
def check_reading_level(input_string):
    level = [
        "college graduate", "college", "12th grade", "11th grade",
        "10th grade", "9th grade", "8th grade", "7th grade", "6th grade",
        "5th grade"
    ]
    grade = []
    if textstat.flesch_kincaid_grade(input_string) <= 5:
        grade.append(level[9])
    elif textstat.flesch_kincaid_grade(input_string) <= 6:
        grade.append(level[8])
    elif textstat.flesch_kincaid_grade(input_string) <= 7:
        grade.append(level[7])
    elif textstat.flesch_kincaid_grade(input_string) <= 8:
        grade.append(level[6])
    elif textstat.flesch_kincaid_grade(input_string) <= 9:
        grade.append(level[5])
    elif textstat.flesch_kincaid_grade(input_string) <= 10:
        grade.append(level[4])
    elif textstat.flesch_kincaid_grade(input_string) <= 11:
        grade.append(level[3])
    elif textstat.flesch_kincaid_grade(input_string) <= 12:
        grade.append(level[2])
    elif textstat.flesch_kincaid_grade(input_string) <= 13:
        grade.append(level[1])
    else:
        grade.append(level[0])

    grade_string = " ".join(grade)
    return grade_string
def other_features(tweet):
    ##SENTIMENT
    sentiment = VS(tweet)
    ##READABILITY
    #See https://pypi.python.org/pypi/textstat/
    flesch = round(textstat.flesch_reading_ease(tweet),3)
    flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet),3)
    gunning_fog = round(textstat.gunning_fog(tweet),3)
    ##TEXT-BASED
    length = len(tweet)
    num_terms = len(tweet.split())
    ##TWITTER SPECIFIC TEXT FEATURES
    hashtag_count = tweet.count("#")
    mention_count = tweet.count("@")
    url_count = tweet.count("http")
    retweet = 0
    if tweet.lower().startswith("rt") is True:
        retweet = 1
    #Checking if RT is in the tweet
    words = tweet.lower().split()
    if "rt" in words or "#rt" in words:
        retweet = 1
    features = [sentiment['compound'],flesch, flesch_kincaid,
                gunning_fog, length, num_terms,
                hashtag_count, mention_count,
                url_count, retweet]
    return features
Ejemplo n.º 3
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
 def analysis(self):
     # Find political issues in document
     existing_issues, trigger_terms = self._find_issues()
     self.document.issues = json.dumps(existing_issues)
     #  Get text frequency words and basic document stats
     content_words = get_lowercase(get_words(self.document.content))
     self.document.word_count = len(content_words)
     self.document.unique_word_count = len(set(content_words))
     content_words_without_stopwords = remove_stop_words(content_words)
     self.document.word_frequency = json.dumps(nltk.FreqDist(content_words_without_stopwords).most_common(10))
     # Run Textblob's document analysis to get sentiment and noun phrases
     analysis = TextBlob(self.document.content)
     self.document.blob_keywords = json.dumps(analysis.noun_phrases)
     self.document.blob_sentiment = str(analysis.sentiment)
     #3: RAKE keywords for each page
     rake_results = self.rake.run(self.document.content)
     self.document.rake_keywords = json.dumps(rake_results[:5])
     # 4 Sentiment
     self.document.nltk_sentiment = self._check_sentiment()
     # 5 Readability
     if self.document.content.strip():
         self.document.readability = textstat.flesch_kincaid_grade(self.document.content)
     # Update Summary Object with Issue Information
     for issue in existing_issues:
         key = issue.get('key')
         if key not in summary['issues']:
             summary['issues'][key] = { 'examples': [], 'content': ''}
         summary['issues'][key]['content'] += (' ' + self.document.content)
         summary['issues'][key]['examples'].append({
             'title': self.document.title,
             'word_count': self.document.word_count,
             'unique_word_count': self.document.unique_word_count,
             'sentiment': self.document.nltk_sentiment,
             'readability': self.document.readability
         })
Ejemplo n.º 5
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
def _get_reading_stats(no_code_text):
    """
    Returns reading level information
    :param no_code_text: String to analyse
    :return: list of details
    """
    group_by = 'Reading Level Analysis '
    results = []
    results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by))        # higher is better, scale 0 to 100
    results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by))
    try:
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by))
    try:
        results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The SMOG Index', "Undetermined", group_by))
    results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by))
    results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by))
    try:
        results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by))
    except IndexError:
        results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by))
    try:
        results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by))

    try:
        results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by))
    except (TypeError, IndexError):
        results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by))
    return results
Ejemplo n.º 7
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
Ejemplo n.º 8
0
def readability(text):
    print("Readability\n=================================\n\n")
    print("Flesch Reading Ease\n________________________\n\n")
    print str(textstat.flesch_reading_ease(text)) + "\n"
    print("Smog Index\n________________________\n\n")
    print str(textstat.smog_index(text)) + "\n"
    print("Flesch Kincaid Grade\n________________________\n\n")
    print str(textstat.flesch_kincaid_grade(text)) + "\n"
    print("Coleman Liau Index\n________________________\n\n")
    print str(textstat.coleman_liau_index(text)) + "\n"
    print("ARI\n________________________\n\n")
    print str(textstat.automated_readability_index(text)) + "\n"
    print("Dale Chall\n________________________\n\n")
    print str(textstat.dale_chall_readability_score(text)) + "\n"
    print("Difficult Words\n________________________\n\n")
    print str(textstat.difficult_words(text)) + "\n"
    print("Linsear Write Formula\n________________________\n\n")
    print str(textstat.linsear_write_formula(text)) + "\n"
    print("Gunning Fog\n________________________\n\n")
    print str(textstat.gunning_fog(text)) + "\n"
    print "Compiled Score\n_____________________________\n\n"
    print str(textstat.text_standard(text)) + "\n"


    return len(adjectives)
Ejemplo n.º 9
0
def calculate_statistics(lyrics):
    """
    Calculates statistics based on the text_raw of the lyrics.
    :return: Annotated lyrics containing information about the songs
    """
    logging.info("Calculating Statistics")
    from textstat.textstat import textstat
    for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)):
        try:
            song["num_syllables"] = textstat.syllable_count(song["text_raw"])
            song["num_words"] = textstat.lexicon_count(song["text_raw"])
            song["num_sentences"] = textstat.sentence_count(song["text_raw"])
            song["flesch_score"] = textstat.flesch_reading_ease(
                song["text_raw"])
            song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade(
                song["text_raw"])
            song["fog_score"] = textstat.gunning_fog(song["text_raw"])
            song[
                "num_difficult_words"] = textstat.dale_chall_readability_score(
                    song["text_raw"])
        except Exception as e:
            logging.error(
                "Something bad happened in the current song ! Skipping it... \n{}"
                .format(song))
            logging.exception(e)
    return lyrics
Ejemplo n.º 10
0
def readability(text, file):
    fog = textstat.gunning_fog(text)
    fres = textstat.flesch_reading_ease(text)
    fkgl = textstat.flesch_kincaid_grade(text)
    file.write(
        '\nGunning Fog Index: %d \nFlesch Reading Ease: %d \nFlesch-Kincaid Grade: %d'
        % (fog, fres, fkgl))
Ejemplo n.º 11
0
    def decide_if_assigned(self, person):
        reading_levels = {}
        for my_product in person.all_products:
            text = ""
            if my_product.title:
                text += u" " + my_product.title
            if my_product.get_abstract_using_mendeley():
                text += u" " + my_product.get_abstract_using_mendeley()

            # only do if at least three words between periods,
            # otherwise textstat library prints too many Not Enough Words error messages
            if text:
                sentences = text.split(".")
                if any([len(sentence.split()) > 3 for sentence in sentences]):
                    try:
                        grade_level = textstat.flesch_kincaid_grade(text)
                        # print u"grade level is {} for {}; text: {}".format(grade_level, my_product.doi, text)
                        if grade_level > 0:
                            # is sometimes negative, strangely.  examples in ethan's profile
                            reading_levels[my_product.doi] = grade_level
                    except TypeError:  #if text is too short it thows this
                        pass

        if reading_levels.values():
            average_reading_level = sum(reading_levels.values()) / float(
                len(reading_levels))
            if average_reading_level <= 14:
                self.candidate_badge.value = average_reading_level
                self.assigned = True
Ejemplo n.º 12
0
def main():
    csv_file2 = open(sys.argv[2], 'w', encoding="utf8")
    writer = csv.writer(csv_file2, delimiter=',')
    doc_id = 1
    writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"])
    with open(sys.argv[1], 'r',  encoding="utf8", errors='ignore') as csv_file1:
        reader = csv.reader(csv_file1)
        # Skip the first line with headers
        next(reader)
        for row in reader:
            impact = str(row[0])
            url = str(row[1])
            text = str(row[2])
            read_ease = textstat.flesch_reading_ease(text)
            grade = textstat.flesch_kincaid_grade(text)
            smog = textstat.smog_index(text)
            words = textstat.lexicon_count(text)
            sentences = textstat.sentence_count(text)
            # Uncomment this if we want summary and key words
            # summary = summarize(text, ratio=0.3)
            # key_words = keywords(text, ratio=0.3)

            writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences])
            doc_id = doc_id+1
    csv_file1.close()
    csv_file2.close()

    print('Summary statistics complete!')
Ejemplo n.º 13
0
def compareContents():
	if request.method == "POST":
	    line = request.form['poem']
	    poem1 = request.form['poem1']
		#---------Metrics comparison logic goes here. keep them in session attributes-----------------------#

	    session['line'] = line	    
        #print("i am in row : ",row)
        #print "Tagline :", line
	    #print("no of words= ",len(line.split()))
	    #line1 = line.lstrip('0123456789.- ,')
	    #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line)
	    fre = textstat.flesch_reading_ease(line)
	    session['fre'] = fre
	    #print "smog_index = ",textstat.smog_index(line)
	    smog = textstat.smog_index(line)
	    session['smog'] = smog
	    #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line)
	    fkg = textstat.flesch_kincaid_grade(line)
	    session['fkg'] = fkg
	    #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line)
	    dcr = textstat.dale_chall_readability_score(line)
	    session['dcr'] = dcr
	    #print "gunning_fog = ",textstat.gunning_fog(line)
	    gf = textstat.gunning_fog(line)
	    session['gf'] = gf
	    metrics = True
	    return render_template('compareContents.html',metrics=metrics, line=line, fre=fre, smog=smog, fkg=fkg, dcr=dcr,gf=gf)
	return render_template('compareContents.html')
Ejemplo n.º 14
0
def getReadabilityStats(text):

    # get scores
    fleschGrade = textstat.flesch_kincaid_grade(text)

    # store
    return {'fleschGrade': fleschGrade}
def flesch_grade_score():
    df.drop(['BodyFleschKinkaidGradeLevel'],
            inplace=True,
            axis=1,
            errors='ignore')
    print df.shape, "dropped a m**********r"
    tokenizer = RegexpTokenizer(r'\w+')
    final_flesch_kincaid_grade_score = []
    for index, row in df.iterrows():
        valid_words = []
        body_only = re.sub('<code>[^>]+</code>', '', row['Body'])
        soup = BeautifulSoup(body_only, "lxml")
        word_tokens = tokenizer.tokenize(soup.text)
        for word in word_tokens:
            if not_punctuation(word):
                valid_words.append(word)
        word_count = len(valid_words)
        print "word_count of ", index, " - ", word_count
        tag_removed_text = soup.text
        tag_removed_text = tag_removed_text.replace("\n", "")
        # syllables_count = get_syllables_count(valid_words)
        # print "inside flesch for loop - ",index
        # sentence_token = sent_tokenize(tag_removed_text)
        # sentences_count = len(sentence_token)
        if word_count != 0:
            flesch_kincaid_grade_score = textstat.flesch_kincaid_grade(
                tag_removed_text)
        else:
            flesch_kincaid_grade_score = 0
        print "flesch_grade_score of ", index, " - ", flesch_kincaid_grade_score
        final_flesch_kincaid_grade_score.append(flesch_kincaid_grade_score)

    df['BodyFleschKinkaidGradeLevel'] = final_flesch_kincaid_grade_score
    df.to_csv("combined.csv")
Ejemplo n.º 16
0
def validate_readability_english(d):
    # Run the supplied string through the Flesch-Kincaid readability grade test
    score = textstat.flesch_kincaid_grade(d)
    if score <= 8:
        return 1
    else:
        return 0
Ejemplo n.º 17
0
def f():
    print("hello")
    book = xlwt.Workbook()
    worksheet = book.add_sheet('ReadabilityScore')
    worksheet.write(0, 0, "Gen_sent")
    worksheet.write(0, 1, "flesch_reading_ease")
    worksheet.write(0, 2, "flesch_kincaid_grade")
    worksheet.write(0, 3, "dale_chall_readability_score")
    worksheet.write(0, 4, "gunning_fog")

    f = open('abc.txt')  #, encoding='utf-8')
    row = 1
    for line in iter(f):
        #print("i am in row : ",row)
        #print "Tagline :", line
        worksheet.write(row, 0, line)
        #print("no of words= ",len(line.split()))
        #line1 = line.lstrip('0123456789.- ,')
        #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line)
        fre = textstat.flesch_reading_ease(line)
        worksheet.write(row, 1, fre)
        #print "smog_index = ",textstat.smog_index(line)
        smog = textstat.smog_index(line)
        #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line)
        fkg = textstat.flesch_kincaid_grade(line)
        worksheet.write(row, 2, fkg)
        #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line)
        dcr = textstat.dale_chall_readability_score(line)
        worksheet.write(row, 3, dcr)
        #print "gunning_fog = ",textstat.gunning_fog(line)
        gf = textstat.gunning_fog(line)
        worksheet.write(row, 4, gf)
        row += 1
    book.save('Readability_Scores.xls')
Ejemplo n.º 18
0
def other_features(tweet):
    ##SENTIMENT
    sentiment = VS(tweet)
    ##READABILITY
    #See https://pypi.python.org/pypi/textstat/
    flesch = round(textstat.flesch_reading_ease(tweet), 3)
    flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3)
    gunning_fog = round(textstat.gunning_fog(tweet), 3)
    ##TEXT-BASED
    length = len(tweet)
    num_terms = len(tweet.split())
    ##TWITTER SPECIFIC TEXT FEATURES
    hashtag_count = tweet.count("#")
    mention_count = tweet.count("@")
    url_count = tweet.count("http")
    retweet = 0
    if tweet.lower().startswith("rt") is True:
        retweet = 1
    #Checking if RT is in the tweet
    words = tweet.lower().split()
    if "rt" in words or "#rt" in words:
        retweet = 1
    features = [
        sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length,
        num_terms, hashtag_count, mention_count, url_count, retweet
    ]
    return features
Ejemplo n.º 19
0
def main():
    """
    Evaluate and print Readability scores
    """

    if len(sys.argv) > 1:
        inf = open(sys.argv[1], 'r')
    else:
        sys.stderr.write('Error: specify input file.\n')
        sys.exit()

    text = inf.read()
    inf.close()

    lexcount = textstat.lexicon_count(text)

    sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount))
    
    # reading time in minutes
    # assumes 180 WPM plus some offset
    tread = (lexcount + 250) / 180.

    sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread))

    ease = textstat.flesch_reading_ease(text)
    grade = textstat.flesch_kincaid_grade(text)

    sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease))
    sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
Ejemplo n.º 20
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
Ejemplo n.º 21
0
def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(
            lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(
            lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(
            lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(
            lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(
            lambda x: textstat.text_standard(x))
    return df
Ejemplo n.º 22
0
    def decide_if_assigned_threshold(self, person, threshold):
        reading_levels = {}
        for my_product in person.all_products:
            text = ""
            if my_product.title:
                text += u" " + my_product.title
            if my_product.get_abstract_using_mendeley():
                text += u" " + my_product.get_abstract_using_mendeley()

            # only do if at least three words between periods,
            # otherwise textstat library prints too many Not Enough Words error messages
            if text:
                sentences = text.split(".")
                if any([len(sentence.split())>3 for sentence in sentences]):
                    try:
                        grade_level = textstat.flesch_kincaid_grade(text)
                        # print u"grade level is {} for {}; text: {}".format(grade_level, my_product.doi, text)
                        if grade_level > 0:
                            # is sometimes negative, strangely.  examples in ethan's profile
                            reading_levels[my_product.doi] = grade_level
                    except TypeError:  #if text is too short it thows this
                        pass

        if reading_levels.values():
            average_reading_level = sum(reading_levels.values()) / float(len(reading_levels))
            if average_reading_level <= 14:
                self.candidate_badge.value = average_reading_level
                self.assigned = True
Ejemplo n.º 23
0
def text_analysis(x):
    for result in x:
        blob = TextBlob(result['summary'])
        for text in blob.sentences:
            result['pola'] = '%.2f' % (abs(text.sentiment.polarity*10)/2) #-1 to 1
            result['subj'] = '%.2f' % (abs(text.sentiment.subjectivity*10)/2) #-1 to 1
            result['reada'] = '%.2f' % textstat.flesch_kincaid_grade(result['summary'])   #out of 10
    return x
Ejemplo n.º 24
0
def textConfidence(fname):
    with PyTessBaseAPI() as api:
        #for image in images:
        api.SetImageFile(fname)
        text = api.GetUTF8Text()
        #print api.AllWordConfidences()
        print textstat.flesch_kincaid_grade(text)

        print textstat.flesch_reading_ease(text)

        print("90-100 : Very Easy")
        print("80-89 : Easy")
        print("70-79 : Fairly Easy")
        print("60-69 : Standard")
        print("50-59 : Fairly Difficult")
        print("30-49 : Difficult")
        print("0-29 : Very Confusing")
Ejemplo n.º 25
0
    def get_statistics(self, f, content):
        content = content.lower()

        reading_level = textstat.flesch_kincaid_grade(content)
        word_count = textstat.lexicon_count(content)
        keyword_frequency = map(lambda x: x[1],
                                self.get_keyword_frequency(content))
        sentiment = DocumentStatistics.get_sentiment(content)
        return [f, reading_level, word_count] + keyword_frequency + sentiment
Ejemplo n.º 26
0
def vecify(v):
    return [ts.flesch_reading_ease(v),
    # ts.smog_index(v),
    ts.flesch_kincaid_grade(v),
    ts.coleman_liau_index(v),
    ts.automated_readability_index(v),
    ts.dale_chall_readability_score(v),
    ts.difficult_words(v),
    ts.linsear_write_formula(v),
    ts.gunning_fog(v)]
Ejemplo n.º 27
0
def add_reading_levels(df):
    for row,body in enumerate(df['body']):
        x = df['body'][row]
        df.loc[row,'flesch_kincaid']=textstat.flesch_kincaid_grade(x)
        df.loc[row,'fk_score']=textstat.flesch_reading_ease(x)
        #df.loc[row,'smog_index']=textstat.smog_index(x)
        df.loc[row,'gunning_fog']=textstat.gunning_fog(x)
        #df.loc[row,'difficult_words']=textstat.difficult_words(x)
        #df.loc[row,'text_standard']=textstat.text_standard(x)
    return df
Ejemplo n.º 28
0
def complex_str_pipeline(s):
    sh_ent = renyi_entropy(s, alpha=1)  # shannon
    col_ent = renyi_entropy(s, alpha=2)  # collision
    sh_delta = shannon_ideal(s,
                             logbase=2) - sh_ent  # distance to ideal encoding
    f_ease = ts.flesch_reading_ease(s)  # Flesch reading ease
    fk_grade = ts.flesch_kincaid_grade(s)  # Flesch–Kincaid grade level
    lix_scr = lix(s)
    #return {'Shannon': sh_ent,'Collision': col_ent, 'Delta': sh_delta,
    #'Flesch_ease': f_ease, 'Flesch_Kincaid': fk_grade, 'LIX': lix_scr}
    return [sh_ent, col_ent, sh_delta, f_ease, fk_grade, lix_scr]
Ejemplo n.º 29
0
def all_trad_scores(text):
    fre = textstat.flesch_reading_ease(text)
    fkg = textstat.flesch_kincaid_grade(text)
    smog = textstat.smog_index(text)
    cole = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dale = textstat.dale_chall_readability_score(text)
    linsear = textstat.linsear_write_formula(text)
    gunning = textstat.gunning_fog(text)

    return [fre, fkg, smog, cole, ari, dale, linsear, gunning]
Ejemplo n.º 30
0
def reading_level(lyrics):
    r = textstat.flesch_kincaid_grade(lyrics)
    if r >= 90:
        return '5th Grade'
    elif r >= 65:
        return 'Middle School'
    elif r >= 50:
        return 'High School'
    elif r >= 30:
        return 'College'
    else:
        return 'College Graduate'
Ejemplo n.º 31
0
    def score_text(self, txt):
        ease = float('%.2f' % (100 - textstat.flesch_reading_ease(txt)))
        grade_raw = float('%.2f' % textstat.flesch_kincaid_grade(txt))
        grade = grade_raw if grade <= 12 else '12+ (%s)' % grade_raw

        self.file.config(text='Filename: %s' % self.filename)
        self.ease.config(text='Flesch Reading Ease scale score: %s' % ease)
        self.grade.config(text='Flesch-Kincaid Grade scale score: %s' % grade)
        if self.filename not in self.files:
            self.files.append(self.filename[:self.filename.find('.')])
            self.easeScores.append(ease)
            self.gradeScores.append(grade_raw)
Ejemplo n.º 32
0
    def predict_trust(self, profile, strip_html=True):
        """Predicts the trustworthiness of a profile.

        Segments the input with sentence-level granularity, returning the
        probability that the profile represented by the input is perceived to
        be more trustworthy compared to other profiles of similar length.

        Args:
            profile: An Airbnb host profile, as a string.
            strip_html: Whether HTML tags in the input should be stripped. True
                by default, but can be disabled for speed if the input is known
                to be sanitized.

        Returns:
            An AirProfile.Prediction object for trustworthiness of the profile.

        Raises:
            ValueError: If the input is an invalid or empty string.
            IOError: If the LIWC trie is not available.
        """
        if not (profile and profile.strip()):
            raise ValueError

        if not (self.__liwc_path.exists() and self.__liwc_path.is_file()):
            raise IOError

        sentence_tokens = self.__preprocess(profile, strip_html)
        liwc_features = self.__liwc.summarize(profile, sentence_tokens)

        word_count = liwc_features['WC']
        liwc_features['wc_log'] = np.log(word_count)
        liwc_features['readability'] = ts.flesch_kincaid_grade(
            profile.decode('utf-8'))

        prediction_agg = np.empty(len(self.__classifier_cat))
        for sent in sentence_tokens:
            prediction_agg += np.array(
                [c.predict for c in self.__classify_sentence(sent)])

        for idx, cat in enumerate(FEAT_WC_CATEGORIES):
            liwc_features[cat] = prediction_agg[idx]

        feats = [
            liwc_features[f]
            for f in AirProfile.__get_trust_model_feat_cols(word_count)
        ]
        feats_shape = np.array(feats).reshape(1, -1)
        model = self.__get_classifier_trust(
            AirProfile.__get_trust_model_fname(word_count))

        return self.Prediction(
            np.round(model.predict_proba(feats_shape)[0][1], 2),
            model.predict(feats_shape)[0])
Ejemplo n.º 33
0
 def reading_difficulty(self):
     diff_words = textstat.difficult_words(self.text) / self.nword
     flesch_kincaid = textstat.flesch_kincaid_grade(self.text)
     coleman_liau = textstat.coleman_liau_index(self.text)
     ari = textstat.automated_readability_index(self.text)
     dale_chall = textstat.dale_chall_readability_score(self.text)
     linsear = textstat.linsear_write_formula(self.text)
     gunning_fog = textstat.gunning_fog(self.text) - 6
     smog = textstat.smog_index(self.text)
     avg_grade = max(
         math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall +
                    linsear + gunning_fog + smog) / 7), 12)
     return avg_grade, diff_words
def textstat_analysis(profile_text):
    fre = textstat.flesch_reading_ease(profile_text)
    smog = textstat.smog_index(profile_text)
    fkg = textstat.flesch_kincaid_grade(profile_text)
    coleman = textstat.coleman_liau_index(profile_text)
    ari = textstat.automated_readability_index(profile_text)
    dale = textstat.dale_chall_readability_score(profile_text)
    dw = textstat.difficult_words(profile_text)
    lwf = textstat.linsear_write_formula(profile_text)
    gf = textstat.gunning_fog(profile_text)
    rc = textstat.readability_consensus(profile_text)
    word_count = textstat.lexicon_count(profile_text)
    return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
Ejemplo n.º 35
0
def c():
    resp = table.scan(FilterExpression=Attr("fips").exists())
    sorted_by_fips = {}
    sorted_by_fips['items'] = []
    for item in resp['Items']:
        concat = " ".join(item['tweet'])
        polarity = textstat.flesch_kincaid_grade(concat)

        sorted_by_fips['items'].append({
            'id': str(item['fips']),
            'rate': polarity
        })

    return json.dumps(sorted_by_fips)
Ejemplo n.º 36
0
def get_readability(contents):
    readability = []
    readability.append(textstat.flesch_reading_ease(contents))
    readability.append(textstat.smog_index(contents))
    readability.append(textstat.flesch_kincaid_grade(contents))
    readability.append(textstat.automated_readability_index(contents))
    readability.append(textstat.dale_chall_readability_score(contents))
    readability.append(textstat.difficult_words(contents))
    readability.append(textstat.linsear_write_formula(contents))
    readability.append(textstat.gunning_fog(contents))
    readability.append(textstat.coleman_liau_index(contents))
    readability.append(textstat.text_standard(contents))

    return readability
Ejemplo n.º 37
0
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')])
     print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def age_feature(text, feature_vect):
    """
    Extract age features
    :param text:
    :param feature_vect: contains a bag of words
    :return:a dictionary which contains the feature and its computed value
    """
    tokens = word_tokenize(text.lower())

    features = {}
    for word in feature_vect:
        features['contains(%s)' % word] = (word in set(tokens))
    return dict(features, **dict({'FRE': textstat.flesch_reading_ease(text),
                                  'FKGL': textstat.flesch_kincaid_grade(text)}))
Ejemplo n.º 39
0
def flesch_from_list(frequent_word_list):
    '''
    If we want advanced text, this function allows us to run Flesch Kincaid
    on a list of frequent words that we turn back into a string to process.

    Input:
      frequent_word_list: a list of frequent words

    Returns:
      This functions returns the Flesch Kincaid grade of the most
      frequent words.
    '''

    freq_words_string = " ".join(frequent_word_list)
    return textstat.flesch_kincaid_grade(freq_words_string)
Ejemplo n.º 40
0
    def analyze_one(self, email):
        """ Analyzes a single email and stores results. """

        sents = tstat.sentence_count(email)
        self.sent_count.append(sents if sents > 0 else 1)

        if email and len(email) > 0:
            self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email))
            self.automated_readability_index.append(
                tstat.automated_readability_index(email))
            self.coleman_liau_index.append(tstat.coleman_liau_index(email))
            self.linsear_write_formula.append(
                tstat.linsear_write_formula(email))
            self.dale_chall_readability_score.append(
                tstat.dale_chall_readability_score(email))
Ejemplo n.º 41
0
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def calculate2FormulaFromFile(inputFile, isTEI=1):
    inputData = extractText.extractTextTEI(inputFile, isTEI)
    inputData = re.sub('_', ' ', inputData)
    # r1 = textstat.flesch_kincaid_grade(inputData)
    # r2 = textstat.dale_chall_readability_score(inputData)
    # import pdb; pdb.set_trace()
    try:
        r1 = textstat.flesch_kincaid_grade(inputData)
    except:
        print('ERROR: cannot calculate flesch_kincaid_grade for ', inputFile)
        r1 = -1
    try:
        r2 = textstat.dale_chall_readability_score(inputData)
    except:
        print('ERROR: cannot calculate dale_chall_readability_score for ', inputFile)
        r2 = -1
    print('processing file', inputFile, 'complete')
    return (inputFile, r1, r2)
def analyze1(text):
    
    # Automatically reject if no input
    if text.isspace():
        return -1.0
    if text.startswith('http'):
        return -1.0
    
    # Analyze text
    try:
        x = textstat.flesch_kincaid_grade(text)
    except:
        return -1.0
    
    # Keep outputs valid
    if not isinstance(x, float):
        return -1.0
    if x < 0:
        return -1.0
    
    return x
Ejemplo n.º 44
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch': textstat.flesch_reading_ease(source['content']),
            'smog': textstat.smog_index(source['content']),
            'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau': textstat.coleman_liau_index(source['content']),
            'readability': textstat.automated_readability_index(source['content']),
            'dale_chall': textstat.dale_chall_readability_score(source['content']),
            'difficult_words': textstat.difficult_words(source['content']),
            'linsear_write_formula': textstat.linsear_write_formula(source['content']),
            'gunning_fog': textstat.gunning_fog(source['content']),
            'consensus': textstat.readability_consensus(source['content']),
        }

        es.update(index='beek', doc_type='page', id=id,
                  body={'doc': {'measures': measures}}, refresh=True)
    except Exception as err:
        pass
Ejemplo n.º 45
0
def fic2text(ident):
   textsegs = Loader.get_field(data['fics'],ident,'fic') 
   rtags = Loader.get_field(data['base'],ident,'tags')
   rtext = ""

   for line in textsegs:
      line = line.replace(u'\xa0',' ')
      s = re.sub('([.,!?()])', r' \1 ', line)
      s = re.sub('\s{2,}', ' ', line)
      line = line.encode('ascii', 'ignore').decode('ascii')
      rtext += line+" "

   tags = []
   for genre in rtags:
      for el in rtags[genre]:
         tname = el["name"]
         tags.append(tname)

   reading_ease =  textstat.flesch_reading_ease(rtext)
   reading_level = textstat.flesch_kincaid_grade(rtext)
   print(ident,reading_ease,reading_level)
   #tokens = nltk.word_tokenize(rtext)
   return tags,rtext
Ejemplo n.º 46
0
import json

from textstat.textstat import textstat

filename = 'usertimeline.json'
READ = 'rb'
TEXT=1

tweets = json.load(open(filename,READ))
#Identify retweets
retweets = [word for tweet in tweets for word in tweet['text'][TEXT] if 'RT' in word]

print retweets
#identify replies

#Word count
print [tweet['analysis']['word-count'] for tweet in tweets]

#How would you do a character count?

#Lexical diversity
lex_div = lambda text: len(text.split())/float(len(set(text.split())))
print [lex_div(tweet['text'][TEXT]) for tweet in tweets]

#F-K
print [textstat.flesch_kincaid_grade(tweet['text'][TEXT]) for tweet in tweets]

#remove stopwords
print [[word for word in tweet['text'][TEXT].split() if word not in stopwords.words('english') ] for tweet in tweets]
#What's another way to filter out stopwords?
#How to handle punctuation?
def engineer_NLP_features(doc):

    """
    Generate NLP fatures (related to language and sentiment)
    for mashable articles to be used in predicting no. of 
    shares

    Arguements:
    doc: mongoDB document contating article content data

    Output:
    Stores NLP features results in MongoDB for Document
    """

    # get article headline and article content from Mongo DB document

    headline = doc['title']
    content = doc['content'].encode('utf-8')

    # generate headline features

    # number of words in title
    n_tokens_title = len(headline.split())

    # subjectivity
    title_subjectivity = TextBlob(headline).subjectivity

    # polarity
    title_sentiment_polarity = TextBlob(headline).polarity

    # absolute value polarirty
    title_sentiment_abs_polarity = abs(title_sentiment_polarity)

    # average word length
    average_token_length_title = np.mean([len(w) for w 
                                          in "".join(c for c in headline 
                                                     if c not in string.punctuation).split()])

    #generate content features

    # number of words
    n_tokens_content = len([w for w in content.split()])

    # rate of unique words
    r_unique_tokens = len(set([w.lower().decode('utf-8')
                               for w 
                               in "".join(c for c in content 
                                          if c not in string.punctuation).split()]))/n_tokens_content

    # rate of non-stop word
    r_non_stop_words = len([w.lower().decode('utf-8') 
                            for w in "".join(c for c in content 
                                             if c not in string.punctuation).split() 
                            if w.decode('utf-8') 
                            not in stop])/n_tokens_content

    # rate of unique non-stop word
    r_non_stop_unique_tokens = len(set([w.lower().decode('utf-8') 
                               for w in "".join(c for c in content 
                                                if c not in string.punctuation).split() 
                               if w.decode('utf-8')
                               not in stop]))/n_tokens_content

    # average word length
    average_token_length_content = np.mean([len(w) for w 
                                            in "".join(c for c in content
                                                       if c not in string.punctuation).split()])

    # subjectivity
    global_subjectivity = TextBlob(content.decode('utf-8')).subjectivity

    # polarity
    global_sentiment_polarity = TextBlob(content.decode('utf-8')).polarity

    # absolute polarity
    global_sentiment_abs_polarity = abs(global_sentiment_polarity)

    # get polarity by word
    polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                     for w in "".join(c for c in content 
                                      if c not in string.punctuation).split()]

    # global positive word rate
    global_rate_positive_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p > 0])/len(polarity_list)

    # global negative word rate
    global_rate_negative_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p < 0])/len(polarity_list)

    # positive word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_positive_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p > 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])
    else:
        rate_positive_words = 0

    # negative word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_negative_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p < 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])

    else:
       rate_negative_words = 0 

    # average polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        avg_positive_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p > 0])
    else:
        avg_positive_polarity = 0

    # minimum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        min_positive_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else:
        min_positive_polarity = 0

    # maximum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        max_positive_polarity = max([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else: 
        max_positive_polarity = 0

    # average polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        avg_negative_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p < 0])
    else:
        avg_negative_polarity = 0

    # minimum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        min_negative_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
    else:
        min_negative_polarity = 0

    # maximum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        max_negative_polarity = max([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
    else:
        max_negative_polarity = 0

    # abs maximum polarity, sum of abs of max positive and abs of min negative polarity
    max_abs_polarity = max_positive_polarity + abs(min_negative_polarity)

    # Flesch Reading Ease
    global_reading_ease = textstat.flesch_reading_ease(content.decode('utf-8'))

    # Flesch Kincaid Grade Level
    global_grade_level = textstat.flesch_kincaid_grade(content.decode('utf-8'))

    collection.update_one({"_id": doc["_id"]}, 
                          {"$set": {"n_tokens_title": n_tokens_title, 
                                    "title_subjectivity": title_subjectivity,
                                    "title_sentiment_polarity": title_sentiment_polarity,
                                    "title_sentiment_abs_polarity": title_sentiment_abs_polarity,
                                    "average_token_length_title": average_token_length_title,
                                    "n_tokens_content": n_tokens_content,
                                    "r_unique_tokens": r_unique_tokens,
                                    "r_non_stop_words": r_non_stop_words,
                                    "r_non_stop_unique_tokens": r_non_stop_unique_tokens,
                                    "average_token_length_content": average_token_length_content,
                                    "global_subjectivity": global_subjectivity,
                                    "global_sentiment_polarity": global_sentiment_polarity,
                                    "global_sentiment_abs_polarity": global_sentiment_abs_polarity,
                                    "global_rate_positive_words": global_rate_positive_words,
                                    "global_rate_negative_words": global_rate_negative_words,
                                    "rate_positive_words": rate_positive_words,
                                    "rate_negative_words": rate_negative_words,
                                    "avg_positive_polarity": avg_positive_polarity,
                                    "min_positive_polarity": min_positive_polarity,
                                    "max_positive_polarity": max_positive_polarity,
                                    "avg_negative_polarity": avg_negative_polarity,
                                    "min_negative_polarity": min_negative_polarity,
                                    "max_negative_polarity": max_negative_polarity,
                                    "max_abs_polarity": max_abs_polarity,
                                    "global_reading_ease": global_reading_ease,
                                    "global_grade_level": global_grade_level}})
                    target = open(writename, 'w')
                    target.truncate()
                    target.write(lyrics)
                    target.close()

                    # Build Dataset
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment": sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index": ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
Ejemplo n.º 49
0
tweets = json.load(open(filename,READ))
#Identify retweets
retweets = [word for tweet in tweets for word in tweet['text'][TEXT] if 'RT' in word]

print retweets
#identify replies

#Word count
print [tweet['analysis']['word-count'] for tweet in tweets]

#How would you do a character count?

#Lexical diversity
lex_div = lambda text: len(text.split())/float(len(set(text.split())))
print [lex_div(tweet['text'][TEXT]) for tweet in tweets]

#F-K

FK = []
for tweet in tweets:
	print tweet['text']
	try:
		FK.append(textstat.flesch_kincaid_grade(tweet['text']))
	except:
		FK.append(None)

print 'FK:', FK
#remove stopwords
print 'Removed stopwords:', [[word for word in tweet['text'].split() if word not in stopwords] for tweet in tweets]
#What's another way to filter out stopwords?
#How to handle punctuation?
Ejemplo n.º 50
0
def extract_features_sub(text, dialogue = True):
	## aggregate all dialogue, action
	#scenes = format_script(file_name)
	if len(text) > 0:
		try:
			language_complexity = {'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'automated_readability_index': textstat.automated_readability_index(text)}
		except:
			language_complexity = {'flesch_reading_ease': None, 'flesch_kincaid_grade': None, 'automated_readability_index': None}
	else:
		#badD.write(movie_name + "\n")
		language_complexity = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'automated_readability_index': 0}
	lexical_diversity = find_lex_d(text)
	sentiment = extract_senti_wordnet(text)
	#print sentiment
	inquirer_features = general_inquirer_features(text)
	final_features = {}
	final_features.update(language_complexity)
	final_features.update(lexical_diversity)
	final_features.update(sentiment)
	final_features.update(inquirer_features)
	curr_keys = [feature for feature in final_features]
	if dialogue:
		new_keys = [feature + "_" + "dialogue" for feature in final_features]
	else:
		new_keys = [feature + "_" + "action" for feature in final_features]
	#print final_features
	"""
	if dialogue: 
		for feature in final_features:
			final_features[feature + "_dialogue"] = final_features.pop(feature)
	else:
		for feature in final_features:
			final_features[feature + "_action"] = final_features.pop(feature)		
	#final_features = language_complexity + lexical_diversity + sentiment + inquirer_features
	"""
	return convert(final_features, dict(zip(curr_keys, new_keys)))
Ejemplo n.º 51
0
print len(reviews)

reviews['scores'] = reviews['helpful'].apply(compute_score)
print reviews['scores'].head(n=10)
y = reviews['scores']
Text = reviews['reviewText']
del reviews

X = np.zeros((len(Text), 4))

for idx, review in enumerate(Text):
    if review == '':
        continue
    try:
        X[idx][0] = ts.flesch_reading_ease(review)
        X[idx][1] = ts.flesch_kincaid_grade(review)
        X[idx][2] = ts.gunning_fog(review)
        X[idx][3] = ts.smog_index(review)
    except Exception as e:
        print review
        print e

X = StandardScaler().fit_transform(X)
print 'Computed X'
print X[0]

model = SVR(verbose=True)
params = {'C': [0.1, 0.5]}
grid = GridSearchCV(model, params, cv=10, scoring='mean_squared_error', n_jobs=-1)
grid.fit(X, y)
print grid.best_score_
Ejemplo n.º 52
0
	# Coleman-Liau index: goo.gl/8sE0m1
	cl_index_grades = []
	cl_index_total_grade = 0
	# Linsear Write Formula: goo.gl/GuOZ8B
	lwf_grades = []
	lwf_total_grade = 0
	# Dale-Chall Readability Score: goo.gl/dvmXmx
	dcr_grades = []
	dcr_total_grade = 0		
	
	num_tweets = 0
	for tweet in cleanest_tweets:
			# skipping tweets which are not just contextbased text. 
			if textstat.sentence_count(tweet) < 1:
				continue
			flesch_kincaid_grade = textstat.flesch_kincaid_grade(tweet)	
			flesch_kincaid_grades.append(flesch_kincaid_grade)
			flesch_kincaid_total_grade += flesch_kincaid_grade

			gunning_fog_grade = textstat.gunning_fog(tweet)	
			gunning_fog_grades.append(gunning_fog_grade)
			gunning_fog_total_grade += gunning_fog_grade

			smog_index_grade = textstat.smog_index(tweet)	
			smog_index_grades.append(smog_index_grade)
			smog_index_total_grade += smog_index_grade

			ar_index_grade = textstat.automated_readability_index(tweet)	
			ar_index_grades.append(ar_index_grade)
			ar_index_total_grade += ar_index_grade
			
Ejemplo n.º 53
0
def reading_level(raw_text):
    print "Flesch Reading Ease: ",textstat.flesch_reading_ease(raw_text)
    print "Flesch-Kincaid Grade Level: ", textstat.flesch_kincaid_grade(raw_text)
    print "Average Sentence Length: ", textstat.avg_sentence_length(raw_text)
    print "Average Word Length: ", textstat.avg_letter_per_word(raw_text)
Ejemplo n.º 54
0
rcParams['text.usetex'] = True

data = [line for line in csv.DictReader(open('comments.csv','rb'))]
names = set([entry['Name'] for entry in data])
TAB = '\t'
WRITE = 'wb'
READ = 'rb'
filename = 'FK-calculated'

with open(filename,WRITE) as out:
	for name in names: 
		#This measurement is confounded by lengths of the text
		text = ' '.join([entry['Student Comment'] for entry in data if entry['Name'] == name and entry['Student Comment'] != 'None'])
		try:
			grade_level =  textstat.flesch_kincaid_grade(text)
		except: 
			grade_level = -1
		try:
			lex_div =  len(text.split())/float(len(set(text.split())))
		except:
			lex_div = -1 
		print>>out,'%s \t %.02f \t %.02f'%(name,grade_level,lex_div)


names, grade_levels, lex_div= zip(*[line.split('\t') for line in open(filename,READ).read().splitlines()])
grade_levels = map(float,grade_levels)
lex_div = map(float,lex_div)

fig,(ax,ax2) = plt.subplots(nrows=1,ncols=2,sharey=True)
ax.hist(grade_levels, bins=10,color='k')
def create_NLP_features(data, headline, content):

    """
    Add NLP features to DF or Dictionary of data to be input 
    to mashable models for prediction.

    Arguements:
    data: DateFrame or Dictionary
    headline: string containing article headline
    content: string containing article content
    """

    # number of words in title
    data['n_tokens_title'] = len(headline.split())

    # subjectivity
    data['title_subjectivity'] = TextBlob(headline).subjectivity

    # polarity
    data['title_sentiment_polarity'] = round(TextBlob(headline).polarity,2)

    # absolute value polarirty
    data['title_sentiment_abs_polarity'] = abs(data['title_sentiment_polarity'])

    # average word length
    data['average_token_length_title'] = np.mean([len(w) for w 
                                          in "".join(c for c in headline 
                                                     if c not in string.punctuation).split()])

    #generate content features

    # number of words
    data['n_tokens_content'] = len([w for w in content.split()])

    # rate of unique words
    data['r_unique_tokens'] = round(len(set([w.lower().decode('utf-8')
                               for w 
                               in "".join(c for c in content 
                                          if c not in string.punctuation).split()]))/data['n_tokens_content'],2)

    # rate of non-stop word
    data['r_non_stop_words'] = len([w.lower().decode('utf-8') 
                            for w in "".join(c for c in content 
                                             if c not in string.punctuation).split() 
                            if w.decode('utf-8') 
                            not in stop])/data['n_tokens_content']

    # rate of unique non-stop word
    data['r_non_stop_unique_tokens'] = len(set([w.lower().decode('utf-8') 
                               for w in "".join(c for c in content 
                                                if c not in string.punctuation).split() 
                               if w.decode('utf-8')
                               not in stop]))/data['n_tokens_content']

    # average word length
    data['average_token_length_content'] = np.mean([len(w) for w 
                                            in "".join(c for c in content
                                                       if c not in string.punctuation).split()])

    # subjectivity
    data['global_subjectivity'] = TextBlob(content.decode('utf-8')).subjectivity

    # polarity
    data['global_sentiment_polarity'] = round(TextBlob(content.decode('utf-8')).polarity,2)

    # absolute polarity
    data['global_sentiment_abs_polarity'] = abs(data['global_sentiment_polarity'])

    # get polarity by word
    polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) 
                             for w in "".join(c for c in content 
                                              if c not in string.punctuation).split()]

    # global positive word rate
    data['global_rate_positive_words'] = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p > 0])/len(polarity_list)

    # global negative word rate
    data['global_rate_negative_words'] = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p < 0])/len(polarity_list)

    # positive word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        data['rate_positive_words'] = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p > 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])
    else:
        data['rate_positive_words'] = 0

    # negative word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        data['rate_negative_words'] = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p < 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])

    else:
        data['rate_negative_words'] = 0 

    # average polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['avg_positive_polarity'] = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p > 0])
    else:
        data['avg_positive_polarity'] = 0

    # minimum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['min_positive_polarity'] = min([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else:
        data['min_positive_polarity'] = 0

    # maximum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        data['max_positive_polarity'] = max([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else: 
        data['max_positive_polarity'] = 0

    # average polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['avg_negative_polarity'] = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p < 0])
    else:
        data['avg_negative_polarity'] = 0

    # minimum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['min_negative_polarity'] = min([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
    else:
        data['min_negative_polarity'] = 0

    # maximum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        data['max_negative_polarity'] = max([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
    else:
        data['max_negative_polarity'] = 0

    # abs maximum polarity, sum of abs of max positive and abs of min negative polarity
    data['max_abs_polarity'] = data['max_positive_polarity'] + abs(data['min_negative_polarity'])

    # Flesch Reading Ease
    data['global_reading_ease'] = textstat.flesch_reading_ease(content.decode('utf-8'))

    # Flesch Kincaid Grade Level
    data['global_grade_level'] = textstat.flesch_kincaid_grade(content.decode('utf-8'))
Ejemplo n.º 56
0
#location of full .csv
with open(inFile,"rb") as source:
        rdr= csv.reader( source, delimiter = ',' )

        #change path below to update output file location
        with open(outfile,"wb") as result:
            wtr= csv.writer( result, delimiter=',')
            i = 0
            for r in rdr:
                #get the description from each field
                description = r[2]

                #try/catch to calculate grade level
                try:
                    gradeLevel= textstat.flesch_kincaid_grade(description)
                except:
                    gradeLevel = 'Unable to calculate!'
                
                #append grade level calculation to list
                r.append(gradeLevel)
                wtr.writerow(r)
                i+=1

                #print obs to track execution in console
                print(i)


                            
                
Ejemplo n.º 57
0
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)