Ejemplo n.º 1
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
Ejemplo n.º 2
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
Ejemplo n.º 3
0
def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(
            lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(
            lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(
            lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(
            lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(
            lambda x: textstat.text_standard(x))
    return df
Ejemplo n.º 4
0
def text_analytics(text):
    if textstat.sentence_count(text) != 0:
        lexicon = textstat.lexicon_count(text) #word count
        sent = textstat.sentence_count(text) #sentence count
        syll = textstat.syllable_count(text) #syllable count
        flesch = textstat.flesch_reading_ease(text) #flesch score
        smog = textstat.smog_index(text) #SMOG index
        fog = textstat.gunning_fog(text) #FOG index
        dale = textstat.dale_chall_readability_score(text) #grade level
        ari = textstat.automated_readability_index(text) #grade level
        cl = textstat.coleman_liau_index(text) #grade level

        flesch1 = lexicon*flesch
        flesch2 = sent*flesch
        flesch3 = syll*flesch
        smog1 = lexicon*smog
        smog2 = sent*smog
        smog3 = syll*smog
        fog1 = lexicon*fog
        fog2 = sent*fog
        fog3 = syll*fog
        dale1 = lexicon*dale
        dale2 = sent*dale
        dale3=syll*dale
        ari1 = lexicon*ari
        ari2 = sent*ari
        ari3 = syll*ari
        cl1 = lexicon*cl
        cl2 = sent*cl
        cl3 = syll*cl
        x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1,                 smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3]
    return(x)
Ejemplo n.º 5
0
def readability(text):
    print("Readability\n=================================\n\n")
    print("Flesch Reading Ease\n________________________\n\n")
    print str(textstat.flesch_reading_ease(text)) + "\n"
    print("Smog Index\n________________________\n\n")
    print str(textstat.smog_index(text)) + "\n"
    print("Flesch Kincaid Grade\n________________________\n\n")
    print str(textstat.flesch_kincaid_grade(text)) + "\n"
    print("Coleman Liau Index\n________________________\n\n")
    print str(textstat.coleman_liau_index(text)) + "\n"
    print("ARI\n________________________\n\n")
    print str(textstat.automated_readability_index(text)) + "\n"
    print("Dale Chall\n________________________\n\n")
    print str(textstat.dale_chall_readability_score(text)) + "\n"
    print("Difficult Words\n________________________\n\n")
    print str(textstat.difficult_words(text)) + "\n"
    print("Linsear Write Formula\n________________________\n\n")
    print str(textstat.linsear_write_formula(text)) + "\n"
    print("Gunning Fog\n________________________\n\n")
    print str(textstat.gunning_fog(text)) + "\n"
    print "Compiled Score\n_____________________________\n\n"
    print str(textstat.text_standard(text)) + "\n"


    return len(adjectives)
Ejemplo n.º 6
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
def _get_reading_stats(no_code_text):
    """
    Returns reading level information
    :param no_code_text: String to analyse
    :return: list of details
    """
    group_by = 'Reading Level Analysis '
    results = []
    results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by))        # higher is better, scale 0 to 100
    results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by))
    try:
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by))
    try:
        results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The SMOG Index', "Undetermined", group_by))
    results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by))
    results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by))
    try:
        results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by))
    except IndexError:
        results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by))
    try:
        results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by))

    try:
        results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by))
    except (TypeError, IndexError):
        results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by))
    return results
Ejemplo n.º 8
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
Ejemplo n.º 9
0
def coleman_liau_index(text):
    """
    :type text: Text
    :param text: The text to be analysed
    :rtype float
    :returns Coleman liau index
    """
    return textstat.coleman_liau_index(text.text)
Ejemplo n.º 10
0
def vecify(v):
    return [ts.flesch_reading_ease(v),
    # ts.smog_index(v),
    ts.flesch_kincaid_grade(v),
    ts.coleman_liau_index(v),
    ts.automated_readability_index(v),
    ts.dale_chall_readability_score(v),
    ts.difficult_words(v),
    ts.linsear_write_formula(v),
    ts.gunning_fog(v)]
    def assign_grade_score_for_songs(self):

        all_songs = Song.objects.all()

        for song_object in all_songs:

            if len(song_object.song_lyrics) > 0:
                song_object.lyric_stats[
                    'grade_level'] = textstat.coleman_liau_index(
                        song_object.song_lyrics[0])
                song_object.save()
Ejemplo n.º 12
0
def all_trad_scores(text):
    fre = textstat.flesch_reading_ease(text)
    fkg = textstat.flesch_kincaid_grade(text)
    smog = textstat.smog_index(text)
    cole = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dale = textstat.dale_chall_readability_score(text)
    linsear = textstat.linsear_write_formula(text)
    gunning = textstat.gunning_fog(text)

    return [fre, fkg, smog, cole, ari, dale, linsear, gunning]
Ejemplo n.º 13
0
 def reading_difficulty(self):
     diff_words = textstat.difficult_words(self.text) / self.nword
     flesch_kincaid = textstat.flesch_kincaid_grade(self.text)
     coleman_liau = textstat.coleman_liau_index(self.text)
     ari = textstat.automated_readability_index(self.text)
     dale_chall = textstat.dale_chall_readability_score(self.text)
     linsear = textstat.linsear_write_formula(self.text)
     gunning_fog = textstat.gunning_fog(self.text) - 6
     smog = textstat.smog_index(self.text)
     avg_grade = max(
         math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall +
                    linsear + gunning_fog + smog) / 7), 12)
     return avg_grade, diff_words
def textstat_analysis(profile_text):
    fre = textstat.flesch_reading_ease(profile_text)
    smog = textstat.smog_index(profile_text)
    fkg = textstat.flesch_kincaid_grade(profile_text)
    coleman = textstat.coleman_liau_index(profile_text)
    ari = textstat.automated_readability_index(profile_text)
    dale = textstat.dale_chall_readability_score(profile_text)
    dw = textstat.difficult_words(profile_text)
    lwf = textstat.linsear_write_formula(profile_text)
    gf = textstat.gunning_fog(profile_text)
    rc = textstat.readability_consensus(profile_text)
    word_count = textstat.lexicon_count(profile_text)
    return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
Ejemplo n.º 15
0
def get_readability(contents):
    readability = []
    readability.append(textstat.flesch_reading_ease(contents))
    readability.append(textstat.smog_index(contents))
    readability.append(textstat.flesch_kincaid_grade(contents))
    readability.append(textstat.automated_readability_index(contents))
    readability.append(textstat.dale_chall_readability_score(contents))
    readability.append(textstat.difficult_words(contents))
    readability.append(textstat.linsear_write_formula(contents))
    readability.append(textstat.gunning_fog(contents))
    readability.append(textstat.coleman_liau_index(contents))
    readability.append(textstat.text_standard(contents))

    return readability
Ejemplo n.º 16
0
    def analyze_one(self, email):
        """ Analyzes a single email and stores results. """

        sents = tstat.sentence_count(email)
        self.sent_count.append(sents if sents > 0 else 1)

        if email and len(email) > 0:
            self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email))
            self.automated_readability_index.append(
                tstat.automated_readability_index(email))
            self.coleman_liau_index.append(tstat.coleman_liau_index(email))
            self.linsear_write_formula.append(
                tstat.linsear_write_formula(email))
            self.dale_chall_readability_score.append(
                tstat.dale_chall_readability_score(email))
Ejemplo n.º 17
0
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
Ejemplo n.º 18
0
def run_textstat(text):
    #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension."""

    ts_flesch_reading_ease = textstat.flesch_reading_ease(text)
    ts_smog_index = textstat.smog_index(text)
    ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    ts_coleman_liau_index = textstat.coleman_liau_index(text)
    ts_automated_readability_index = textstat.automated_readability_index(text)
    ts_dale_chall_readability_score = textstat.dale_chall_readability_score(
        text)
    ts_difficult_words = textstat.difficult_words(text)
    ts_linsear_write_formula = textstat.linsear_write_formula(text)
    ts_gunning_fog = textstat.gunning_fog(text)
    ts_text_standard = textstat.text_standard(text)

    return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade,
            ts_coleman_liau_index, ts_automated_readability_index,
            ts_dale_chall_readability_score, ts_difficult_words,
            ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
Ejemplo n.º 19
0
def predict_readability_level(text):
  #convert text from a list to data frame
  data = pd.DataFrame([text])
  #create an empty data frame for features
  df = pd.DataFrame()

  #get text features for all rows in data
  df[['commas_number' , 'pronouns_number' , 'modal_verbs_number' , \
      'personal_pronouns_number' , 'wh_pronouns_number' , 'function_words_number' , \
      'VB_tags_number' , 'VBD_tags_number' , 'VBG_tags_number' , 'VBN_tags_number' , \
      'VBP_tags_number' , 'nouns_number' , 'proper_nouns_number' , 'conjunctions_number' , \
      'adjectives_number' , 'non_modal_verbs_number' , 'interjections_number' , \
      'adverbs_number' , 'determiners_number' ]] \
      = data[0].apply(lambda x:pd.Series(text_features(x)))

  #calculate all readability equations for all rows in data
  df["Flesch_Reading_Ease_score"] = data[0].apply(lambda x:flesch_reading_ease(x))
  df["Flesch_Kincaid_Grade_Level"] = data[0].apply(lambda x:textstat.flesch_kincaid_grade(x))
  df["Fog_Scale"] = data[0].apply(lambda x:gunning_fog(x))
  df["SMOG_Index"] = data[0].apply(lambda x:smog_index(x))
  df["Automated_Readability_Index"] = data[0].apply(lambda x:textstat.automated_readability_index(x))
  df["Coleman_Liau_Index"] = data[0].apply(lambda x:textstat.coleman_liau_index(x))
  df["Linsear_Write_Formula"] = data[0].apply(lambda x:textstat.linsear_write_formula(x))
  df["Dale_Chall_Readability_Score"] = data[0].apply(lambda x:dale_chall_readability_score(x))

  #get text parameters used in readability equations for all rows in data
  df[['Word_count' , 'Sentence_count' , 'Average_Sentence_length' , \
      'Syllable_Count' , 'Average_syllables_per_words' , 'poly_syllable_count' , \
      'Lexical_Count' , 'average_poly_syllable' , 'long_word' , 'average_long_word' , \
      'average_word_length']]  = data[0].apply(lambda x:pd.Series(text_param(x)))

  #Drop bad featurs
  df = df.drop('long_word',axis=1)
  df = df.drop('average_long_word',axis=1)

  #Load readability model for predition
  Readability_model=load_pkl("Readability_model.pkl")

  #Use model to predict the right class based on features
  result = Readability_model.predict(df)

  #return the result
  return str(result[0])
def coleman_liau_index(text):

    score = textstat.coleman_liau_index(text)
    level = 0
    if 0 < score < 6:
        level = 1
    elif 6 <= score < 8:
        level = 2
    elif 8 <= score < 10:
        level = 3
    elif 10 <= score < 11:
        level = 4
    elif 11 <= score < 12:
        level = 5
    elif 12 <= score < 13:
        level = 6
    elif 13 <= score:
        level = 7

    return level
Ejemplo n.º 21
0
def lambda_handler(event, context):

    text = event['text']

    response = {}
    response['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    response['smog_index'] = textstat.smog_index(text)
    response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    response['coleman_liau_index'] = textstat.coleman_liau_index(text)
    response[
        'automated_readability_index'] = textstat.automated_readability_index(
            text)
    response[
        'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
            text)
    response['difficult_words'] = textstat.difficult_words(text)
    response['linsear_write_formula'] = textstat.linsear_write_formula(text)
    response['gunning_fog'] = textstat.gunning_fog(text)
    response['text_standard'] = textstat.text_standard(text)

    return respond(None, response)
Ejemplo n.º 22
0
    def get_feat_readability_metrics(self):
        # https://github.com/shivam5992/textstat

        try:
            test_data = self.webscrap.get_body()
            out = []
            out.append(textstat.flesch_reading_ease(test_data))
            out.append(textstat.smog_index(test_data))
            out.append(textstat.flesch_kincaid_grade(test_data))
            out.append(textstat.coleman_liau_index(test_data))
            out.append(textstat.automated_readability_index(test_data))
            out.append(textstat.dale_chall_readability_score(test_data))
            out.append(textstat.difficult_words(test_data))
            out.append(textstat.linsear_write_formula(test_data))
            out.append(textstat.gunning_fog(test_data))
            #out.append(textstat.text_standard(test_data))
            return out, False

        except Exception as e:
            config.logger.error(repr(e))
            return MISSING_FEATURE * 9, True
Ejemplo n.º 23
0
def feature_readability(essay):
    syllable_count = textstat.syllable_count(essay)
    #音节数统计
    flesch_reading_ease = textstat.flesch_reading_ease(essay)
    #文档的易读性0-100之间的分数
    smog_index = textstat.smog_index(essay)
    #烟雾指数,反映文档的易读程度,更精确,更容易计算
    flesch_kincaid_index = textstat.flesch_kincaid_grade(essay)
    #等级分数,年级等级
    coleman_liau_index = textstat.coleman_liau_index(essay)
    #返回文本的年级级别
    automated_readability_index = textstat.automated_readability_index(essay)
    #自动可读性指数,接近理解文本需要的年级
    dale_chall_readability_score = textstat.dale_chall_readability_score(essay)
    #返回年级级别,使用最常见的英文单词
    difficult_words = textstat.difficult_words(essay)

    linsear_write_formula = textstat.linsear_write_formula(essay)
    #返回文本的年级级别
    gunning_fog = textstat.gunning_fog(essay)
    #迷雾指数, 反映文本的阅读难度
    return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
Ejemplo n.º 24
0
def analyseText():
    values = request.get_json()
    required = [ 'inputText' ]
    if not all(k in values for k in required):
        return 'Missing values', 400

    text = values['inputText']
    result = {
        'syllable_count': textstat.syllable_count(text),
        'lexicon_count': textstat.lexicon_count(text),
        'sentence_count': textstat.sentence_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'gunning_fog': textstat.gunning_fog(text),
        'smog_index': textstat.smog_index(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(text)
    };

    return jsonify(result), 200
Ejemplo n.º 25
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch':
            textstat.flesch_reading_ease(source['content']),
            'smog':
            textstat.smog_index(source['content']),
            'flesch_kincaid':
            textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau':
            textstat.coleman_liau_index(source['content']),
            'readability':
            textstat.automated_readability_index(source['content']),
            'dale_chall':
            textstat.dale_chall_readability_score(source['content']),
            'difficult_words':
            textstat.difficult_words(source['content']),
            'linsear_write_formula':
            textstat.linsear_write_formula(source['content']),
            'gunning_fog':
            textstat.gunning_fog(source['content']),
            'consensus':
            textstat.readability_consensus(source['content']),
        }

        es.update(index='beek',
                  doc_type='page',
                  id=id,
                  body={'doc': {
                      'measures': measures
                  }},
                  refresh=True)
    except Exception as err:
        pass
Ejemplo n.º 26
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch': textstat.flesch_reading_ease(source['content']),
            'smog': textstat.smog_index(source['content']),
            'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau': textstat.coleman_liau_index(source['content']),
            'readability': textstat.automated_readability_index(source['content']),
            'dale_chall': textstat.dale_chall_readability_score(source['content']),
            'difficult_words': textstat.difficult_words(source['content']),
            'linsear_write_formula': textstat.linsear_write_formula(source['content']),
            'gunning_fog': textstat.gunning_fog(source['content']),
            'consensus': textstat.readability_consensus(source['content']),
        }

        es.update(index='beek', doc_type='page', id=id,
                  body={'doc': {'measures': measures}}, refresh=True)
    except Exception as err:
        pass
Ejemplo n.º 27
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Ejemplo n.º 28
0
    def get_readability(self, corpus, type='ari'):
        readability = None
        if type == 'ari':
            readability = textstat.automated_readability_index(corpus)
        elif type == 'flesch':
            readability = textstat.flesch_reading_ease(corpus)
        elif type == 'smog':
            readability = textstat.smog_index(corpus)
        elif type == 'flesch_kinciad':
            readability = textstat.flesch_kincaid_grade(corpus)
        elif type == 'coleman':
            readability = textstat.coleman_liau_index(corpus)
        elif type == 'dale_chall':
            readability = textstat.dale_chall_readability_score(corpus)
        elif type == 'difficult_words':
            readability = textstat.difficult_words(corpus)
        elif type == 'linsear':
            readability = textstat.linsear_write_formula(corpus)
        elif type == 'gunning_fog':
            readability = textstat.gunning_fog(corpus)
        elif type == 'readability_conensus':
            readability = textstat.readability_consensus(corpus)

        return readability
def coleman_liau(string):
    result = ts.coleman_liau_index(string)
    return result
Ejemplo n.º 30
0
negHeadCountFile.close()
posParseLengthFile.close()
negParseLengthFile.close()
posPOSCountFile.close()
negPOSCountFile.close()
posFile.close()
negFile.close()
print len(contents)

for index, content in enumerate(contents):
    temp = []
    words = simpleTokenize(content)
    twLen = float(len(words))
    sentiScore = afinn.score(stemContent(content))
    # posProb, negProb = utilities.classifySentiment(words, happy_log_probs, sad_log_probs)
    readScore = textstat.coleman_liau_index(content)

    temp.append(twLen)

    # temp.append(content.count('URRL'))
    if content.count('URRL') > 0:
        temp.append(1)
    else:
        temp.append(0)
    # temp.append(content.count('HHTTG'))
    if content.count('HHTTG') > 0:
        temp.append(1)
    else:
        temp.append(0)
    # temp.append(content.count('USSERNM'))
    if content.count('USSERNM') > 0:
Ejemplo n.º 31
0
			flesch_kincaid_grades.append(flesch_kincaid_grade)
			flesch_kincaid_total_grade += flesch_kincaid_grade

			gunning_fog_grade = textstat.gunning_fog(tweet)	
			gunning_fog_grades.append(gunning_fog_grade)
			gunning_fog_total_grade += gunning_fog_grade

			smog_index_grade = textstat.smog_index(tweet)	
			smog_index_grades.append(smog_index_grade)
			smog_index_total_grade += smog_index_grade

			ar_index_grade = textstat.automated_readability_index(tweet)	
			ar_index_grades.append(ar_index_grade)
			ar_index_total_grade += ar_index_grade
			
			cl_index_grade = textstat.coleman_liau_index(tweet)	
			cl_index_grades.append(cl_index_grade)
			cl_index_total_grade += cl_index_grade				

			lwf_grade = textstat.linsear_write_formula(tweet)	
			lwf_grades.append(lwf_grade)
			lwf_total_grade += lwf_grade

			dcr_grade = textstat.dale_chall_readability_score(tweet)	
			dcr_grades.append(dcr_grade)
			dcr_total_grade += dcr_grade

			num_tweets += 1


Ejemplo n.º 32
0
#main script
if __name__ == '__main__':

	print "TextStat Comparison Script"
	print "--------------------------"
	
	#read in text from the command line
	#This needs to be fixed to deal/escape special characters
	textToCheck = raw_input("Please enter the text you would like to analyse: ") 
	
	#read in text from a file- but what format?
	
	print "\n\n"
	print "Results"
	print "=============================================="
	print "==============================================\n"
	
	print "Syllable Count: " + str(textstat.syllable_count(textToCheck))
	print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting
	print "Sentence Count: " + str(textstat.sentence_count(textToCheck))
	print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck))
	print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck))
	print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck))
	print "SMOG Index: " + str(textstat.smog_index(textToCheck))
	print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck))
	print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck))
	print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck))
	print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck))
	print "--------------------------------------------------------------"
	print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck))
	print "\n\n"
Ejemplo n.º 33
0
 print(difficulty_label)
 print(
     "-------------------------Readability Formula------------------------------"
 )
 # print("The SMOG Index")
 # print("Texts of fewer than 30 sentences are statistically invalid, "
 #       "because the SMOG formula was normed on 30-sentence samples.")
 # print("textstat requires atleast 3 sentences for a result.")
 # print(textstat.smog_index(test_data))
 print("The Flesch-Kincaid Grade")
 # print(textstat.flesch_kincaid_grade(test_data))
 flesch_kincaid_grade = textstat.flesch_kincaid_grade(test_data)
 print(flesch_kincaid_grade)
 print("The Coleman-Liau Index")
 # print(textstat.coleman_liau_index(test_data))
 coleman_liau_index = textstat.coleman_liau_index(test_data)
 print(coleman_liau_index)
 print("Automated Readability Index (ARI)")
 # print(textstat.automated_readability_index(test_data))
 automated_readability_index = textstat.automated_readability_index(
     test_data)
 print(automated_readability_index)
 # print("Dale-Chall Readability Score")
 # print(textstat.dale_chall_readability_score(test_data))
 print("Linsear Write Formula")
 # print(textstat.linsear_write_formula(test_data))
 linsear_write_formula = textstat.linsear_write_formula(test_data)
 print(linsear_write_formula)
 print("The Fog Scale (Gunning FOG Formula)")
 # print(textstat.gunning_fog(test_data))
 gunning_fog = textstat.gunning_fog(test_data)
Ejemplo n.º 34
0
# avg_word_len
try: 
  avg_word_len = avg_word_length(AB) 
except:
  warning_message = 1
  
# flesch_score
try: 
  flesch_score = textstat.flesch_reading_ease(AB) 
except:
  warning_message = 1

# coleman_liau_score
try:
  coleman_liau_score = textstat.coleman_liau_index(AB) 
except: 
  warning_message = 1

# num_stopwords
try:
  clean_abstract = re.sub("[^a-zA-Z]", " ", AB)
  clean_abstract = clean_abstract.lower() 
  split_words = clean_abstract.split()
  stops = set(stopwords.words("english"))
  num_stopwords = len([w for w in split_words if w in stops]) 
except:
  warning_message = 1

# num_unique_nonstop_words
try:
Ejemplo n.º 35
0
    def __init__(self, path):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            path (str): path to file to open, anaylze, close


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made by
        textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.user = ""
        self.path = path
        self.abs_path = os.path.abspath(self.path)
        if os.path.isfile(self.path):
            self.time_stamp = self.timestamp()
            self.file_name = os.path.basename(path)
            self.mime = MimeTypes()
            self.guessed_type = self.mime.guess_type(self.path)
            self.file_type = self.guessed_type[0]
            self.raw_text = textract.process(self.path, encoding="ascii")
            self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text)
            self.ptext = re.sub(u"\u2014", "--", self.ptext)
            self.ptext = re.sub(",", ",", self.ptext)
            self.ptext = re.sub("—", "--", self.ptext)
            self.ptext = re.sub("…", "...", self.ptext)
            self.text_no_feed = self.clean_new_lines(self.ptext)
            self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
            self.sentence_count = len(self.sentence_tokens)
            self.passive_sentences = passive(self.text_no_feed)
            self.passive_sentence_count = len(self.passive_sentences)
            self.percent_passive = (100 * (float(self.passive_sentence_count) /
                                           float(self.sentence_count)))
            self.percent_passive_round = round(self.percent_passive, 2)

            self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
            self.be_verb_count = self.be_verb_analysis[0]
            self.weak_sentences_all = self.be_verb_analysis[1]
            self.weak_sentences_set = set(self.weak_sentences_all)
            self.weak_sentences_count = len(self.weak_sentences_set)
            self.weak_verbs_to_sentences = 100 * float(
                self.weak_sentences_count) / float(self.sentence_count)
            self.weak_verbs_to_sentences_round = round(
                self.weak_verbs_to_sentences, 2)
            self.word_tokens = self.word_tokenize(self.text_no_feed)
            self.word_tokens_no_punct = \
                self.word_tokenize_no_punct(self.text_no_feed)
            self.no_punct = self.strip_punctuation(self.text_no_feed)
            # use this! It make lower and strips symbols
            self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)


            self.readability_flesch_re = \
                textstat.flesch_reading_ease(self.text_no_feed)
            self.readability_smog_index = \
                textstat.smog_index(self.text_no_feed)
            self.readability_flesch_kincaid_grade = \
                textstat.flesch_kincaid_grade(self.text_no_feed)
            self.readability_coleman_liau_index = \
                textstat.coleman_liau_index(self.text_no_feed)
            self.readability_ari = \
                textstat.automated_readability_index(self.text_no_feed)
            self.readability_linser_write = \
                textstat.linsear_write_formula(self.text_no_feed)
            self.readability_dale_chall = \
                textstat.dale_chall_readability_score(self.text_no_feed)
            self.readability_standard = \
                textstat.text_standard(self.text_no_feed)

            self.flesch_re_desc_str = self.flesch_re_desc(
                int(textstat.flesch_reading_ease(self.text_no_feed)))
            self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
            self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
            self.avg_syllables_per_word = textstat.avg_syllables_per_word(
                self.text_no_feed)
            self.avg_sentence_per_word = textstat.avg_sentence_per_word(
                self.text_no_feed)
            self.avg_sentence_length = textstat.avg_sentence_length(
                self.text_no_feed)
            self.avg_letter_per_word = textstat.avg_letter_per_word(
                self.text_no_feed)
            self.difficult_words = textstat.difficult_words(self.text_no_feed)
            self.rand_passive = self.select_random(self.passive_sentence_count,
                                                   self.passive_sentences)
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
            if self.word_tokens_no_punct:
                self.word_count = len(self.word_tokens_no_punct)
                self.page_length = float(self.word_count) / float(250)
                self.paper_count = int(math.ceil(self.page_length))
                self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
                self.pos_counts = Counter(
                    tag for word, tag in self.parts_of_speech)
                self.pos_total = sum(self.pos_counts.values())
                self.pos_freq = dict(
                    (word, float(count) / self.pos_total)
                    for word, count in self.pos_counts.items())
                self.doc_pages = float(float(self.word_count) / float(250))
                self.freq_words = \
                    self.word_frequency(self.word_tokens_no_punct)
                self.modal_dist = self.modal_count(self.word_tokens_no_punct)
                # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
                self.pos_count_dict = self.pos_counts.items()

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            self.semicolon_sentences = self.list_sentences(";")
            self.semicolon_example = self.select_random(
                len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)
Ejemplo n.º 36
0
    [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
sf['Sentences_wc'] = sf['content'].apply(lambda x: len(sent_tokenize(x)))
sf['Positive_sentrate'] = sf['Positive_text_wc'] / sf['Sentences_wc']
sf['Negative_sentrate'] = sf['Negative_text_wc'] / sf['Sentences_wc']

#Readability feature extraction from content of news articles
sf['FRE_text'] = sf['content'].apply(lambda x: textstat.flesch_reading_ease(x))
sf['FRE_tagged_text'] = sf['FRE_text'].apply(
    lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
    if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
    if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
sf['FK_text'] = sf['content'].apply(
    lambda x: int(textstat.flesch_kincaid_grade(x)))
sf['GFI_text'] = sf['content'].apply(lambda x: textstat.gunning_fog(x))
sf['SMI_text'] = sf['content'].apply(lambda x: textstat.smog_index(x))
sf['CLI_text'] = sf['content'].apply(lambda x: textstat.coleman_liau_index(x))
sf['ARI_text'] = sf['content'].apply(
    lambda x: int(textstat.automated_readability_index(x)))
sf['DC_text'] = sf['content'].apply(
    lambda x: textstat.dale_chall_readability_score(x))
sf['Difficult_text_wc'] = sf['content'].apply(
    lambda x: textstat.difficult_words(x))

#Hand-picked quantitative features - # of percentage occurrences
percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%')
sf['Percent_occurrences'] = sf['content'].apply(
    lambda x: len(percent_pattern.findall(x)))

#Polarity feature extraction from news headlines
sf['Polarity_head'] = sf['title'].apply(
    lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
Ejemplo n.º 37
0
def predict_relevance(df):

    #Loading data into SFrame
    df[[a for a in df.columns.values]] = df[[a for a in df.columns.values
                                             ]].astype(str)
    tf = gl.SFrame(data=df)
    tf = tf.unique()

    #Loading LDA model for topic modeling, pysentiment module for financial sentiment analysis and the relevance prediction model
    lda = models.ldamodel.LdaModel.load('lda1.model')
    lm = py.LM()
    model = gl.load_model('relevance_model_64feat')

    #Building the LDA model using news articles
    tf['tokens'] = tf['content'].apply(lambda x: dc.tokenize_doc(x, 'STEM'))
    tokens_text = [
        unicode('|'.join(i), errors='replace').split('|') for i in tf['tokens']
    ]
    dictionary = corpora.Dictionary(tokens_text)
    corpus = [dictionary.doc2bow(text) for text in tokens_text]
    ldamat = lda[corpus]

    #Building LDA topic arrays per topic
    topic_arrays = np.zeros((30, len(ldamat)))
    for i, x in enumerate(ldamat):
        for topic_no, contrib in x:
            topic_arrays[topic_no, i] = contrib

    #Adding LDA topic arrays as feature columns as 'Tx'
    for i, x in enumerate(topic_arrays):
        tf['T' + str(i)] = gl.SArray(data=x, dtype=float)

    #Polarity feature extraction from content of news articles
    tf['Polarity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_text_wc'] = tf['content'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_text_rate'] = tf['Negative_text_wc'] / tf['Total_text_wc']
    tf['Positive_text_rate'] = tf['Positive_text_wc'] / tf['Total_text_wc']
    tf['Max_Polarity'] = tf['content'].apply(lambda x: max(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Min_Polarity'] = tf['content'].apply(lambda x: min(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Sentences_wc'] = tf['content'].apply(lambda x: len(sent_tokenize(x)))
    tf['Positive_sentrate'] = tf['Positive_text_wc'] / tf['Sentences_wc']
    tf['Negative_sentrate'] = tf['Negative_text_wc'] / tf['Sentences_wc']

    #Readability feature extraction from content of news articles
    tf['FRE_text'] = tf['content'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_text'] = tf['FRE_text'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_text'] = tf['content'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_text'] = tf['content'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_text'] = tf['content'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_text'] = tf['content'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_text'] = tf['content'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_text'] = tf['content'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_text_wc'] = tf['content'].apply(
        lambda x: textstat.difficult_words(x))

    #Hand-picked quantitative features - # of percentage occurrences
    percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%')
    tf['Percent_occurrences'] = tf['content'].apply(
        lambda x: len(percent_pattern.findall(x)))

    #Polarity feature extraction from news headlines
    tf['Polarity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_head_wc'] = tf['title'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_head_rate'] = tf['Negative_head_wc'] / tf['Total_head_wc']
    tf['Positive_head_rate'] = tf['Positive_head_wc'] / tf['Total_head_wc']

    #Readability feature extraction from news headlines
    tf['FRE_head'] = tf['title'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_head'] = tf['FRE_head'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_head'] = tf['title'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_head'] = tf['title'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_head'] = tf['title'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_head'] = tf['title'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_head'] = tf['title'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_head'] = tf['title'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_head_wc'] = tf['title'].apply(
        lambda x: textstat.difficult_words(x))

    #Predicting relevance class using these features in sorted order of confidence
    tf = tf.add_row_number()
    pred = model.classify(tf)
    pred = pred.add_row_number()
    relevant = pred.sort('probability', ascending=False)[:10]
    relevant = pred[pred['class'] == 1]
    non_relevant = pred[pred['class'] == 0]
    if relevant.num_rows() > 10:
        relevant_news_out = tf.join(relevant).sort('probability',
                                                   ascending=False)[:10]
    else:
        relevant_news = relevant.sort('probability', ascending=False)
        req_num_non_relevant_news = 10 - relevant.num_rows()
        non_relevant_news = non_relevant.sort(
            'probability')[:req_num_non_relevant_news]
        relevant_news = relevant_news.append(non_relevant_news)
        relevant_news_out = tf.join(relevant_news)

    return relevant_news_out
Ejemplo n.º 38
0
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
Ejemplo n.º 39
0
    '../output_text/trump_out.txt', '../output_text/shakespeare_out.txt',
    '../output_text/drseuss_out.txt'
]

# input_file_names = ['../data_parsed/trump.txt',
input_file_names = [
    '../data_parsed/shakespeare.txt', '../data_parsed/drseuss.txt'
]

for i in range(0, len(input_file_names)):
    input_file_name = input_file_names[i]
    print(input_file_name)
    with open(input_file_name, 'r') as myfile:
        test_data = myfile.read().replace('\n', '')

    print "flesch_reading_ease: " + str(
        textstat.flesch_reading_ease(test_data))
    print "smog_index: " + str(textstat.smog_index(test_data))
    print "flesch_kincaid_grade: " + str(
        textstat.flesch_kincaid_grade(test_data))
    print "coleman_liau_index: " + str(textstat.coleman_liau_index(test_data))
    print "automated_readability_index: " + str(
        textstat.automated_readability_index(test_data))
    print "dale_chall_readability_score: " + str(
        textstat.dale_chall_readability_score(test_data))
    print "difficult_words: " + str(textstat.difficult_words(test_data))
    print "linsear_write_formula: " + str(
        textstat.linsear_write_formula(test_data))
    print "gunning_fog: " + str(textstat.gunning_fog(test_data))
    print "text_standard: " + str(textstat.text_standard(test_data))
Ejemplo n.º 40
0
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=25)
pprint(ldamodel.print_topics(num_topics=10, num_words=10))

# grade level
data = []
for k in tqdm(cr.keys()):
    try:
        v = cr[k]
        gl = []
        for s in tqdm(v):
            if (gl == []):
                gl.append(textstat.flesch_kincaid_grade(s) / len(v))
                gl.append(textstat.smog_index(s) / len(v))
                gl.append(textstat.automated_readability_index(s) / len(v))
                gl.append(textstat.dale_chall_readability_score(s) / len(v))
                gl.append(textstat.coleman_liau_index(s) / len(v))
                gl.append(textstat.linsear_write_formula(s) / len(v))
                gl.append(textstat.gunning_fog(s) / len(v))
            else:
                gl[0] += textstat.flesch_kincaid_grade(s) / len(v)
                gl[1] += textstat.smog_index(s) / len(v)
                gl[2] += textstat.automated_readability_index(s) / len(v)
                gl[3] += textstat.dale_chall_readability_score(s) / len(v)
                gl[4] += textstat.coleman_liau_index(s) / len(v)
                gl[5] += textstat.linsear_write_formula(s) / len(v)
                gl[6] += textstat.gunning_fog(s) / len(v)
        t = ""
        for s in v:
            t += s
        gl.append(textstat.text_standard(t))
        data.append([k] + gl)
Ejemplo n.º 41
0
    def updateData(self):

        # Full list of polarity scores
        self.polscore = self.sid.polarity_scores(self.text)

        ##### INDEX 0 IN DATA: Text Sentiment #####
        # [INDEX 0] Compounded score (0.0 - 1.0)            [INDEX 1] Negative connotation rating (0.0 - 1.0),
        # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0)
        self.data.append([
            self.polscore['compound'], self.polscore['neg'],
            self.polscore['pos'], self.polscore['neu']
        ])

        ##### INDEX 1 IN DATA: Sentence Info #####
        # [INDEX 0] Sentence count          [INDEX 1] Average sentence length
        # [INDEX 2] Syllable count          [INDEX 3] Overall word count
        # [INDEX 4] Character count         [INDEX 5] Character count without spaces
        # [INDEX 6] Avg letters per word    [INDEX 7] Avg syllables per word
        self.data.append([
            textstat.sentence_count(self.text),
            textstat.avg_sentence_length(self.text),
            textstat.syllable_count(self.text),
            len(self.splList),
            textstat.char_count(self.text, False),
            textstat.char_count(self.text, True),
            textstat.avg_letter_per_word(self.text),
            textstat.avg_syllables_per_word(self.text)
        ])

        ##### INDEX 2 IN DATA: Flesch Reading Ease #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 100
        self.freRaw = textstat.flesch_reading_ease(self.text)
        self.freStat = min(max(self.freRaw, 0), 100)
        self.data.append([
            round(self.freStat, 3),
            self.freGrade(self.freStat),
            round(abs(self.freStat - 100), 2)
        ])

        ##### INDEX 3 IN DATA: Flesch-Kincaid Grade #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fkgRaw = textstat.flesch_kincaid_grade(self.text)
        self.fkgStat = self.adjustScore(self.fkgRaw)
        self.data.append([
            round(self.fkgStat, 3),
            self.grade(self.fkgStat),
            round(self.fkgStat / 0.18, 2)
        ])

        ##### INDEX 4 IN DATA: Gunning FOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fogRaw = textstat.gunning_fog(self.text)
        self.fogStat = self.adjustScore(self.fogRaw)
        self.data.append([
            round(self.fogStat, 3),
            self.grade(self.fogStat),
            round(self.fogStat / 0.18, 2)
        ])

        ##### INDEX 5 IN DATA: SMOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.smogRaw = textstat.smog_index(self.text)
        self.smogStat = self.adjustScore(self.smogRaw)
        self.data.append([
            round(self.smogStat, 3),
            self.grade(self.smogStat),
            round(self.smogStat / 0.18, 2)
        ])

        ##### INDEX 6 IN DATA: Automated Readability Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 14
        self.ariRaw = textstat.automated_readability_index(self.text)
        self.ariStat = min(max(self.ariRaw, 0), 14)
        self.data.append([
            round(self.ariStat, 3),
            self.ariGrade(ceil(self.ariStat)),
            round(self.ariStat / 0.14, 2)
        ])  #13

        ##### INDEX 7 IN DATA: Coleman-Liau Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.cliRaw = textstat.coleman_liau_index(self.text)
        self.cliStat = self.adjustScore(self.cliRaw)
        self.data.append([
            round(self.cliStat, 3),
            self.grade(self.cliStat),
            round(self.cliStat / 0.18, 2)
        ])

        ##### INDEX 8 IN DATA: Linsear Write Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.lwiRaw = textstat.linsear_write_formula(self.text)
        self.lwiStat = self.adjustScore(self.lwiRaw)
        self.data.append([
            round(self.lwiStat, 3),
            self.grade(self.lwiStat),
            round(self.lwiStat / 0.18, 2)
        ])

        ##### INDEX 9 IN DATA: Dale-Chall Readability Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 10
        self.dcrRaw = textstat.dale_chall_readability_score(self.text)
        self.dcrStat = min(max(self.dcrRaw, 0), 10)
        self.data.append([
            round(self.dcrStat, 3),
            self.daleChallGrade(self.dcrStat),
            round(self.dcrStat / 0.1, 2)
        ])

        ##### INDEX 10 IN DATA: Overall Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 20
        self.txtRaw = textstat.text_standard(self.text, True)
        self.txtStd = min(max(self.txtRaw, 0), 20)
        self.txtInfo = textstat.text_standard(self.text)
        self.data.append([
            round(self.txtStd, 3),
            self.txtGrade(self.txtStd, self.txtInfo),
            round(self.txtStd / 0.2, 2)
        ])

        return self.data
Ejemplo n.º 42
0
def run(groupSize, groupTitle, vectorMode, featureMode, outputFile='result.output'):
    resultFile = open(outputFile, 'a')
    mentionMapper = mapMention('adData/analysis/ranked/mention.json')

    print groupTitle
    resultFile.write(groupTitle + '\n')
    for group in range(groupSize):
        print 'group: ' + str(group)
        resultFile.write('group: ' + str(group) + '\n')
        happy_log_probs, sad_log_probs = utilities.readSentimentList('twitter_sentiment_list.csv')
        posFile = open('adData/analysis/groups/' + groupTitle + '/group' + str(group) + '.pos', 'r')
        negFile = open('adData/analysis/groups/' + groupTitle + '/group' + str(group) + '.neg', 'r')
        posParseLengthFile = open('adData/analysis/groups/' + groupTitle + '/parserLength' + str(group) + '.pos', 'r')
        negParseLengthFile = open('adData/analysis/groups/' + groupTitle + '/parserLength' + str(group) + '.neg', 'r')
        posHeadCountFile = open('adData/analysis/groups/' + groupTitle + '/parserHeadCount' + str(group) + '.pos', 'r')
        negHeadCountFile = open('adData/analysis/groups/' + groupTitle + '/parserHeadCount' + str(group) + '.neg', 'r')

        contents = []
        scores = []
        days = []
        time = []
        labels = []
        parseLength = []
        headCount = []
        usernames = []
        semanticFeatures = []
        classes = []

        print 'loading...'

        for line in posFile:
            seg = line.strip().split(' :: ')
            text = seg[3]
            username = seg[7].split(';')
            time.append(hourMapper(seg[2]))
            day = seg[1]
            score = float(seg[0])
            usernames.append(username)
            days.append(dayMapper[day])
            contents.append(text)
            scores.append(score)
            labels.append(1)

        for line in negFile:
            seg = line.strip().split(' :: ')
            text = seg[3]
            username = seg[7].split(';')
            time.append(hourMapper(seg[2]))
            day = seg[1]
            score = float(seg[0])
            usernames.append(username)
            days.append(dayMapper[day])
            contents.append(text)
            scores.append(score)
            labels.append(0)

        if vectorMode == 1:
            resultFile.write('tfidf \n')
            vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=2, stop_words='english')
            vectorMatrix = vectorizer.fit_transform(contents)
        elif vectorMode == 2:
            resultFile.write('binary count \n')
            vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=2, stop_words='english',
                                         binary='True')
            vectorMatrix = vectorizer.fit_transform(contents)
            print vectorMatrix.shape
        else:
            resultFile.write('no vector features \n')

        for line in posParseLengthFile:
            parseLength.append(int(line.strip(' :: ')[0]))
        for line in negParseLengthFile:
            parseLength.append(int(line.strip(' :: ')[0]))
        for line in posHeadCountFile:
            headCount.append(int(line.strip(' :: ')[0]))
        for line in negHeadCountFile:
            headCount.append(int(line.strip(' :: ')[0]))

        posHeadCountFile.close()
        negHeadCountFile.close()
        posParseLengthFile.close()
        negParseLengthFile.close()
        posFile.close()
        negFile.close()

        for index, content in enumerate(contents):
            temp = []
            twLen = len(simpleTokenize(content))
            posProb, negProb = utilities.classifySentiment(simpleTokenize(content), happy_log_probs, sad_log_probs)
            readScore = textstat.coleman_liau_index(content)

            temp.append(content.count('urrl'))
            temp.append(content.count('hhttg'))
            # numpy.append(temp, content.count('emmoj'))
            temp.append(content.count('ussernm'))
            temp.append(twLen)
            temp.append(posProb)
            temp.append(readScore)
            temp.append(parseLength[index])
            temp.append(headCount[index])
            temp.append(days[index])
            temp.append(time[index])

            mentionFlag = 0
            mentionFollowers = 0
            for user in usernames[index]:
                if user in mentionMapper:
                    if mentionMapper[user][0] == 1:
                        mentionFlag = 1
                    mentionFollowers += mentionMapper[user][1]
            temp.append(mentionFlag)
            temp.append(mentionFollowers)

            semanticFeatures.append(numpy.array(temp))
            classes.append(labels[index])

        if featureMode == 0:
            resultFile.write('semantic features only \n')
            features = csr_matrix(numpy.array(semanticFeatures))
        elif featureMode == 1:
            resultFile.write('vector features only \n')
            features = vectorMatrix
        else:
            resultFile.write('both features \n')
            features = hstack((vectorMatrix, csr_matrix(numpy.array(semanticFeatures))), format='csr')


        # initialize the MLP
        model = Classifier(layers=[Layer("Sigmoid", units=100), Layer("Softmax")], learning_rate=0.02, n_iter=25)

        precisionSum = 0.0
        recallSum = 0.0
        accuracySum = 0.0
        resultFile.flush()
        print 'running 5-fold CV...'
        for i in range(5):
            print 'case ' + str(i)
            feature_train, feature_test, label_train, label_test = cross_validation.train_test_split(features.todense(), classes, test_size=0.2, random_state=0)

            X_train = numpy.array(feature_train)
            Y_train = numpy.array(label_train)
            X_test = numpy.array(feature_test)
            Y_test = numpy.array(label_test)

            model.fit(X_train, Y_train)
            predictions = model.predict(X_test)

            correctCount = 0.0
            totalCount = 0.0
            if len(predictions) != len(label_test):
                print 'inference error!'
                resultFile.write('inferece error!\n')
            for index, label in enumerate(predictions):
                if label == 1:
                    if label_test[index] == 1:
                        correctCount += 1
                    totalCount += 1
            if totalCount == 0:
                precision = 0
            else:
                precision = correctCount / totalCount
            recall = correctCount / label_test.count(1)
            accuracy = model.score(X_test, Y_test)

            precisionSum += precision
            recallSum += recall
            accuracySum += accuracy
            resultFile.flush()

        outputPrecision = precisionSum / 5
        outputRecall = recallSum / 5
        outputAccuracy = accuracySum / 5
        if (outputRecall + outputPrecision) == 0:
            outputF1 = 0.0
        else:
            outputF1 = 2 * outputRecall * outputPrecision / (outputRecall + outputPrecision)

        print outputPrecision
        print outputRecall
        print outputAccuracy
        print outputF1
        print ''
        resultFile.write(str(outputPrecision) + '\n')
        resultFile.write(str(outputRecall) + '\n')
        resultFile.write(str(outputAccuracy) + '\n')
        resultFile.write(str(outputF1) + '\n')
        resultFile.write('\n')
        resultFile.flush()

    resultFile.close()