Beispiel #1
0
def get_readability_scores(text):
    scores = {
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'gunning_fog': textstat.gunning_fog(text),
        'smog_index': textstat.smog_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'text_standard': textstat.text_standard(text, float_output=True),
        'difficult_words': textstat.difficult_words(text) / len(text.split()),
    }
    return scores
def analyze_vocab(text):
    return {
        'num_words': textstat.lexicon_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'difficult_words': textstat.difficult_words(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True)
    }
 def score_text(self, test_data):
     score = {}
     score['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     score['smog_index'] = textstat.smog_index(test_data)
     score['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
         test_data)
     score['coleman_liau_index'] = textstat.coleman_liau_index(test_data)
     score[
         'automated_readability_index'] = textstat.automated_readability_index(
             test_data)
     score[
         'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
             test_data)
     score['difficult_words'] = textstat.difficult_words(test_data)
     score['linsear_write_formula'] = textstat.linsear_write_formula(
         test_data)
     score['gunning_fog'] = textstat.gunning_fog(test_data)
     score['text_standard'] = textstat.text_standard(test_data)
     return score
Beispiel #4
0
    def _extract_readability_scores(self, text: Text, scores=None) -> Dict:

        output = {}
        if scores == None or 'flesch_reading_ease' in scores:
            output['flesch_reading_ease'] = textstat.flesch_reading_ease(text)

        if scores == None or 'smog_index' in scores:
            output['smog_index'] = textstat.smog_index(text)

        if scores == None or 'flesch_kincaid_grade' in scores:
            output['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
                text)

        if scores == None or 'coleman_liau_index' in scores:
            output['coleman_liau_index'] = textstat.coleman_liau_index(text)

        if scores == None or 'automated_readability_index' in scores:
            output[
                'automated_readability_index'] = textstat.automated_readability_index(
                    text)

        if scores == None or 'dale_chall_readability_score' in scores:
            output[
                'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
                    text)

        if scores == None or 'difficult_words' in scores:
            output['difficult_words'] = textstat.difficult_words(text)

        if scores == None or 'linsear_write_formula' in scores:
            output['linsear_write_formula'] = textstat.linsear_write_formula(
                text)

        if scores == None or 'gunning_fog' in scores:
            output['gunning_fog'] = textstat.gunning_fog(text)

        if scores == None or 'text_standard' in scores:
            output['text_standard'] = textstat.text_standard(text,
                                                             float_output=True)

        return output
Beispiel #5
0
def text_analysis(test_data):
	#flesch_reading_ease: higher scores indicate material that is easier to read. aim for >60.0
	print ('flesch_reading_ease: '+str(textstat.flesch_reading_ease(test_data)))
	#smog_index: Calculates US grade level
	print ('smog_index: '+str(textstat.smog_index(test_data)))
	#flesch_kincaid_grade: Calculates US grade level
	print ('flesch_kincaid_grade: '+str(textstat.flesch_kincaid_grade(test_data)))
	#Colman Liau: Calculates US grade level
	print ('coleman_liau_index: '+str(textstat.coleman_liau_index(test_data)))
	#automated_readability_index: Calculates US grade level
	print ('automated_readability_index: '+str(textstat.automated_readability_index(test_data)))
	#Dale Chall Readability Score: 0.1579(dificult words / words *100) + 0.0496(words/sentences)
	print ('dale_chall_readability_score: '+str(textstat.dale_chall_readability_score(test_data)))
	#number of difficult words
	print ('difficult_words: '+str(textstat.difficult_words(test_data)))
	#Linsear Write: Calculates the U.S. grade level of a text sample based on sentence length and the number of words with three or more syllables. 
	print ('linsear_write_formula: '+str(textstat.linsear_write_formula(test_data)))
	#gunning_frog: The text can be understood by someone who left full-time education at a later age than the index
	print ('gunning_fog: '+str(textstat.gunning_fog(test_data)))
	#text_standard: Calculates US grade level
	print ('text_standard: '+str(textstat.text_standard(test_data)))
def print_readability(text_to_analyse, option='short'):
    if option == 'all':
        print(
            "flesch (0-29: confusing, 30-59: Difficult, 60-69: Standard, 70-100: Easy): ",
            textstat.flesch_reading_ease(text_to_analyse))
        print("smog (years of education required): ",
              textstat.smog_index(text_to_analyse))
        print(
            "flesch kinkaid (70-100: Fairly Easy; 60-70: Plain English; 30-60: Fairly Difficult; 30-0: Very Difficult): ",
            textstat.flesch_kincaid_grade(text_to_analyse))
        print("coleman liau: ", textstat.coleman_liau_index(text_to_analyse))
        print(
            "auto read (1-4: 5-10 years age; 5-8: 10-14 y; 9-12: 14-18 y; 13-14: 18+): ",
            textstat.automated_readability_index(text_to_analyse))
        print("dale chall (< 5: kid; 5-8: scholar; 9-10: college): ",
              textstat.dale_chall_readability_score(text_to_analyse))
        print("difficult words: ", textstat.difficult_words(text_to_analyse))
        print("linsear write: ",
              textstat.linsear_write_formula(text_to_analyse))
        print("gunning fog (9-12: High-school; 13-17: College): ",
              textstat.gunning_fog(text_to_analyse))

    print("text standard (estimated school grade level): ",
          textstat.text_standard(text_to_analyse))
def readability():

    text = "I am some really difficult text to read because I use obnoxiously large words."
    test_data = ("data/Job Bulletins/ACCOUNTANT 1513 062218.txt")

    all_files = os.listdir("data/Job Bulletins")
    all_files.sort()

    counter = 0
    average = 0

    for file_name in all_files:
        test_data = file_name
        # print(test_data)

        test_data_name = f"data/Job Bulletins/{test_data}"
        # print(test_data_name)

        a = textstat.flesch_reading_ease(test_data_name)
        # counter += 1
        '''Score	Difficulty
        90-100	Very Easy
        80-89	Easy
        70-79	Fairly Easy
        60-69	Standard
        50-59	Fairly Difficult
        30-49	Difficult
        0-29	Very Confusing
        '''

        b = textstat.smog_index(test_data)
        c = textstat.flesch_kincaid_grade(test_data)
        d = textstat.coleman_liau_index(test_data)
        e = textstat.automated_readability_index(test_data)

        f = textstat.dale_chall_readability_score(test_data)
        '''Score	Understood by
        4.9 or lower	average 4th-grade student or lower
        5.0–5.9	average 5th or 6th-grade student
        6.0–6.9	average 7th or 8th-grade student
        7.0–7.9	average 9th or 10th-grade student
        8.0–8.9	average 11th or 12th-grade student
        9.0–9.9	average 13th to 15th-grade (college) student
        '''

        g = textstat.difficult_words(test_data)
        h = textstat.linsear_write_formula(test_data)
        i = textstat.gunning_fog(test_data)
        j = textstat.text_standard(test_data)
        k = textstat.syllable_count(text, lang='en_US')
        l = textstat.lexicon_count(text, removepunct=True)
        m = textstat.gunning_fog(text)
        n = textstat.text_standard(text, float_output=False)

        counter += a

    average = counter / 683

    # print(average)

    my_list = [a]

    return (jsonify(my_list))
Beispiel #8
0
    print(len(words))
    # remove the stopwords
    words = [word for word in words if word not in stw]
    lyrics_no_sw = [
        word for word in words if word not in stopwords.words('english')
    ]

    # Calculate the total number of words
    ttl_words = len(words)

    # Calculate the total number of sentences
    docReader = nltk.corpus.PlaintextCorpusReader('./', artist + '.txt')
    sentences = len(docReader.sents())

    # Calculate the total number of difficult words
    diff_words_count = textstat.difficult_words(raw_text)

    # Calculate readability-- Gunning Fog
    dif_words = (diff_words_count / ttl_words * 100)
    gf_read = 0.4 * (float(ttl_words / sentences) + dif_words)

    # Calculate readability-- SMOG
    poly_syl = 0
    for word in words:
        syl_count = textstatistics().syllable_count(word)
        if syl_count >= 3:
            poly_syl += 1
    SMOG = (1.043 * (30 * (poly_syl / sentences))**0.5) + 3.1291
    smog_read = legacy_round(SMOG, 1)

    # Calculate readability-- Linsear Write
        cap_words = word_tokenize(review_captial)
        cap_words = [
            w for w in cap_words
            if w not in ['.', ',', ';', '?', ':', '!', '"', "'", '#']
        ]
        for w in cap_words:
            if w[0].isupper():
                cnt += 1
        capital_count.append(cnt / len(cap_words))

        #obatining readability features
        reviews[i] = reviews[i].strip().lower().replace("\'", '')
        kingrade.append(textstat.flesch_kincaid_grade(reviews[i]))
        gunning.append(textstat.gunning_fog(reviews[i]))
        flesch_reading_ease1.append(textstat.flesch_reading_ease(reviews[i]))
        difficult_words1.append(textstat.difficult_words(reviews[i]))
        smog_index1.append(textstat.smog_index(reviews[i]))
        automated_readability_index1.append(
            textstat.automated_readability_index(reviews[i]))
        coleman_liau_index1.append(textstat.coleman_liau_index(reviews[i]))
        linsear_write_formula1.append(
            textstat.linsear_write_formula(reviews[i]))
        dale_chall_readability_score1.append(
            textstat.dale_chall_readability_score(reviews[i]))
        word_freq = []

        #obtaining punctuation count
        words = word_tokenize(reviews[i])
        punct = [w for w in words if w in ['.', ',', ';', '?', ':', '!']]
        punct_count.append(len(punct) / len(words))
Beispiel #10
0
def download(request):
    global tweetsList

    response = HttpResponse(content_type='application/x-download')
    response['Content-Disposition'] = 'attachment; filename="tweets.csv"'

    #set headers of csv
    fieldnames = ['datetime', 'last updated', 'original username', 'original screen name',
                  'original user location', 'original user verified', 'retweet', 'retweeter username',
                  'retweeter screen name', 'retweeter location', 'retweeter verified', 'text', 'comment',
                  # 'hashtags', 'urls', '#retweets','#favorites', '#retweets of retweet',
                  'hashtags', 'urls', '#retweets', '#favorites',
                  '#favorites of retweet', 'original syllable count', 'original lexicon count',
                  'original sentence count', 'original flesch reading ease score', 'original flesch-kincaid grade level',
                  'original fog scale', 'original smog index', 'original automated readability index', 'original coleman-liau index',
                  'original linsear write level', 'original dale-chall readability score', 'original difficult words',
                  'original readability consensus', 'original neg sentiment', 'original neu sentiment', 'original pos sentiment',
                  'original overall sentiment', 'comment syllable count', 'comment lexicon count',
                  'comment sentence count', 'comment flesch reading ease score', 'comment flesch-kincaid grade level',
                  'comment fog scale', 'comment smog index', 'comment automated readability index', 'comment coleman-liau index',
                  'comment linsear write level', 'comment dale-chall readability score', 'comment difficult words',
                  'comment readability consensus', 'comment neg sentiment', 'comment neu sentiment', 'comment pos sentiment',
                  'comment overall sentiment', 'combined syllable count', 'combined lexicon count',
                  'combined sentence count', 'combined flesch reading ease score', 'combined flesch-kincaid grade level',
                  'combined fog scale', 'combined smog index', 'combined automated readability index', 'combined coleman-liau index',
                  'combined linsear write level', 'combined dale-chall readability score', 'combined difficult words',
                  'combined readability consensus', 'combined neg sentiment', 'combined neu sentiment', 'combined pos sentiment',
                  'combined overall sentiment', 'twitter users query', 'twitter excluded users query', 'twitter hashtags query', 'twitter keywords query',
                  'twitter from date query', 'twitter to date query']

    writer = csv.writer(response, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(fieldnames)

    for tweet in tweetsList:
        #combine hashtags of tweet into string separated by commas
        hashtagString = ""
        tweetHashtags = HashtagLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetHashtags)):
            if i == 0:
                hashtagString += tweetHashtags[i].hashtag.hashtagText
            else:
                hashtagString += ", " + tweetHashtags[i].hashtag.hashtagText

        #combine urls of tweet into string separated by commas
        urlString = ""
        tweetUrls = UrlLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetUrls)):
            if i == 0:
                urlString += tweetUrls[i].url.urlText
            else:
                urlString += ", " + tweetUrls[i].url.urlText

        #display yes or no in verified column for original user
        if tweet.originalUser.isVerified:
            originalVerifiedString = "yes"
        else:
            originalVerifiedString = "no"

        #if not a retweet, new user fields should be empty
        newUsername = None
        newScreenName = None
        newLocation = None
        newVerifiedString = None

        #if retweet:
        #display yes or no in verified column for new user
        if tweet.newUser:
            if tweet.newUser.isVerified:
                newVerifiedString = "yes"
            else:
                newVerifiedString = "no"

            #set retweet fields
            newUsername = tweet.newUser.username
            newScreenName = tweet.newUser.screenName
            newLocation = tweet.newUser.location

        #display yes or no in retweet column
        if tweet.isRetweet:
            isRetweetString = "yes"
        else:
            isRetweetString = "no"

        #get sentiment scores of original text
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict_original = sid_obj.polarity_scores(tweet.originalText)

        #combine comment text and original tezt and get sentiment scores for the combination
        commentText = ""
        if tweet.commentText:
            commentText = tweet.commentText
        sentiment_dict_combined = sid_obj.polarity_scores(tweet.originalText + commentText)

        #intialize all comment word processing to empty strings in case there is no comment text
        cSyllableCount = ""
        cLexiconCount = ""
        cSentenceCount = ""
        cFleschReadingEase = ""
        cFleschKincaidGrade = ""
        cGunningFog = ""
        cSmogIndex = ""
        cAutomatedReadabilityIndex = ""
        cColemanLiauIndex = ""
        cLinsearWriteFormula = ""
        cDaleChallReadabilityScore = ""
        cDifficultWords = ""
        cTextStandard = ""

        #if there is comment text, get language processing stats for comment text
        if tweet.commentText != None:
            cSyllableCount = textstat.syllable_count(tweet.commentText, lang='en_US')
            cLexiconCount = textstat.lexicon_count(tweet.commentText, removepunct=True)
            cSentenceCount = textstat.sentence_count(tweet.commentText)
            cFleschReadingEase = textstat.flesch_reading_ease(tweet.commentText)
            cFleschKincaidGrade = textstat.flesch_kincaid_grade(tweet.commentText)
            cGunningFog = textstat.gunning_fog(tweet.commentText)
            cSmogIndex = textstat.smog_index(tweet.commentText)
            cAutomatedReadabilityIndex = textstat.automated_readability_index(tweet.commentText)
            cColemanLiauIndex = textstat.coleman_liau_index(tweet.commentText)
            cLinsearWriteFormula = textstat.linsear_write_formula(tweet.commentText)
            cDaleChallReadabilityScore = textstat.dale_chall_readability_score(tweet.commentText)
            cDifficultWords = textstat.difficult_words(tweet.commentText)
            cTextStandard = textstat.text_standard(tweet.commentText, float_output=False)

        #get sentiment scores for comment text
        cNegSent = ""
        cNeuSent = ""
        cPosSent = ""
        cCompoundSent = ""
        if tweet.commentText:
            sentiment_dict_comment = sid_obj.polarity_scores(tweet.commentText)
            cNegSent = sentiment_dict_comment['neg']
            cNeuSent = sentiment_dict_comment['neu']
            cPosSent = sentiment_dict_comment['pos']
            cCompoundSent = sentiment_dict_comment['compound']

        #write all information about the tweet, and its language processing stats to row in csv
        writer.writerow(
            [tweet.createdAt, tweet.lastUpdated, tweet.originalUser.username,
             tweet.originalUser.screenName, tweet.originalUser.location, originalVerifiedString,
             isRetweetString, newUsername, newScreenName, newLocation, newVerifiedString,
             tweet.originalText, tweet.commentText, hashtagString, urlString, tweet.numRetweetsOriginal,
             # tweet.numFavoritesOriginal, tweet.numRetweetsNew, tweet.numFavoritesNew,
             tweet.numFavoritesOriginal, tweet.numFavoritesNew,
             textstat.syllable_count(tweet.originalText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText, removepunct=True),
             textstat.sentence_count(tweet.originalText),
             textstat.flesch_reading_ease(tweet.originalText),
             textstat.flesch_kincaid_grade(tweet.originalText),
             textstat.gunning_fog(tweet.originalText),
             textstat.smog_index(tweet.originalText),
             textstat.automated_readability_index(tweet.originalText),
             textstat.coleman_liau_index(tweet.originalText),
             textstat.linsear_write_formula(tweet.originalText),
             textstat.dale_chall_readability_score(tweet.originalText),
             textstat.difficult_words(tweet.originalText),
             textstat.text_standard(tweet.originalText, float_output=False),
             sentiment_dict_original['neg'], sentiment_dict_original['neu'],
             sentiment_dict_original['pos'], sentiment_dict_original['compound'], cSyllableCount,
             cLexiconCount, cSentenceCount, cFleschReadingEase, cFleschKincaidGrade, cGunningFog,
             cSmogIndex, cAutomatedReadabilityIndex, cColemanLiauIndex, cLinsearWriteFormula, cDaleChallReadabilityScore,
             cDifficultWords, cTextStandard, cNegSent, cNeuSent, cPosSent, cCompoundSent,
             textstat.syllable_count(tweet.originalText + commentText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText + commentText, removepunct=True),
             textstat.sentence_count(tweet.originalText + commentText),
             textstat.flesch_reading_ease(tweet.originalText + commentText),
             textstat.flesch_kincaid_grade(tweet.originalText + commentText),
             textstat.gunning_fog(tweet.originalText + commentText),
             textstat.smog_index(tweet.originalText + commentText),
             textstat.automated_readability_index(tweet.originalText + commentText),
             textstat.coleman_liau_index(tweet.originalText + commentText),
             textstat.linsear_write_formula(tweet.originalText + commentText),
             textstat.dale_chall_readability_score(tweet.originalText + commentText),
             textstat.difficult_words(tweet.originalText + commentText),
             textstat.text_standard(tweet.originalText + commentText, float_output=False),
             sentiment_dict_combined['neg'], sentiment_dict_combined['neu'],
             sentiment_dict_combined['pos'], sentiment_dict_combined['compound'],
             tweet.twitterQueryUsers, tweet.twitterQueryNotUsers,
             tweet.twitterQueryHashtags, tweet.twitterQueryKeywords,
             tweet.twitterQueryFromDate, tweet.twitterQueryToDate]
        )

    return response
Beispiel #11
0
def ReadabilityFeatureGenerator(df):
    """
    Computes various readability features of news content.

    Input: DataFrame
    Returns list of readability features
    """

    t0 = time()
    print("\n---Generating Readability Features:---")

    def lexical_diversity(text):
        word_count = len(text)
        vocab_size = len(set(text))
        diversity_score = word_count / vocab_size
        return diversity_score

    def get_counts(text, word_list):
        words = nltk.tokenize.word_tokenize(text.lower())
        count = 0
        for word in words:
            if word in word_list:
                count += 1
        return count

    df['flesch_reading_ease'] = df['articleBody'].map(
        lambda x: textstat.flesch_reading_ease(x))
    df['smog_index'] = df['articleBody'].map(lambda x: textstat.smog_index(x))
    df['flesch_kincaid_grade'] = df['articleBody'].map(
        lambda x: textstat.flesch_kincaid_grade(x))
    df['coleman_liau_index'] = df['articleBody'].map(
        lambda x: textstat.coleman_liau_index(x))
    df['automated_readability_index'] = df['articleBody'].map(
        lambda x: textstat.automated_readability_index(x))
    df['dale_chall_readability_score'] = df['articleBody'].map(
        lambda x: textstat.dale_chall_readability_score(x))
    df['difficult_words'] = df['articleBody'].map(
        lambda x: textstat.difficult_words(x))
    df['linsear_write_formula'] = df['articleBody'].map(
        lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog'] = df['articleBody'].map(
        lambda x: textstat.gunning_fog(x))
    df['i_me_myself'] = df['articleBody'].apply(get_counts,
                                                args=(['i', 'me', 'myself'], ))
    df['punct'] = df['articleBody'].apply(get_counts,
                                          args=([',', '.', '!', '?'], ))
    df['lexical_diversity'] = df['articleBody'].apply(lexical_diversity)

    feats = [
        'flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
        'coleman_liau_index', 'automated_readability_index',
        'dale_chall_readability_score', 'difficult_words',
        'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct',
        'lexical_diversity'
    ]

    xReadable = df[feats].values
    print('xReadable.shape: ', xReadable.shape)

    print('---Readability Features is complete---')
    print("Time taken {} seconds\n".format(time() - t0))
    return [xReadable]
import textstat

# reference: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
# reference: https://pypi.org/project/textstat/

# url = https://www.omio.com/trains/paris/london
text = "Trains in Europe are a convenient way of traveling between cities, with a number of train companies offering domestic and international routes. Trains from Paris to London are incredibly fast and convenient, with high-speed Eurostar trains traveling between the cities in about 2.5 hours. Eurostar trains depart from Paris frequently during the day from early morning until evening, providing plenty of travel options. Paris to London train travel time ranges between 2 hours 15 minutes and 2 hours 30 minutes, depending on the chosen departure time. Eurostar's Paris to London trains leave from Paris Gare du Nord station and travel directly to London St Pancras International. Peak times tend to be in the morning around 8:30 a.m. and 4:30 p.m. in the afternoon."
print "Flesch Reading Ease: " + str(textstat.flesch_reading_ease(text))
print "Flesch Kincaid Grade: " + str(textstat.flesch_kincaid_grade(text))
print "Smog Index: " + str(textstat.smog_index(text))
print "Coleman Liau Index: " + str(textstat.coleman_liau_index(text))
print "Automated Readability Index: " + str(
    textstat.automated_readability_index(text))
print "Dale Chall Readability Score: " + str(
    textstat.dale_chall_readability_score(text))
print "Difficult Words: " + str(textstat.difficult_words(text))
print "Linsear Write Formula: " + str(textstat.linsear_write_formula(text))
print "Gunning Fog: " + str(textstat.gunning_fog(text))
print "Text Standard: " + str(textstat.text_standard(text))
print "==========\n"

# url = https://de.omio.com/bahn/berlin/amsterdam-rdudx
textstat.set_lang('de_DE')
text = "Bahn von Berlin nach Amsterdam \
Tagtäglich verbinden mehrere Schnellzüge der Deutschen Bahn Berlin mit Amsterdam auf direktem Wege: Einfach am Hauptbahnhof einsteigen, entspannt zurücklehnen und nach 6,5 Stunden im Herzen der niederländischen Metropole aussteigen. Zusätzlich bestehen zahlreiche genauso schnelle Verbindungen mit einmaligem Umsteigen in Hannover oder Duisburg. Einfacher und komfortabler geht es kaum! \
Welche Bahngesellschaften fahren von von Berlin nach Amsterdam? \
Alle Verbindungen mit der Bahn von Berlin nach Amsterdam werden von der Deutschen Bahn angeboten. Fast täglich fahren 7 ICs der DB direkt vom Berliner Hauptbahnhof nach Amsterdam Centraal. Der erste IC verlässt Berlin bereits in den frühen Morgenstunden und erreicht mittags Amsterdam. Der letzte IC fährt am späten Nachmittag in Berlin ab und kommt noch vor Mitternacht in Amsterdam an. \
Daneben besteht die Möglichkeit, in Hannover oder Duisburg in ICs oder ICEs nach Amsterdam umzusteigen. Wer die längere Dauer und das mehrmalige Umsteigen nicht scheut, kann auch die landschaftliche attraktive Route über Ostfriesland wählen: Dabei führt die Fahrt mit der Bahn von Berlin nach Amsterdam über Hamburg, Bremen, Leer/Ostfriesland, Groningen und Almere. \
\
Wie lange dauert die Bahnfahrt von Berlin nach Amsterdam? \
Die direkte Fahrt mit der Bahn von Berlin nach Amsterdam dauert exakt 6,5 Stunden. Doch auch die Umsteigeverbindungen über Hannover oder Duisburg benötigen nicht länger für die gesamte Strecke, da die Bahn bis Hannover bzw. Duisburg weniger Zwischenhalte einlegt. \
Beispiel #13
0
    "SUNDAY, JANUARY 13, 2019. Additional instructions will be sent via e-mail. "
    "Candidates who fail to complete the advisory essay as instructed may be disqualified."
    "The multiple-choice test will be proctored and administered on-line during a single session. "
    "Candidates invited to participate in the on-line multiple-choice test will be able to take the test "
    "as instructed from a remote location using a computer with a webcam and a reliable internet connection. "
    "Candidates will receive an e-mail from the City of Los Angeles outlining the dates and "
    "specific steps on how to take the multiple-choice test and advisory essay on-line"
)

textstat.flesch_reading_ease(test_data)
textstat.smog_index(test_data)
textstat.flesch_kincaid_grade(test_data)
textstat.coleman_liau_index(test_data)
textstat.automated_readability_index(test_data)
textstat.dale_chall_readability_score(test_data)
textstat.difficult_words(test_data)
textstat.linsear_write_formula(test_data)
textstat.gunning_fog(test_data)
textstat.text_standard(test_data)

# In[ ]:


# Let's take another sample


df_opening_pdfs.head()

# Clean up the pdf opening names

df_openings = pd.DataFrame()
Beispiel #14
0
import textstat
#import readtime

test_data = "Hello world. Welcome to my home!"

print("Flesch Reading Ease : " + str(textstat.flesch_reading_ease(test_data)))
print("Smog Index : " + str(textstat.smog_index(test_data)))
print("Flesch Kincaid Grade : " +
      str(textstat.flesch_kincaid_grade(test_data)))
print("Coleman Liau Index : " + str(textstat.coleman_liau_index(test_data)))
print("Automated Readibility Index : " +
      str(textstat.automated_readability_index(test_data)))
print("Dale Chall Readability Score : " +
      str(textstat.dale_chall_readability_score(test_data)))
print("Difficult Words : " + str(textstat.difficult_words(test_data)))
print("Linsear Write Formula : " +
      str(textstat.linsear_write_formula(test_data)))
print("Gunning Fog : " + str(textstat.gunning_fog(test_data)))
print("Text Stamdard : " + str(textstat.text_standard(test_data)))

res = len(test_data.split())
print("Word Count : " + str(res))
"""
Average reading speed of an adult - roughly 256 words per minute


256 words can be read in 60 seconds

1 word can be read in (60/256) seconds

n words can be read in (60/256) * n seconds
def get_difficult_words(text):
    return textstat.difficult_words(text.text)
Beispiel #16
0
def test_difficult_words():
    textstat.set_lang("en_US")
    result = textstat.difficult_words(long_test)

    assert result == 49
Beispiel #17
0
def test_difficult_words():
    result = textstat.difficult_words(long_test)

    assert result == 49
Beispiel #18
0
    def process(self, df):

        t0 = time()
        print("\n---Generating Readability Features:---\n")

        def lexical_diversity(text):
            word_count = len(text)
            vocab_size = len(set(text))
            diversity_score = word_count / vocab_size
            return diversity_score

        def get_counts(text, word_list):
            words = nltk.tokenize.word_tokenize(text.lower())
            count = 0
            for word in words:
                if word in word_list:
                    count += 1
            return count

        df['flesch_reading_ease'] = df['articleBody'].astype(str).map(
            lambda x: textstat.flesch_reading_ease(x))
        print('flesch_reading_ease done!')
        df['smog_index'] = df['articleBody'].astype(str).map(
            lambda x: textstat.smog_index(x))
        print('smog_index done!')
        df['flesch_kincaid_grade'] = df['articleBody'].astype(str).map(
            lambda x: textstat.flesch_kincaid_grade(x))
        print('flesch_kincaid_grade done!')
        df['coleman_liau_index'] = df['articleBody'].astype(str).map(
            lambda x: textstat.coleman_liau_index(x))
        print('coleman_liau_index done!')
        df['automated_readability_index'] = df['articleBody'].astype(str).map(
            lambda x: textstat.automated_readability_index(x))
        print('automated_readability_index done!')
        df['dale_chall_readability_score'] = df['articleBody'].astype(str).map(
            lambda x: textstat.dale_chall_readability_score(x))
        print('dale_chall_readability_score done!')
        df['difficult_words'] = df['articleBody'].astype(str).map(
            lambda x: textstat.difficult_words(x))
        print('difficult_words done!')
        df['linsear_write_formula'] = df['articleBody'].astype(str).map(
            lambda x: textstat.linsear_write_formula(x))
        print('linsear_write_formula done!')
        df['gunning_fog'] = df['articleBody'].astype(str).map(
            lambda x: textstat.gunning_fog(x))
        print('gunning_fog done!')
        df['i_me_myself'] = df['articleBody'].astype(str).apply(
            get_counts, args=(['i', 'me', 'myself'], ))
        print('i_me_myself done!')
        df['punct'] = df['articleBody'].astype(str).apply(
            get_counts, args=([',', '.', '!', '?'], ))
        print('punct done!')
        df['lexical_diversity'] = df['articleBody'].astype(str).apply(
            lexical_diversity)
        print('lexical_diversity done!')

        feats = [
            'flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
            'coleman_liau_index', 'automated_readability_index',
            'dale_chall_readability_score', 'difficult_words',
            'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct',
            'lexical_diversity'
        ]

        outfilename_xReadable = df[feats].values

        with open('../saved_data/kaggle/read.pkl', 'wb') as outfile:
            pickle.dump(feats, outfile, -1)
            pickle.dump(outfilename_xReadable, outfile, -1)

        print('readable features saved in read.pkl')

        print('\n---Readability Features is complete---')
        print("Time taken {} seconds\n".format(time() - t0))

        return 1
Beispiel #19
0
def get_article_features(title, text, nlp):
    '''
    The Above Function Returns Json Object for the String Input.
    Takes two Parameter
    '''
    def preprocess(sentence):
        sentence = sentence.lower()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sentence)
        return " ".join(tokens)

    def lexical_diversity(text):
        '''
        Returns the diversity of the string.
        '''
        tokens = word_tokenize(text)
        word_count = len(tokens)
        vocab_size = len(set(tokens))
        diversity_score = vocab_size / word_count
        return (diversity_score * 100)

    def freq_dist_sentence(text, stop_flag=False):
        '''
        Returns word count for Each Sentence
        '''
        text = preprocess(text)
        tokenized_word = word_tokenize(text)
        # with Stop Flag enabled
        if stop_flag:
            stop_words = set(stopwords.words("english"))
            tokenized_word = [
                x for x in tokenized_word
                if (x not in stop_words and x.isalpha())
            ]
        fdist = FreqDist(tokenized_word)
        return fdist

    def polarity_sc(text):
        # Returns dictionary of Polarity Score. Vader Intensity Analyzer
        sid = SentimentIntensityAnalyzer()
        scores = sid.polarity_scores(text)
        return scores

    def reading_standard(text):
        x = textstat.text_standard(text)
        match = re.search(r'(.?\d+)th(\s\w{3}\s((.?\d+)))?', x)
        r_stan = []
        if match:
            r_stan.append(match.group(1))
            r_stan.append(match.group(3))
        return r_stan

    def spacy_vizualizer(title, text, nlp):
        '''
        Returns Graphs of NER and Dependency Parse.
        Return Format is HTML
        '''
        text = nlp(text)
        title = nlp(title)
        html_dep = displacy.render(title, style='dep', page=True)
        html_ent = displacy.render(text, style='ent', minify=True)
        dep = html_dep
        ent = html_ent
        print(dep)
        return (dep, ent)

    result = {}
    #Result = []
    Text = preprocess(text)
    Title = preprocess(title)
    result['difficult_words'] = textstat.difficult_words(Text)
    result['word_count'] = len(word_tokenize(Text))
    result['lexical_diversity'] = lexical_diversity(Text)
    result['word_dist'] = dict(freq_dist_sentence(Text).most_common())
    result['word_dist_without_stopwords'] = dict(
        freq_dist_sentence(Text, stop_flag=True).most_common())
    result['polarity_title_pos'] = polarity_sc(Title)['pos'] * 100
    result['polarity_title_neg'] = polarity_sc(Title)['neg'] * 100
    result['polarity_title_neu'] = polarity_sc(Title)['neu'] * 100
    result['reading_standard'] = reading_standard(text)
    result['dependency_html'], result['ner_html'] = spacy_vizualizer(
        title, text, nlp)

    return result
Beispiel #20
0
    lambda x: x.count("?") / len(x.split()),
    lambda x: x.count("-") / len(x.split()),
    lambda x: x.count(",") / len(x.split()),
    lambda x: x.count("$") / len(x.split()),
    lambda x: x.count("(") / len(x.split()),
    lambda x: len(x) / (x.count(" ") + 1),
    lambda x: x.count(" ") / (x.count(".") + 1),
    lambda x: len(re.findall("\d", x)),
    lambda x: len(re.findall("[A-Z]", x)),
    lambda x: textstat.flesch_reading_ease(x),
    lambda x: textstat.smog_index(x),
    lambda x: textstat.flesch_kincaid_grade(x),
    lambda x: textstat.coleman_liau_index(x),
    lambda x: textstat.automated_readability_index(x),
    lambda x: textstat.dale_chall_readability_score(x),
    lambda x: textstat.difficult_words(x),
    lambda x: textstat.linsear_write_formula(x),
    lambda x: textstat.gunning_fog(x),
]

# Apply each function and put the results into a list.
columns = []
for func in transform_functions:
    columns.append(df["text"].apply(func))

# Convert the meta features to a numpy array.
meta = np.asarray(columns).T

##features = np.hstack([ meta,chi_matrix.todense()])
features = np.hstack([meta, tfidf.todense()])
##features=tfidf
Beispiel #21
0
def test_difficult_words():
    result = textstat.difficult_words(long_test)

    assert result == 49
Beispiel #22
0
    def __init__(self):
        test_data = (
            '''Playing games has always been thought to be important to
            the development of well-balanced and creative children; 
            however, what part, if any, they should play in the lives 
            of adults has never been researched that deeply. I believe 
            that playing games is every bit as important for adults 
            as for children. Not only is taking time out to play games 
            with our children and other adults valuable to building 
            interpersonal relationships but is also a wonderful way 
            to release built up tension.
            ''')

        print(
            'flesch_reading_ease',
            textstat.flesch_reading_ease(test_data),
        )
        print(
            'flesch_kincaid_grade',
            textstat.flesch_kincaid_grade(test_data),
        )

        print(
            'difficult_words',
            textstat.difficult_words(test_data),
        )
        print(
            'automated_readability_index',
            textstat.automated_readability_index(test_data),
        )
        print(
            'text_standard',
            textstat.text_standard(test_data),
        )

        print(
            'smog_index',
            textstat.smog_index(test_data),
        )
        print(
            'gunning_fog',
            textstat.gunning_fog(test_data),
        )

        print(
            'coleman_liau_index',
            textstat.coleman_liau_index(test_data),
        )
        print(
            'dale_chall_readability_score',
            textstat.dale_chall_readability_score(test_data),
        )

        print(
            'linsear_write_formula',
            textstat.linsear_write_formula(test_data),
        )

        print(
            'fernandez_huerta',
            textstat.fernandez_huerta(test_data),
        )
        print(
            'szigriszt_pazos',
            textstat.szigriszt_pazos(test_data),
        )
        print(
            'gutierrez_polini',
            textstat.gutierrez_polini(test_data),
        )
        print(
            'crawford',
            textstat.crawford(test_data),
        )

        print('=========')
        blob = TextBlob(test_data)

        for sentence in blob.sentences:
            print(sentence, sentence.sentiment.polarity)
Beispiel #23
0
def getneurograde():

    composition = textbox.get("1.0", END)

    neuro_flesch_reading_ease = str(textstat.flesch_reading_ease(composition))
    print('Flesch Reading Ease : ' + neuro_flesch_reading_ease)
    output1 = tk.Label(root,
                       text='Flesch Reading Ease : ' +
                       neuro_flesch_reading_ease,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 500, window=output1)

    neuro_flesch_kincaid_grade = str(
        textstat.flesch_kincaid_grade(composition))
    print('Flesch Kincaid Grade : ' + neuro_flesch_reading_ease)
    output2 = tk.Label(root,
                       text='Flesch Kincaid Grade : ' +
                       neuro_flesch_kincaid_grade,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 520, window=output2)

    neuro_smog_index = str(textstat.smog_index(composition))
    print('Smog Index : ' + neuro_smog_index)
    output3 = tk.Label(root,
                       text='Smog Index : ' + neuro_smog_index,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 540, window=output3)

    neuro_coleman_liau_index = str(textstat.coleman_liau_index(composition))
    print('Coleman Liau Index : ' + neuro_coleman_liau_index)
    output4 = tk.Label(root,
                       text='Coleman Liau Index : ' + neuro_coleman_liau_index,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 560, window=output4)

    neuro_automated_readability_index = str(
        textstat.automated_readability_index(composition))
    print('Automated Readability Index : ' + neuro_automated_readability_index)
    output5 = tk.Label(root,
                       text='Automated Readibility Index : ' +
                       neuro_automated_readability_index,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 580, window=output5)

    neuro_dale_chall_readability_score = str(
        textstat.dale_chall_readability_score(composition))
    print('Dale Chall Readability Score : ' +
          neuro_dale_chall_readability_score)
    output6 = tk.Label(root,
                       text='Dale Chall Readability Score : ' +
                       neuro_dale_chall_readability_score,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 600, window=output6)

    neuro_difficult_words = str(textstat.difficult_words(composition))
    print('Difficult Words : ' + neuro_difficult_words)
    output7 = tk.Label(root,
                       text='Difficult Words : ' + neuro_difficult_words,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 620, window=output7)

    neuro_linsear_write_formula = str(
        textstat.linsear_write_formula(composition))
    print('Linsear Write Formula : ' + neuro_linsear_write_formula)
    output8 = tk.Label(root,
                       text='Linsear Write Formula : ' +
                       neuro_linsear_write_formula,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 640, window=output8)

    neuro_gunning_fog = str(textstat.gunning_fog(composition))
    print('Gunning Fog : ' + neuro_gunning_fog)
    output9 = tk.Label(root,
                       text='Gunning Fog : ' + neuro_gunning_fog,
                       font=('helvetica', 10),
                       bg="#a4de02")
    canvas1.create_window(400, 660, window=output9)

    neuro_text_standard = str(textstat.text_standard(composition))
    print('Text Standard : ' + neuro_text_standard)
    output10 = tk.Label(root,
                        text='Text Standard : ' + neuro_text_standard,
                        font=('helvetica', 10),
                        bg="#a4de02")
    canvas1.create_window(400, 680, window=output10)

    word_count = len(composition.split())
    neuro_word_count = str(word_count)
    output11 = tk.Label(root,
                        text='Word Count : ' + neuro_word_count,
                        font=('helvetica', 10),
                        bg="#a4de02")
    canvas1.create_window(400, 700, window=output11)

    reading_time = (60 / 256) * word_count
    neuro_reading_time = str(reading_time)
    output12 = tk.Label(root,
                        text='Average Reading Time : ' + neuro_reading_time +
                        ' seconds',
                        font=('helvetica', 10),
                        bg="#a4de02")
    canvas1.create_window(400, 720, window=output12)
Beispiel #24
0
    def test_difficult_words(self):
        result = textstat.difficult_words(self.long_test)

        self.assertEqual(49, result)
Beispiel #25
0
def testing_tfidf_fe(s):
    s = clean_text(s)
    df = pd.DataFrame(columns=['text'])
    df.loc[0] = [s]

    with open('my_dumped_classifier.pkl', 'rb') as fid:
        gnb_loaded = cPickle.load(fid)
    f = open("glove.6B.100d.txt", "r", encoding="utf8")

    glove = []
    r = f.readlines()
    stop_words = stopwords.words("english")

    for i in r:
        if (i.split()[0] not in stop_words):
            glove.append(i.split()[0])

    f.close()

    transform_functions = [
        lambda x: x.count(" ") / len(x.split()),
        lambda x: x.count(".") / len(x.split()),
        lambda x: x.count("!") / len(x.split()),
        lambda x: x.count("?") / len(x.split()),
        lambda x: x.count("-") / len(x.split()),
        lambda x: x.count(",") / len(x.split()),
        lambda x: x.count("$") / len(x.split()),
        lambda x: x.count("(") / len(x.split()),
        lambda x: len(x) / (x.count(" ") + 1),
        lambda x: x.count(" ") / (x.count(".") + 1),
        lambda x: len(re.findall("\d", x)),
        lambda x: len(re.findall("[A-Z]", x)),
        lambda x: textstat.flesch_reading_ease(x),
        lambda x: textstat.smog_index(x),
        lambda x: textstat.flesch_kincaid_grade(x),
        lambda x: textstat.coleman_liau_index(x),
        lambda x: textstat.automated_readability_index(x),
        lambda x: textstat.dale_chall_readability_score(x),
        lambda x: textstat.difficult_words(x),
        lambda x: textstat.linsear_write_formula(x),
        lambda x: textstat.gunning_fog(x),
    ]

    transformer = TfidfTransformer(smooth_idf=True)
    count_vectorizer = CountVectorizer(ngram_range=(2, 3), vocabulary=glove)
    counts = count_vectorizer.fit_transform(df['text'].values)
    tfidf = transformer.fit_transform(counts)

    columns = []

    for func in transform_functions:
        columns.append(df["text"].apply(func))

    meta = np.asarray(columns).T

    features = np.hstack([meta, tfidf.todense()])
    with open('my_dumped_classifier.pkl', 'rb') as fid:
        gnb_loaded = cPickle.load(fid)
        ans = gnb_loaded.predict(features)[0]
        if (ans == 0):
            print("Article is legitimate according to classifier")
        else:
            print("Article is fake according to classifiers")
        return ans