コード例 #1
0
def extractfeaturevalueslime(title, body, tags):
    tagnum = len(tags)
    titleuppers = len(re.findall(r'[A-Z]', title))
    titlelength = len(title)
    titleqmarks = len(re.findall(r'\?', title))
    snippetslist = body.split("code>")[1::2]
    cleansnippets = [code.replace("</", "") for code in snippetslist]
    nsnippets = len(cleansnippets)
    bodychunks = body.split("code>")[0::2]
    cleanbodychunks = [
        re.sub('(<[^>]+>)|(\\n)|(\\r)|(<)', '', chunk) for chunk in bodychunks
    ]
    conbodylength = len(" ".join(cleanbodychunks))
    # get readability score for body
    try:
        clean = punct_clean(",".join(cleanbodychunks) + ".")
        read = Textatistic(clean).flesch_score
    except:
        read = -1000
    snippetlength = len("".join(cleansnippets))
    today = time.strftime("%A")
    creationday = dummerdict[today]
    popcount = np.sum([t in top20 for t in tags])
    bodyqmarks = len(re.findall(r'\?', (",".join(cleanbodychunks))))
    values = np.array([
        tagnum, titleuppers, titlelength, titleqmarks, nsnippets,
        conbodylength, read, snippetlength, creationday, popcount, bodyqmarks
    ]).astype(float)
    return (values)
コード例 #2
0
def text_statistics(text):
  word_count = get_word_count(text)
  sent_count = get_sent_count(text)
  s = Textatistic(text)
  syllable_count=s.sybl_count
  #moallen=list(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
  #syllable_count = sum(moallen)
  return word_count, sent_count, syllable_count
コード例 #3
0
def read_score(string):
    # Compute the readability scores
    readability_scores = Textatistic(string).scores
    # Calculate flesch reading ease score
    try:
        flesch = readability_scores['gunningfog_score']
    except:
        flesch = 0
    return flesch
コード例 #4
0
ファイル: app.py プロジェクト: davideceolin/qupid
def f2(q, url):
    try:
        #print("Start: %s" % time.ctime())
        vals = requests.get(url, timeout=4,
                            allow_redirects=False).elapsed.total_seconds()
        g = Goose()
        article = g.extract(url=url)
        text = article.cleaned_text
        blob = TextBlob(text)
        taal = blob.detect_language()
        if taal == ('en'):
            try:
                s = Textatistic(text)
                cols = {
                    'wordcount': [s.word_count],
                    'reponsetime': [vals],
                    'subjectivity': [blob.sentiment.subjectivity],
                    'polarity': [blob.sentiment.polarity],
                    'fleschscore': [s.flesch_score],
                    # 'kw': [ kw ] ,
                    'url': [str(url)]
                }
                dfa = pd.DataFrame.from_dict(cols)
                #print(dfa)
                #print("Start: %s" % time.ctime())
                q.put(dfa)
            except:
                cols = {
                    'wordcount': [str('err')],
                    'reponsetime': [str('err')],
                    'subjectivity': [str('err')],
                    'polarity': [str('err')],
                    'fleschscore': [str('err')],
                    # 'kw': [ kw ] ,
                    'url': [str(url)]
                }
                dfa = pd.DataFrame.from_dict(cols)
                # print(dfa)
                # print("Start: %s" % time.ctime())
                q.put(dfa)
    except:
        #s = Textatistic(text)
        cols = {
            'wordcount': [str('err')],
            'reponsetime': [str('err')],
            'subjectivity': [str('err')],
            'polarity': [str('err')],
            'fleschscore': [str('err')],
            # 'kw': [ kw ] ,
            'url': [str(url)]
        }
        dfa = pd.DataFrame.from_dict(cols)
        #print(dfa)
        #print("Start: %s" % time.ctime())
        q.put(dfa)
コード例 #5
0
def reading_scores(cleaned_text):
    for article in cleaned_text:
        # Compute the readability scores
        try:
            readability_scores = Textatistic(article).scores
            flesch = readability_scores['flesch_score']
            gunning_fog = readability_scores['gunningfog_score']
            flesh_reading_scores.append(flesch)
            gunning_fog_scores.append(gunning_fog)
        except:
            continue

    return flesh_reading_scores, gunning_fog_scores
def create_f_k_dict():
    global current_directory
    file_name = create_f_k_name_var.get()
    path_name = create_f_k_path_var.get()
    file_path = path_name + "/" + file_name + ".pickle"
    if current_directory == "":
        msg.showwarning("Utility Warning", "No Gutenberg corpus specified.")
        return ""
    if file_name == "":
        msg.showwarning("Utility Warning", "Provide a filename.")
        return ""
    if path_name == "":
        msg.showwarning("Utility Warning",
                        "Provide a path for Flesch-Kincaid dictionary.")
        return ""
    output_dict = {}
    progress_max = get_count(current_directory)
    progress_bar["maximum"] = progress_max
    i = 0
    print(i)
    for root, dirs, files in os.walk(current_directory, topdown=False):
        for name in files:
            text_loc = os.path.join(root, name)
            if checkpath(name):
                try:
                    text0 = open(text_loc, "r").read()
                    text1 = strip_headers(open(text_loc, "r").read())
                    if detect(text1) == "en":
                        text2 = Textatistic(text1)
                        output_dict[text_loc] = text2.fleschkincaid_score
                        print(text_loc, output_dict[text_loc])
                        i += 1
                        progress_bar["value"] = i
                        progress_bar.update()
                except:
                    continue
    print(output_dict)
    with open(file_path, 'wb') as handle:
        pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    f_k_dict_file_name.config(state="normal")
    f_k_dict_file_name.delete(0, tk.END)
    f_k_dict_file_name.insert(tk.INSERT, file_path)
    f_k_dict_file_name.config(state="disabled")
    sleep(0.75)
    progress_bar["value"] = 0
コード例 #7
0
def f2(q, url):
    try:
        #print("Start: %s" % time.ctime())
        #vals = requests.get(url, timeout=4, allow_redirects=False).elapsed.total_seconds()
        article = Article(url)
        article.download()
        article.parse()
        text = article.text
        #afb = len(article.images)
        blob = TextBlob(text)

        # taal = blob.detect_language()
        # if taal == ('en'):
        #     try:
        s = Textatistic(text)
        cols = {
            'words': [s.word_count],
            'pictures': [len(article.images)],
            'subjectivity': [blob.sentiment.subjectivity],
            'polarity': [blob.sentiment.polarity],
            'readable': [s.flesch_score],
            'text': [str(text)],
            # 'kw': [ kw ] ,
            'url': [str(url)]
        }
        dfa = pd.DataFrame.from_dict(cols)
        #print(dfa)
        #print("Start: %s" % time.ctime())
        q.put(dfa)
        # except:
    except:
        #s = Textatistic(text)
        cols = {
            'words': [str('err')],
            #                    'latency': [str('err')],
            'subjectivity': [str('err')],
            'polarity': [str('err')],
            'readable': [str('err')],
            # 'kw': [ kw ] ,
            'url': [str(url)]
        }
        dfa = pd.DataFrame.from_dict(cols)
        #print(dfa)
        #print("Start: %s" % time.ctime())
        q.put(dfa)
コード例 #8
0
def text_readability(text):
    """Creates a Textatistic Object that contains various readability scores. Then extracts 2 of those scores: 
        1)Flesch reading ease
            greater average sentence length - harder to read; 
            greater avg num of syllables harder to read;
            higher the score - greater the readability (easier to understand)
            
        2)Gunning fog index
            Also utilizes average sentence length
            Greater % of complex words - harder to read
            higher the score - lesser the readability (harder to understand) """

    try:
        readability_scores = Textatistic(text).scores
        flesch = readability_scores['flesch_score']
        gunningfog = readability_scores['gunningfog_score']
        return flesch, gunningfog
    except:
        return np.nan, np.nan
コード例 #9
0
def get_readability_features(text):
    """get FK easiness readability score from a text
    calculated according to https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

    Args:
        text (str): text string of the transcript

    Returns:
        easiness (float): FK ease readability score for the text.

    """
    # remove non words
    from textatistic import Textatistic

    try:
        text_score_obj = Textatistic(text)
        easiness = text_score_obj.flesch_score
    except ZeroDivisionError:
        easiness = 100.0

    return easiness
def flesch_kincaidizer(gutenberg_path, pickle_dump_path):
    output_dict = {}
    for root, dirs, files in os.walk(gutenberg_path, topdown=False):
        for name in files:
            text_loc = os.path.join(root, name)
            if name[-4:] == ".txt" and name[-6:] != "-8.txt" and name[
                    -6:] != "-0.txt" and "old" not in text_loc:
                i += 1
                progress_bar["value"] = i
                progress_bar.update()
                try:
                    text0 = open(text_loc, "r").read()
                    text1 = strip_headers(open(text_loc, "r").read())
                    if detect(text1) == "en":
                        text2 = Textatistic(text1)
                        output_dict[text_loc] = text2.fleschkincaid_score
                        print(text_loc, output_dict[text_loc])
                except:
                    continue
    os.chdir(pickle_dump_path)
    with open('readability_dictionary.pickle', 'wb') as handle:
        pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    sleep(0.75)
    progress_bar["value"] = 0
コード例 #11
0
ファイル: app.py プロジェクト: robinkumarsharma/qupid2
def get_score():
    if request.method == 'POST':
        tag = request.form['query']
        url = tag
        g = Goose()
        article = g.extract(url=url)
        text = article.cleaned_text
        blob = TextBlob(text)
        s = Textatistic(text)
        vals = requests.get(url, timeout=4,
                            allow_redirects=False).elapsed.total_seconds()
        st = "/&callback=process&key=57bf606e01a24537ac906a86dc27891f94a0f587"
        # zz = urlopen ( url )
        quez = 'http://api.mywot.com/0.4/xpublic_link_json2?hosts=' + url + st
        stt = urllib.request.urlopen(quez).read()
        stt = str(stt)
        wot = re.findall('\d+', stt)
        ##z=[[conv(s) for s in line.split()] for line in wot]
        z = [conv(s) for s in wot]
        high = (z[1])
        low = (z[2])
        #print ( high , low )
        # WAYBACK
        zz = "{0.scheme}://{0.netloc}/".format(urlsplit(url))
        zurlz = "https://web.archive.org/web/0/" + str(zz)
        r = requests.get(zurlz, allow_redirects=False)
        data = r.content
        years = re.findall('\d+', str(data))
        years = [conv(s) for s in years]
        years = (years[0])
        years = int(str(years)[:4])
        cols = {
            'yeararchive': [years],
            'lowwot': [low],
            'highwot': [high],
            'reponsetime': [vals],
            'wordcount': [s.word_count],
            'subjectivity': [blob.sentiment.subjectivity],
            'polarity': [blob.sentiment.polarity],
            'fleschscore': [s.flesch_score],
            #'kw': [ kw ] ,
            'url': [url]
        }
        dfeat = pd.DataFrame.from_dict(cols)
        #df.to_csv ( 'ft.csv' , index=False , sep=',' , encoding='utf-8' )
        del dfeat['url']
        #print (df)
        newX = dfeat.values
        pickle_fname = 'pickle.model'
        pickle_model = pickle.load(open(pickle_fname, 'rb'))
        result = pickle_model.predict(newX)  #print (result)
        px2 = result.reshape((-1, 8))
        dfres = pd.DataFrame({
            'OverallQuality': px2[:, 0],
            'accuracy': px2[:, 1],
            'completeness': px2[:, 2],
            'neutrality': px2[:, 3],
            'relevance': px2[:, 4],
            'trustworthiness': px2[:, 5],
            'readability': px2[:, 6],
            'precision': px2[:, 7]
        })
        tp = str(keywords(text, words=2))
        # comm = re.compile ( r"https?://(www\.)?" )
        # new_url = comm.sub ( '' , url ).strip ( ).strip ( '/' )
        # print (new_url)
        twtext = list()
        polar = list()
        datum = list()
        for tweet in query_tweets(tp, 10):
            try:
                txt = tweet.text
                txt = re.sub(r"http\S+", "", txt)
                dat = tweet.timestamp
                tblob = TextBlob(txt)
                tpol = tblob.sentiment.polarity
                tal = tblob.detect_language()
                if tal == ('en'):
                    twtext.append(txt)
                    polar.append(tpol)
                    datum.append(dat)
                else:
                    pass
            except:
                pass

        df = pd.DataFrame({
            'tweet': twtext,
            'timestamp': datum,
            'polarity': polar
        })
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        oldest = df['timestamp'].min()
        newest = df['timestamp'].max()
        total = (oldest - newest).total_seconds()
        gem = total / len(df.index)
        #df.to_csv ( 'sentiment.csv' , index=False , sep=',' , encoding='utf-8' )
        tmean = df["polarity"].mean()
        tsd = df["polarity"].std()
        tkur = df["polarity"].kurtosis()
        #topics
        # compile sample documents into a list
        tokenizer = RegexpTokenizer(r'\w+')
        stop = set(stopwords.words('english'))
        p_stemmer = PorterStemmer()
        doc_set = twtext
        texts = []

        for i in doc_set:
            raw = i.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in stop]
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                                   num_topics=1,
                                                   id2word=dictionary,
                                                   minimum_phi_value=0.05)
        topic = ldamodel.print_topics(num_topics=1, num_words=1)
        ctweets = {
            'meansentiment': [tmean],
            'sdpolarity': [tsd],
            'kurtosispolarity': [tkur],
            'tweetrate': [gem],
            'tweetcount': [len(df.index)],
            'topic': [topic],
            'url': [url]
        }
        dftwit = pd.DataFrame.from_dict(ctweets)
        #entit
        my_sent = article.cleaned_text
        parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(my_sent.split()),
                                   binary=True)  # POS tagging before chunking!
        named_entities = []
        for t in parse_tree.subtrees():
            if t.label() == 'NE':
                named_entities.append(t)
        z = named_entities
        my_count = pd.Series(z).value_counts()
        df = pd.DataFrame(my_count)
        df.columns = ['Count']
        df['entity'] = df.index
        za = df.assign(
            entity=[', '.join([x[0] for x in r]) for r in df.entity])
        df['entities'] = pd.DataFrame(za['entity'])
        del df['entity']
        var_input = article.cleaned_text
        var_input = re.sub(r'[\W\s\d]', ' ', var_input)
        input_tokenized = word_tokenize(var_input, "english")
        filtered_words = [
            word for word in input_tokenized
            if word not in stopwords.words('english')
        ]

        emotion_count = []

        for i in range(0, len(filtered_words)):
            with open('em.txt') as f:
                for line in f:
                    finaline = line.strip()
                    keym = re.search("'" + filtered_words[i] + "':\s'",
                                     finaline)
                    if keym:
                        # print(keym)
                        valuem = re.findall(":\s'.*", finaline)
                        newstr = str(valuem)
                        finalvalue = re.sub(r'[\W\s]', ' ', newstr)
                        emotion_count.append(finalvalue.strip())

        emo = most_common(emotion_count)
        # tp = str ( keywords ( var_input , words=2 ) )
        tijd = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        col2 = {
            'emotions': [emo],
            'topics': [tp],
            'tittle': [article.title],
            'published': [article.publish_date],
            'authors': [article.authors],
            'timestamp(gmtime)': [tijd],
            'url': [url]
        }
        df2 = pd.DataFrame.from_dict(col2)
    return render_template('tabs.html',
                           dataframe=dfeat.to_html(index=False),
                           res=dfres.to_html(index=False),
                           twit=dftwit.to_html(index=False),
                           ent=df.to_html(index=False),
                           des=df2.to_html(index=False))
コード例 #12
0
def compute_stats_for_pages_in_course(course_id):
    list_of_all_pages = []
    page_stats = []

    # Use the Canvas API to get the list of pages for this course
    #GET /api/v1/courses/:course_id/pages

    url = "{0}/courses/{1}/pages".format(baseUrl, course_id)
    if Verbose_Flag:
        print("url: " + url)

    r = requests.get(url, headers=header)
    if r.status_code == requests.codes.ok:
        page_response = r.json()
    else:
        print("No pages for course_id: {}".format(course_id))
        return False

    for p_response in page_response:
        list_of_all_pages.append(p_response)

    # the following is needed when the reponse has been paginated
    # i.e., when the response is split into pieces - each returning only some of the list of modules
    while r.links.get('next', False):
        r = requests.get(r.links['next']['url'], headers=header)
        page_response = r.json()
        for p_response in page_response:
            list_of_all_pages.append(p_response)

    for p in list_of_all_pages:
        print("title is '{0}' with url {1}".format(p['title'], p['url']))
        # Use the Canvas API to GET the page
        #GET /api/v1/courses/:course_id/pages/:url

        url = "{0}/courses/{1}/pages/{2}".format(baseUrl, course_id, p["url"])
        if Verbose_Flag:
            print(url)
        payload = {}
        r = requests.get(url, headers=header, data=payload)
        if r.status_code == requests.codes.ok:
            page_response = r.json()
            if Verbose_Flag:
                print("body: {}".format(page_response["body"]))

            body = page_response["body"]
            if isinstance(body, str) and len(body) > 0:
                document = html.document_fromstring(body)
                raw_text = document.text_content()
            else:  # nothing to process
                continue

            if Verbose_Flag:
                print("raw_text: {}".format(raw_text))

        else:
            print("No pages for course_id: {}".format(course_id))
            return False

        # see http://www.erinhengel.com/software/textatistic/
        try:
            fixed_title = page_response["title"].replace(',', '_comma_')
            fixed_title = fixed_title.replace('"', '_doublequote_')
            fixed_title = fixed_title.replace("'", '_singlequote_')
            page_entry = {
                "url": url,
                "page_name": fixed_title,
                "Textatistic.statistics": Textatistic(raw_text).dict()
            }
        except ZeroDivisionError:
            # if there are zero sentences, then some of the scores cannot be computed
            if Verbose_Flag:
                print("no sentences in page {}".format(url))
            continue
        except ValueError:
            # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this
            if Verbose_Flag:
                print("there is likely code on page {}".format(url))
            continue

        if page_entry:
            page_stats.append(page_entry)

    return page_stats
コード例 #13
0
"""

import os

from textatistic import Textatistic, fleschkincaid_score

from gutenberg.cleanup import strip_headers

from langdetect import detect

import pandas as pd

output_dict = {}

for root, dirs, files in os.walk("E://gutenberg", topdown=False):
    for name in files:
        if name[-4:] == ".txt" and name[
                -6:] != "-8.txt" and name not in output_dict.keys():
            text_loc = os.path.join(root, name)
            try:
                text0 = open(text_loc, "r").read()
                text1 = strip_headers(open(text_loc, "r").read())
                if detect(text1) == "en":
                    text2 = Textatistic(text1)
                    output_dict[text_loc] = text2.fleschkincaid_score
                    print(text_loc, output_dict[text_loc])
            except:
                continue

output_dataframe = pd.DataFrame.from_dict(output_dict, orient="index")
output_dataframe.to_csv("output.csv")
コード例 #14
0
abbr = Abbreviations(append=[['dog', 'cat'], ['mouse', 'elephant']],
                     modify=[['i.e.', 'XXX'], ['cf.', 'YYY']],
                     remove=[['U. N.', 'United Nations']])
abbr.list[0][1] == "XXX"
abbr.list[-1][0] == "mouse"
try:
    abbr.list.index(['U. N.', 'United Nations'])
    print("Found U.N.")
except ValueError:
    pass

text_sample = 'There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer—than-crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever (who would have thought?!)—The Jacksonian Five ate a cake. We also ate a cake (and that suprised me!!). Here is my co-author. This is a decimal 0.835.'

iterate = 1000
suma = 0
for i in range(iterate):
    start = datetime.now()
    Textatistic(text_sample)
    end = datetime.now()
    delta = end - start
    suma += timedelta.total_seconds(delta)

print(str(iterate) + " Texatstic iterations")
print(str(round(suma, 4)) + " seconds\n\n")

print("punct_clean text")
print(textatistic.punct_clean(text_sample) + "\n\n")

print("word_array list")
print(textatistic.word_array(text_sample))
コード例 #15
0
plt.show()



#
#Readability of 'The Myth of Sisyphus'
#
#In this exercise, you will compute the Flesch reading ease score for Albert Camus' famous essay The Myth of Sisyphus. We will then interpret the value of this score as explained in the video and try to determine the reading level of the essay.
#
#The entire essay is in the form of a string and is available as sisyphus_essay.

# Import Textatistic
from textatistic import Textatistic

# Compute the readability scores 
readability_scores = Textatistic(sisyphus_essay).scores

# Print the flesch reading ease score
flesch = readability_scores['flesch_score']
print("The Flesch Reading Ease is %.2f" % (flesch))



#Readability of various publications
#
#In this exercise, you have been given excerpts of articles from four publications. Your task is to compute the readability of these excerpts using the Gunning fog index and consequently, determine the relative difficulty of reading these publications.
#
#The excerpts are available as the following strings:
#
#    forbes- An excerpt from an article from Forbes magazine on the Chinese social credit score system.
#    harvard_law- An excerpt from a book review published in Harvard Law Review.
コード例 #16
0
def get_measures(snippets):
    """Given a list of snippets for a conversation, return a dictionary of metrics."""
    duration_sum = 0.0
    word_length_sum = 0
    last_end_time = 0.0
    inter_speaker_silence = 0.0
    num_speaker_transitions = 0
    num_interruptions = 0
    word_count = 0

    all_content = []
    all_words = []
    speaker_to_duration_sum_map = {}  # speaker_id -> seconds

    # Split data into speaker turns and accumulate stats for each turn
    for speaker_turn_snippets in generate_speaker_turns(snippets):
        speaker_duration = sum(x["audio_end_offset"] - x["audio_start_offset"] for x in speaker_turn_snippets)
        speaker_id = speaker_turn_snippets[0]["speaker_id"]
        speaker_to_duration_sum_map[speaker_id] = (
            speaker_to_duration_sum_map.get(speaker_id, 0.0) + speaker_duration
        )
        speaker_content, is_crosstalk = snippets_to_content_string(speaker_turn_snippets)
        words = speaker_content.split()
        all_words += words
        duration_sum += speaker_duration
        word_count += len(words)
        word_length_sum += len(speaker_content) - len(words) + 1

        inter_speaker_gap = speaker_turn_snippets[0]["audio_start_offset"] - last_end_time
        if not last_end_time or inter_speaker_gap < 0 or inter_speaker_gap > 20:
            # Beginning of clip or very long pause.  Ignore as transition
            pass
        else:
            inter_speaker_silence += inter_speaker_gap
            num_speaker_transitions += 1
            if inter_speaker_gap < 0.0001:
                num_interruptions += 1
            elif is_crosstalk:
                num_interruptions += 1
        last_end_time = speaker_turn_snippets[-1]["audio_end_offset"]
        all_content.append(speaker_content)

    if word_count:
        grade_level = Textatistic(" ".join(all_content)).fleschkincaid_score
    else:
        grade_level = 0.0
    mattr_score = mattr_metric(all_words)

    x = {
        "duration_sum": duration_sum,
        "num_snippets": len(snippets),
        "num_words": word_count,
        "num_interruptions": num_interruptions,
        "word_length_sum": word_length_sum,
        "grade_level": grade_level,
        "turn_taking_balance": turn_taking_balance_metric(speaker_to_duration_sum_map),
        "num_speakers": len(speaker_to_duration_sum_map),
        "inter_speaker_silence": inter_speaker_silence,
        "speaker_transitions": num_speaker_transitions
    }
    if mattr_score:
        x["mattr_score"] = mattr_score

    return x
コード例 #17
0
def augment_entries(course_id, moduleItems, module_name, module_position,
                    options):
    newModuleitems = []
    for mi in moduleItems:
        mn = {'module_name': module_name, 'module_position': module_position}
        mn.update(mi)
        mi = mn
        publishedP = mi['published']
        if not options.unpublished and not publishedP:  # If not published do not process it further, but add it to the list to return
            newModuleitems.append(mi)
            continue
        mi_type = mi['type']
        if mi_type == 'Page':
            url = mi['url']
            if Verbose_Flag:
                print(url)
            page_entry = None
            payload = {}
            r = requests.get(url, headers=header, data=payload)
            if r.status_code == requests.codes.ok:
                page_response = r.json()
                if Verbose_Flag:
                    print("body: {}".format(page_response["body"]))
                if page_response:
                    body = page_response.get("body", None)
                if body and isinstance(body, str) and len(body) > 0:
                    document = html.document_fromstring(body)

                    elements_to_remove = ['img', 'code', 'pre']
                    for el in elements_to_remove:
                        el_path = "//{}".format(el)
                        for bad in document.xpath(el_path):
                            bad.getparent().remove(bad)

                    #  remove anything in one of the following languages
                    languages_to_remove = [
                        'sv', 'sv-SE', 'fr', 'fr-FR', 'de', 'de-DE', 'nb-NO',
                        'nn-NO', 'da-DK', 'zh-Hans', 'es', 'es-ES', 'nl',
                        'nl-NL', 'it', 'it-IT', 'X-NONE', 'x-western'
                    ]
                    for l in languages_to_remove:
                        lang_path = "//*[@lang=\'{0}\']".format(l)
                        for bad in document.xpath(lang_path):
                            bad.getparent().remove(bad)

                    expected_languages = ['en', 'en-US', 'en-GB', 'en-UK']
                    for el in document.xpath('//*[@lang]'):
                        lang = el.get('lang')
                        if lang not in expected_languages:
                            print("Unexpected language={0}, url={1}".format(
                                lang, url))

                    raw_text = document.text_content()
                    if Verbose_Flag:
                        print("raw_text: {}".format(raw_text))

                    if len(raw_text) > 0:
                        # see http://www.erinhengel.com/software/textatistic/
                        try:
                            page_entry = Textatistic(raw_text).dict()
                        except ZeroDivisionError:
                            # if there are zero sentences, then some of the scores cannot be computed
                            if Verbose_Flag:
                                print("no sentences in page {0}, raw_text={1}".
                                      format(url, raw_text))
                            page_entry = {
                                'text_stats_note': 'no sentences on page'
                            }
                        except ValueError:
                            # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this
                            if Verbose_Flag:
                                print(
                                    "there is likely code on page {0}, raw_text={1}"
                                    .format(url, raw_text))
                            page_entry = {
                                'text_stats_note':
                                'likely there is code on the page'
                            }
                    else:
                        page_entry = {
                            'text_stats_note':
                            'no text left after filtering on the page'
                        }

            # augment the module item if there were statistics
            if page_entry:
                mi.update(page_entry)
            else:
                page_entry = {
                    'text_stats_note':
                    'No results for Textatistic on this page'
                }
                mi.update(page_entry)

        # add module item to list to return all module items
        newModuleitems.append(mi)

    return newModuleitems
コード例 #18
0
"""
import requests
from textatistic import Textatistic
from bs4 import BeautifulSoup

# scrape the Internet for news articles
news_fox = requests.get('https://www.foxnews.com/world/explosion-lebanon-capital-beirut')
news_thesun = requests.get('https://timesofindia.indiatimes.com/world/middle-east/massive-beirut-blast-kills-more-than-70-injures-thousands/articleshow/77360097.cms')
news_aljazeera = requests.get('https://www.aljazeera.com/news/2020/8/4/dozens-killed-as-huge-explosion-rips-through-lebanons-beirut')

# create BeautifulSoup objects for the news articles
soup_fox = BeautifulSoup(news_fox.content, 'html.parser')
soup_thesun = BeautifulSoup(news_thesun.content, 'html.parser')
soup_aljazeera = BeautifulSoup(news_aljazeera.content, 'html.parser')

# get the text from the html pages in Beautiful Soup
text_fox = soup_fox.get_text(separator=' ', strip=True)
text_thesun = soup_thesun.get_text(separator=' ', strip=True)
text_aljazeera = soup_aljazeera.get_text(separator=' ', strip=True)

print(text_thesun)

# Check the readability using Textatistics
readability_fox = Textatistic(text_fox)
readability_thesun = Textatistic(text_thesun)
readability_aljazeera = Textatistic(text_aljazeera)

print(f'The readability for Fox News is: {readability_fox.notdalechall_count}')
print(f'The readability for The Sun News is: {readability_thesun.notdalechall_count}')
print(f'The readability for Aljazeera News is: {readability_aljazeera.notdalechall_count}')
コード例 #19
0
# The excerpts are available as the following strings:

# forbes- An excerpt from an article from Forbes magazine on the Chinese social credit score system.
# harvard_law- An excerpt from a book review published in Harvard Law Review.
# r_digest- An excerpt from a Reader's Digest article on flight turbulence.
# time_kids - An excerpt from an article on the ill effects of salt consumption published in TIME for Kids.
# Instructions
# 100 XP
# Import the Textatistic class from textatistic.
# Compute the readability_scores dictionary for each excerpt using Textatistic.
# Select the Gunning fog index from the readability_scores dictionary for each excerpt and append it to gunning_fog_scores.
# Print the list of Gunning fog indices.


# Import Textatistic
from textatistic import Textatistic

# List of excerpts
excerpts = [forbes, harvard_law, r_digest, time_kids]

# Loop through excerpts and compute gunning fog index
gunning_fog_scores = []
for excerpt in excerpts:
  readability_scores = Textatistic(excerpt).scores
  gunning_fog = readability_scores['gunningfog_score']
  gunning_fog_scores.append(gunning_fog)

# Print the gunning fog indices
print(gunning_fog_scores)
def compute_stats_for_pages_in_course(course_id):
    list_of_all_pages = []
    page_stats = []

    # Use the Canvas API to get the list of pages for this course
    #GET /api/v1/courses/:course_id/pages

    url = baseUrl + '%s/pages' % (course_id)
    if Verbose_Flag:
        print("url: " + url)

    r = requests.get(url, headers=header)
    if Verbose_Flag:
        write_to_log("result of getting pages: " + r.text)
    if r.status_code == requests.codes.ok:
        page_response = r.json()
    else:
        print("No pages for course_id: {}".format(course_id))
        return False

    for p_response in page_response:
        list_of_all_pages.append(p_response)

    # the following is needed when the reponse has been paginated
    # i.e., when the response is split into pieces - each returning only some of the list of modules
    # see "Handling Pagination" - Discussion created by [email protected] on Apr 27, 2015, https://community.canvaslms.com/thread/1500
    while r.links['current']['url'] != r.links['last']['url']:
        r = requests.get(r.links['next']['url'], headers=header)
        page_response = r.json()
        for p_response in page_response:
            list_of_all_pages.append(p_response)

    for p in list_of_all_pages:
        print("{}".format(p["title"]))
        # Use the Canvas API to GET the page
        #GET /api/v1/courses/:course_id/pages/:url

        url = baseUrl + '%s/pages/%s' % (course_id, p["url"])
        if Verbose_Flag:
            print(url)
        payload = {}
        r = requests.get(url, headers=header, data=payload)
        if r.status_code == requests.codes.ok:
            page_response = r.json()
            if Verbose_Flag:
                print("body: {}".format(page_response["body"]))

            document = html.document_fromstring(page_response["body"])
            raw_text = document.text_content()
            if Verbose_Flag:
                print("raw_text: {}".format(raw_text))
        else:
            print("No pages for course_id: {}".format(course_id))
            return False

        # see http://www.erinhengel.com/software/textatistic/
        try:
            fixed_title = page_response["title"].replace(',', '_comma_')
            fixed_title = fixed_title.replace('"', '_doublequote_')
            fixed_title = fixed_title.replace("'", '_singlequote_')
            page_entry = {
                "url": url,
                "page_name": fixed_title,
                "Textatistic.statistics": Textatistic(raw_text).dict()
            }
        except ZeroDivisionError:
            # if there are zero sentences, then some of the scores cannot be computed
            if Verbose_Flag:
                print("no sentences in page {}".format(url))
            continue
        except ValueError:
            # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this
            if Verbose_Flag:
                print("there is likely code on page {}".format(url))
            continue

        if page_entry:
            page_stats.append(page_entry)

    return page_stats
コード例 #21
0
        # Set delimiter for making a list of sentence.
        auto_abstractor.delimiter_list = ["。", "\n"]
        # Object of abstracting and filtering document.
        abstractable_doc = TopNRankAbstractor()
        # Summarize document.
        result_dict = auto_abstractor.summarize(document, abstractable_doc)
        # Output result.
        print("==========================")
        print("Summary of the text")
        print("----")
        for sentence in result_dict["summarize_result"]:
            print(sentence)
    print(" ")
    print("==========================")        

    s = Textatistic(document)
    print(s.counts)
    print(s.sent_count)
    levsco=s.flesch_score
    levsco= abs(levsco)
    if levsco>100:
        levsco=levsco/10
        print ("your ease of readability:   ", round(levsco))
        if levsco>=90:
            print("It sounds like a 5th grade writing")
        elif levsco>=80 and levsco<90:
            print("It sounds like a 6th grade writing")
        elif levsco>=70 and levsco<80:
            print("It sounds like a 7th grade writing")
        elif levsco>=60 and levsco<70:
            print("It sounds like a 8th or 9th grade writing")
コード例 #22
0
ファイル: textStats.py プロジェクト: Prashant-SK/nlp
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from textatistic import Textatistic
import json
import string
import re

data = "BREAKING: All work and NO play makes JaCK dull boy. All work and no play makes jack a dull boy!?"
s = Textatistic(data)
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)

wordsFiltered = []
stopWordsInText = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
    else:
        stopWordsInText.append(w)

percentStopwords = (len(stopWordsInText) / len(wordsFiltered)) * 100

text = word_tokenize(data)
tagged = nltk.pos_tag(text)

counts = Counter(tag for word, tag in tagged)
nounCount = counts['NN']
verbCounts = Counter(tag for word, tag in tagged if tag == 'VBP' or tag == 'VB'
コード例 #23
0
ファイル: 11_04.py プロジェクト: gvc2000/curso
# Section 11.4 snippets
# NOTE: This section's self check snippets are included in this file 
# because the interactive session continues into the self check.

# Calculating Statistics and Readability Scores
from pathlib import Path

text = Path('RomeoAndJuliet.txt').read_text()

from textatistic import Textatistic

readability = Textatistic(text)

%precision 3

readability.dict()





##########################################################################
# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #
# Pearson Education, Inc. All Rights Reserved.                           #
#                                                                        #
# DISCLAIMER: The authors and publisher of this book have used their     #
# best efforts in preparing the book. These efforts include the          #
# development, research, and testing of the theories and programs        #
# to determine their effectiveness. The authors and publisher make       #
# no warranty of any kind, expressed or implied, with regard to these    #
# programs or to the documentation contained in these books. The authors #
def update_page_info_module(course_id, page_name):
    # Use the Canvas API to GET the page
    #GET /api/v1/courses/:course_id/pages/:url

    url = baseUrl + '%s/pages/%s' % (course_id, page_name)
    if Verbose_Flag:
        print(url)
    payload = {}
    r = requests.get(url, headers=header, data=payload)
    if r.status_code == requests.codes.ok:
        page_response = r.json()
        if Verbose_Flag:
            print("body: {}".format(page_response["body"]))

        document = html.document_fromstring(page_response["body"])
        raw_text = document.text_content()
        print("raw_text: {}".format(raw_text))

        title = page_response["title"]
    else:
        print("No page {}".format(page_name))
        return False

    # transform page

    GQMContent = document.xpath('//p[@class="GQMContent"]')
    if len(GQMContent) > 0:
        text_of_GQMContent = GQMContent[0].text
        print("Existing information as text is {}".format(text_of_GQMContent))

        information_for_on_page = json.loads(text_of_GQMContent)
        print("Existing information is {}".format(information_for_on_page))

        document2 = deepcopy(document)
        # trim off GQMContent paragraph before processing the raw_text
        for elem in document2.xpath('//p[@class="GQMContent"]'):
            elem.getparent().remove(elem)

        raw_text = document2.text_content()
        print("raw_text: {}".format(raw_text))

    information_for_on_page["Words"] = len(raw_text.split())
    information_for_on_page["Characters"] = len(raw_text)
    # see http://www.erinhengel.com/software/textatistic/
    information_for_on_page["Textatistic.counts"] = Textatistic(
        raw_text).counts
    information_for_on_page["Textatistic.statistics"] = Textatistic(
        raw_text).dict()

    if len(GQMContent) == 0:
        #no GQMContent found on this page so add some
        print("No GQMContent found - adding some")
        body = document.find('.//body')
        if body == None:
            print("page has no <body>")
        else:
            GQMContent_string = '<p class="GQMContent">' + json.dumps(
                information_for_on_page) + "</p>"
        body.append(html.etree.XML(GQMContent_string))
        print("initial updated document {}", format(html.tostring(document)))
    else:
        GQMContent[0].text = json.dumps(information_for_on_page)
        print("updated document {}", format(html.tostring(document)))

    # Use the Canvas API to insert the page
    #PUT /api/v1/courses/:course_id/pages/:uid
    #    wiki_page[title]
    #    wiki_page[published]
    #    wiki_page[body]

    url = baseUrl + '%s/pages/%s' % (course_id, page_name)
    if Verbose_Flag:
        print(url)
    payload = {
        'wiki_page[title]':
        title,
        'wiki_page[published]':
        False,
        'wiki_page[body]':
        str(html.tostring(document, pretty_print=True, method="html"), 'utf-8')
    }
    r = requests.put(url, headers=header, data=payload)
    write_to_log(r.text)
    print("status code {}".format(r.status_code))
    if r.status_code == requests.codes.ok:
        return True
    else:
        print("Unable to update page {}".format(page_name))
        return False
コード例 #25
0
    text = x[i]
    wordcount.append(word_count(text))
    sentencecount = sentence_count(text)
    feature_set[i].append(sentencecount)

    avg_syl = avg_syllables_per_word(text)
    feature_set[i].append(avg_syl)

    avg_sen_len = avg_sentence_length(text, wordcount[i], feature_set[i][0])
    feature_set[i].append(avg_sen_len)

    flesch = flesch_kincaid(text, feature_set[i][2], feature_set[i][1])
    feature_set[i].append(flesch)
    #text=unicodedata.normalize('NFKD', text).encode('ascii','ignore')
    try:
        s = Textatistic(text)
        gf = s.gunningfog_score
        feature_set[i].append(gf)
    except ZeroDivisionError:
        feature_set[i].append(-1)
    num_char_w = len(text)
    feature_set[i].append(num_char_w)

    num_char = 0
    for j in range(0, len(text)):
        if text[j] != ' ':
            num_char = num_char + 1
    feature_set[i].append(num_char)

tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()
コード例 #26
0
ファイル: nlp_tools.py プロジェクト: stbamb/DataMining
def getBookResults(book_data):
    book = Textatistic(book_data[2])
    fres_score = book.flesch_score
    school_level = determineSchoolLevel(fres_score)
    return "Title: {}\nAuthor: {}\nFlesch reading-ease score: {}\nSchool level: {}". \
        format(book_data[0], book_data[1], fres_score, school_level)
コード例 #27
0
    # Don't forget to show the final image

    plt.show()
    print('---------------------------------')

"""## Reading Scores (Flesh and Gunning)"""

# Import Textatistic

flesh_reading_scores = []
gunning_fog_scores = []
 
for article in data["cleaned_body_text"]:
    # Compute the readability scores
    try:
        readability_scores = Textatistic(article).scores
        flesch = readability_scores['flesch_score']
        gunning_fog = readability_scores['gunningfog_score']
    except:
        print('Error has occured')
        continue
    
    flesh_reading_scores.append(flesch)
    gunning_fog_scores.append(gunning_fog)

data["flesh_reading_scores"] = pd.Series(flesh_reading_scores)
 
# Loop through excerpts and compute gunning fog index

data["gunning_fog_scores"] = pd.Series(gunning_fog_scores)