def extractfeaturevalueslime(title, body, tags): tagnum = len(tags) titleuppers = len(re.findall(r'[A-Z]', title)) titlelength = len(title) titleqmarks = len(re.findall(r'\?', title)) snippetslist = body.split("code>")[1::2] cleansnippets = [code.replace("</", "") for code in snippetslist] nsnippets = len(cleansnippets) bodychunks = body.split("code>")[0::2] cleanbodychunks = [ re.sub('(<[^>]+>)|(\\n)|(\\r)|(<)', '', chunk) for chunk in bodychunks ] conbodylength = len(" ".join(cleanbodychunks)) # get readability score for body try: clean = punct_clean(",".join(cleanbodychunks) + ".") read = Textatistic(clean).flesch_score except: read = -1000 snippetlength = len("".join(cleansnippets)) today = time.strftime("%A") creationday = dummerdict[today] popcount = np.sum([t in top20 for t in tags]) bodyqmarks = len(re.findall(r'\?', (",".join(cleanbodychunks)))) values = np.array([ tagnum, titleuppers, titlelength, titleqmarks, nsnippets, conbodylength, read, snippetlength, creationday, popcount, bodyqmarks ]).astype(float) return (values)
def text_statistics(text): word_count = get_word_count(text) sent_count = get_sent_count(text) s = Textatistic(text) syllable_count=s.sybl_count #moallen=list(map(lambda w: max(numsyllables(w)), word_tokenize(text))) #syllable_count = sum(moallen) return word_count, sent_count, syllable_count
def read_score(string): # Compute the readability scores readability_scores = Textatistic(string).scores # Calculate flesch reading ease score try: flesch = readability_scores['gunningfog_score'] except: flesch = 0 return flesch
def f2(q, url): try: #print("Start: %s" % time.ctime()) vals = requests.get(url, timeout=4, allow_redirects=False).elapsed.total_seconds() g = Goose() article = g.extract(url=url) text = article.cleaned_text blob = TextBlob(text) taal = blob.detect_language() if taal == ('en'): try: s = Textatistic(text) cols = { 'wordcount': [s.word_count], 'reponsetime': [vals], 'subjectivity': [blob.sentiment.subjectivity], 'polarity': [blob.sentiment.polarity], 'fleschscore': [s.flesch_score], # 'kw': [ kw ] , 'url': [str(url)] } dfa = pd.DataFrame.from_dict(cols) #print(dfa) #print("Start: %s" % time.ctime()) q.put(dfa) except: cols = { 'wordcount': [str('err')], 'reponsetime': [str('err')], 'subjectivity': [str('err')], 'polarity': [str('err')], 'fleschscore': [str('err')], # 'kw': [ kw ] , 'url': [str(url)] } dfa = pd.DataFrame.from_dict(cols) # print(dfa) # print("Start: %s" % time.ctime()) q.put(dfa) except: #s = Textatistic(text) cols = { 'wordcount': [str('err')], 'reponsetime': [str('err')], 'subjectivity': [str('err')], 'polarity': [str('err')], 'fleschscore': [str('err')], # 'kw': [ kw ] , 'url': [str(url)] } dfa = pd.DataFrame.from_dict(cols) #print(dfa) #print("Start: %s" % time.ctime()) q.put(dfa)
def reading_scores(cleaned_text): for article in cleaned_text: # Compute the readability scores try: readability_scores = Textatistic(article).scores flesch = readability_scores['flesch_score'] gunning_fog = readability_scores['gunningfog_score'] flesh_reading_scores.append(flesch) gunning_fog_scores.append(gunning_fog) except: continue return flesh_reading_scores, gunning_fog_scores
def create_f_k_dict(): global current_directory file_name = create_f_k_name_var.get() path_name = create_f_k_path_var.get() file_path = path_name + "/" + file_name + ".pickle" if current_directory == "": msg.showwarning("Utility Warning", "No Gutenberg corpus specified.") return "" if file_name == "": msg.showwarning("Utility Warning", "Provide a filename.") return "" if path_name == "": msg.showwarning("Utility Warning", "Provide a path for Flesch-Kincaid dictionary.") return "" output_dict = {} progress_max = get_count(current_directory) progress_bar["maximum"] = progress_max i = 0 print(i) for root, dirs, files in os.walk(current_directory, topdown=False): for name in files: text_loc = os.path.join(root, name) if checkpath(name): try: text0 = open(text_loc, "r").read() text1 = strip_headers(open(text_loc, "r").read()) if detect(text1) == "en": text2 = Textatistic(text1) output_dict[text_loc] = text2.fleschkincaid_score print(text_loc, output_dict[text_loc]) i += 1 progress_bar["value"] = i progress_bar.update() except: continue print(output_dict) with open(file_path, 'wb') as handle: pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) f_k_dict_file_name.config(state="normal") f_k_dict_file_name.delete(0, tk.END) f_k_dict_file_name.insert(tk.INSERT, file_path) f_k_dict_file_name.config(state="disabled") sleep(0.75) progress_bar["value"] = 0
def f2(q, url): try: #print("Start: %s" % time.ctime()) #vals = requests.get(url, timeout=4, allow_redirects=False).elapsed.total_seconds() article = Article(url) article.download() article.parse() text = article.text #afb = len(article.images) blob = TextBlob(text) # taal = blob.detect_language() # if taal == ('en'): # try: s = Textatistic(text) cols = { 'words': [s.word_count], 'pictures': [len(article.images)], 'subjectivity': [blob.sentiment.subjectivity], 'polarity': [blob.sentiment.polarity], 'readable': [s.flesch_score], 'text': [str(text)], # 'kw': [ kw ] , 'url': [str(url)] } dfa = pd.DataFrame.from_dict(cols) #print(dfa) #print("Start: %s" % time.ctime()) q.put(dfa) # except: except: #s = Textatistic(text) cols = { 'words': [str('err')], # 'latency': [str('err')], 'subjectivity': [str('err')], 'polarity': [str('err')], 'readable': [str('err')], # 'kw': [ kw ] , 'url': [str(url)] } dfa = pd.DataFrame.from_dict(cols) #print(dfa) #print("Start: %s" % time.ctime()) q.put(dfa)
def text_readability(text): """Creates a Textatistic Object that contains various readability scores. Then extracts 2 of those scores: 1)Flesch reading ease greater average sentence length - harder to read; greater avg num of syllables harder to read; higher the score - greater the readability (easier to understand) 2)Gunning fog index Also utilizes average sentence length Greater % of complex words - harder to read higher the score - lesser the readability (harder to understand) """ try: readability_scores = Textatistic(text).scores flesch = readability_scores['flesch_score'] gunningfog = readability_scores['gunningfog_score'] return flesch, gunningfog except: return np.nan, np.nan
def get_readability_features(text): """get FK easiness readability score from a text calculated according to https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests Args: text (str): text string of the transcript Returns: easiness (float): FK ease readability score for the text. """ # remove non words from textatistic import Textatistic try: text_score_obj = Textatistic(text) easiness = text_score_obj.flesch_score except ZeroDivisionError: easiness = 100.0 return easiness
def flesch_kincaidizer(gutenberg_path, pickle_dump_path): output_dict = {} for root, dirs, files in os.walk(gutenberg_path, topdown=False): for name in files: text_loc = os.path.join(root, name) if name[-4:] == ".txt" and name[-6:] != "-8.txt" and name[ -6:] != "-0.txt" and "old" not in text_loc: i += 1 progress_bar["value"] = i progress_bar.update() try: text0 = open(text_loc, "r").read() text1 = strip_headers(open(text_loc, "r").read()) if detect(text1) == "en": text2 = Textatistic(text1) output_dict[text_loc] = text2.fleschkincaid_score print(text_loc, output_dict[text_loc]) except: continue os.chdir(pickle_dump_path) with open('readability_dictionary.pickle', 'wb') as handle: pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) sleep(0.75) progress_bar["value"] = 0
def get_score(): if request.method == 'POST': tag = request.form['query'] url = tag g = Goose() article = g.extract(url=url) text = article.cleaned_text blob = TextBlob(text) s = Textatistic(text) vals = requests.get(url, timeout=4, allow_redirects=False).elapsed.total_seconds() st = "/&callback=process&key=57bf606e01a24537ac906a86dc27891f94a0f587" # zz = urlopen ( url ) quez = 'http://api.mywot.com/0.4/xpublic_link_json2?hosts=' + url + st stt = urllib.request.urlopen(quez).read() stt = str(stt) wot = re.findall('\d+', stt) ##z=[[conv(s) for s in line.split()] for line in wot] z = [conv(s) for s in wot] high = (z[1]) low = (z[2]) #print ( high , low ) # WAYBACK zz = "{0.scheme}://{0.netloc}/".format(urlsplit(url)) zurlz = "https://web.archive.org/web/0/" + str(zz) r = requests.get(zurlz, allow_redirects=False) data = r.content years = re.findall('\d+', str(data)) years = [conv(s) for s in years] years = (years[0]) years = int(str(years)[:4]) cols = { 'yeararchive': [years], 'lowwot': [low], 'highwot': [high], 'reponsetime': [vals], 'wordcount': [s.word_count], 'subjectivity': [blob.sentiment.subjectivity], 'polarity': [blob.sentiment.polarity], 'fleschscore': [s.flesch_score], #'kw': [ kw ] , 'url': [url] } dfeat = pd.DataFrame.from_dict(cols) #df.to_csv ( 'ft.csv' , index=False , sep=',' , encoding='utf-8' ) del dfeat['url'] #print (df) newX = dfeat.values pickle_fname = 'pickle.model' pickle_model = pickle.load(open(pickle_fname, 'rb')) result = pickle_model.predict(newX) #print (result) px2 = result.reshape((-1, 8)) dfres = pd.DataFrame({ 'OverallQuality': px2[:, 0], 'accuracy': px2[:, 1], 'completeness': px2[:, 2], 'neutrality': px2[:, 3], 'relevance': px2[:, 4], 'trustworthiness': px2[:, 5], 'readability': px2[:, 6], 'precision': px2[:, 7] }) tp = str(keywords(text, words=2)) # comm = re.compile ( r"https?://(www\.)?" ) # new_url = comm.sub ( '' , url ).strip ( ).strip ( '/' ) # print (new_url) twtext = list() polar = list() datum = list() for tweet in query_tweets(tp, 10): try: txt = tweet.text txt = re.sub(r"http\S+", "", txt) dat = tweet.timestamp tblob = TextBlob(txt) tpol = tblob.sentiment.polarity tal = tblob.detect_language() if tal == ('en'): twtext.append(txt) polar.append(tpol) datum.append(dat) else: pass except: pass df = pd.DataFrame({ 'tweet': twtext, 'timestamp': datum, 'polarity': polar }) df['timestamp'] = pd.to_datetime(df['timestamp']) oldest = df['timestamp'].min() newest = df['timestamp'].max() total = (oldest - newest).total_seconds() gem = total / len(df.index) #df.to_csv ( 'sentiment.csv' , index=False , sep=',' , encoding='utf-8' ) tmean = df["polarity"].mean() tsd = df["polarity"].std() tkur = df["polarity"].kurtosis() #topics # compile sample documents into a list tokenizer = RegexpTokenizer(r'\w+') stop = set(stopwords.words('english')) p_stemmer = PorterStemmer() doc_set = twtext texts = [] for i in doc_set: raw = i.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1, id2word=dictionary, minimum_phi_value=0.05) topic = ldamodel.print_topics(num_topics=1, num_words=1) ctweets = { 'meansentiment': [tmean], 'sdpolarity': [tsd], 'kurtosispolarity': [tkur], 'tweetrate': [gem], 'tweetcount': [len(df.index)], 'topic': [topic], 'url': [url] } dftwit = pd.DataFrame.from_dict(ctweets) #entit my_sent = article.cleaned_text parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(my_sent.split()), binary=True) # POS tagging before chunking! named_entities = [] for t in parse_tree.subtrees(): if t.label() == 'NE': named_entities.append(t) z = named_entities my_count = pd.Series(z).value_counts() df = pd.DataFrame(my_count) df.columns = ['Count'] df['entity'] = df.index za = df.assign( entity=[', '.join([x[0] for x in r]) for r in df.entity]) df['entities'] = pd.DataFrame(za['entity']) del df['entity'] var_input = article.cleaned_text var_input = re.sub(r'[\W\s\d]', ' ', var_input) input_tokenized = word_tokenize(var_input, "english") filtered_words = [ word for word in input_tokenized if word not in stopwords.words('english') ] emotion_count = [] for i in range(0, len(filtered_words)): with open('em.txt') as f: for line in f: finaline = line.strip() keym = re.search("'" + filtered_words[i] + "':\s'", finaline) if keym: # print(keym) valuem = re.findall(":\s'.*", finaline) newstr = str(valuem) finalvalue = re.sub(r'[\W\s]', ' ', newstr) emotion_count.append(finalvalue.strip()) emo = most_common(emotion_count) # tp = str ( keywords ( var_input , words=2 ) ) tijd = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) col2 = { 'emotions': [emo], 'topics': [tp], 'tittle': [article.title], 'published': [article.publish_date], 'authors': [article.authors], 'timestamp(gmtime)': [tijd], 'url': [url] } df2 = pd.DataFrame.from_dict(col2) return render_template('tabs.html', dataframe=dfeat.to_html(index=False), res=dfres.to_html(index=False), twit=dftwit.to_html(index=False), ent=df.to_html(index=False), des=df2.to_html(index=False))
def compute_stats_for_pages_in_course(course_id): list_of_all_pages = [] page_stats = [] # Use the Canvas API to get the list of pages for this course #GET /api/v1/courses/:course_id/pages url = "{0}/courses/{1}/pages".format(baseUrl, course_id) if Verbose_Flag: print("url: " + url) r = requests.get(url, headers=header) if r.status_code == requests.codes.ok: page_response = r.json() else: print("No pages for course_id: {}".format(course_id)) return False for p_response in page_response: list_of_all_pages.append(p_response) # the following is needed when the reponse has been paginated # i.e., when the response is split into pieces - each returning only some of the list of modules while r.links.get('next', False): r = requests.get(r.links['next']['url'], headers=header) page_response = r.json() for p_response in page_response: list_of_all_pages.append(p_response) for p in list_of_all_pages: print("title is '{0}' with url {1}".format(p['title'], p['url'])) # Use the Canvas API to GET the page #GET /api/v1/courses/:course_id/pages/:url url = "{0}/courses/{1}/pages/{2}".format(baseUrl, course_id, p["url"]) if Verbose_Flag: print(url) payload = {} r = requests.get(url, headers=header, data=payload) if r.status_code == requests.codes.ok: page_response = r.json() if Verbose_Flag: print("body: {}".format(page_response["body"])) body = page_response["body"] if isinstance(body, str) and len(body) > 0: document = html.document_fromstring(body) raw_text = document.text_content() else: # nothing to process continue if Verbose_Flag: print("raw_text: {}".format(raw_text)) else: print("No pages for course_id: {}".format(course_id)) return False # see http://www.erinhengel.com/software/textatistic/ try: fixed_title = page_response["title"].replace(',', '_comma_') fixed_title = fixed_title.replace('"', '_doublequote_') fixed_title = fixed_title.replace("'", '_singlequote_') page_entry = { "url": url, "page_name": fixed_title, "Textatistic.statistics": Textatistic(raw_text).dict() } except ZeroDivisionError: # if there are zero sentences, then some of the scores cannot be computed if Verbose_Flag: print("no sentences in page {}".format(url)) continue except ValueError: # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this if Verbose_Flag: print("there is likely code on page {}".format(url)) continue if page_entry: page_stats.append(page_entry) return page_stats
""" import os from textatistic import Textatistic, fleschkincaid_score from gutenberg.cleanup import strip_headers from langdetect import detect import pandas as pd output_dict = {} for root, dirs, files in os.walk("E://gutenberg", topdown=False): for name in files: if name[-4:] == ".txt" and name[ -6:] != "-8.txt" and name not in output_dict.keys(): text_loc = os.path.join(root, name) try: text0 = open(text_loc, "r").read() text1 = strip_headers(open(text_loc, "r").read()) if detect(text1) == "en": text2 = Textatistic(text1) output_dict[text_loc] = text2.fleschkincaid_score print(text_loc, output_dict[text_loc]) except: continue output_dataframe = pd.DataFrame.from_dict(output_dict, orient="index") output_dataframe.to_csv("output.csv")
abbr = Abbreviations(append=[['dog', 'cat'], ['mouse', 'elephant']], modify=[['i.e.', 'XXX'], ['cf.', 'YYY']], remove=[['U. N.', 'United Nations']]) abbr.list[0][1] == "XXX" abbr.list[-1][0] == "mouse" try: abbr.list.index(['U. N.', 'United Nations']) print("Found U.N.") except ValueError: pass text_sample = 'There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer—than-crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever (who would have thought?!)—The Jacksonian Five ate a cake. We also ate a cake (and that suprised me!!). Here is my co-author. This is a decimal 0.835.' iterate = 1000 suma = 0 for i in range(iterate): start = datetime.now() Textatistic(text_sample) end = datetime.now() delta = end - start suma += timedelta.total_seconds(delta) print(str(iterate) + " Texatstic iterations") print(str(round(suma, 4)) + " seconds\n\n") print("punct_clean text") print(textatistic.punct_clean(text_sample) + "\n\n") print("word_array list") print(textatistic.word_array(text_sample))
plt.show() # #Readability of 'The Myth of Sisyphus' # #In this exercise, you will compute the Flesch reading ease score for Albert Camus' famous essay The Myth of Sisyphus. We will then interpret the value of this score as explained in the video and try to determine the reading level of the essay. # #The entire essay is in the form of a string and is available as sisyphus_essay. # Import Textatistic from textatistic import Textatistic # Compute the readability scores readability_scores = Textatistic(sisyphus_essay).scores # Print the flesch reading ease score flesch = readability_scores['flesch_score'] print("The Flesch Reading Ease is %.2f" % (flesch)) #Readability of various publications # #In this exercise, you have been given excerpts of articles from four publications. Your task is to compute the readability of these excerpts using the Gunning fog index and consequently, determine the relative difficulty of reading these publications. # #The excerpts are available as the following strings: # # forbes- An excerpt from an article from Forbes magazine on the Chinese social credit score system. # harvard_law- An excerpt from a book review published in Harvard Law Review.
def get_measures(snippets): """Given a list of snippets for a conversation, return a dictionary of metrics.""" duration_sum = 0.0 word_length_sum = 0 last_end_time = 0.0 inter_speaker_silence = 0.0 num_speaker_transitions = 0 num_interruptions = 0 word_count = 0 all_content = [] all_words = [] speaker_to_duration_sum_map = {} # speaker_id -> seconds # Split data into speaker turns and accumulate stats for each turn for speaker_turn_snippets in generate_speaker_turns(snippets): speaker_duration = sum(x["audio_end_offset"] - x["audio_start_offset"] for x in speaker_turn_snippets) speaker_id = speaker_turn_snippets[0]["speaker_id"] speaker_to_duration_sum_map[speaker_id] = ( speaker_to_duration_sum_map.get(speaker_id, 0.0) + speaker_duration ) speaker_content, is_crosstalk = snippets_to_content_string(speaker_turn_snippets) words = speaker_content.split() all_words += words duration_sum += speaker_duration word_count += len(words) word_length_sum += len(speaker_content) - len(words) + 1 inter_speaker_gap = speaker_turn_snippets[0]["audio_start_offset"] - last_end_time if not last_end_time or inter_speaker_gap < 0 or inter_speaker_gap > 20: # Beginning of clip or very long pause. Ignore as transition pass else: inter_speaker_silence += inter_speaker_gap num_speaker_transitions += 1 if inter_speaker_gap < 0.0001: num_interruptions += 1 elif is_crosstalk: num_interruptions += 1 last_end_time = speaker_turn_snippets[-1]["audio_end_offset"] all_content.append(speaker_content) if word_count: grade_level = Textatistic(" ".join(all_content)).fleschkincaid_score else: grade_level = 0.0 mattr_score = mattr_metric(all_words) x = { "duration_sum": duration_sum, "num_snippets": len(snippets), "num_words": word_count, "num_interruptions": num_interruptions, "word_length_sum": word_length_sum, "grade_level": grade_level, "turn_taking_balance": turn_taking_balance_metric(speaker_to_duration_sum_map), "num_speakers": len(speaker_to_duration_sum_map), "inter_speaker_silence": inter_speaker_silence, "speaker_transitions": num_speaker_transitions } if mattr_score: x["mattr_score"] = mattr_score return x
def augment_entries(course_id, moduleItems, module_name, module_position, options): newModuleitems = [] for mi in moduleItems: mn = {'module_name': module_name, 'module_position': module_position} mn.update(mi) mi = mn publishedP = mi['published'] if not options.unpublished and not publishedP: # If not published do not process it further, but add it to the list to return newModuleitems.append(mi) continue mi_type = mi['type'] if mi_type == 'Page': url = mi['url'] if Verbose_Flag: print(url) page_entry = None payload = {} r = requests.get(url, headers=header, data=payload) if r.status_code == requests.codes.ok: page_response = r.json() if Verbose_Flag: print("body: {}".format(page_response["body"])) if page_response: body = page_response.get("body", None) if body and isinstance(body, str) and len(body) > 0: document = html.document_fromstring(body) elements_to_remove = ['img', 'code', 'pre'] for el in elements_to_remove: el_path = "//{}".format(el) for bad in document.xpath(el_path): bad.getparent().remove(bad) # remove anything in one of the following languages languages_to_remove = [ 'sv', 'sv-SE', 'fr', 'fr-FR', 'de', 'de-DE', 'nb-NO', 'nn-NO', 'da-DK', 'zh-Hans', 'es', 'es-ES', 'nl', 'nl-NL', 'it', 'it-IT', 'X-NONE', 'x-western' ] for l in languages_to_remove: lang_path = "//*[@lang=\'{0}\']".format(l) for bad in document.xpath(lang_path): bad.getparent().remove(bad) expected_languages = ['en', 'en-US', 'en-GB', 'en-UK'] for el in document.xpath('//*[@lang]'): lang = el.get('lang') if lang not in expected_languages: print("Unexpected language={0}, url={1}".format( lang, url)) raw_text = document.text_content() if Verbose_Flag: print("raw_text: {}".format(raw_text)) if len(raw_text) > 0: # see http://www.erinhengel.com/software/textatistic/ try: page_entry = Textatistic(raw_text).dict() except ZeroDivisionError: # if there are zero sentences, then some of the scores cannot be computed if Verbose_Flag: print("no sentences in page {0}, raw_text={1}". format(url, raw_text)) page_entry = { 'text_stats_note': 'no sentences on page' } except ValueError: # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this if Verbose_Flag: print( "there is likely code on page {0}, raw_text={1}" .format(url, raw_text)) page_entry = { 'text_stats_note': 'likely there is code on the page' } else: page_entry = { 'text_stats_note': 'no text left after filtering on the page' } # augment the module item if there were statistics if page_entry: mi.update(page_entry) else: page_entry = { 'text_stats_note': 'No results for Textatistic on this page' } mi.update(page_entry) # add module item to list to return all module items newModuleitems.append(mi) return newModuleitems
""" import requests from textatistic import Textatistic from bs4 import BeautifulSoup # scrape the Internet for news articles news_fox = requests.get('https://www.foxnews.com/world/explosion-lebanon-capital-beirut') news_thesun = requests.get('https://timesofindia.indiatimes.com/world/middle-east/massive-beirut-blast-kills-more-than-70-injures-thousands/articleshow/77360097.cms') news_aljazeera = requests.get('https://www.aljazeera.com/news/2020/8/4/dozens-killed-as-huge-explosion-rips-through-lebanons-beirut') # create BeautifulSoup objects for the news articles soup_fox = BeautifulSoup(news_fox.content, 'html.parser') soup_thesun = BeautifulSoup(news_thesun.content, 'html.parser') soup_aljazeera = BeautifulSoup(news_aljazeera.content, 'html.parser') # get the text from the html pages in Beautiful Soup text_fox = soup_fox.get_text(separator=' ', strip=True) text_thesun = soup_thesun.get_text(separator=' ', strip=True) text_aljazeera = soup_aljazeera.get_text(separator=' ', strip=True) print(text_thesun) # Check the readability using Textatistics readability_fox = Textatistic(text_fox) readability_thesun = Textatistic(text_thesun) readability_aljazeera = Textatistic(text_aljazeera) print(f'The readability for Fox News is: {readability_fox.notdalechall_count}') print(f'The readability for The Sun News is: {readability_thesun.notdalechall_count}') print(f'The readability for Aljazeera News is: {readability_aljazeera.notdalechall_count}')
# The excerpts are available as the following strings: # forbes- An excerpt from an article from Forbes magazine on the Chinese social credit score system. # harvard_law- An excerpt from a book review published in Harvard Law Review. # r_digest- An excerpt from a Reader's Digest article on flight turbulence. # time_kids - An excerpt from an article on the ill effects of salt consumption published in TIME for Kids. # Instructions # 100 XP # Import the Textatistic class from textatistic. # Compute the readability_scores dictionary for each excerpt using Textatistic. # Select the Gunning fog index from the readability_scores dictionary for each excerpt and append it to gunning_fog_scores. # Print the list of Gunning fog indices. # Import Textatistic from textatistic import Textatistic # List of excerpts excerpts = [forbes, harvard_law, r_digest, time_kids] # Loop through excerpts and compute gunning fog index gunning_fog_scores = [] for excerpt in excerpts: readability_scores = Textatistic(excerpt).scores gunning_fog = readability_scores['gunningfog_score'] gunning_fog_scores.append(gunning_fog) # Print the gunning fog indices print(gunning_fog_scores)
def compute_stats_for_pages_in_course(course_id): list_of_all_pages = [] page_stats = [] # Use the Canvas API to get the list of pages for this course #GET /api/v1/courses/:course_id/pages url = baseUrl + '%s/pages' % (course_id) if Verbose_Flag: print("url: " + url) r = requests.get(url, headers=header) if Verbose_Flag: write_to_log("result of getting pages: " + r.text) if r.status_code == requests.codes.ok: page_response = r.json() else: print("No pages for course_id: {}".format(course_id)) return False for p_response in page_response: list_of_all_pages.append(p_response) # the following is needed when the reponse has been paginated # i.e., when the response is split into pieces - each returning only some of the list of modules # see "Handling Pagination" - Discussion created by [email protected] on Apr 27, 2015, https://community.canvaslms.com/thread/1500 while r.links['current']['url'] != r.links['last']['url']: r = requests.get(r.links['next']['url'], headers=header) page_response = r.json() for p_response in page_response: list_of_all_pages.append(p_response) for p in list_of_all_pages: print("{}".format(p["title"])) # Use the Canvas API to GET the page #GET /api/v1/courses/:course_id/pages/:url url = baseUrl + '%s/pages/%s' % (course_id, p["url"]) if Verbose_Flag: print(url) payload = {} r = requests.get(url, headers=header, data=payload) if r.status_code == requests.codes.ok: page_response = r.json() if Verbose_Flag: print("body: {}".format(page_response["body"])) document = html.document_fromstring(page_response["body"]) raw_text = document.text_content() if Verbose_Flag: print("raw_text: {}".format(raw_text)) else: print("No pages for course_id: {}".format(course_id)) return False # see http://www.erinhengel.com/software/textatistic/ try: fixed_title = page_response["title"].replace(',', '_comma_') fixed_title = fixed_title.replace('"', '_doublequote_') fixed_title = fixed_title.replace("'", '_singlequote_') page_entry = { "url": url, "page_name": fixed_title, "Textatistic.statistics": Textatistic(raw_text).dict() } except ZeroDivisionError: # if there are zero sentences, then some of the scores cannot be computed if Verbose_Flag: print("no sentences in page {}".format(url)) continue except ValueError: # if there is code on the page, for example a json structure, then the hyphenation package cannot handle this if Verbose_Flag: print("there is likely code on page {}".format(url)) continue if page_entry: page_stats.append(page_entry) return page_stats
# Set delimiter for making a list of sentence. auto_abstractor.delimiter_list = ["。", "\n"] # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Summarize document. result_dict = auto_abstractor.summarize(document, abstractable_doc) # Output result. print("==========================") print("Summary of the text") print("----") for sentence in result_dict["summarize_result"]: print(sentence) print(" ") print("==========================") s = Textatistic(document) print(s.counts) print(s.sent_count) levsco=s.flesch_score levsco= abs(levsco) if levsco>100: levsco=levsco/10 print ("your ease of readability: ", round(levsco)) if levsco>=90: print("It sounds like a 5th grade writing") elif levsco>=80 and levsco<90: print("It sounds like a 6th grade writing") elif levsco>=70 and levsco<80: print("It sounds like a 7th grade writing") elif levsco>=60 and levsco<70: print("It sounds like a 8th or 9th grade writing")
import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from collections import Counter from textatistic import Textatistic import json import string import re data = "BREAKING: All work and NO play makes JaCK dull boy. All work and no play makes jack a dull boy!?" s = Textatistic(data) stopWords = set(stopwords.words('english')) words = word_tokenize(data) wordsFiltered = [] stopWordsInText = [] for w in words: if w not in stopWords: wordsFiltered.append(w) else: stopWordsInText.append(w) percentStopwords = (len(stopWordsInText) / len(wordsFiltered)) * 100 text = word_tokenize(data) tagged = nltk.pos_tag(text) counts = Counter(tag for word, tag in tagged) nounCount = counts['NN'] verbCounts = Counter(tag for word, tag in tagged if tag == 'VBP' or tag == 'VB'
# Section 11.4 snippets # NOTE: This section's self check snippets are included in this file # because the interactive session continues into the self check. # Calculating Statistics and Readability Scores from pathlib import Path text = Path('RomeoAndJuliet.txt').read_text() from textatistic import Textatistic readability = Textatistic(text) %precision 3 readability.dict() ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors #
def update_page_info_module(course_id, page_name): # Use the Canvas API to GET the page #GET /api/v1/courses/:course_id/pages/:url url = baseUrl + '%s/pages/%s' % (course_id, page_name) if Verbose_Flag: print(url) payload = {} r = requests.get(url, headers=header, data=payload) if r.status_code == requests.codes.ok: page_response = r.json() if Verbose_Flag: print("body: {}".format(page_response["body"])) document = html.document_fromstring(page_response["body"]) raw_text = document.text_content() print("raw_text: {}".format(raw_text)) title = page_response["title"] else: print("No page {}".format(page_name)) return False # transform page GQMContent = document.xpath('//p[@class="GQMContent"]') if len(GQMContent) > 0: text_of_GQMContent = GQMContent[0].text print("Existing information as text is {}".format(text_of_GQMContent)) information_for_on_page = json.loads(text_of_GQMContent) print("Existing information is {}".format(information_for_on_page)) document2 = deepcopy(document) # trim off GQMContent paragraph before processing the raw_text for elem in document2.xpath('//p[@class="GQMContent"]'): elem.getparent().remove(elem) raw_text = document2.text_content() print("raw_text: {}".format(raw_text)) information_for_on_page["Words"] = len(raw_text.split()) information_for_on_page["Characters"] = len(raw_text) # see http://www.erinhengel.com/software/textatistic/ information_for_on_page["Textatistic.counts"] = Textatistic( raw_text).counts information_for_on_page["Textatistic.statistics"] = Textatistic( raw_text).dict() if len(GQMContent) == 0: #no GQMContent found on this page so add some print("No GQMContent found - adding some") body = document.find('.//body') if body == None: print("page has no <body>") else: GQMContent_string = '<p class="GQMContent">' + json.dumps( information_for_on_page) + "</p>" body.append(html.etree.XML(GQMContent_string)) print("initial updated document {}", format(html.tostring(document))) else: GQMContent[0].text = json.dumps(information_for_on_page) print("updated document {}", format(html.tostring(document))) # Use the Canvas API to insert the page #PUT /api/v1/courses/:course_id/pages/:uid # wiki_page[title] # wiki_page[published] # wiki_page[body] url = baseUrl + '%s/pages/%s' % (course_id, page_name) if Verbose_Flag: print(url) payload = { 'wiki_page[title]': title, 'wiki_page[published]': False, 'wiki_page[body]': str(html.tostring(document, pretty_print=True, method="html"), 'utf-8') } r = requests.put(url, headers=header, data=payload) write_to_log(r.text) print("status code {}".format(r.status_code)) if r.status_code == requests.codes.ok: return True else: print("Unable to update page {}".format(page_name)) return False
text = x[i] wordcount.append(word_count(text)) sentencecount = sentence_count(text) feature_set[i].append(sentencecount) avg_syl = avg_syllables_per_word(text) feature_set[i].append(avg_syl) avg_sen_len = avg_sentence_length(text, wordcount[i], feature_set[i][0]) feature_set[i].append(avg_sen_len) flesch = flesch_kincaid(text, feature_set[i][2], feature_set[i][1]) feature_set[i].append(flesch) #text=unicodedata.normalize('NFKD', text).encode('ascii','ignore') try: s = Textatistic(text) gf = s.gunningfog_score feature_set[i].append(gf) except ZeroDivisionError: feature_set[i].append(-1) num_char_w = len(text) feature_set[i].append(num_char_w) num_char = 0 for j in range(0, len(text)): if text[j] != ' ': num_char = num_char + 1 feature_set[i].append(num_char) tkr = RegexpTokenizer('[a-zA-Z0-9@]+') stemmer = LancasterStemmer()
def getBookResults(book_data): book = Textatistic(book_data[2]) fres_score = book.flesch_score school_level = determineSchoolLevel(fres_score) return "Title: {}\nAuthor: {}\nFlesch reading-ease score: {}\nSchool level: {}". \ format(book_data[0], book_data[1], fres_score, school_level)
# Don't forget to show the final image plt.show() print('---------------------------------') """## Reading Scores (Flesh and Gunning)""" # Import Textatistic flesh_reading_scores = [] gunning_fog_scores = [] for article in data["cleaned_body_text"]: # Compute the readability scores try: readability_scores = Textatistic(article).scores flesch = readability_scores['flesch_score'] gunning_fog = readability_scores['gunningfog_score'] except: print('Error has occured') continue flesh_reading_scores.append(flesch) gunning_fog_scores.append(gunning_fog) data["flesh_reading_scores"] = pd.Series(flesh_reading_scores) # Loop through excerpts and compute gunning fog index data["gunning_fog_scores"] = pd.Series(gunning_fog_scores)