def __init__( self, url, consent_elem: selenium.webdriver.remote.webelement.WebElement, ): self.elem = consent_elem # The content of the button text. self.html = self.elem.get_attribute("outerHTML") self.text = self.elem.text self.hasDenyAll = False self.hasAcceptAll = False self.readabilityARI = None self.readabilityFLESH = None self.scrn = None # Check if we have deny/accept all buttons. self.hasDenyAll = self._find_deny_all() self.hasAcceptAll = self._find_appr_all() # Lets define our readability by ARI and FLESH. r = Readability(self.text) self.readabilityARI = r.ari().__dict__ self.readabilityFLESH = r.flesch().__dict__ # Lets see if we can find any checkboxes... self.totalCheckboxes = 0 self.checkedCheckboxes = 0 total_checkboxes = self.elem.find_elements_by_css_selector( "input[type='checkbox']") checked_checkboxes = self.elem.find_elements_by_css_selector( "input[type='checkbox']:checked") self.totalCheckboxes = len(total_checkboxes) self.checkedCheckboxes = len(checked_checkboxes)
def get_scores(text): analysis = Readability(text) results = {} results['ari'] = analysis.ARI() results['fkgl'] = analysis.FleschKincaidGradeLevel() results['cli'] = analysis.ColemanLiauIndex() results['fre'] = analysis.FleschReadingEase() results['gfi'] = analysis.GunningFogIndex() results['lix'] = analysis.LIX() results['rix'] = analysis.RIX() results['smog'] = analysis.SMOGIndex() return results
def __init__(self, url, raw=None): if not raw: raw = requests.get(url).content soup = BeautifulSoup(raw, 'lxml') self.url = url self.metadata = self.getMeta(soup) if self.metadata['content']: content = self.metadata['content'].find("section") if not content: content = self.metadata['content'] content = " ".join(list(content.stripped_strings)) else: raise Exception( "No content found for", url, "\nPlease add custom constraints [if any] in attributes_list.py" ) self.rd = Readability(content) super().__init__(self.metadata['title'], content)
def extract_user_reading_levels( input_file_name=DEFAULT_RAW_REVIEWS_FILE_NAME, output_file_name=DEFAULT_READING_LEVELS_FILE_NAME, reviews_to_analyze_per_user=float('inf')): """ Given a Yelp dataset reviews file, builds a file: user_1_ID user_1_reading_level . . . user_N_ID user_N_reading_level WARNING: This function is computationally expensive. The amount of computation can be limited by setting reviews_to_analyze_per_user, the maximum number of reviews to analyze per user. On a 2011 MacBook Air, 1000 reviews take 2-3 seconds to analyze. """ # Maps each user ID --> [running sum of review reading levels, running number of reviews] total_reading_level_and_review_count_for_user = defaultdict(lambda: [0, 0]) # Compute the above mapping with open(raw_data_absolute_path(input_file_name)) as reviews_file: for review_JSON in reviews_file: review = json.loads(review_JSON) # Skip reviews from users who we have analyzed to the maximum desired if total_reading_level_and_review_count_for_user[ review['user_id']][1] >= reviews_to_analyze_per_user: continue # TODO: Try other reading level metrics try: total_reading_level_and_review_count_for_user[ review['user_id']][0] += Readability( review['text']).SMOGIndex() total_reading_level_and_review_count_for_user[ review['user_id']][1] += 1 except UnicodeEncodeError as error: pass # Compute each user's average reading level # Note: minimum SMOG index is 3.0, but users without reviews are assigned 0 average_reading_level_for_user = { user_ID: safe_divide(total_reading_level, review_count) for user_ID, [total_reading_level, review_count] in total_reading_level_and_review_count_for_user.iteritems() } write_single_user_attribute(average_reading_level_for_user, output_file_name)
def setup(self): self.words = [] self.nouns = {} self.verbs = {} self.similarity = -1 self.active_words = [] self.passive_words = [] self.direct_words = [] self.indirect_words = [] self.positive_words = [] self.negative_words = [] self.line_break = False rd = Readability(self.text) self.FleschReadingEase = rd.FleschReadingEase() self.FleschKincaidGradeLevel = rd.FleschKincaidGradeLevel() self.GunningFogIndex = rd.GunningFogIndex() self.SMOGIndex = rd.SMOGIndex()
def getReadabilities(string): read = Readability(string) return read.FleschReadingEase(), read.FleschKincaidGradeLevel()
def assess_readability(text): '''Assess the readability of text with the Flesch-Kincaid Grade Level test, as implemented in Python here: https://github.com/mmautner/readability''' # Assess grade level return Readability(text).FleschKincaidGradeLevel()
class Article(ArticleText): def __init__(self, url, raw=None): if not raw: raw = requests.get(url).content soup = BeautifulSoup(raw, 'lxml') self.url = url self.metadata = self.getMeta(soup) if self.metadata['content']: content = self.metadata['content'].find("section") if not content: content = self.metadata['content'] content = " ".join(list(content.stripped_strings)) else: raise Exception( "No content found for", url, "\nPlease add custom constraints [if any] in attributes_list.py" ) self.rd = Readability(content) super().__init__(self.metadata['title'], content) def iterTillHit(self, soup, arglist, target=None): for arg in arglist: cont = soup.find(*arg) if cont: if not target: return cont elif cont.text: return cont.text else: return cont[target] else: return None def getMeta(self, soup): # Title, Keywords, Description, Author, Published attr_d = {} attr_d['title'] = self.iterTillHit(soup, TITLE_L, 'content') attr_d['keyword'] = self.iterTillHit(soup, KEYWORD_L, 'content') attr_d['desc'] = self.iterTillHit(soup, DESC_L, 'content') attr_d['author'] = self.iterTillHit(soup, AUTHOR_L, 'content') attr_d['published'] = self.iterTillHit(soup, PUBLISHED_L, 'content') attr_d['content'] = self.iterTillHit(soup, CONTENT_L) return attr_d def num_hrefs(self): return len(self.metadata['content'].findAll("a", href=True)) def num_self_hrefs(self): site = urlparse(self.url)[1] return sum([ 1 for href in self.metadata['content'].findAll("a", href=True) if site in href['href'] ]) def num_imgs(self): return len(self.metadata['content'].findAll("img")) def num_videos(self): return len(self.metadata['content'].findAll("iframe")) def num_keywords(self): return len(self.metadata['keyword'].split( ",")) if self.metadata['keyword'] else 0 def daystuff(self): weekday_dict = [["weekday_is_monday", 0], ["weekday_is_tuesday", 0], ["weekday_is_wednesday", 0], ["weekday_is_thursday", 0], ["weekday_is_friday", 0], ["weekday_is_saturday", 0], ["weekday_is_sunday", 0], ["is_weekend", 0]] try: weekday = dateparser.parse(self.metadata['published']).weekday() weekday_dict[weekday][1] = 1 weekday_dict[-1][1] = 1 if weekday > 4 else 0 except TypeError: pass finally: return dict(weekday_dict) def lda(self): lda_dict = getLDA(self.metadata['title'])[0] lda_dict = {"LDA_%.2d" % index: val for index, val in lda_dict} return lda_dict def readability(self): readability_dict = { 'ARI': self.rd.ARI(), 'FleschReadingEase': self.rd.FleschReadingEase(), 'FleschKincaidGradeLevel': self.rd.FleschKincaidGradeLevel(), 'GunningFogIndex': self.rd.GunningFogIndex(), 'SMOGIndex': self.rd.SMOGIndex(), 'ColemanLiauIndex': self.rd.ColemanLiauIndex(), 'LIX': self.rd.LIX(), 'RIX': self.rd.RIX() } return readability_dict def stats(self): attributes = [ 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'num_keywords' ] meta_dict = super().stats() meta_dict.update({func: getattr(self, func)() for func in attributes}) meta_dict.update(self.daystuff()) meta_dict.update(self.lda()) meta_dict.update(self.readability()) return meta_dict