def return_data(self, **kwargs) -> dict: """ Returns Alexa Rank score in 0-4 scale. 0 - high 1 - moderate 2 - low 3 - very low 4 - not indexed """ if self.company_name in self.cache.index: result = self.cache.loc[self.company_name].values[0] return {"AlexaRank": result} found = [] found_full = [] for webpage in self.webpages: page = WebpageResolver.get_html(AlexaRank.ALEXA_ROOT+webpage, stash=False) try: soup = bs4.BeautifulSoup(page, features="lxml") rank = soup.find_all("div", class_="rankmini-rank")[0].text.strip() rank = int(rank.lstrip("#").replace(",","")) rank_digit = np.digitize(rank, AlexaRank.BINS) found.append(rank_digit) found_full.append(rank) except IndexError: # The page is so small that it's not even indexed in Alexa found.append(4) found_full.append(-1) rank_digit = min(found) rank = min(found_full) self.cache.loc[self.company_name] = rank self.cache.to_csv(AlexaRank.LOC+"cache.tsv", sep='\t') return {"AlexaRank": rank_digit, "AlexaRankScore": rank}
def check_if_polish_text(self, website): def tag_visible(element): if element.parent.name in [ 'style', 'script', 'head', 'title', 'meta', '[document]' ]: return False if isinstance(element, bs4.element.Comment): return False return True def text_from_html(body): soup = BeautifulSoup(body, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(tag_visible, texts) return u" ".join(t.strip() for t in visible_texts) for website in self.websites: try: text = text_from_html(WebpageResolver.get_html(website)) ld = LanguageDetection() langs = ld.return_data(text=text) #print(langs, website) except: continue if 'pl' in langs and langs['pl'] > 0.25: return True return False return False