def create_instance(index: int, block: ResultSet): try: title = block.find('div', {'class': 'BNeawe vvjwJb AP7Wnd'}).text link = unquote(re.search(r'((https|http):\/\/[^&]*)', block.find('a').attrs.get('href'))[0]) return GoogleResult( index, title, link ) except: pass
def scrape_tag_contents(tags, html): tag_list = copy.copy(tags) if isinstance(html, Tag): soup = html else: soup = BeautifulSoup(html, "lxml") results = [] content_tag, content_attr = tag_list.pop() if not len(tag_list): return list(soup.findAll(name=content_tag, attrs=content_attr)) first_tag, first_attr = tag_list.pop(0) element_list = soup.findAll(name=first_tag, attrs=first_attr) for tag, attr in tag_list: temp = ResultSet([], ()) for element in element_list: if isinstance(attr, dict): temp += element.findAll(name=tag, attrs=attr) elif isinstance(attr, unicode) or isinstance(attr, str): if element.has_attr(attr): temp.append(element[attr]) element_list = temp for element in element_list: if content_tag == "regex": pattern = content_attr text = element if not isinstance(text, str): text = element.text if text: match = re.findall(pattern, text) if match: results.append(match[0]) elif content_attr is None or content_attr == "": if content_tag is None or content_tag == "": text = element else: text = element.find(content_tag) if text: results.append(text.text) elif content_tag is None or content_tag == "": if element.has_attr(content_attr): results.append(element[content_attr]) else: info_container = element.findAll(name=content_tag) for container in info_container: if isinstance(content_attr, dict): results.append(container) elif info_container.has_attr(content_attr): results.append(container[content_attr]) return results
def parse(self): """Method to parse results. Returns: (list): list of Merchant instances. """ parse = dateparser.parse results = ResultSet([]) url_parts = urlparse(self.link) url_splitted_path = list(filter(None, url_parts.path.split('/'))) stop_ends = ['complaints', 'customer-reviews', 'print'] if url_splitted_path[len(url_splitted_path)-1] in stop_ends: url_splitted_path.pop() url_path = '/'.join(url_splitted_path + ['print']) url = urlunparse((url_parts.scheme, url_parts.netloc, url_path, None, None, None)) try: html_doc = request.urlopen(url) except urllib_error.HTTPError: logger.error('BBB parser failed with {0}'.format(self.link)) raise soup = BeautifulSoup(html_doc, 'html.parser') try: results += soup.find( 'table', 'cmpldetail' ).find_all( 'tr', re.compile('odd|even') ) except AttributeError: pass try: results += soup.find( 'div', 'customer-complaint-summary' ).find_all( 'tr', re.compile('odd|even') ) except AttributeError: pass for result in results: try: date_ = parse(result.find('td', 'date').text) text = result.find('p').text or '' text = re.sub(r'^Complaint:|Complaint', '', text).strip() self.results.append( self.create_mention(text=text, date=date_) ) except AttributeError: continue return self.results
def alternateRowRemover(rows: ResultSet) -> tuple: counter = 0 multiple = 0 while counter < len(rows): try: if "%" in rows[counter].find_all("td")[1].string: rows.remove(rows[counter]) counter = 0 multiple += 1 continue except IndexError: pass counter += 1 return rows, multiple
def parse_row(r: ResultSet) -> Tuple[str, List[str]]: """Break up a villager's table row into the individual parts Arguments: r {ResultSet} -- The row to be parsed Returns: Tuple[str, List[str]] -- A villager's name and their attributes """ bad_vals = re.compile(r'\W') cols = [re.sub(bad_vals, '', val.text) for val in r.find_all('td')] return (cols[0], cols[2:])
def links(self, character_page): try: page = urllib2.urlopen(character_page) except: character_list = ResultSet([]) return character_list soup = BeautifulSoup(page, 'html.parser') character_list = soup.findAll( 'a', attrs={'class': 'category-page__member-link'}) nextpage = soup.find('a', attrs={'class': 'category-page__pagination-next'}) if nextpage is None: self.nextPage = None else: self.nextPage = nextpage.get('href') return character_list
def helper_remove_tags(element: ResultSet, tags_to_drop: Optional[List[TagInfo]] = None, debug=False): for tag_to_drop in tags_to_drop: if debug: print(f"Drop tag '{tag_to_drop}'") for s in element.select(tag_to_drop[0]): decomposed = False if len(tag_to_drop[1]) > 0: for attribute in tag_to_drop[1]: if not s.has_attr(attribute[0]): decomposed = True s.decompose() elif len(attribute[1]) == 0 or s[ attribute[0]] not in attribute[1]: decomposed = True s.decompose() else: decomposed = True s.decompose() if debug and decomposed: print(f"Drop element '{s}' in element '{element}'") if not decomposed and len(s.select(tag_to_drop[0])) != 0: helper_remove_tags(s, tags_to_drop)
def get_chapter_instance_from_li( li: ResultSet) -> Optional[WebToonChapter]: url = li.find('a')['href'] episode_pretty_name = li.find('img')['alt'].strip() return WebToonChapter.from_url(url, episode_pretty_name)
def _get_href(self, tag: ResultSet) -> str: return tag.get("href")
def helper_rename_tags(element: ResultSet, tags_to_rename: Optional[List[TagRenameInfo]] = None, debug=False): for tag_to_rename in tags_to_rename: for tag in element.find_all(tag_to_rename[0]): tag.name = tag_to_rename[1]