def parse(self): """Method to parse results. Returns: (list): list of Merchant instances. """ parse = dateparser.parse results = ResultSet([]) url_parts = urlparse(self.link) url_splitted_path = list(filter(None, url_parts.path.split('/'))) stop_ends = ['complaints', 'customer-reviews', 'print'] if url_splitted_path[len(url_splitted_path)-1] in stop_ends: url_splitted_path.pop() url_path = '/'.join(url_splitted_path + ['print']) url = urlunparse((url_parts.scheme, url_parts.netloc, url_path, None, None, None)) try: html_doc = request.urlopen(url) except urllib_error.HTTPError: logger.error('BBB parser failed with {0}'.format(self.link)) raise soup = BeautifulSoup(html_doc, 'html.parser') try: results += soup.find( 'table', 'cmpldetail' ).find_all( 'tr', re.compile('odd|even') ) except AttributeError: pass try: results += soup.find( 'div', 'customer-complaint-summary' ).find_all( 'tr', re.compile('odd|even') ) except AttributeError: pass for result in results: try: date_ = parse(result.find('td', 'date').text) text = result.find('p').text or '' text = re.sub(r'^Complaint:|Complaint', '', text).strip() self.results.append( self.create_mention(text=text, date=date_) ) except AttributeError: continue return self.results
def scrape_tag_contents(tags, html): tag_list = copy.copy(tags) if isinstance(html, Tag): soup = html else: soup = BeautifulSoup(html, "lxml") results = [] content_tag, content_attr = tag_list.pop() if not len(tag_list): return list(soup.findAll(name=content_tag, attrs=content_attr)) first_tag, first_attr = tag_list.pop(0) element_list = soup.findAll(name=first_tag, attrs=first_attr) for tag, attr in tag_list: temp = ResultSet([], ()) for element in element_list: if isinstance(attr, dict): temp += element.findAll(name=tag, attrs=attr) elif isinstance(attr, unicode) or isinstance(attr, str): if element.has_attr(attr): temp.append(element[attr]) element_list = temp for element in element_list: if content_tag == "regex": pattern = content_attr text = element if not isinstance(text, str): text = element.text if text: match = re.findall(pattern, text) if match: results.append(match[0]) elif content_attr is None or content_attr == "": if content_tag is None or content_tag == "": text = element else: text = element.find(content_tag) if text: results.append(text.text) elif content_tag is None or content_tag == "": if element.has_attr(content_attr): results.append(element[content_attr]) else: info_container = element.findAll(name=content_tag) for container in info_container: if isinstance(content_attr, dict): results.append(container) elif info_container.has_attr(content_attr): results.append(container[content_attr]) return results
def links(self, character_page): try: page = urllib2.urlopen(character_page) except: character_list = ResultSet([]) return character_list soup = BeautifulSoup(page, 'html.parser') character_list = soup.findAll( 'a', attrs={'class': 'category-page__member-link'}) nextpage = soup.find('a', attrs={'class': 'category-page__pagination-next'}) if nextpage is None: self.nextPage = None else: self.nextPage = nextpage.get('href') return character_list