def parse_search_page(self, url): url_list = [] for _ in range(self.page_count): res = self.browser.get(fix_url(url)) html_text = res.text soap_page = BeautifulSoup(html_text) for prod_element in soap_page.find("ul", { "id": re.compile(r"list-items") }).find_all("li"): url_list.append( fix_url( prod_element.find( "a", {"href": re.compile("aliexpress.com/item") })["href"])) try: url = soap_page.find("div", { "class": "ui-pagination-navi" }).find("a", { "class": "page-next" }).attrs["href"] except Exception as e: logger.debug(e) break return url_list
def parse_sale_page(self, url): res = self.browser.get(fix_url(url)) html_text = res.text soap_page = BeautifulSoup(html_text) var = soap_page(text=re.compile(r'data_widgety5zzyn')) json_data = json.loads(var[0][var[0].index('{'):]) products_url = json_data["source"]["url"] res = self.browser.get(fix_url(products_url)) res.text.lstrip("onJSONPCallback(").rstrip(")") json_data = json.loads( res.text.lstrip("onJSONPCallback(").rstrip(");")) nodeList = json_data['content']['nodeList'][0] name = nodeList['name'] return [item['detailUrl'] for item in nodeList['nodeData']['dataList']]
def parse_details(self): details_url = fix_url( re.search(r'window.runParams.descUrl="(.*?)";', self.main_page_soap.text).group(1)) response = self.browser.get(details_url) soup = BS(response.text) only_text = soup.getText().replace("window.productDescription=", "").strip(" ") self.save_param('details', only_text)
def parse_feedbacks(self): feedback_url = fix_url( self.main_page_soap.find(id="feedback").iframe['thesrc']) comments = [] last_page_count = None for page_count in range(1, 10000): feedback_r = self.browser.post(feedback_url, {"page": page_count}) feddback_soap = BS(feedback_r.text) if not last_page_count: try: a_tags = feddback_soap.find( "div", { "class": "ui-pagination-navi util-left" }).find_all("a") last_page_count = int(a_tags[len(a_tags) - 2].text) except Exception as e: pass elif last_page_count < page_count: break for comment_div in feddback_soap.find_all( 'div', {'class': 'feedback-item'}): try: comment = {} user_data = comment_div.find('div', {'class': 'fb-user-info'}) try: user_name = user_data.span.a.text except AttributeError: user_name = user_data.span.text comment['user_name'] = user_name comment['country'] = user_data.b.text comment['comment'] = comment_div.find( 'dt', { 'class': 'buyer-feedback' }).span.text comment['posted_time'] = comment_div.find( 'dd', { "class": "r-time" }).text start_css = comment_div.find('span', { "class": "star-view" }).span["style"] comment["rating"] = start_css[start_css.find(":"):].strip( "%") comments.append(comment) except Exception as e: logger.debug(e) if self.max_comments < len(comments): logger.info("Stopped comments fetching by max_transactions") break self.save_param('comments', comments)
def __init__(self, browser, detail_url, max_comments=100, max_transactions=100): self.browser = browser self.max_comments = max_comments self.max_transactions = max_transactions self.product_id = get_product_id_from_url(detail_url) self.detail_url = detail_url res = browser.get(fix_url(detail_url)) self.main_page_soap = BS(res.text) self.item = {}
def parse_syllabus(session, page, reverse=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll(attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" section_name = clean_filename(stag.contents[0].contents[1]) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" vname = clean_filename(vtag.a.contents[0]) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) title = clean_filename(a.get('title', '')) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if (count == i + 1): # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('Probably bad cookies file (or wrong class name)') return sections
def parse_syllabus(session, page, reverse=False, intact_fnames=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll( attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn('Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url(session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections
def get_error_rate(self, response): self.out_domains.add(get_domain(response.request.url)) self.crawler.stats.inc_value("no_requests") if not self.domain.check_request_url(response.request.url): self.crawler.stats.inc_value('no_new_posts') self.sum_download_time += response.meta['request_time'] urls = [response.urljoin(url.strip()) for url in response.xpath("//a/@href").getall() if fix_url(url)] for url in urls: yield Request(url=url, callback=self.get_error_rate, errback=self.check_error_back_rate)
def parse_old_style_syllabus(session, page, reverse=False, intact_fnames=False, subtitle_language="en"): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={"class": re.compile("^course-item-list-header")}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll("li"): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(" %s", vname) lecture = {} lecture_page = None for a in vtag.findAll("a"): href = fix_url(a["href"]) untouched_fname = a.get("title", "") title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) if fmt in ("srt", "txt") and subtitle_language != "en": title = title.replace("_en&format", "_" + subtitle_language + "&format") href = href.replace("_en&format", "_" + subtitle_language + "&format") logging.debug(" %s %s", fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture["mp4"] = lecture.get("mp4", []) lecture["mp4"].append((fix_url(href), "")) except TypeError: logging.warn("Could not get resource: %s", lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if "mp4" not in lecture: for a in vtag.findAll("a"): if a.get("data-modal-iframe"): href = grab_hidden_video_url(session, a["data-modal-iframe"]) href = fix_url(href) fmt = "mp4" logging.debug(" %s %s", fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, "")) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], "") else: # make sure the title is unique lecture[fmt][i] = (r[0], "{0:d}_{1}".format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info("Found %d sections and %d lectures on this page", len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error("The cookies file may be invalid, " "please re-run with the `--clear-cache` option.") return sections
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False, subtitle_language='en'): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, unrestricted_filenames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, unrestricted_filenames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, unrestricted_filenames) fmt = get_anchor_format(href) if fmt in ('srt', 'txt') and subtitle_language != 'en': title = title.replace('_en&format', '_' + subtitle_language + '&format') href = href.replace('_en&format', '_' + subtitle_language + '&format') logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warning( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections