def main(): for row in read_file(): browser = Browser(debug=False, use_debug_proxy=False) create_post_url = "https://www.123saunas.com/customer/account/createpost/" # first request call is necessary to get cookies enabled browser.get(create_post_url) res = browser.get(create_post_url) html_text = res.text soap_page = BeautifulSoup(html_text) form_key = soap_page.find('input', {'name': 'form_key'}).get('value') res2 = browser.post( 'https://www.123saunas.com/customer/account/createpost/', { "success_url": "", "error_url": "", "form_key": form_key, "firstname": row['FirstName'], "middlename": "", "lastname": row['Lastname'], "email": row['Email'], "password": row['Password'], "confirmation": row['Password_confirm'], "persistent_remember_me": "on", }, headers={'Content-Type': 'application/x-www-form-urlencoded'}) if res2.url == "https://www.123saunas.com/customer/account/index/": save_to_file("valid_reg.txt", row) else: save_to_file("checkagain_reg.txt", row) time.sleep(10)
def _extract_asset_tags(self, text): """ Extract asset tags from text into a convenient form. @param text: Text to extract asset tags from. This text contains HTML code that is parsed by BeautifulSoup. @type text: str @return: Asset map. @rtype: { '<id>': { 'name': '<name>', 'extension': '<extension>' }, ... } """ soup = BeautifulSoup(text) asset_tags_map = {} for asset in soup.find_all('asset'): asset_tags_map[asset['id']] = { 'name': asset['name'], 'extension': asset['extension'] } return asset_tags_map
def _extract_asset_tags(self, text): """ Extract asset tags from text into a convenient form. @param text: Text to extract asset tags from. This text contains HTML code that is parsed by BeautifulSoup. @type text: str @return: Asset map. @rtype: { '<id>': { 'name': '<name>', 'extension': '<extension>' }, ... } """ soup = BeautifulSoup(text) asset_tags_map = {} for asset in soup.find_all('asset'): asset_tags_map[asset['id']] = {'name': asset['name'], 'extension': asset['extension']} return asset_tags_map
def parse_search_page(self, url): url_list = [] for _ in range(self.page_count): res = self.browser.get(fix_url(url)) html_text = res.text soap_page = BeautifulSoup(html_text) for prod_element in soap_page.find("ul", { "id": re.compile(r"list-items") }).find_all("li"): url_list.append( fix_url( prod_element.find( "a", {"href": re.compile("aliexpress.com/item") })["href"])) try: url = soap_page.find("div", { "class": "ui-pagination-navi" }).find("a", { "class": "page-next" }).attrs["href"] except Exception as e: logger.debug(e) break return url_list
def get_old_style_video(session, url): """ Parse a old style Coursera video page. """ page = get_page(session, url) soup = BeautifulSoup(page) return soup.find(attrs={'type': re.compile('^video/mp4')})['src']
def parse_details(self): details_url = fix_url( re.search(r'window.runParams.descUrl="(.*?)";', self.main_page_soap.text).group(1)) response = self.browser.get(details_url) soup = BS(response.text) only_text = soup.getText().replace("window.productDescription=", "").strip(" ") self.save_param('details', only_text)
def parse_feedbacks(self): feedback_url = fix_url( self.main_page_soap.find(id="feedback").iframe['thesrc']) comments = [] last_page_count = None for page_count in range(1, 10000): feedback_r = self.browser.post(feedback_url, {"page": page_count}) feddback_soap = BS(feedback_r.text) if not last_page_count: try: a_tags = feddback_soap.find( "div", { "class": "ui-pagination-navi util-left" }).find_all("a") last_page_count = int(a_tags[len(a_tags) - 2].text) except Exception as e: pass elif last_page_count < page_count: break for comment_div in feddback_soap.find_all( 'div', {'class': 'feedback-item'}): try: comment = {} user_data = comment_div.find('div', {'class': 'fb-user-info'}) try: user_name = user_data.span.a.text except AttributeError: user_name = user_data.span.text comment['user_name'] = user_name comment['country'] = user_data.b.text comment['comment'] = comment_div.find( 'dt', { 'class': 'buyer-feedback' }).span.text comment['posted_time'] = comment_div.find( 'dd', { "class": "r-time" }).text start_css = comment_div.find('span', { "class": "star-view" }).span["style"] comment["rating"] = start_css[start_css.find(":"):].strip( "%") comments.append(comment) except Exception as e: logger.debug(e) if self.max_comments < len(comments): logger.info("Stopped comments fetching by max_transactions") break self.save_param('comments', comments)
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [ item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs ] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename(extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename(os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename( extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename( os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def _prettify_instructions(self, text): """ Prettify instructions text to make it more suitable for offline reading. @param text: HTML (kinda) text to prettify. @type text: str @return: Prettified HTML with several markup tags replaced with HTML equivalents. @rtype: str """ soup = BeautifulSoup(text) self._convert_instructions_basic(soup) self._convert_instructions_images(soup) return soup.prettify()
def _convert_markup_basic(self, soup): """ Perform basic conversion of instructions markup. This includes replacement of several textual markup tags with their HTML equivalents. @param soup: BeautifulSoup instance. @type soup: BeautifulSoup """ # Inject meta charset tag meta = soup.new_tag('meta', charset='UTF-8') soup.insert(0, meta) # 1. Inject basic CSS style css_soup = BeautifulSoup(INSTRUCTIONS_HTML_INJECTION) soup.append(css_soup) # 2. Replace <text> with <p> while soup.find('text'): soup.find('text').name = 'p' # 3. Replace <heading level="1"> with <h1> while soup.find('heading'): heading = soup.find('heading') heading.name = 'h%s' % heading.attrs.get('level', '1') # 4. Replace <code> with <pre> while soup.find('code'): soup.find('code').name = 'pre' # 5. Replace <list> with <ol> or <ul> while soup.find('list'): list_ = soup.find('list') type_ = list_.attrs.get('bullettype', 'numbers') list_.name = 'ol' if type_ == 'numbers' else 'ul'
def __call__(self, markup): """ Convert instructions markup to make it more suitable for offline reading. @param markup: HTML (kinda) markup to prettify. @type markup: str @return: Prettified HTML with several markup tags replaced with HTML equivalents. @rtype: str """ soup = BeautifulSoup(markup) self._convert_markup_basic(soup) self._convert_markup_images(soup) self._convert_markup_audios(soup) return soup.prettify()
def grab_hidden_video_url(session, href): """ Follow some extra redirects to grab hidden video URLs. The first of these "hidden" videos were seen in courses from the University of Washington, but others appeared after that (like in the course Social Psychology). """ try: page = get_page(session, href) except requests.exceptions.HTTPError: return None soup = BeautifulSoup(page) l = soup.find('source', attrs={'type': 'video/mp4'}) if l is not None: return l['src'] else: return None
def __init__(self, browser, detail_url, max_comments=100, max_transactions=100): self.browser = browser self.max_comments = max_comments self.max_transactions = max_transactions self.product_id = get_product_id_from_url(detail_url) self.detail_url = detail_url res = browser.get(fix_url(detail_url)) self.main_page_soap = BS(res.text) self.item = {}
def parse_sale_page(self, url): res = self.browser.get(fix_url(url)) html_text = res.text soap_page = BeautifulSoup(html_text) var = soap_page(text=re.compile(r'data_widgety5zzyn')) json_data = json.loads(var[0][var[0].index('{'):]) products_url = json_data["source"]["url"] res = self.browser.get(fix_url(products_url)) res.text.lstrip("onJSONPCallback(").rstrip(")") json_data = json.loads( res.text.lstrip("onJSONPCallback(").rstrip(");")) nodeList = json_data['content']['nodeList'][0] name = nodeList['name'] return [item['detailUrl'] for item in nodeList['nodeData']['dataList']]
class AliexpressPageParser: def __init__(self, browser, detail_url, max_comments=100, max_transactions=100): self.browser = browser self.max_comments = max_comments self.max_transactions = max_transactions self.product_id = get_product_id_from_url(detail_url) self.detail_url = detail_url res = browser.get(fix_url(detail_url)) self.main_page_soap = BS(res.text) self.item = {} def run(self): """ Call methods which has parse_ prefix Position is important !!! """ methods = [ getattr(self, m) for m in dir(self) if m.startswith("parse_") ] for method in methods: try: method() except Exception as e: logger.exception(e) return self.item def save_param(self, key, value): self.item[key] = value def get_data(self, tag, attrs, val_type="str"): """ Get values from main page :param tag: String Example: "h2" :param attrs: Dict {"class": "class_name"} :return String """ text = "" try: text = self.main_page_soap.find(tag, attrs).text if val_type != "str": m = re.search(r"[-+]?\d*\.\d+|\d+", text) if m: text = m.group() except Exception as e: logger.debug("PASS: tag: %s, attrs: %s", tag, attrs) return text def parse_commond_data(self): self.save_param('detailUrl', self.detail_url) self.save_param('ali_id', self.product_id) self.save_param('title', self.get_data("h1", {"class": "product-name"})) self.save_param('avgStar', self.get_data("span", {"class": "percent-num"})) self.save_param('discount', self.get_data("span", {"class": "p-discount-rate"})) self.save_param('minPrice', self.get_data("span", {"id": "j-sku-price"})) self.save_param('minMobPromPrice', self.get_data("span", {"id": "j-sku-discount-price"})) self.save_param('promLeft', self.get_data("span", {"class": "p-eventtime-left"})) self.save_param('orderNum', self.get_data("span", {"id": "j-order-num"}, "int")) self.save_param('rantingsNum', self.get_data("span", {"id": "rantings-num"}, "int")) def parse_description(self): descriptions = [] for li in self.main_page_soap.find('ul', { 'class': 'product-property-list' }).find_all('li'): description = {} try: key, val = li.find_all("span") description[key.text.strip(":")] = val.text descriptions.append(description) except Exception as e: logger.debug(e) self.save_param('description', descriptions) def parse_details(self): details_url = fix_url( re.search(r'window.runParams.descUrl="(.*?)";', self.main_page_soap.text).group(1)) response = self.browser.get(details_url) soup = BS(response.text) only_text = soup.getText().replace("window.productDescription=", "").strip(" ") self.save_param('details', only_text) def parse_images(self): images = [] for image in self.main_page_soap.find_all('span', {'class': 'img-thumb-item'}): origin_image_path = origin_image(image.img['src']) images.append(origin_image_path) self.save_param('images', images) def parse_feedbacks(self): feedback_url = fix_url( self.main_page_soap.find(id="feedback").iframe['thesrc']) comments = [] last_page_count = None for page_count in range(1, 10000): feedback_r = self.browser.post(feedback_url, {"page": page_count}) feddback_soap = BS(feedback_r.text) if not last_page_count: try: a_tags = feddback_soap.find( "div", { "class": "ui-pagination-navi util-left" }).find_all("a") last_page_count = int(a_tags[len(a_tags) - 2].text) except Exception as e: pass elif last_page_count < page_count: break for comment_div in feddback_soap.find_all( 'div', {'class': 'feedback-item'}): try: comment = {} user_data = comment_div.find('div', {'class': 'fb-user-info'}) try: user_name = user_data.span.a.text except AttributeError: user_name = user_data.span.text comment['user_name'] = user_name comment['country'] = user_data.b.text comment['comment'] = comment_div.find( 'dt', { 'class': 'buyer-feedback' }).span.text comment['posted_time'] = comment_div.find( 'dd', { "class": "r-time" }).text start_css = comment_div.find('span', { "class": "star-view" }).span["style"] comment["rating"] = start_css[start_css.find(":"):].strip( "%") comments.append(comment) except Exception as e: logger.debug(e) if self.max_comments < len(comments): logger.info("Stopped comments fetching by max_transactions") break self.save_param('comments', comments) def parse_history_transactions(self): history_transaction = "https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?" \ "productId=%s&type=default" % self.product_id transactions = [] last_page = None for page_count in range(1, 100000): transaction_r = self.browser.get(history_transaction, {'page': page_count}) transaction_json = transaction_r.json() if not last_page: last_page = int(transaction_json['page']['total']) elif last_page < page_count: break for records in transaction_json['records']: transactions.append(records) if self.max_transactions < len(transactions): logger.info( "Stopped transactions fetching by max_transactions") break self.save_param('transaction', transactions)
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False, subtitle_language='en'): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, unrestricted_filenames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, unrestricted_filenames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, unrestricted_filenames) fmt = get_anchor_format(href) if fmt in ('srt', 'txt') and subtitle_language != 'en': title = title.replace('_en&format', '_' + subtitle_language + '&format') href = href.replace('_en&format', '_' + subtitle_language + '&format') logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warning( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections
def _replace_tag(self, text, initial_tag, target_tag): soup = BeautifulSoup(text) while soup.find(initial_tag): soup.find(initial_tag).name = target_tag return soup.prettify()