def get_desc(cls, value): try: # Step 1 url_desc = ingram.desc_api % (value) # Step 2 data = urllib.urlopen(url_desc) response = HtmlResponse(url=url_desc, body=data.read()) data = None all_xpath = [ "substring-before(//div[@class='training_details_content'],'Language')", "substring-before(//div[@class='training_details_content'],'Please bring your')", "//div[@class='training_details_content']/p/text()", "//div[@class='training_details_content']/div[1]/text()", "//div[@class='training_details_content']/span/text()", "//div[@class='training_details_content']//text()" ] for xpath in all_xpath: data = response.xpath(xpath).extract() desc = html_to_text(data) if desc: return desc return None except: pass
def parse_course(cls, response): print response.url json_data = json.loads(response.body) from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() # get course_groups info = get_attr(json_data, 'courseGroups')[0] output['product_image_url'] = info['asset'] output['name'] = info['title'] output['product_url'] = urljoin('https://www.writersonlineworkshops.com/courses/', info['slug']) # get courseTabs course_tabs = get_attr(json_data, 'courseTabs') #get description for item in course_tabs: if 'Overview' in item['label']: output['description'] = html_to_text(item['body']) #get audience elif 'Who Should Take' in item['label']: output['audience'] = html_to_text(item['body']) #get toc elif 'Course Outline' in item['label']: output['toc'] = item['body'] # get courses courses = get_attr(json_data, 'courses') ################################################# product_events = [] has_price = False for item in courses: if not is_expired(item['courseStartDate']): event = ProductEvent() event['language'] = 'eng' event['start_date_local'] = convert_date(item['courseStartDate']) event['end_date_local'] = convert_date(item['courseEndDate']) event['price_display_float'] = item['priceInCents'] / 100 has_price = True product_events.append(event) output['product_events'] = product_events if not has_price and len(courses) > 0: event = ProductEvent() event['price_display_float'] = item['priceInCents'] / 100 event['price_currency'] = 'USD' product_events.append(event) output['product_events'] = product_events return output
def get_instructors(self, response, source=None): def first_non_null(value): if value and isinstance(value, list): for i in value: if i and len(i.strip()) > 0: return i return None table = response.xpath("//table") instructors = [] if table: # case 1 : data as table format tr1 = table.xpath("//tr[1]") tr2 = table.xpath("//tr[2]") for i in range(1, len(tr1.xpath(".//td")) + 1): instructor = Instructor() try: instructor["image"] = tr1.xpath("td[%s]//img/@src" % (i)).extract()[0] except: pass try: name = first_non_null(tr2.xpath("td[%s]//text()" % (i)).extract()) if name is None or len(name.strip()) == 0: name = tr2.xpath("td[%s]//strong/text()" % (i)).extract()[0] instructor["name"] = html_to_text(name) except: pass instructors.append(instructor) else: # case 2 : data as text format data = response.xpath( "//div[@id='article-content']//b[contains(.,'Speakers')]/following-sibling::b/text() | //div[@id='article-content']//b[contains(.,'Presenter')]/following-sibling::b/text()" ).extract() if not data: data = response.xpath( "//div[@id='article-content']//b[contains(.,'PRESENTERS')]/following-sibling::b/text()" ).extract() data = data if len(data) % 2 == 0 else data[0 : len(data) - 1] data = [v for i, v in enumerate(data) if i % 2 == 0] for i in data: instructor = Instructor() instructor["name"] = html_to_text(i) instructors.append(instructor) return instructors if instructors else None
def events(selector): event = ProductEvent() event['language'] = 'eng', event['price_currency'] = 'EUR' price = selector.xpath("./td[7]//text()").re("\\d.*") if price: event['price_display_float'] = html_to_text(price[0]) else: event['price_display_float'] = '0' location = selector.xpath("./td[5]/text()[1]").extract()[0] event['location_display'] = html_to_text(location) start_date = selector.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0] event['start_date_local'] = convert_date(start_date) return event
def convert_products(self, origin_products): from scrapy_balloons.spiders.balloon import balloon_spider results = [] for data in origin_products: try: product = balloon_spider.create_new_product() product['name'] = data['title'] product['product_image_url'] = data['image']['l'] try: video_id = data['freeChapter']['videoID'] if video_id: product['product_video_url'] = urljoin("https://www.youtube.com/embed/", video_id) else: product['product_video_url'] = None except: pass product['description'] = html_to_text(data['description']) product['published_date'] = data['date']['date'] product['tz'] = data['date']['timezone'] product['product_url'] = urljoin("https://www.evisors.com/", data['url']) instructors = [] for ins in data['presenters']: instructor = Instructor() instructor['name'] = ins['name'] instructor['image'] = ins['avatar']['xl'] instructor['bio'] = html_to_text(ins['bio']) instructor['link'] = urljoin("https://www.evisors.com/", ins['expert']['url']) instructors.append(instructor) product['instructors'] = instructors product['price_currency'] = 'USD' product['price_display_float'] = get_price_float(data['price']) results.append(product) except: traceback.print_exc() return results
def convert_products(self, origin_products): from scrapy_balloons.spiders.balloon import balloon_spider results = [] for data in origin_products: try: product = balloon_spider.create_new_product() product['name'] = data['title'] product['product_image_url'] = data['video_thumbnail'] product['description'] = html_to_text(data['body'][0]['value']) product['product_url'] = urljoin("https://www.elance.com/", data['url']) product['duration_display'] = data['video_duration']['hms_labeled'] product['duration_filter'] = duration_filter(data['video_duration']['hms_labeled']) results.append(product) except: traceback.print_exc() return results
def get_prerequisites(cls, response): if isinstance(response, Response): text = compuworks.get_text_from_cache(response) else: text = html_to_text(response) patterns = ["Prerequisites:(.*)Course Outline", "Prerequisites:(.*)(Unit 1:|Module 1:|Course Length)", "Prerequisites:(.*)Designing and Deploy" ] for p in patterns: result = re.search(p, text, re.I) if result: if '**' in result.group(1): result = result.group(1).replace('*', '') result = re.sub("To ensure your success.*:", '', result).strip().split('.') return [v for v in result if v.strip()] else: result = re.sub("Before attending[^:]*:", '', result.group(1)).strip().split(' ') return [v for v in result if v.strip()]
def parse_course(cls, response): json_data = json.loads(response.body) if json_data['courseGroups']: from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() # get course_groups course_groups = get_attr(json_data, 'courseGroups') if course_groups and len(course_groups) > 0: #get field 'product_image_url' if [course_groups[0]['asset']]: output['product_image_url'] = [course_groups[0]['asset']] #get name Instructor if course_groups[0]['categories']: # instructor_name = [item['value'] for item in course_groups[0]['categories'] if item['label']=='Instructor'] for item in course_groups[0]['categories']: if item['label'] == 'Instructor': instructor_name = item['value'] else: instructor_name = None else: instructor_name = None # get courseTabs coursetabs = get_attr(json_data, 'courseTabs') if coursetabs and len(coursetabs) > 0: #get description if coursetabs[0]['body']: output['description'] = html_to_text(coursetabs[0]['body']) #get toc if len(coursetabs) > 3: if coursetabs[2]['body']: output['toc'] = coursetabs[2]['body'] # get profileBlocks profileblocks = get_attr(json_data, 'profileBlocks') if profileblocks and len(profileblocks) > 0: #get bio Instructor if profileblocks[0]['bio']: instructor_bio = html_to_text(profileblocks[0]['bio']) else: instructor_bio = None # import pdb # pdb.set_trace() # get users users = get_attr(json_data, 'users') if users and len(users) > 0: #get image Instructor for item in users: if item['asset']: instructor_image = item['asset'] else: instructor_image = None else: instructor_image = None # get courses courses = get_attr(json_data, 'courses') if courses and len(courses) > 0: ################################################# url = courses[0]['slug'] output['product_url'] = urljoin('https://www.howdesignuniversity.com/courses/', url) # get field 'name' output['name'] = courses[0]['title'] #get product_events instructors = [] product_events = [] if courses[0]['courseEndDate']: event = ProductEvent() event['language'] = 'eng' # get field 'start_date_local' if courses[0]['courseStartDate']: event['start_date_local'] = courses[0]['courseStartDate'].split('T')[0] + ' ' + \ courses[0]['courseStartDate'].split('T')[1].split('.')[0] elif courses[0]['enrollmentStartDate']: event['start_date_local'] = courses[0]['enrollmentStartDate'].split('T')[0] + ' ' + \ courses[0]['enrollmentStartDate'].split('T')[1].split('.')[0] # get field 'end_date_local' if courses[0]['courseEndDate']: event['end_date_local'] = courses[0]['courseEndDate'].split('T')[0] + ' ' + \ courses[0]['courseEndDate'].split('T')[1].split('.')[0] elif courses[0]['enrollmentEndDate']: event['end_date_local'] = courses[0]['enrollmentEndDate'].split('T')[0] + ' ' + \ courses[0]['enrollmentEndDate'].split('T')[1].split('.')[0] #get price if courses[0]['priceInCents']: event['price_display_float'] = float( str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] + str(courses[0]['priceInCents'])[-1]) #get instructor instructor = Instructor() instructor['name'] = instructor_name instructor['bio'] = instructor_bio instructor['image'] = instructor_image instructor['link'] = None instructors.append(instructor) event['instructors'] = instructors product_events.append(event) else: #get price if courses[0]['priceInCents']: output['price_display_float'] = float( str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] + str(courses[0]['priceInCents'])[-1]) instructor = Instructor() instructor['name'] = instructor_name instructor['bio'] = instructor_bio instructor['image'] = instructor_image instructor['link'] = None instructors.append(instructor) if instructor_image == None and instructor_bio == None: output['instructors'] = None else: output['instructors'] = instructors output['product_events'] = product_events return output
def parse(cls, response): from scrapy_balloons.spiders.balloon import balloon_spider data = response.xpath("//table[@class='training_table']//tr[@bgcolor]") def events(selector): event = ProductEvent() event['language'] = 'eng', event['price_currency'] = 'EUR' price = selector.xpath("./td[7]//text()").re("\\d.*") if price: event['price_display_float'] = html_to_text(price[0]) else: event['price_display_float'] = '0' location = selector.xpath("./td[5]/text()[1]").extract()[0] event['location_display'] = html_to_text(location) start_date = selector.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0] event['start_date_local'] = convert_date(start_date) return event # course_dict = {} title = [] results = [] for d in data: name = d.xpath("./td[@class='trainingtitle']//text()").extract()[0] print name if name not in title: try: output = balloon_spider.create_new_product() output['name'] = name description = d.xpath("./td[@class='trainingtitle']/span/@rel-tid").extract() output['description'] = ingram_micro.get_desc(description[0]) price = d.xpath("./td[7]//text()").re("\\d.*") if price: output['price_display_float'] = html_to_text(price[0]) else: output['price_display_float'] = '0' start_date = d.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0] output['start_date_local'] = convert_date(start_date) location = d.xpath("./td[5]/text()[1]").extract()[0] output['location_display'] = html_to_text(location) output['product_events'] = events(d) output['product_url'] = "%s#%s" %(response.url,len(results) + 1) # pdb.set_trace() title.append(name) results.append(output) except: pass else: for i in results: if name in i['name']: prod_events = [] old_event = i['product_events'] new_events = events(d) prod_events.append(old_event) prod_events.append(new_events) i['product_events'] = prod_events else: pass return results
def build_course(cls): from scrapy_balloons.spiders.balloon import balloon_spider courses_data = coursera.courses_data if balloon_spider.limit == -1 else coursera.courses_data[ :balloon_spider.limit] for data in courses_data: if 'en' in data['primaryLanguages']: output = balloon_spider.create_new_product() output['name'] = html_to_text(data['name']) output['product_url'] = 'https://www.coursera.org/course/' + data['slug'] output['description'] = html_to_text(data['description']) output['product_image_url'] = data['photoUrl'] duration_week = data['workload'] # instructors ins_course = [] if 'instructorIds' in data: ins_data = coursera.find_instructors(data['instructorIds']) for in_data in ins_data: instructor = Instructor() instructor['name'] = "%s %s %s" % ( get_attr(in_data, 'firstName'), get_attr(in_data, 'middleName'), get_attr(in_data, 'lastName')) instructor['bio'] = html_to_text(get_attr(in_data, 'bio')) instructor['image'] = get_attr(in_data, 'photo') instructor['link'] = 'https://www.coursera.org/instructor/~' + in_data['profileId'] if contains( in_data, 'profileId') else None ins_course.append(instructor) # product events course_events = coursera.get_events_by_courseid(data['id']) if course_events: product_events = [] for event_data in course_events: event = ProductEvent() event['language'] = 'eng' start_time = "%s %s %s" % (get_attr(event_data, 'startDay'), get_attr(event_data, 'startMonth'), get_attr(event_data, 'startYear')) event['start_date_local'] = convert_date(start_time) if contains(event_data, 'dbEndDate'): end_time = event_data['dbEndDate'] event['end_date_local'] = epoch_time_to_date(end_time / 1000) event['duration_display'] = event_data['durationString'] event['duration_filter'] = duration_filter(event_data['durationString']) product_events.append(event) output['product_events'] = product_events else: output['language'] = 'eng' output['duration_display'] = duration_week.encode('ascii', 'ignore') if 'hours/week' in output['duration_display']: if '-' in output['duration_display']: output['duration_filter'] = duration_filter( output['duration_display'].split('-')[0] + ' hours') else: output['duration_filter'] = duration_filter(output['duration_display'].split('/')[0]) elif ' hours ' in output['duration_display']: if '-' in output['duration_display']: output['duration_filter'] = duration_filter( output['duration_display'].split(' hours ')[0].split('-')[0] + ' hours') else: output['duration_filter'] = duration_filter( output['duration_display'].split(' hours ')[0] + ' hours') else: None output['instructors'] = ins_course courses_detail = get_attr(coursera.courses_detail_data, data['id']) if courses_detail: output['toc'] = get_attr(courses_detail, 'aboutTheCourse') videos = get_attr(courses_detail, 'videos') if videos and len(videos) > 0: output['product_video_url'] = videos[0]['source'] else: # pdb.set_trace() output['product_video_url'] = None output['language'] = 'eng' yield output