def get_event(cls, response, item): data=re.search("classCfgs:(.*),filters",response.body,re.S).group(1) events = demjson.decode(data) from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() product_events = [] for event in events: prod_event = ProductEvent() prod_event['langauge'] = 'eng' prod_event['location_display'] = event['location'] start_time = event['start'] prod_event['start_date_local'] = epoch_time_to_date(start_time) end_time = event['end'] prod_event['end_date_local'] = epoch_time_to_date(end_time) prod_event['duration_display'] = item['duration_display'] prod_event['duration_filter'] = item['duration_filter'] prod_event['price_currency'] = 'USD' prod_event['price_display_float'] = item['price_display_float'] product_events.append(prod_event) output['product_events'] = product_events return output
def parse_tutorial_info(cls, response): from scrapy_balloons.spiders.balloon import balloon_spider from scrapy import Selector course_id = response.meta['course_id'] course_detail = cmivfx.courses_detail[course_id] product = balloon_spider.create_new_product() #build product product['name'] = course_detail['name'] product['product_url'] = urljoin('https://cmivfx.com/store/', course_detail['id']) product['partner_prod_id'] = course_detail['id'] product['product_image_url'] = course_detail['image_large'] product['description'] = course_detail['longDescription'] product['toc'] = re.sub('Chapter Descriptions', '', course_detail['marketingText']) product['published_date'] = course_detail['releaseDate'] product['duration_display'] = re.sub('\\.', ':', course_detail['videoDuration']) product['duration_filter'] = duration_filter(re.sub('\\.', ':', course_detail['videoDuration'])) product['price_display_float'] = course_detail['price'] return product # modal 1 : category #[{"id":"584","name":"Nuke-Mari Workflows","price":"49.95","releaseDate":"2014-03-13 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/match-moving-and-more-in-nuke,-maya-and-mari-thumb.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/match-moving-and-more-in-nuke,-maya-and-mari-large.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/match-moving-and-more-in-nuke,-maya-and-mari-large.jpg","shortDescription":"Salah Soltane teaches you match moving in Nuke, modeling in Maya, painting in Mari, and even final c...","category":"3D Matchmoving","categoryId":"20"},{"id":"508","name":"Mocha Pro Advanced","price":"19.95","releaseDate":"2012-11-25 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1353861765_Master_icon.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1353861765_Master.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1353861765_Master.jpg","shortDescription":"Brilliant video for advanced match moving scenarios otherwise impossible in other apps. MochaPro dem...","category":"3D Matchmoving","categoryId":"20"},{"id":"309","name":"Motion Control 3D Camera","price":"59.95","releaseDate":"2011-07-05 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1309885456_Capturing_04_Master_Icon.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1309885456_Capturing_04_Master_Image.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1309885456_Capturing_04_Master_Image.png","shortDescription":"Learn how to use Motion Control camera data, from entry level skills to more advanced data extractio...","category":"3D Matchmoving","categoryId":"20"},{"id":"301","name":"Camera Based Motion Capture","price":"59.95","releaseDate":"2011-05-26 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1306416932_Master_Thumb.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1306416932_Master.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1306416932_Master.jpg","shortDescription":"This video shows Motion Capture Tracking with Matchmover and 3DSMax Character Studio. However, it ca...","category":"3D Matchmoving","categoryId":"20"},{"id":"242","name":"Complete Syntheyes Training","price":"49.95","releaseDate":"2011-02-15 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/242-Thumb_Large.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","shortDescription":"This is the most complete Syntheyes training on the market, and you better believe its a cmiVFX vide...","category":"3D Matchmoving","categoryId":"20"}] # modal 2 : course detail # {"marketingText":"<h3>Chapter Descriptions<\/h3>\n<h4>Introduction<\/h4>\n<p>\nFor starters we'll use a simple shot and do the most obvious thing when you first open Syntheyes: A completely automatic track. In a matter of literally seconds you'll have a pretty usable result, that will quickly improve by using some simple clean-up techniques. Orient the scene in 3d-space and insert 3D-objects into your scene to test if everything works, export your camera-solution and voil\u00e1 - you have a decent matchmove in just a few minutes.\n<\/p>\t\n<h4>Solving a shaky handheld shot<\/h4>\n<p>\nLife not often makes it easy for you, and so you'll often be confronted with things like motion-blur, camera-shake and lens-distortion. So for the more difficult shots we'll have to bring the big guns in. Automatic tracker cleanup, manual control over each tracker-curve in the graphs-editor and fine-tune trackers can really improve your camera solution. To solve fast and shaky hand-held footage with lots of motion-blur it's best to use supervised tracking to give you the most control over each tracker. If you're footage was shot with a lens that has barrel-distortion then you have another problem that can make things more complicated. Tracks will have lots of errors, and 3D objects will not fit to the footage not matter what. Luckily Syntheyes has a clever way to get rid of lens-distortion. Or even to re-distort the CGI, if the director wants to work with the original footage later in compositing.\n<\/p>\n<h4>Modeling with Syntheyes<\/h4>\n<p>\nWith the tracking-features and resulting 3d-points you can not only define the camera, but also re-model the scene, just use the points to define a mesh! And if there are not enough features for you to re-create the scene, insert zero weighted trackers (ZWT) that don't affect the camera solution but will position themselves in the correct place in your 3d environment. And if you need more organic shapes to be rebuilt, add thousands of 3d-points all at once to build your mesh. You can even texture that mesh by using camera-projection right within Syntheyes, to test if camera-mapping will work later in your 3d- or compositing-application.\n<\/p>\t\n<h4>Solving Tripod shots - with 3d environment?<\/h4>\n<p>\nThe common problem with tripod shots is that it is impossible to get any usable 3d information out of just a camera-pan. The solution of the camera movement might be good, still it can be hard to position it in a reasonable and exact way in 3d-space. That problem can quickly be solved with Syntheye's lens-tools. Use any straight perspective lines in one frame of your tripod-shot to setup a coordinate system that let's you position objects in your scene more easily - that makes life easier for the 3d-artist! You can even use that method on still images.\n<\/p>\t\n<h4>A moving camera - and a moving object!<\/h4>\n<p>\nAlthough primarily a camera-tracker, Syntheyes does a really good job at object-tracking. The last chapter of this tutorial shows you how to track camera and objects in one scene altogether, how to setup their coordinate systems and how to orient them in 3d-space. You can really do amazing things with this great software. Make sure you have some time and your favorite compositing and 3d application at hand! You'll want to try out all these goodies immediately!\n<\/p>\t\n<h4>About the Author<\/h4>\n<p>\nSebastian K\u00f6nig is a German 3D-artist who is working as a freelancer and CG-instructor for several years now. During his studies for Education of Art he discovered the joy of modeling and creating 3D-Animations with Blender and hasn't stopped since.\u00a0 Being a passionate Blender-User he has been teaching Blender at the University of Art and Design Halle\/Germany.\u00a0 He has been working for various studios and companies as a 3D-Artist and freelancer.\u00a0 During the dozens of projects and jobs he completed with Blender he got a profound knowledge of almost every aspect of this great Open-Source 3D-application. Since 2010 he is an official Blender Foundation Certified Trainer.\n<\/p>","longDescription":"Syntheyes, available at http:\/\/www.ssontech.com, is one of the fastest, feature-rich and yet surprisingly inexpensive camera-trackers out there. This tutorial not only gets you started with basic and advanced tracking, but will also dig deeper into the vast feature tool-set of Syntheyes that goes way beyond simple tracking.","activeVideo":"1","activeStore":"1","id":"242","name":"Complete Syntheyes Training","price":"49.95","releaseDate":"2011-02-15 00:00:00","image_thumb":"https:\/\/cmivfx.com\/img\/tutorial\/small\/242-Thumb_Large.jpg","image_medium":"https:\/\/cmivfx.com\/img\/tutorial\/medium\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","image_large":"https:\/\/cmivfx.com\/img\/tutorial\/large\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","shortDescription":"This is the most complete Syntheyes training on the market, and you better believe its a cmiVFX vide...","category":"3D Matchmoving","categoryId":"20","videoId":"127","videoDuration":"03.44.19","chapters":[{"id":"474","videoId":"127","name":"Chapter 01","description":"Introduction","timeCode":"12.9166666667"},{"id":"475","videoId":"127","name":"Chapter 02","description":"First Clip","timeCode":"246.833333333"},{"id":"476","videoId":"127","name":"Chapter 03","description":"Load First Shot","timeCode":"360.333333333"},{"id":"477","videoId":"127","name":"Chapter 04","description":"Your First Track","timeCode":"545.75"},{"id":"478","videoId":"127","name":"Chapter 05","description":"Examining The Results","timeCode":"979.333333333"},{"id":"479","videoId":"127","name":"Chapter 06","description":"The RMS Error","timeCode":"1045.58333333"},{"id":"480","videoId":"127","name":"Chapter 07","description":"Search Bad Trackers","timeCode":"1206.58333333"},{"id":"481","videoId":"127","name":"Chapter 08","description":"Clean Up Trackers","timeCode":"1550.83333333"},{"id":"482","videoId":"127","name":"Chapter 09","description":"Test Geometry","timeCode":"2027.83333333"},{"id":"483","videoId":"127","name":"Chapter 10","description":"Export Scene","timeCode":"2512.5"},{"id":"484","videoId":"127","name":"Chapter 11","description":"Open New Shot","timeCode":"2647.66666667"},{"id":"485","videoId":"127","name":"Chapter 12","description":"Graph Editor","timeCode":"3112.58333333"},{"id":"486","videoId":"127","name":"Chapter 13","description":"Solving The Scene","timeCode":"3670.58333333"},{"id":"487","videoId":"127","name":"Chapter 14","description":"Lens Distortion","timeCode":"4075.58333333"},{"id":"488","videoId":"127","name":"Chapter 15","description":"Image Preprocessing","timeCode":"5002.66666667"},{"id":"489","videoId":"127","name":"Chapter 16","description":"Manual Tracking Process","timeCode":"5248"},{"id":"490","videoId":"127","name":"Chapter 17","description":"Constraints","timeCode":"7323.83333333"},{"id":"491","videoId":"127","name":"Chapter 18","description":"Saving Sequences","timeCode":"8880.33333333"},{"id":"492","videoId":"127","name":"Chapter 19","description":"Stabilization","timeCode":"9130"},{"id":"493","videoId":"127","name":"Chapter 20","description":"Nodal Pans","timeCode":"10020"},{"id":"494","videoId":"127","name":"Chapter 21","description":"Object Tracking","timeCode":"11112.5833333"}],"images":[{"id":"216","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/16f40474-64d9-46b2-832c-f567192c3ebb.jpg"},{"id":"217","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/85ab0db9-3448-493e-867a-1baa0f492304.jpg"},{"id":"218","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/d489dec8-4fe8-4ea2-9985-04cccdce06ae.jpg"},{"id":"219","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/3af72062-37e1-429d-b350-4d2d20acd377.jpg"},{"id":"220","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/da10c679-b0a1-4b3e-9577-8b688d0a6b24.jpg"}]}
def get_info_course(cls, response): data_json = json.loads(response.body) from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() # Get product url prod_url_1 = "http://www.knowledgecity.com/%s" % data_json["id"] prod_url_2 = data_json["title"] prod_url = prod_url_1 + "-" + prod_url_2 output["product_url"] = prod_url.replace(" ", "-") output["name"] = data_json["title"] output["product_type_id"] = data_json["id"] output["description"] = data_json["description"] output["toc"] = data_json["chapters"] output["duration_display"] = data_json["trt"] output["duration_filter"] = duration_filter(data_json["trt"]) # authors = [] # author = Author() # # author['name'] = data_json['author'] # author['bio'] = None # author['link'] = None # author['image'] = None # # authors.append(author) # # output['authors'] = authors return output
def mapping_with_course(self, data): from scrapy_balloons.spiders.balloon import balloon_spider product = balloon_spider.create_new_product() product['name'] = get_attr(data, 'title') product['description'] = get_attr(data, 'description') product['difficulty'] = get_attr(data, 'level').split()[0] if get_attr(data, 'level') else None product['partner_prod_id'] = get_attr(data, 'id') product['published_date'] = convert_date(get_attr(data, 'pub_date')) product['price_display_float'] = get_price_float(get_attr(data, 'price')) product['product_url'] = get_attr(data, 'url') product['product_image_url'] = get_attr(data, 'image_url') kw_data = get_attr(data, 'tags') product['prod_keywords'] = kw_data.split(',') if kw_data else None product['price_display_float'] = '0' return product
def convert_products(self, origin_products): from scrapy_balloons.spiders.balloon import balloon_spider results = [] for data in origin_products: try: product = balloon_spider.create_new_product() product['name'] = data['title'] product['product_image_url'] = data['video_thumbnail'] product['description'] = html_to_text(data['body'][0]['value']) product['product_url'] = urljoin("https://www.elance.com/", data['url']) product['duration_display'] = data['video_duration']['hms_labeled'] product['duration_filter'] = duration_filter(data['video_duration']['hms_labeled']) results.append(product) except: traceback.print_exc() return results
def parse_course(cls, response): print response.url json_data = json.loads(response.body) from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() # get course_groups info = get_attr(json_data, 'courseGroups')[0] output['product_image_url'] = info['asset'] output['name'] = info['title'] output['product_url'] = urljoin('https://www.writersonlineworkshops.com/courses/', info['slug']) # get courseTabs course_tabs = get_attr(json_data, 'courseTabs') #get description for item in course_tabs: if 'Overview' in item['label']: output['description'] = html_to_text(item['body']) #get audience elif 'Who Should Take' in item['label']: output['audience'] = html_to_text(item['body']) #get toc elif 'Course Outline' in item['label']: output['toc'] = item['body'] # get courses courses = get_attr(json_data, 'courses') ################################################# product_events = [] has_price = False for item in courses: if not is_expired(item['courseStartDate']): event = ProductEvent() event['language'] = 'eng' event['start_date_local'] = convert_date(item['courseStartDate']) event['end_date_local'] = convert_date(item['courseEndDate']) event['price_display_float'] = item['priceInCents'] / 100 has_price = True product_events.append(event) output['product_events'] = product_events if not has_price and len(courses) > 0: event = ProductEvent() event['price_display_float'] = item['priceInCents'] / 100 event['price_currency'] = 'USD' product_events.append(event) output['product_events'] = product_events return output
def create_course(self, id): from scrapy_balloons.spiders.balloon import balloon_spider product = balloon_spider.create_new_product() data = pluralsight.courses_caches[id] product['product_url'] = urljoin(pluralsight.base_url, 'courses/%s' % (id)) product['price_display_float'] = float_to_string(pluralsight.price) product['price_currency'] = 'USD' for r in data: json_data = json.loads(r.body) if 'data/course/content' in r.url: # example see : http://www.pluralsight.com/data/course/content/clickonce-deployment-fundamentals product['toc'] = json_data elif 'data/course/authors' in r.url: # see http://www.pluralsight.com/data/course/authors/clickonce-deployment-fundamentals authors = [] for data in json_data: author = Author() author['name'] = data['fullName'] author['link'] = urljoin(pluralsight.base_url, 'author', data['authorHandle']) author['bio'] = data['longBio'] author['image'] = data['largeImageUrl'].replace('//', '') authors.append(author) product['authors'] = authors elif 'data/course/relationships' in r.url: prod_keywords = [item['name'] for item in json_data['tags']] product['prod_keywords'] = prod_keywords else: # see http://www.pluralsight.com/data/course/clickonce-deployment-fundamentals product['name'] = json_data['title'] product['description'] = json_data['description'] product['short_desc'] = json_data['shortDescription'] product['difficulty'] = json_data['level'] product['published_date'] = convert_date(json_data['releaseDate']) product['duration_filter'] = duration_filter(json_data['duration']) product['duration_display'] = pluralsight.duration_display(json_data['duration']) rating_data = json_data['courseRating'] rating = ProductRating() rating['pub_status'] = 'L' rating['overall_rating'] = str(int(round(rating_data['rating']))) rating['rating_only'] = '1' product['ProductRating'] = rating return product
def parse_courses(cls, response): from scrapy_balloons.spiders.balloon import balloon_spider json_data = json.loads(response.body)['courses'] json_data = json_data if balloon_spider.limit == -1 else json_data[:balloon_spider.limit] for course in json_data: from scrapy_balloons.spiders.balloon import balloon_spider # create output output = balloon_spider.create_new_product() # get info of course output['product_url'] = "https://www.udacity.com/course/" + course['key'] output['name'] = course['title'] output['product_image_url'] = course['image'] output['product_video_url'] = course['teaser_video']['youtube_url'] output['description'] = course['summary'] output['difficulty'] = difficulty(course['level']) output['duration_filter'] = duration_filter( str(course['expected_duration']) + course['expected_duration_unit']) output['duration_display'] = str(course['expected_duration']) + ' ' + course['expected_duration_unit'] output['toc'] = course['syllabus'] output['prerequisites'] = course['required_knowledge'] output['requirements'] = course['required_knowledge'] instructors = [] for ins in course['instructors']: instructor = Instructor() instructor['name'] = ins['name'] instructor['bio'] = ins['bio'] instructor['image'] = ins['image'] instructor['link'] = None instructors.append(instructor) output['instructors'] = instructors key = output['product_url'] udacity.output_cache[key] = output request = Request(url=key, meta={'key': key}, callback=udacity.parse_price, ) yield request
def start(cls, response): from scrapy_balloons.spiders.balloon import balloon_spider all_course_url = "https://www.zeitgeistminds.com/zce/video" data = zeitgeistminds.curl_url(all_course_url) courses_data = re.search(".*\n(.*)", data).group(1) courses_data = json.loads(courses_data) courses_id = [d["id"] for d in courses_data] courses_id = courses_id if balloon_spider.limit == -1 else courses_id[: balloon_spider.limit] for id in courses_id: # get data detail for a course data = zeitgeistminds.curl_url("%s/%s" % (all_course_url, id)) course_data = re.search(".*\n(.*)", data).group(1) course_data = json.loads(course_data) product = balloon_spider.create_new_product() product["product_url"] = ( "https://www.zeitgeistminds.com/talk/" + str(get_attr(course_data, "id")) + "/" + str(get_attr(course_data, "slug")) ) product["product_image_url"] = get_attr(course_data, "img_url") product["name"] = get_attr(course_data, "title") product["description"] = get_attr(course_data, "description") product["product_video_url"] = ( "https://www.youtube.com/watch?v=%s" % get_attr(course_data, "video_id") if get_attr(course_data, "video_id") else None ) product["partner_prod_id"] = get_attr(course_data, "id") product["duration_filter"] = get_attr(course_data, "duration") product["published_date"] = datetime.strptime(str(get_attr(course_data, "date")), "%b %Y").strftime( "%Y - %m" ) yield product
def convert_products(self, origin_products): from scrapy_balloons.spiders.balloon import balloon_spider results = [] for data in origin_products: try: product = balloon_spider.create_new_product() product['name'] = data['title'] product['product_image_url'] = data['image']['l'] try: video_id = data['freeChapter']['videoID'] if video_id: product['product_video_url'] = urljoin("https://www.youtube.com/embed/", video_id) else: product['product_video_url'] = None except: pass product['description'] = html_to_text(data['description']) product['published_date'] = data['date']['date'] product['tz'] = data['date']['timezone'] product['product_url'] = urljoin("https://www.evisors.com/", data['url']) instructors = [] for ins in data['presenters']: instructor = Instructor() instructor['name'] = ins['name'] instructor['image'] = ins['avatar']['xl'] instructor['bio'] = html_to_text(ins['bio']) instructor['link'] = urljoin("https://www.evisors.com/", ins['expert']['url']) instructors.append(instructor) product['instructors'] = instructors product['price_currency'] = 'USD' product['price_display_float'] = get_price_float(data['price']) results.append(product) except: traceback.print_exc() return results
def parse_course(cls, response): json_data = json.loads(response.body) if json_data['courseGroups']: from scrapy_balloons.spiders.balloon import balloon_spider output = balloon_spider.create_new_product() # get course_groups course_groups = get_attr(json_data, 'courseGroups') if course_groups and len(course_groups) > 0: #get field 'product_image_url' if [course_groups[0]['asset']]: output['product_image_url'] = [course_groups[0]['asset']] #get name Instructor if course_groups[0]['categories']: # instructor_name = [item['value'] for item in course_groups[0]['categories'] if item['label']=='Instructor'] for item in course_groups[0]['categories']: if item['label'] == 'Instructor': instructor_name = item['value'] else: instructor_name = None else: instructor_name = None # get courseTabs coursetabs = get_attr(json_data, 'courseTabs') if coursetabs and len(coursetabs) > 0: #get description if coursetabs[0]['body']: output['description'] = html_to_text(coursetabs[0]['body']) #get toc if len(coursetabs) > 3: if coursetabs[2]['body']: output['toc'] = coursetabs[2]['body'] # get profileBlocks profileblocks = get_attr(json_data, 'profileBlocks') if profileblocks and len(profileblocks) > 0: #get bio Instructor if profileblocks[0]['bio']: instructor_bio = html_to_text(profileblocks[0]['bio']) else: instructor_bio = None # import pdb # pdb.set_trace() # get users users = get_attr(json_data, 'users') if users and len(users) > 0: #get image Instructor for item in users: if item['asset']: instructor_image = item['asset'] else: instructor_image = None else: instructor_image = None # get courses courses = get_attr(json_data, 'courses') if courses and len(courses) > 0: ################################################# url = courses[0]['slug'] output['product_url'] = urljoin('https://www.howdesignuniversity.com/courses/', url) # get field 'name' output['name'] = courses[0]['title'] #get product_events instructors = [] product_events = [] if courses[0]['courseEndDate']: event = ProductEvent() event['language'] = 'eng' # get field 'start_date_local' if courses[0]['courseStartDate']: event['start_date_local'] = courses[0]['courseStartDate'].split('T')[0] + ' ' + \ courses[0]['courseStartDate'].split('T')[1].split('.')[0] elif courses[0]['enrollmentStartDate']: event['start_date_local'] = courses[0]['enrollmentStartDate'].split('T')[0] + ' ' + \ courses[0]['enrollmentStartDate'].split('T')[1].split('.')[0] # get field 'end_date_local' if courses[0]['courseEndDate']: event['end_date_local'] = courses[0]['courseEndDate'].split('T')[0] + ' ' + \ courses[0]['courseEndDate'].split('T')[1].split('.')[0] elif courses[0]['enrollmentEndDate']: event['end_date_local'] = courses[0]['enrollmentEndDate'].split('T')[0] + ' ' + \ courses[0]['enrollmentEndDate'].split('T')[1].split('.')[0] #get price if courses[0]['priceInCents']: event['price_display_float'] = float( str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] + str(courses[0]['priceInCents'])[-1]) #get instructor instructor = Instructor() instructor['name'] = instructor_name instructor['bio'] = instructor_bio instructor['image'] = instructor_image instructor['link'] = None instructors.append(instructor) event['instructors'] = instructors product_events.append(event) else: #get price if courses[0]['priceInCents']: output['price_display_float'] = float( str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] + str(courses[0]['priceInCents'])[-1]) instructor = Instructor() instructor['name'] = instructor_name instructor['bio'] = instructor_bio instructor['image'] = instructor_image instructor['link'] = None instructors.append(instructor) if instructor_image == None and instructor_bio == None: output['instructors'] = None else: output['instructors'] = instructors output['product_events'] = product_events return output
def parse(cls, response): from scrapy_balloons.spiders.balloon import balloon_spider data = response.xpath("//table[@class='training_table']//tr[@bgcolor]") def events(selector): event = ProductEvent() event['language'] = 'eng', event['price_currency'] = 'EUR' price = selector.xpath("./td[7]//text()").re("\\d.*") if price: event['price_display_float'] = html_to_text(price[0]) else: event['price_display_float'] = '0' location = selector.xpath("./td[5]/text()[1]").extract()[0] event['location_display'] = html_to_text(location) start_date = selector.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0] event['start_date_local'] = convert_date(start_date) return event # course_dict = {} title = [] results = [] for d in data: name = d.xpath("./td[@class='trainingtitle']//text()").extract()[0] print name if name not in title: try: output = balloon_spider.create_new_product() output['name'] = name description = d.xpath("./td[@class='trainingtitle']/span/@rel-tid").extract() output['description'] = ingram_micro.get_desc(description[0]) price = d.xpath("./td[7]//text()").re("\\d.*") if price: output['price_display_float'] = html_to_text(price[0]) else: output['price_display_float'] = '0' start_date = d.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0] output['start_date_local'] = convert_date(start_date) location = d.xpath("./td[5]/text()[1]").extract()[0] output['location_display'] = html_to_text(location) output['product_events'] = events(d) output['product_url'] = "%s#%s" %(response.url,len(results) + 1) # pdb.set_trace() title.append(name) results.append(output) except: pass else: for i in results: if name in i['name']: prod_events = [] old_event = i['product_events'] new_events = events(d) prod_events.append(old_event) prod_events.append(new_events) i['product_events'] = prod_events else: pass return results
def build_course(cls): from scrapy_balloons.spiders.balloon import balloon_spider courses_data = coursera.courses_data if balloon_spider.limit == -1 else coursera.courses_data[ :balloon_spider.limit] for data in courses_data: if 'en' in data['primaryLanguages']: output = balloon_spider.create_new_product() output['name'] = html_to_text(data['name']) output['product_url'] = 'https://www.coursera.org/course/' + data['slug'] output['description'] = html_to_text(data['description']) output['product_image_url'] = data['photoUrl'] duration_week = data['workload'] # instructors ins_course = [] if 'instructorIds' in data: ins_data = coursera.find_instructors(data['instructorIds']) for in_data in ins_data: instructor = Instructor() instructor['name'] = "%s %s %s" % ( get_attr(in_data, 'firstName'), get_attr(in_data, 'middleName'), get_attr(in_data, 'lastName')) instructor['bio'] = html_to_text(get_attr(in_data, 'bio')) instructor['image'] = get_attr(in_data, 'photo') instructor['link'] = 'https://www.coursera.org/instructor/~' + in_data['profileId'] if contains( in_data, 'profileId') else None ins_course.append(instructor) # product events course_events = coursera.get_events_by_courseid(data['id']) if course_events: product_events = [] for event_data in course_events: event = ProductEvent() event['language'] = 'eng' start_time = "%s %s %s" % (get_attr(event_data, 'startDay'), get_attr(event_data, 'startMonth'), get_attr(event_data, 'startYear')) event['start_date_local'] = convert_date(start_time) if contains(event_data, 'dbEndDate'): end_time = event_data['dbEndDate'] event['end_date_local'] = epoch_time_to_date(end_time / 1000) event['duration_display'] = event_data['durationString'] event['duration_filter'] = duration_filter(event_data['durationString']) product_events.append(event) output['product_events'] = product_events else: output['language'] = 'eng' output['duration_display'] = duration_week.encode('ascii', 'ignore') if 'hours/week' in output['duration_display']: if '-' in output['duration_display']: output['duration_filter'] = duration_filter( output['duration_display'].split('-')[0] + ' hours') else: output['duration_filter'] = duration_filter(output['duration_display'].split('/')[0]) elif ' hours ' in output['duration_display']: if '-' in output['duration_display']: output['duration_filter'] = duration_filter( output['duration_display'].split(' hours ')[0].split('-')[0] + ' hours') else: output['duration_filter'] = duration_filter( output['duration_display'].split(' hours ')[0] + ' hours') else: None output['instructors'] = ins_course courses_detail = get_attr(coursera.courses_detail_data, data['id']) if courses_detail: output['toc'] = get_attr(courses_detail, 'aboutTheCourse') videos = get_attr(courses_detail, 'videos') if videos and len(videos) > 0: output['product_video_url'] = videos[0]['source'] else: # pdb.set_trace() output['product_video_url'] = None output['language'] = 'eng' yield output