Beispiel #1
0
    def get_event(cls, response, item):
        data=re.search("classCfgs:(.*),filters",response.body,re.S).group(1)

        events = demjson.decode(data)
        from scrapy_balloons.spiders.balloon import balloon_spider

        output = balloon_spider.create_new_product()

        product_events = []

        for event in events:
            prod_event = ProductEvent()
            prod_event['langauge'] = 'eng'
            prod_event['location_display'] = event['location']

            start_time = event['start']
            prod_event['start_date_local'] = epoch_time_to_date(start_time)

            end_time = event['end']
            prod_event['end_date_local'] = epoch_time_to_date(end_time)

            prod_event['duration_display'] = item['duration_display']
            prod_event['duration_filter'] = item['duration_filter']

            prod_event['price_currency'] = 'USD'
            prod_event['price_display_float'] = item['price_display_float']

            product_events.append(prod_event)

            output['product_events'] = product_events

        return output
Beispiel #2
0
    def parse_tutorial_info(cls, response):
        from scrapy_balloons.spiders.balloon import balloon_spider
        from scrapy import Selector
        course_id = response.meta['course_id']
        course_detail = cmivfx.courses_detail[course_id]
        product = balloon_spider.create_new_product()
        #build product
        product['name'] = course_detail['name']
        product['product_url'] = urljoin('https://cmivfx.com/store/', course_detail['id'])
        product['partner_prod_id'] = course_detail['id']
        product['product_image_url'] = course_detail['image_large']
        product['description'] = course_detail['longDescription']
        product['toc'] = re.sub('Chapter Descriptions', '', course_detail['marketingText'])
        product['published_date'] = course_detail['releaseDate']
        product['duration_display'] = re.sub('\\.', ':', course_detail['videoDuration'])
        product['duration_filter'] = duration_filter(re.sub('\\.', ':', course_detail['videoDuration']))
        product['price_display_float'] = course_detail['price']
        return product



# modal 1 : category
#[{"id":"584","name":"Nuke-Mari Workflows","price":"49.95","releaseDate":"2014-03-13 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/match-moving-and-more-in-nuke,-maya-and-mari-thumb.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/match-moving-and-more-in-nuke,-maya-and-mari-large.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/match-moving-and-more-in-nuke,-maya-and-mari-large.jpg","shortDescription":"Salah Soltane teaches you match moving in Nuke, modeling in Maya, painting in Mari, and even final c...","category":"3D Matchmoving","categoryId":"20"},{"id":"508","name":"Mocha Pro Advanced","price":"19.95","releaseDate":"2012-11-25 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1353861765_Master_icon.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1353861765_Master.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1353861765_Master.jpg","shortDescription":"Brilliant video for advanced match moving scenarios otherwise impossible in other apps. MochaPro dem...","category":"3D Matchmoving","categoryId":"20"},{"id":"309","name":"Motion Control   3D Camera","price":"59.95","releaseDate":"2011-07-05 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1309885456_Capturing_04_Master_Icon.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1309885456_Capturing_04_Master_Image.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1309885456_Capturing_04_Master_Image.png","shortDescription":"Learn how to use Motion Control camera data, from entry level skills to more advanced data extractio...","category":"3D Matchmoving","categoryId":"20"},{"id":"301","name":"Camera Based Motion Capture","price":"59.95","releaseDate":"2011-05-26 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/1306416932_Master_Thumb.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/1306416932_Master.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/1306416932_Master.jpg","shortDescription":"This video shows Motion Capture Tracking with Matchmover and 3DSMax Character Studio. However, it ca...","category":"3D Matchmoving","categoryId":"20"},{"id":"242","name":"Complete Syntheyes Training","price":"49.95","releaseDate":"2011-02-15 00:00:00","image_thumb":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/small\\/242-Thumb_Large.jpg","image_medium":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/medium\\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","image_large":"https:\\/\\/cmivfx.com\\/img\\/tutorial\\/large\\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","shortDescription":"This is the most complete Syntheyes training on the market, and you better believe its a cmiVFX vide...","category":"3D Matchmoving","categoryId":"20"}]

# modal 2 : course detail
# {"marketingText":"<h3>Chapter Descriptions<\/h3>\n<h4>Introduction<\/h4>\n<p>\nFor starters we&#39;ll use a simple shot and do the most obvious thing when you first open Syntheyes: A completely automatic track. In a matter of literally seconds you&#39;ll have a pretty usable result, that will quickly improve by using some simple clean-up techniques. Orient the scene in 3d-space and insert 3D-objects into your scene to test if everything works, export your camera-solution and voil\u00e1 - you have a decent matchmove in just a few minutes.\n<\/p>\t\n<h4>Solving a shaky handheld shot<\/h4>\n<p>\nLife not often makes it easy for you, and so you&#39;ll often be confronted with things like motion-blur, camera-shake and lens-distortion. So for the more difficult shots we&#39;ll have to bring the big guns in. Automatic tracker cleanup, manual control over each tracker-curve in the graphs-editor and fine-tune trackers can really improve your camera solution. To solve fast and shaky hand-held footage with lots of motion-blur it&#39;s best to use supervised tracking to give you the most control over each tracker. If you&#39;re footage was shot with a lens that has barrel-distortion then you have another problem that can make things more complicated. Tracks will have lots of errors, and 3D objects will not fit to the footage not matter what. Luckily Syntheyes has a clever way to get rid of lens-distortion. Or even to re-distort the CGI, if the director wants to work with the original footage later in compositing.\n<\/p>\n<h4>Modeling with Syntheyes<\/h4>\n<p>\nWith the tracking-features and resulting 3d-points you can not only define the camera, but also re-model the scene, just use the points to define a mesh! And if there are not enough features for you to re-create the scene, insert zero weighted trackers (ZWT) that don&#39;t affect the camera solution but will position themselves in the correct place in your 3d environment. And if you need more organic shapes to be rebuilt, add thousands of 3d-points all at once to build your mesh. You can even texture that mesh by using camera-projection right within Syntheyes, to test if camera-mapping will work later in your 3d- or compositing-application.\n<\/p>\t\n<h4>Solving Tripod shots - with 3d environment?<\/h4>\n<p>\nThe common problem with tripod shots is that it is impossible to get any usable 3d information out of just a camera-pan. The solution of the camera movement might be good, still it can be hard to position it in a reasonable and exact way in 3d-space. That problem can quickly be solved with Syntheye&#39;s lens-tools. Use any straight perspective lines in one frame of your tripod-shot to setup a coordinate system that let&#39;s you position objects in your scene more easily - that makes life easier for the 3d-artist! You can even use that method on still images.\n<\/p>\t\n<h4>A moving camera - and a moving object!<\/h4>\n<p>\nAlthough primarily a camera-tracker, Syntheyes does a really good job at object-tracking. The last chapter of this tutorial shows you how to track camera and objects in one scene altogether, how to setup their coordinate systems and how to orient them in 3d-space. You can really do amazing things with this great software. Make sure you have some time and your favorite compositing and 3d application at hand! You&#39;ll want to try out all these goodies immediately!\n<\/p>\t\n<h4>About the Author<\/h4>\n<p>\nSebastian K\u00f6nig is a German 3D-artist who is working as a freelancer and CG-instructor for several years now. During his studies for Education of Art he discovered the joy of modeling and creating 3D-Animations with Blender and hasn&#39;t stopped since.\u00a0 Being a passionate Blender-User he has been teaching Blender at the University of Art and Design Halle\/Germany.\u00a0 He has been working for various studios and companies as a 3D-Artist and freelancer.\u00a0 During the dozens of projects and jobs he completed with Blender he got a profound knowledge of almost every aspect of this great Open-Source 3D-application. Since 2010 he is an official Blender Foundation Certified Trainer.\n<\/p>","longDescription":"Syntheyes, available at http:\/\/www.ssontech.com, is one of the fastest, feature-rich and yet surprisingly inexpensive camera-trackers out there. This tutorial not only gets you started with basic and advanced tracking, but will also dig deeper into the vast feature tool-set of Syntheyes that goes way beyond simple tracking.","activeVideo":"1","activeStore":"1","id":"242","name":"Complete Syntheyes Training","price":"49.95","releaseDate":"2011-02-15 00:00:00","image_thumb":"https:\/\/cmivfx.com\/img\/tutorial\/small\/242-Thumb_Large.jpg","image_medium":"https:\/\/cmivfx.com\/img\/tutorial\/medium\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","image_large":"https:\/\/cmivfx.com\/img\/tutorial\/large\/42f216de-0c66-4cc1-b699-1ab601b1d0c4.jpg","shortDescription":"This is the most complete Syntheyes training on the market, and you better believe its a cmiVFX vide...","category":"3D Matchmoving","categoryId":"20","videoId":"127","videoDuration":"03.44.19","chapters":[{"id":"474","videoId":"127","name":"Chapter 01","description":"Introduction","timeCode":"12.9166666667"},{"id":"475","videoId":"127","name":"Chapter 02","description":"First Clip","timeCode":"246.833333333"},{"id":"476","videoId":"127","name":"Chapter 03","description":"Load First Shot","timeCode":"360.333333333"},{"id":"477","videoId":"127","name":"Chapter 04","description":"Your First Track","timeCode":"545.75"},{"id":"478","videoId":"127","name":"Chapter 05","description":"Examining The Results","timeCode":"979.333333333"},{"id":"479","videoId":"127","name":"Chapter 06","description":"The RMS Error","timeCode":"1045.58333333"},{"id":"480","videoId":"127","name":"Chapter 07","description":"Search Bad Trackers","timeCode":"1206.58333333"},{"id":"481","videoId":"127","name":"Chapter 08","description":"Clean Up Trackers","timeCode":"1550.83333333"},{"id":"482","videoId":"127","name":"Chapter 09","description":"Test Geometry","timeCode":"2027.83333333"},{"id":"483","videoId":"127","name":"Chapter 10","description":"Export Scene","timeCode":"2512.5"},{"id":"484","videoId":"127","name":"Chapter 11","description":"Open New Shot","timeCode":"2647.66666667"},{"id":"485","videoId":"127","name":"Chapter 12","description":"Graph Editor","timeCode":"3112.58333333"},{"id":"486","videoId":"127","name":"Chapter 13","description":"Solving The Scene","timeCode":"3670.58333333"},{"id":"487","videoId":"127","name":"Chapter 14","description":"Lens Distortion","timeCode":"4075.58333333"},{"id":"488","videoId":"127","name":"Chapter 15","description":"Image Preprocessing","timeCode":"5002.66666667"},{"id":"489","videoId":"127","name":"Chapter 16","description":"Manual Tracking Process","timeCode":"5248"},{"id":"490","videoId":"127","name":"Chapter 17","description":"Constraints","timeCode":"7323.83333333"},{"id":"491","videoId":"127","name":"Chapter 18","description":"Saving Sequences","timeCode":"8880.33333333"},{"id":"492","videoId":"127","name":"Chapter 19","description":"Stabilization","timeCode":"9130"},{"id":"493","videoId":"127","name":"Chapter 20","description":"Nodal Pans","timeCode":"10020"},{"id":"494","videoId":"127","name":"Chapter 21","description":"Object Tracking","timeCode":"11112.5833333"}],"images":[{"id":"216","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/16f40474-64d9-46b2-832c-f567192c3ebb.jpg"},{"id":"217","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/85ab0db9-3448-493e-867a-1baa0f492304.jpg"},{"id":"218","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/d489dec8-4fe8-4ea2-9985-04cccdce06ae.jpg"},{"id":"219","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/3af72062-37e1-429d-b350-4d2d20acd377.jpg"},{"id":"220","type":"IMAGE","url":"https:\/\/cmivfx.com\/img\/tutorial\/media\/da10c679-b0a1-4b3e-9577-8b688d0a6b24.jpg"}]}
Beispiel #3
0
    def get_info_course(cls, response):
        data_json = json.loads(response.body)

        from scrapy_balloons.spiders.balloon import balloon_spider

        output = balloon_spider.create_new_product()

        # Get product url
        prod_url_1 = "http://www.knowledgecity.com/%s" % data_json["id"]
        prod_url_2 = data_json["title"]
        prod_url = prod_url_1 + "-" + prod_url_2
        output["product_url"] = prod_url.replace(" ", "-")

        output["name"] = data_json["title"]
        output["product_type_id"] = data_json["id"]
        output["description"] = data_json["description"]
        output["toc"] = data_json["chapters"]
        output["duration_display"] = data_json["trt"]
        output["duration_filter"] = duration_filter(data_json["trt"])

        # authors = []
        # author = Author()
        #
        # author['name'] = data_json['author']
        # author['bio'] = None
        # author['link'] = None
        # author['image'] = None
        #
        # authors.append(author)
        #
        # output['authors'] = authors

        return output
Beispiel #4
0
 def mapping_with_course(self, data):
     from scrapy_balloons.spiders.balloon import balloon_spider
     product = balloon_spider.create_new_product()
     product['name'] = get_attr(data, 'title')
     product['description'] = get_attr(data, 'description')
     product['difficulty'] = get_attr(data, 'level').split()[0] if get_attr(data, 'level') else None
     product['partner_prod_id'] = get_attr(data, 'id')
     product['published_date'] = convert_date(get_attr(data, 'pub_date'))
     product['price_display_float'] = get_price_float(get_attr(data, 'price'))
     product['product_url'] = get_attr(data, 'url')
     product['product_image_url'] = get_attr(data, 'image_url')
     kw_data = get_attr(data, 'tags')
     product['prod_keywords'] = kw_data.split(',') if kw_data else None
     product['price_display_float'] = '0'
     return product
    def convert_products(self, origin_products):
        from scrapy_balloons.spiders.balloon import balloon_spider

        results = []
        for data in origin_products:
            try:
                product = balloon_spider.create_new_product()
                product['name'] = data['title']
                product['product_image_url'] = data['video_thumbnail']
                product['description'] = html_to_text(data['body'][0]['value'])
                product['product_url'] = urljoin("https://www.elance.com/", data['url'])
                product['duration_display'] = data['video_duration']['hms_labeled']
                product['duration_filter'] = duration_filter(data['video_duration']['hms_labeled'])
                results.append(product)
            except:
                traceback.print_exc()
        return results
    def parse_course(cls, response):
        print response.url
        json_data = json.loads(response.body)
        from scrapy_balloons.spiders.balloon import balloon_spider

        output = balloon_spider.create_new_product()
        # get course_groups
        info = get_attr(json_data, 'courseGroups')[0]
        output['product_image_url'] = info['asset']
        output['name'] = info['title']
        output['product_url'] = urljoin('https://www.writersonlineworkshops.com/courses/', info['slug'])

        # get courseTabs
        course_tabs = get_attr(json_data, 'courseTabs')
        #get description
        for item in course_tabs:
            if 'Overview' in item['label']:
                output['description'] = html_to_text(item['body'])
            #get audience
            elif 'Who Should Take' in item['label']:
                output['audience'] = html_to_text(item['body'])
            #get toc
            elif 'Course Outline' in item['label']:
                output['toc'] = item['body']
        # get courses
        courses = get_attr(json_data, 'courses')
        #################################################
        product_events = []
        has_price = False
        for item in courses:
            if not is_expired(item['courseStartDate']):
                event = ProductEvent()
                event['language'] = 'eng'
                event['start_date_local'] = convert_date(item['courseStartDate'])
                event['end_date_local'] = convert_date(item['courseEndDate'])
                event['price_display_float'] = item['priceInCents'] / 100
                has_price = True
                product_events.append(event)
                output['product_events'] = product_events
        if not has_price and len(courses) > 0:
            event = ProductEvent()
            event['price_display_float'] = item['priceInCents'] / 100
            event['price_currency'] = 'USD'
            product_events.append(event)
            output['product_events'] = product_events
        return output
Beispiel #7
0
 def create_course(self, id):
     from scrapy_balloons.spiders.balloon import balloon_spider
     product = balloon_spider.create_new_product()
     data = pluralsight.courses_caches[id]
     product['product_url'] = urljoin(pluralsight.base_url, 'courses/%s' % (id))
     product['price_display_float'] = float_to_string(pluralsight.price)
     product['price_currency'] = 'USD'
     for r in data:
         json_data = json.loads(r.body)
         if 'data/course/content' in r.url:
             # example see : http://www.pluralsight.com/data/course/content/clickonce-deployment-fundamentals
             product['toc'] = json_data
         elif 'data/course/authors' in r.url:
             # see http://www.pluralsight.com/data/course/authors/clickonce-deployment-fundamentals
             authors = []
             for data in json_data:
                 author = Author()
                 author['name'] = data['fullName']
                 author['link'] = urljoin(pluralsight.base_url, 'author', data['authorHandle'])
                 author['bio'] = data['longBio']
                 author['image'] = data['largeImageUrl'].replace('//', '')
                 authors.append(author)
             product['authors'] = authors
         elif 'data/course/relationships' in r.url:
             prod_keywords = [item['name'] for item in json_data['tags']]
             product['prod_keywords'] = prod_keywords
         else:
             # see http://www.pluralsight.com/data/course/clickonce-deployment-fundamentals
             product['name'] = json_data['title']
             product['description'] = json_data['description']
             product['short_desc'] = json_data['shortDescription']
             product['difficulty'] = json_data['level']
             product['published_date'] = convert_date(json_data['releaseDate'])
             product['duration_filter'] = duration_filter(json_data['duration'])
             product['duration_display'] = pluralsight.duration_display(json_data['duration'])
             rating_data = json_data['courseRating']
             rating = ProductRating()
             rating['pub_status'] = 'L'
             rating['overall_rating'] = str(int(round(rating_data['rating'])))
             rating['rating_only'] = '1'
             product['ProductRating'] = rating
     return product
Beispiel #8
0
    def parse_courses(cls, response):
        from scrapy_balloons.spiders.balloon import balloon_spider

        json_data = json.loads(response.body)['courses']
        json_data = json_data if balloon_spider.limit == -1 else json_data[:balloon_spider.limit]
        for course in json_data:
            from scrapy_balloons.spiders.balloon import balloon_spider
            # create output
            output = balloon_spider.create_new_product()

            # get info of course
            output['product_url'] = "https://www.udacity.com/course/" + course['key']
            output['name'] = course['title']
            output['product_image_url'] = course['image']
            output['product_video_url'] = course['teaser_video']['youtube_url']
            output['description'] = course['summary']
            output['difficulty'] = difficulty(course['level'])
            output['duration_filter'] = duration_filter(
                str(course['expected_duration']) + course['expected_duration_unit'])
            output['duration_display'] = str(course['expected_duration']) + ' ' + course['expected_duration_unit']
            output['toc'] = course['syllabus']
            output['prerequisites'] = course['required_knowledge']
            output['requirements'] = course['required_knowledge']
            instructors = []

            for ins in course['instructors']:
                instructor = Instructor()
                instructor['name'] = ins['name']
                instructor['bio'] = ins['bio']
                instructor['image'] = ins['image']
                instructor['link'] = None

                instructors.append(instructor)
            output['instructors'] = instructors

            key = output['product_url']
            udacity.output_cache[key] = output
            request = Request(url=key, meta={'key': key}, callback=udacity.parse_price, )
            yield request
Beispiel #9
0
    def start(cls, response):
        from scrapy_balloons.spiders.balloon import balloon_spider

        all_course_url = "https://www.zeitgeistminds.com/zce/video"
        data = zeitgeistminds.curl_url(all_course_url)
        courses_data = re.search(".*\n(.*)", data).group(1)
        courses_data = json.loads(courses_data)
        courses_id = [d["id"] for d in courses_data]
        courses_id = courses_id if balloon_spider.limit == -1 else courses_id[: balloon_spider.limit]
        for id in courses_id:
            # get data detail for a course

            data = zeitgeistminds.curl_url("%s/%s" % (all_course_url, id))
            course_data = re.search(".*\n(.*)", data).group(1)
            course_data = json.loads(course_data)
            product = balloon_spider.create_new_product()
            product["product_url"] = (
                "https://www.zeitgeistminds.com/talk/"
                + str(get_attr(course_data, "id"))
                + "/"
                + str(get_attr(course_data, "slug"))
            )
            product["product_image_url"] = get_attr(course_data, "img_url")
            product["name"] = get_attr(course_data, "title")
            product["description"] = get_attr(course_data, "description")
            product["product_video_url"] = (
                "https://www.youtube.com/watch?v=%s" % get_attr(course_data, "video_id")
                if get_attr(course_data, "video_id")
                else None
            )
            product["partner_prod_id"] = get_attr(course_data, "id")
            product["duration_filter"] = get_attr(course_data, "duration")
            product["published_date"] = datetime.strptime(str(get_attr(course_data, "date")), "%b %Y").strftime(
                "%Y - %m"
            )
            yield product
Beispiel #10
0
    def convert_products(self, origin_products):
        from scrapy_balloons.spiders.balloon import balloon_spider

        results = []
        for data in origin_products:
            try:
                product = balloon_spider.create_new_product()
                product['name'] = data['title']
                product['product_image_url'] = data['image']['l']
                try:
                    video_id = data['freeChapter']['videoID']
                    if video_id:
                        product['product_video_url'] = urljoin("https://www.youtube.com/embed/", video_id)
                    else:
                        product['product_video_url'] = None
                except:
                    pass
                product['description'] = html_to_text(data['description'])
                product['published_date'] = data['date']['date']
                product['tz'] = data['date']['timezone']
                product['product_url'] = urljoin("https://www.evisors.com/", data['url'])
                instructors = []
                for ins in data['presenters']:
                    instructor = Instructor()
                    instructor['name'] = ins['name']
                    instructor['image'] = ins['avatar']['xl']
                    instructor['bio'] = html_to_text(ins['bio'])
                    instructor['link'] = urljoin("https://www.evisors.com/", ins['expert']['url'])
                    instructors.append(instructor)
                product['instructors'] = instructors
                product['price_currency'] = 'USD'
                product['price_display_float'] = get_price_float(data['price'])
                results.append(product)
            except:
                traceback.print_exc()
        return results
    def parse_course(cls, response):

        json_data = json.loads(response.body)

        if json_data['courseGroups']:
            from scrapy_balloons.spiders.balloon import balloon_spider

            output = balloon_spider.create_new_product()

            # get course_groups
            course_groups = get_attr(json_data, 'courseGroups')
            if course_groups and len(course_groups) > 0:
                #get field 'product_image_url'
                if [course_groups[0]['asset']]:
                    output['product_image_url'] = [course_groups[0]['asset']]

                #get name Instructor
                if course_groups[0]['categories']:
                    # instructor_name = [item['value'] for item in course_groups[0]['categories'] if item['label']=='Instructor']
                    for item in course_groups[0]['categories']:
                        if item['label'] == 'Instructor':
                            instructor_name = item['value']
                        else:
                            instructor_name = None
            else:
                instructor_name = None


            # get courseTabs
            coursetabs = get_attr(json_data, 'courseTabs')
            if coursetabs and len(coursetabs) > 0:
                #get description
                if coursetabs[0]['body']:
                    output['description'] = html_to_text(coursetabs[0]['body'])

                #get toc
                if len(coursetabs) > 3:
                    if coursetabs[2]['body']:
                        output['toc'] = coursetabs[2]['body']

            # get profileBlocks
            profileblocks = get_attr(json_data, 'profileBlocks')
            if profileblocks and len(profileblocks) > 0:
                #get bio Instructor
                if profileblocks[0]['bio']:
                    instructor_bio = html_to_text(profileblocks[0]['bio'])
            else:
                instructor_bio = None

            # import pdb
            # pdb.set_trace()
            # get users
            users = get_attr(json_data, 'users')
            if users and len(users) > 0:
                #get image Instructor
                for item in users:
                    if item['asset']:
                        instructor_image = item['asset']
                    else:
                        instructor_image = None
            else:
                instructor_image = None

            # get courses
            courses = get_attr(json_data, 'courses')
            if courses and len(courses) > 0:
                #################################################
                url = courses[0]['slug']
                output['product_url'] = urljoin('https://www.howdesignuniversity.com/courses/', url)
                # get field 'name'
                output['name'] = courses[0]['title']

                #get product_events
                instructors = []
                product_events = []
                if courses[0]['courseEndDate']:
                    event = ProductEvent()

                    event['language'] = 'eng'
                    # get field 'start_date_local'
                    if courses[0]['courseStartDate']:
                        event['start_date_local'] = courses[0]['courseStartDate'].split('T')[0] + ' ' + \
                                                    courses[0]['courseStartDate'].split('T')[1].split('.')[0]
                    elif courses[0]['enrollmentStartDate']:
                        event['start_date_local'] = courses[0]['enrollmentStartDate'].split('T')[0] + ' ' + \
                                                    courses[0]['enrollmentStartDate'].split('T')[1].split('.')[0]

                    # get field 'end_date_local'
                    if courses[0]['courseEndDate']:
                        event['end_date_local'] = courses[0]['courseEndDate'].split('T')[0] + ' ' + \
                                                  courses[0]['courseEndDate'].split('T')[1].split('.')[0]
                    elif courses[0]['enrollmentEndDate']:
                        event['end_date_local'] = courses[0]['enrollmentEndDate'].split('T')[0] + ' ' + \
                                                  courses[0]['enrollmentEndDate'].split('T')[1].split('.')[0]

                    #get price
                    if courses[0]['priceInCents']:
                        event['price_display_float'] = float(
                            str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] +
                            str(courses[0]['priceInCents'])[-1])


                    #get instructor
                    instructor = Instructor()

                    instructor['name'] = instructor_name
                    instructor['bio'] = instructor_bio
                    instructor['image'] = instructor_image
                    instructor['link'] = None
                    instructors.append(instructor)

                    event['instructors'] = instructors

                    product_events.append(event)

                else:
                    #get price
                    if courses[0]['priceInCents']:
                        output['price_display_float'] = float(
                            str(courses[0]['priceInCents'])[:-2] + '.' + str(courses[0]['priceInCents'])[-2] +
                            str(courses[0]['priceInCents'])[-1])

                    instructor = Instructor()

                    instructor['name'] = instructor_name
                    instructor['bio'] = instructor_bio
                    instructor['image'] = instructor_image
                    instructor['link'] = None
                    instructors.append(instructor)

                    if instructor_image == None and instructor_bio == None:
                        output['instructors'] = None
                    else:
                        output['instructors'] = instructors

                output['product_events'] = product_events

            return output
Beispiel #12
0
    def parse(cls, response):

        from scrapy_balloons.spiders.balloon import balloon_spider
        data = response.xpath("//table[@class='training_table']//tr[@bgcolor]")

        def events(selector):

            event = ProductEvent()
            event['language'] = 'eng',
            event['price_currency'] = 'EUR'

            price = selector.xpath("./td[7]//text()").re("\\d.*")
            if price:
                event['price_display_float'] =  html_to_text(price[0])
            else:
                event['price_display_float'] = '0'

            location = selector.xpath("./td[5]/text()[1]").extract()[0]
            event['location_display'] = html_to_text(location)

            start_date = selector.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0]
            event['start_date_local'] = convert_date(start_date)

            return event

        # course_dict = {}
        title = []
        results = []

        for d in data:
            name = d.xpath("./td[@class='trainingtitle']//text()").extract()[0]
            print name
            if name not in title:
                try:
                    output = balloon_spider.create_new_product()

                    output['name'] = name

                    description = d.xpath("./td[@class='trainingtitle']/span/@rel-tid").extract()
                    output['description'] = ingram_micro.get_desc(description[0])

                    price = d.xpath("./td[7]//text()").re("\\d.*")
                    if price:
                        output['price_display_float'] =  html_to_text(price[0])
                    else:
                        output['price_display_float'] = '0'

                    start_date = d.xpath("substring-before(concat(./td[1]/text()[2],' ',./td[5]/i/text()),'-')").extract()[0]
                    output['start_date_local'] = convert_date(start_date)

                    location = d.xpath("./td[5]/text()[1]").extract()[0]
                    output['location_display'] = html_to_text(location)
                    output['product_events'] = events(d)

                    output['product_url'] = "%s#%s" %(response.url,len(results) + 1)
                    # pdb.set_trace()
                    title.append(name)
                    results.append(output)
                except:
                    pass

            else:
                for i in results:
                    if name in i['name']:
                        prod_events = []
                        old_event = i['product_events']
                        new_events = events(d)
                        prod_events.append(old_event)
                        prod_events.append(new_events)

                        i['product_events'] = prod_events
                    else:
                        pass

        return results
Beispiel #13
0
    def build_course(cls):
        from scrapy_balloons.spiders.balloon import balloon_spider

        courses_data = coursera.courses_data if balloon_spider.limit == -1 else coursera.courses_data[
                                                                                :balloon_spider.limit]
        for data in courses_data:
            if 'en' in data['primaryLanguages']:
                output = balloon_spider.create_new_product()
                output['name'] = html_to_text(data['name'])
                output['product_url'] = 'https://www.coursera.org/course/' + data['slug']
                output['description'] = html_to_text(data['description'])
                output['product_image_url'] = data['photoUrl']
                duration_week = data['workload']
                # instructors
                ins_course = []
                if 'instructorIds' in data:
                    ins_data = coursera.find_instructors(data['instructorIds'])
                    for in_data in ins_data:
                        instructor = Instructor()
                        instructor['name'] = "%s %s %s" % (
                        get_attr(in_data, 'firstName'), get_attr(in_data, 'middleName'), get_attr(in_data, 'lastName'))
                        instructor['bio'] = html_to_text(get_attr(in_data, 'bio'))
                        instructor['image'] = get_attr(in_data, 'photo')
                        instructor['link'] = 'https://www.coursera.org/instructor/~' + in_data['profileId'] if contains(
                            in_data, 'profileId') else None
                        ins_course.append(instructor)

                # product events
                course_events = coursera.get_events_by_courseid(data['id'])
                if course_events:
                    product_events = []
                    for event_data in course_events:
                        event = ProductEvent()
                        event['language'] = 'eng'
                        start_time = "%s %s %s" % (get_attr(event_data, 'startDay'), get_attr(event_data, 'startMonth'),
                                                   get_attr(event_data, 'startYear'))
                        event['start_date_local'] = convert_date(start_time)
                        if contains(event_data, 'dbEndDate'):
                            end_time = event_data['dbEndDate']
                            event['end_date_local'] = epoch_time_to_date(end_time / 1000)
                        event['duration_display'] = event_data['durationString']
                        event['duration_filter'] = duration_filter(event_data['durationString'])
                        product_events.append(event)
                        output['product_events'] = product_events

                else:
                    output['language'] = 'eng'
                    output['duration_display'] = duration_week.encode('ascii', 'ignore')
                    if 'hours/week' in output['duration_display']:
                        if '-' in output['duration_display']:
                            output['duration_filter'] = duration_filter(
                                output['duration_display'].split('-')[0] + ' hours')
                        else:
                            output['duration_filter'] = duration_filter(output['duration_display'].split('/')[0])

                    elif ' hours ' in output['duration_display']:
                        if '-' in output['duration_display']:
                            output['duration_filter'] = duration_filter(
                                output['duration_display'].split(' hours ')[0].split('-')[0] + ' hours')
                        else:
                            output['duration_filter'] = duration_filter(
                                output['duration_display'].split(' hours ')[0] + ' hours')
                    else:
                        None
                    output['instructors'] = ins_course

                courses_detail = get_attr(coursera.courses_detail_data, data['id'])
                if courses_detail:
                    output['toc'] = get_attr(courses_detail, 'aboutTheCourse')
                    videos = get_attr(courses_detail, 'videos')
                    if videos and len(videos) > 0:
                        output['product_video_url'] = videos[0]['source']
                    else:
                        # pdb.set_trace()
                        output['product_video_url'] = None
                output['language'] = 'eng'
                yield output