Example #1
0
    def parse(self, response):

        print response.url

        for line in response.xpath(self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('model'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[7:-1])
                
                for result in immo_json["results"]:

                    item = ImmoscoutItem()

                    item['immo_id'] = result['id']
                    item['url'] = response.urljoin("/expose/" + str(result['id']))
                    item['title'] = result['title']
                    item['address'] = result['address']
                    item['city'] = result['city']
                    item['zip_code'] = result['zip']
                    item['district'] = result['district']

                    for attr in result['attributes']:
                        if attr['title'] == "Kaltmiete":
                            item['rent'] = attr['value'][:-2]  # remove units
                        if attr['title'] == u"Wohnfläche":
                            item['sqm'] = attr['value'][:-3] # remove units
                        if attr['title'] == "Zimmer":
                            item['rooms'] = attr['value']     

                    try:
                        item['contact_name'] = result['contactName']
                    except:
                        item['contact_name'] = None

                    try:
                        item['media_count'] = result['mediaCount']
                    except:
                        item['media_count'] = 0

                    try:
                        item['lat'] = result['latitude']
                        item['lng'] = result['longitude']
                    except:
                        item['lat'] = None
                        item['lng'] = None 
               
                    yield item     

    	next_page = response.xpath(self.next_xpath).extract()[-1]
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)	
Example #2
0
    def parse(self, response):

        print(response.url)

        for line in response.xpath(
                self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                try:
                    immo_json = json.loads(immo_json[17:-1])

                    #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                    #TODO: So extracting data will fail.
                    numberOfHits = int(
                        immo_json["searchResponseModel"]
                        ["resultlist.resultlist"]["resultlistEntries"][0]
                        ["@numberOfHits"])
                    print("Number of hits: %i" % (numberOfHits, ))
                    for result in immo_json["searchResponseModel"][
                            "resultlist.resultlist"]["resultlistEntries"][0][
                                "resultlistEntry"]:

                        item = ImmoscoutItem()

                        data = result["resultlist.realEstate"]

                        # print(data)

                        item['immo_id'] = data['@id']
                        item['createdAtDate'] = result['@creation']
                        item['modifiedAtDate'] = result['@modification']
                        item['publishedAtDate'] = result['@publishDate']
                        item['hasNewFlag'] = result['hasNewFlag']
                        item['url'] = response.urljoin("/expose/" +
                                                       str(data['@id']))
                        item['title'] = data['title']
                        address = data['address']
                        try:
                            item['address'] = address[
                                'street'] + " " + address['houseNumber']
                        except:
                            item['address'] = None
                        if 'newHomeBuilder' in result:
                            item['newHomeBuilder'] = result['newHomeBuilder']
                        else:
                            item['newHomeBuilder'] = None
                        if 'floorplan' in data:
                            item['floorplan'] = data['floorplan']
                        else:
                            item['floorplan'] = None
                        item['city'] = address['city']
                        item['zip_code'] = address['postcode']
                        item['district'] = address['quarter']

                        item["rent"] = data["price"]["value"]
                        item["livingSpace"] = data[
                            "livingSpace"]  # Wohnflaeche
                        item["rooms"] = data["numberOfRooms"]

                        if "calculatedPrice" in data:
                            item["extra_costs"] = (
                                data["calculatedPrice"]["value"] -
                                data["price"]["value"])
                        if "builtInKitchen" in data:
                            item["kitchen"] = data["builtInKitchen"]
                        if "balcony" in data:
                            item["balcony"] = data["balcony"]
                        if "garden" in data:
                            item["garden"] = data["garden"]
                        if "privateOffer" in data:
                            item["private"] = data["privateOffer"]
                        if "plotArea" in data:
                            item["plotArea"] = data["plotArea"]
                        if "cellar" in data:
                            item["cellar"] = data["cellar"]

                        try:
                            contact = data['contactDetails']
                            item['contact_name'] = contact[
                                'firstname'] + " " + contact["lastname"]
                        except:
                            item['contact_name'] = None

                        try:
                            item['media_count'] = len(
                                data['galleryAttachments']['attachment'])
                        except:
                            item['media_count'] = 0

                        try:
                            item['lat'] = address['wgs84Coordinate'][
                                'latitude']
                            item['lng'] = address['wgs84Coordinate'][
                                'longitude']
                        except Exception as e:
                            # print(e)
                            item['lat'] = None
                            item['lng'] = None

#                        yield item
                        yield Request(item['url'],
                                      callback=self.parse_expose,
                                      meta={'thisItem': item})

                except Exception as e:
                    print("There was a general error: %s" % (e, ))
                    #print("!!!! GENERAL ERROR !!!!"

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
Example #3
0
    def parse(self, response):

        print(response.url)

        for line in response.xpath(
                self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[17:-1])

                #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                #TODO: So extracting data will fail.
                for result in immo_json["searchResponseModel"][
                        "resultlist.resultlist"]["resultlistEntries"][0][
                            "resultlistEntry"]:

                    item = ImmoscoutItem()

                    # print(data)

                    data = result["resultlist.realEstate"]

                    item['immo_id'] = data['@id']
                    item['url'] = response.urljoin("/expose/" +
                                                   str(data['@id']))
                    item['title'] = data['title']
                    address = data['address']
                    try:
                        item['address'] = address['street'] + " " + address[
                            'houseNumber']
                    except:
                        item['address'] = None
                    item['city'] = address['city']
                    item['zip_code'] = address['postcode']
                    item['district'] = address['quarter']
                    try:
                        item['lat'] = address['wgs84Coordinate']['latitude']
                        item['lng'] = address['wgs84Coordinate']['longitude']
                    except Exception as e:
                        # print(e)
                        item['lat'] = None
                        item['lng'] = None

                    item["rent"] = data["price"]["value"]
                    item["livingSpace"] = data["livingSpace"]
                    item["rooms"] = data["numberOfRooms"]
                    item["brokerage"] = data["courtage"]["hasCourtage"]

                    if "calculatedPrice" in data:
                        item["extra_costs"] = (
                            data["calculatedPrice"]["value"] -
                            data["price"]["value"])
                    if "builtInKitchen" in data:
                        item["kitchen"] = data["builtInKitchen"]
                    if "balcony" in data:
                        item["balcony"] = data["balcony"]
                    if "garden" in data:
                        item["garden"] = data["garden"]
                    if "privateOffer" in data:
                        item["private"] = data["privateOffer"]
                    if "plotArea" in data:
                        item["area"] = data["plotArea"]
                    if "cellar" in data:
                        item["cellar"] = data["cellar"]
                    if "guestToilet" in data:
                        item["guestToilet"] = data["guestToilet"]

                    if "@publishDate" in result:
                        item["publishDate"] = result["@publishDate"]

                    try:
                        contact = data['contactDetails']
                        item['contact_name'] = contact[
                            'firstname'] + " " + contact["lastname"]
                    except:
                        item['contact_name'] = None

                    try:
                        item['media_count'] = len(
                            data['galleryAttachments']['attachment'])
                    except:
                        item['media_count'] = 0

                    yield item

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
Example #4
0
    def parse(self, response):

        #print(response.url)

        for line in response.xpath(
                self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(
                    immo_json[17:-1]
                )  # everything element including #18..(last-1)

                #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                #TODO: So extracting data will fail.
                for result in immo_json["searchResponseModel"][
                        "resultlist.resultlist"]["resultlistEntries"][0][
                            "resultlistEntry"]:

                    item = ImmoscoutItem()  #define new field if needed here

                    data = result["resultlist.realEstate"]

                    #General Information
                    item['immo_id'] = data['@id']
                    item['title'] = data['title']
                    item['url'] = response.urljoin("/expose/" +
                                                   str(data['@id']))
                    item['retype'] = data['@xsi.type']
                    #Adress
                    address = data['address']
                    try:
                        item['address'] = address['city'] + " " + address[
                            'street'] + " " + address['houseNumber']
                    except:
                        item['address'] = ""
                    item['city'] = address['city']
                    try:
                        item['street'] = address['street']
                    except:
                        item['street'] = ""
                    try:
                        item['housenumber'] = address['houseNumber']
                    except:
                        item['housenumber'] = ""
                    if "preciseHouseNumber" in data:
                        item['precisehousenumber'] = address[
                            'preciseHouseNumber']
                    else:
                        item['precisehousenumber'] = ""
                    item['zip_code'] = address['postcode']
                    item['district'] = address['quarter']
                    try:
                        item['lat'] = address['wgs84Coordinate']['latitude']
                        item['lng'] = address['wgs84Coordinate']['longitude']
                    except Exception as e:
                        # print(e)
                        item['lat'] = ""
                        item['lng'] = ""
                    #Additions
                    if "balcony" in data:
                        item["balcony"] = data["balcony"]
                    else:
                        item["balcony"] = ""
                    if "builtInKitchen" in data:
                        item["kitchen"] = data["builtInKitchen"]
                    else:
                        item["kitchen"] = ""
                    if "cellar" in data:
                        item["cellar"] = data["cellar"]
                    else:
                        item["cellar"] = ""
                    if "companywidecustomerid" in data:
                        item['companywidecustomerid'] = address[
                            'companyWideCustomerId']
                    else:
                        item["companywidecustomerid"] = ""
                    #contactDetails
                    contact = data['contactDetails']
                    try:
                        item['contcompany'] = contact['company']
                    except:
                        item['contcompany'] = ""
                    try:
                        item['contname'] = contact[
                            'firstname'] + " " + contact["lastname"]
                    except:
                        item['contname'] = ""
                    if "contfirstname" in data:
                        item['contfirstname'] = contact['firstname']
                    else:
                        item['contfirstname'] = ""
                    if "contlastname" in data:
                        item['contlastname'] = contact['lastname']
                    else:
                        item['contlastname'] = ""
                    if "contphonenumber" in data:
                        item['contphonenumber'] = contact['phoneNumber']
                    else:
                        item['contphonenumber'] = ""
                    item['contsalutation'] = contact['salutation']
                    #courtage
                    #courtage = data['courtage']
                    #item['hascourtage'] = courtage['hasCourtage']
                    item['hascourtage'] = ''
                    #Additions2
                    item['floorplan'] = data['floorplan']
                    if "garden" in data:
                        item["garden"] = data["garden"]
                    else:
                        item["garden"] = ""
                    if "guestToilet" in data:
                        item["guesttoilet"] = data["guestToilet"]
                    else:
                        item["guesttoilet"] = ""
                    if "isBarrierFree" in data:
                        item["isbarrierfree"] = data["isBarrierFree"]
                    else:
                        item["isbarrierfree"] = ""
                    if "lift" in data:
                        item["lift"] = data["lift"]
                    else:
                        item["lift"] = ""
                    item["listingtype"] = data["listingType"]
                    item["livingspace"] = data["livingSpace"]
                    item["numberofrooms"] = data["numberOfRooms"]
                    #price
                    price = data["price"]
                    item["currency"] = price["currency"]
                    item["marketingtype"] = price["marketingType"]
                    item["priceintervaltype"] = price["priceIntervalType"]
                    item["value"] = price["value"]
                    #Additions3
                    if "privateOffer" in data:
                        item["privateoffer"] = data["privateOffer"]
                    else:
                        item["privateoffer"] = ""
                    try:
                        item["realtorcompanyname"] = data["realtorCompanyName"]
                    except:
                        item["realtorcompanyname"] = ""
                    if "realtorlogo" in data:
                        item["realtorlogo"] = data["realtorLogo"]
                    else:
                        item["realtorlogo"] = ""
                    item["spotlightlisting"] = data["spotlightListing"]
                    item["streamingvideo"] = data["streamingVideo"]
                    #titlePicture
                    try:
                        titlePicture = data["titlePicture"]
                    except:
                        titlePicture = ""
                    try:
                        item["creation"] = titlePicture["@creation"]
                    except:
                        item["creation"] = ""

                    try:
                        item['media_count'] = len(
                            data['galleryAttachments']['attachment'])
                    except:
                        item['media_count'] = 0

                    yield item

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
Example #5
0
    def parse(self, response):

        print(response.url)

        for line in response.xpath(
                self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[17:-1])

                for result in immo_json["searchResponseModel"][
                        "resultlist.resultlist"]["resultlistEntries"][0][
                            "resultlistEntry"]:

                    item = ImmoscoutItem()

                    data = result["resultlist.realEstate"]

                    item['immo_id'] = data['@id']
                    item['url'] = response.urljoin("/expose/" +
                                                   str(data['@id']))
                    item['title'] = data['title']
                    address = data['address']
                    try:
                        item['address'] = address['street'] + " " + address[
                            'houseNumber']
                    except:
                        item['address'] = None
                    item['city'] = address['city']
                    item['zip_code'] = address['postcode']
                    item['district'] = address['quarter']

                    for attr in result['attributes'][0]['attribute']:
                        if attr['label'] == "Kaltmiete":
                            item['rent'] = attr['value'][:-2]  # remove units
                        if attr['label'] == u"Wohnfläche":
                            item['sqm'] = attr['value'][:-3]  # remove units
                        if attr['label'] == "Zimmer":
                            item['rooms'] = attr['value']

                    try:
                        contact = data['contactDetails']
                        item['contact_name'] = contact[
                            'firstname'] + " " + contact["lastname"]
                    except:
                        item['contact_name'] = None

                    try:
                        item['media_count'] = len(
                            data['galleryAttachments']['attachment'])
                    except:
                        item['media_count'] = 0

                    try:
                        item['lat'] = address['wgs84Coordinate']['latitude']
                        item['lng'] = address['wgs84Coordinate']['longitude']
                    except:
                        item['lat'] = None
                        item['lng'] = None

                    yield item

        next_page = response.xpath(self.next_xpath).extract()[-1]
        print("Scraping next page", next_page)
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)