Exemple #1
0
    def setUp(self):
        database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT,
                                           settings.SQL_USER, settings.SQL_PASSWD,
                                           settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._item_db = database_factory.get_database(CollectionTypes.item)
        self.section_data = SectionData()
        self.expect = self.section_data.expect

        self.section_mgr = HarajsSection(self.section_data.sections, self._item_db)
class HarajsSecionTest(unittest.TestCase):
    def setUp(self):
        database_factory = DatabaseFactory(
            settings.SQL_HOST, settings.SQL_PORT, settings.SQL_USER,
            settings.SQL_PASSWD, settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._item_db = database_factory.get_database(CollectionTypes.item)
        self.section_data = SectionData()
        self.expect = self.section_data.expect

        self.section_mgr = HarajsSection(self.section_data.sections,
                                         self._item_db)

    def test_parse_section(self):
        _section_item = self.section_mgr.get_section_item_for_harajsa()

        _tags_f = '{}'.format(_section_item.ads_tags_F)
        _tags_ff = '{}'.format(_section_item.ads_tags_FF)
        _tags_r = '{}'.format(_section_item.ads_tags_R)
        _other_final = _section_item.type_ads_other_final.encode('utf-8')

        self.assertEqual(_tags_f, self.expect["tags_f"])
        self.assertEqual(_tags_ff, self.expect["tags_ff"])
        self.assertEqual(_tags_r, self.expect["tags_r"])
        self.assertEqual(_other_final, self.expect["other_final"])
    def parse(self, url, hxs, item_db):
        from cwharaj.utils.crawl_utils import CrawlUtils
        _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from)

        # AD
        _ads_title = self.get_value_response(hxs, '//*[@class="titleSection doHighlight"]/text()')
        _time_added = self.get_value_response(hxs, '//*[@class="boxItem"]/table[1]/tr/td[2]/span/text()')
        _last_updated_ad = self.get_value_response(hxs, '//*[@class="boxItem"]/table[2]/tr/td[2]/span/text()')
        _time_added = TimerUtil().get_time_for_mstaml(_time_added)
        _last_updated_ad = TimerUtil().get_time_for_mstaml(_last_updated_ad)
        _image_link = self.get_images_in_selector(hxs, '//noscript')
        _ads_body = self.get_all_value_response(hxs,
                                                '//*[@class="text linkify linkifyWithImages linkifyWithWasel doHighlight"]/text()')

        # Member(boxItem)
        _memberName = self.get_value_response(hxs, '//table[@class="dcs"]/tr[1]/td/text()')
        _ads_city = self.get_city(hxs)
        _member_email = self.get_value_response(hxs, '//table[@class="dcs"]/tr[8]/td[2]/span/@title')
        _member_phone = self.get_value_response(hxs, '//table[@class="dcs"]/tr[10]/td[2]/span/@title')

        # Sections
        _sections = self.get_section(hxs, '//div[@class="pageRight"]/h1[@class="titlePage"]/a/text()')
        _section_item = HarajsSection(_sections, item_db).get_section_item_for_mstaml()

        # Fixing the empty page.
        if _ads_title == '' and _ads_body == '':
            logging.debug("  The empty page on the mstaml")
            return {"id_ads": _ID}

        # ====
        # Save to relative database
        # ====
        _city_id = item_db.save_city(City.get_default(_ads_city))

        _His_announcement_id = item_db.save_member(
            Member.get_default(user_name=_memberName, email=_member_email, phone=_member_phone))

        item = Ad.get_default(
            section_item=_section_item,
            ads_title=_ads_title,
            city_id=_city_id,
            ads_contact=_member_phone,
            ads_body=_ads_body,
            image_link=_image_link,
            His_announcement_id=_His_announcement_id,
            url_from=self.url_from,
            Time_added=_time_added, Last_updated_Ad=_last_updated_ad,
            type_ads_or=1, _close_ads=0
        )

        id_ads = item_db.save_ad(item)

        # mstaml no comments found.
        # ignore parsing the comments.

        return item
class OpensooqSecionTest(unittest.TestCase):
    def setUp(self):
        database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT,
                                           settings.SQL_USER, settings.SQL_PASSWD,
                                           settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._item_db = database_factory.get_database(CollectionTypes.item)
        self.section_data = SectionData()
        self.expect = self.section_data.expect

        self.section_mgr = HarajsSection(self.section_data.sections, self._item_db)

    def test_parse_section(self):
        _section_item = self.section_mgr.get_section_item_for_opensooq()

        _tags_f = '{}'.format(_section_item.ads_tags_F)
        _tags_ff = '{}'.format(_section_item.ads_tags_FF)
        _tags_r = '{}'.format(_section_item.ads_tags_R)

        self.assertEqual(_tags_f, self.expect["tags_f"])
        self.assertEqual(_tags_ff, self.expect["tags_ff"])
        self.assertEqual(_tags_r, self.expect["tags_r"])
    def parse(self, url, hxs, item_db):
        from cwharaj.utils.crawl_utils import CrawlUtils
        _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from)

        # ADs User
        # memberName len(list) = 2
        _memberName = self.get_value_response(
            hxs, '//*[@class="userDet tableCell vTop"]/strong/a/text()')
        # member_timeregister is 'اريخ الانضمام  08/10/2015'
        member_timeregister = self.get_value_response(
            hxs, '//span[@class="joinDate"]/text()')

        _ads_city = self.get_value_response(
            hxs,
            '//*[@class="sellerAddress"]/span[@class="sellerAddressText"]/a/text()'
        )

        # ADs
        _ads_title = self.get_value_response(
            hxs, '//*[@class="postTitleCont"]/div/h1/text()')
        _image_link = self.get_pictures(
            hxs, '//*[@class="galleryLeftList fLeft"]/ul/li/a/img/@src')
        time_added = self.get_value_response(
            hxs, '//*[@class="postDate fRight"]/text()')
        _ads_body = self.get_all_value_response(
            hxs, '//*[@class="postDesc"]/p/text()')

        _sections = self.get_section(
            self.get_value_response(hxs, '//*[@class="breadcrumbs"]'))

        # Fixing the empty page.
        if (_ads_title == '') and (len(_sections) == 0):
            logging.debug("  The empty page on the opensooq")
            return {"id_ads": _ID, "url_from": self.url_from}

        section_item = HarajsSection(_sections,
                                     item_db).get_section_item_for_opensooq()

        # Replace "\n","\r"
        _ads_title = _ads_title.replace("\n", "").replace("\r", "").strip()

        # ====
        # Save to relative database
        # ====

        # Because opensooq's contact is image base64 format,
        # So Firstly request it via ajax.
        ads_contact = ''
        phone_number_base64 = self.query_phone_number_base64_image(hxs)
        if phone_number_base64:
            opensooq_phone_id = item_db.save_opensooq_phone(
                OpensooqPhone.get_default(phone_number_base64))
            # opensooq's contact is a specialized format.
            ads_contact = Ad.get_opensooq_phone(opensooq_phone_id)

        time_added = OpensooqCommentDateUtil(
        ).get_time_for_opensooq_time_added(time_added)
        member_timeregister = OpensooqCommentDateUtil(
        ).get_time_for_opensooq_member_timeregister(member_timeregister)

        city_id = item_db.save_city(City.get_default(_ads_city))

        _His_announcement_id = item_db.save_member(
            Member.get_default(user_name=_memberName,
                               timeregister=member_timeregister,
                               phone=ads_contact))

        item = Ad.get_default(section_item=section_item,
                              ads_title=_ads_title,
                              city_id=city_id,
                              ads_contact=ads_contact,
                              ads_body=_ads_body,
                              image_link=_image_link,
                              His_announcement_id=_His_announcement_id,
                              url_from=self.url_from,
                              Time_added=time_added,
                              type_ads_or=1,
                              _close_ads=0)

        id_ads = item_db.save_ad(item)

        # Scrape all comments for the ad.
        HarajsComments(self, item_db, id_ads).save_for_opensooq(hxs)

        return item
Exemple #6
0
    def parse(self, url, hxs, item_db):
        from cwharaj.utils.crawl_utils import CrawlUtils
        _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from)

        # comment ad_div
        _ads_title = self.get_value_response(
            hxs, '//*[@itemprop="name"]/text()').replace('» ', '')
        _ads_city = self.get_value_response(
            hxs, '//*[@class=" comment_header"]/*[@class="city-head"]/text()')
        _published_data = self.get_published_date(
            self.get_value_response(hxs, '//*[@class=" comment_header"]'))
        # "_published_data' is the same as '_time_added'
        _time_added = TimerUtil().get_time_for_harajs(_published_data)
        _last_updated_ad = _time_added
        _memberName = self.get_value_response(
            hxs, '//*[@class=" comment_header"]/*[@class="username"]/text()')

        # ad_low
        def filter_for_image(src):
            if 'haraj.com.sa' in src:
                return True

            logging.debug(
                "  invalide picture url from the haraj.sa, {}".format(src))
            return False

        _image_link = self.get_images_in_selector(
            hxs,
            '//*[@itemprop="description"]',
            filter_method=filter_for_image)
        _ads_body = self.get_all_value_response(
            hxs, '//*[@itemprop="description"]/text()')
        _ads_contact = self.get_value_response(
            hxs, '//*[@class="contact"]/strong/a/text()')

        # sections
        _sections = self.get_section(
            self.get_value_response(hxs, '//*[@class="ad_low"]'))
        _section_item = HarajsSection(_sections,
                                      item_db).get_section_item_for_harajsa()

        # Fixing the empty page.
        if _ads_title == '' and _ads_body == '':
            logging.debug("  The empty page on the harajsa")
            return {"id_ads": _ID, 'url_from': self.url_from}

        # TODO: djzhang, how to parse the sections when length is more then 3.
        if len(_sections) > 3:
            logging.debug(
                "  The sections length is more than 3 on the harajsa")
            return {"id_ads": _ID, 'url_from': self.url_from}

        # Replace "\n","\r"
        _ads_body = _ads_body.replace("\r", "").strip()

        # ====
        # Save to relative database
        # ====
        _city_id = item_db.save_city(City.get_default(_ads_city))

        _His_announcement_id = item_db.save_member(
            Member.get_default(_memberName))

        item = Ad.get_default(section_item=_section_item,
                              ads_title=_ads_title,
                              city_id=_city_id,
                              ads_contact=_ads_contact,
                              ads_body=_ads_body,
                              image_link=_image_link,
                              His_announcement_id=_His_announcement_id,
                              url_from=self.url_from,
                              Time_added=_time_added,
                              Last_updated_Ad=_last_updated_ad,
                              type_ads_or=1,
                              _close_ads=0)

        id_ads = item_db.save_ad(item)

        HarajsComments(self, item_db, id_ads).save_for_harajs(hxs)

        return item