def setUp(self): database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT, settings.SQL_USER, settings.SQL_PASSWD, settings.SQL_DB, settings.SQL_COLLECTION_NAME) self._item_db = database_factory.get_database(CollectionTypes.item) self.section_data = SectionData() self.expect = self.section_data.expect self.section_mgr = HarajsSection(self.section_data.sections, self._item_db)
class HarajsSecionTest(unittest.TestCase): def setUp(self): database_factory = DatabaseFactory( settings.SQL_HOST, settings.SQL_PORT, settings.SQL_USER, settings.SQL_PASSWD, settings.SQL_DB, settings.SQL_COLLECTION_NAME) self._item_db = database_factory.get_database(CollectionTypes.item) self.section_data = SectionData() self.expect = self.section_data.expect self.section_mgr = HarajsSection(self.section_data.sections, self._item_db) def test_parse_section(self): _section_item = self.section_mgr.get_section_item_for_harajsa() _tags_f = '{}'.format(_section_item.ads_tags_F) _tags_ff = '{}'.format(_section_item.ads_tags_FF) _tags_r = '{}'.format(_section_item.ads_tags_R) _other_final = _section_item.type_ads_other_final.encode('utf-8') self.assertEqual(_tags_f, self.expect["tags_f"]) self.assertEqual(_tags_ff, self.expect["tags_ff"]) self.assertEqual(_tags_r, self.expect["tags_r"]) self.assertEqual(_other_final, self.expect["other_final"])
def parse(self, url, hxs, item_db): from cwharaj.utils.crawl_utils import CrawlUtils _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from) # AD _ads_title = self.get_value_response(hxs, '//*[@class="titleSection doHighlight"]/text()') _time_added = self.get_value_response(hxs, '//*[@class="boxItem"]/table[1]/tr/td[2]/span/text()') _last_updated_ad = self.get_value_response(hxs, '//*[@class="boxItem"]/table[2]/tr/td[2]/span/text()') _time_added = TimerUtil().get_time_for_mstaml(_time_added) _last_updated_ad = TimerUtil().get_time_for_mstaml(_last_updated_ad) _image_link = self.get_images_in_selector(hxs, '//noscript') _ads_body = self.get_all_value_response(hxs, '//*[@class="text linkify linkifyWithImages linkifyWithWasel doHighlight"]/text()') # Member(boxItem) _memberName = self.get_value_response(hxs, '//table[@class="dcs"]/tr[1]/td/text()') _ads_city = self.get_city(hxs) _member_email = self.get_value_response(hxs, '//table[@class="dcs"]/tr[8]/td[2]/span/@title') _member_phone = self.get_value_response(hxs, '//table[@class="dcs"]/tr[10]/td[2]/span/@title') # Sections _sections = self.get_section(hxs, '//div[@class="pageRight"]/h1[@class="titlePage"]/a/text()') _section_item = HarajsSection(_sections, item_db).get_section_item_for_mstaml() # Fixing the empty page. if _ads_title == '' and _ads_body == '': logging.debug(" The empty page on the mstaml") return {"id_ads": _ID} # ==== # Save to relative database # ==== _city_id = item_db.save_city(City.get_default(_ads_city)) _His_announcement_id = item_db.save_member( Member.get_default(user_name=_memberName, email=_member_email, phone=_member_phone)) item = Ad.get_default( section_item=_section_item, ads_title=_ads_title, city_id=_city_id, ads_contact=_member_phone, ads_body=_ads_body, image_link=_image_link, His_announcement_id=_His_announcement_id, url_from=self.url_from, Time_added=_time_added, Last_updated_Ad=_last_updated_ad, type_ads_or=1, _close_ads=0 ) id_ads = item_db.save_ad(item) # mstaml no comments found. # ignore parsing the comments. return item
class OpensooqSecionTest(unittest.TestCase): def setUp(self): database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT, settings.SQL_USER, settings.SQL_PASSWD, settings.SQL_DB, settings.SQL_COLLECTION_NAME) self._item_db = database_factory.get_database(CollectionTypes.item) self.section_data = SectionData() self.expect = self.section_data.expect self.section_mgr = HarajsSection(self.section_data.sections, self._item_db) def test_parse_section(self): _section_item = self.section_mgr.get_section_item_for_opensooq() _tags_f = '{}'.format(_section_item.ads_tags_F) _tags_ff = '{}'.format(_section_item.ads_tags_FF) _tags_r = '{}'.format(_section_item.ads_tags_R) self.assertEqual(_tags_f, self.expect["tags_f"]) self.assertEqual(_tags_ff, self.expect["tags_ff"]) self.assertEqual(_tags_r, self.expect["tags_r"])
def parse(self, url, hxs, item_db): from cwharaj.utils.crawl_utils import CrawlUtils _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from) # ADs User # memberName len(list) = 2 _memberName = self.get_value_response( hxs, '//*[@class="userDet tableCell vTop"]/strong/a/text()') # member_timeregister is 'اريخ الانضمام 08/10/2015' member_timeregister = self.get_value_response( hxs, '//span[@class="joinDate"]/text()') _ads_city = self.get_value_response( hxs, '//*[@class="sellerAddress"]/span[@class="sellerAddressText"]/a/text()' ) # ADs _ads_title = self.get_value_response( hxs, '//*[@class="postTitleCont"]/div/h1/text()') _image_link = self.get_pictures( hxs, '//*[@class="galleryLeftList fLeft"]/ul/li/a/img/@src') time_added = self.get_value_response( hxs, '//*[@class="postDate fRight"]/text()') _ads_body = self.get_all_value_response( hxs, '//*[@class="postDesc"]/p/text()') _sections = self.get_section( self.get_value_response(hxs, '//*[@class="breadcrumbs"]')) # Fixing the empty page. if (_ads_title == '') and (len(_sections) == 0): logging.debug(" The empty page on the opensooq") return {"id_ads": _ID, "url_from": self.url_from} section_item = HarajsSection(_sections, item_db).get_section_item_for_opensooq() # Replace "\n","\r" _ads_title = _ads_title.replace("\n", "").replace("\r", "").strip() # ==== # Save to relative database # ==== # Because opensooq's contact is image base64 format, # So Firstly request it via ajax. ads_contact = '' phone_number_base64 = self.query_phone_number_base64_image(hxs) if phone_number_base64: opensooq_phone_id = item_db.save_opensooq_phone( OpensooqPhone.get_default(phone_number_base64)) # opensooq's contact is a specialized format. ads_contact = Ad.get_opensooq_phone(opensooq_phone_id) time_added = OpensooqCommentDateUtil( ).get_time_for_opensooq_time_added(time_added) member_timeregister = OpensooqCommentDateUtil( ).get_time_for_opensooq_member_timeregister(member_timeregister) city_id = item_db.save_city(City.get_default(_ads_city)) _His_announcement_id = item_db.save_member( Member.get_default(user_name=_memberName, timeregister=member_timeregister, phone=ads_contact)) item = Ad.get_default(section_item=section_item, ads_title=_ads_title, city_id=city_id, ads_contact=ads_contact, ads_body=_ads_body, image_link=_image_link, His_announcement_id=_His_announcement_id, url_from=self.url_from, Time_added=time_added, type_ads_or=1, _close_ads=0) id_ads = item_db.save_ad(item) # Scrape all comments for the ad. HarajsComments(self, item_db, id_ads).save_for_opensooq(hxs) return item
def parse(self, url, hxs, item_db): from cwharaj.utils.crawl_utils import CrawlUtils _ID = CrawlUtils.get_model_id_by_url_from(url, self.url_from) # comment ad_div _ads_title = self.get_value_response( hxs, '//*[@itemprop="name"]/text()').replace('» ', '') _ads_city = self.get_value_response( hxs, '//*[@class=" comment_header"]/*[@class="city-head"]/text()') _published_data = self.get_published_date( self.get_value_response(hxs, '//*[@class=" comment_header"]')) # "_published_data' is the same as '_time_added' _time_added = TimerUtil().get_time_for_harajs(_published_data) _last_updated_ad = _time_added _memberName = self.get_value_response( hxs, '//*[@class=" comment_header"]/*[@class="username"]/text()') # ad_low def filter_for_image(src): if 'haraj.com.sa' in src: return True logging.debug( " invalide picture url from the haraj.sa, {}".format(src)) return False _image_link = self.get_images_in_selector( hxs, '//*[@itemprop="description"]', filter_method=filter_for_image) _ads_body = self.get_all_value_response( hxs, '//*[@itemprop="description"]/text()') _ads_contact = self.get_value_response( hxs, '//*[@class="contact"]/strong/a/text()') # sections _sections = self.get_section( self.get_value_response(hxs, '//*[@class="ad_low"]')) _section_item = HarajsSection(_sections, item_db).get_section_item_for_harajsa() # Fixing the empty page. if _ads_title == '' and _ads_body == '': logging.debug(" The empty page on the harajsa") return {"id_ads": _ID, 'url_from': self.url_from} # TODO: djzhang, how to parse the sections when length is more then 3. if len(_sections) > 3: logging.debug( " The sections length is more than 3 on the harajsa") return {"id_ads": _ID, 'url_from': self.url_from} # Replace "\n","\r" _ads_body = _ads_body.replace("\r", "").strip() # ==== # Save to relative database # ==== _city_id = item_db.save_city(City.get_default(_ads_city)) _His_announcement_id = item_db.save_member( Member.get_default(_memberName)) item = Ad.get_default(section_item=_section_item, ads_title=_ads_title, city_id=_city_id, ads_contact=_ads_contact, ads_body=_ads_body, image_link=_image_link, His_announcement_id=_His_announcement_id, url_from=self.url_from, Time_added=_time_added, Last_updated_Ad=_last_updated_ad, type_ads_or=1, _close_ads=0) id_ads = item_db.save_ad(item) HarajsComments(self, item_db, id_ads).save_for_harajs(hxs) return item