def __init__(self): self.authority = r'https://www.wenku8.net' self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.pageurl = r"/modules/article/articlelist.php?page=" self.username = r'' self.password = r'' self.formdata = {} self.formdata['username'] = self.username self.formdata['password'] = self.password self.formdata['usecookie'] = '0' self.formdata['action'] = r'login' self.formdata[ 'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B' self.headers = {} self.headers['origin'] = r'https://www.wenku8.net' self.headers[ 'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.headers['upgrade-insecure-requests'] = '1' self.headers[ 'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' self.session = requests.Session() self.manager = UrlManager() self.downloader = UrlDownloader() self.parser = UrlParser(self.session)
def create_book(command, counter): Path.reset_path() Debug.logger.info(u"Ready to make No.{} e-book".format(counter)) Debug.logger.info(u"Analyzes {} ".format(command)) task_package = UrlParser.get_task(command) # 分析命令 if not task_package.is_work_list_empty(): worker_factory(task_package.work_list) # 执行抓取程序 Debug.logger.info(u"Complete fetching from web") file_name_set = None if not task_package.is_book_list_empty(): Debug.logger.info(u"Start generating e-book from the database") book = Book(task_package.book_list) file_name_set = book.create() if file_name_set is not None: file_name_set2list = list(file_name_set) file_name = '-'.join(file_name_set2list[0:3]) return file_name return u"Oops! no epub file produced"
class Wenku(): def __init__(self): self.authority = r'https://www.wenku8.net' self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.pageurl = r"/modules/article/articlelist.php?page=" self.username = r'' self.password = r'' self.formdata = {} self.formdata['username'] = self.username self.formdata['password'] = self.password self.formdata['usecookie'] = '0' self.formdata['action'] = r'login' self.formdata[ 'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B' self.headers = {} self.headers['origin'] = r'https://www.wenku8.net' self.headers[ 'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.headers['upgrade-insecure-requests'] = '1' self.headers[ 'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' self.session = requests.Session() self.manager = UrlManager() self.downloader = UrlDownloader() self.parser = UrlParser(self.session) def login(self): response = self.session.post(self.loginurl, data=self.formdata, headers=self.headers) if response.status_code == 200: return True return False def parser_some_pages(self, begin, end): for turn in range(begin, end): index = self.session.get(self.authority + self.pageurl + str(turn)) if index.status_code != 200: print('get page error page num: ' + str(turn) + ' ,error code: ' + str(index.status_code)) return index.encoding = 'gbk' self.parser_one_page(index) print('parser page ' + str(turn) + ' done!') time.sleep(random.random() * 3) def parser_one_page(self, index): data = self.parser.parser(index, index.text) self.manager.add_new_urls(data) def save_2_files(self, filename): self.manager.save_2_file(filename) def Run(self): thread_pool = [] for i in range(0, 10): t = threading.Thread(target=self.parser_some_pages, args=(1 + i * 10, 1 + (i + 1) * 10)) thread_pool.append(t) for t in thread_pool: t.start() for t in thread_pool: t.join() self.save_2_files('dict.txt') def load_and_download(self): with open('dict.txt', 'r') as f: urls = json.load(fp=f) for k, v in urls.items(): name = (k + '.txt').replace('?', '!') self.downloader.download(v[0], name) print('download done ' + name) time.sleep(random.random() * 3)
10006: "Odbiór w punkcie po przedpłacie - PACZKA W RUCHu", 10022: "Odbiór w punkcie po przedpłacie - Paczkomaty 24/7", 10023: "Odbiór w punkcie po przedpłacie - Allegro Paczkomaty InPost", 10060: "Odbiór w punkcie po przedpłacie - Paczka24 Odbiór w Punkcie", 10061: "Odbiór w punkcie po przedpłacie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie", 20006: "Odbiór w punkcie - PACZKA W RUCHu", 20022: "Odbiór w punkcie - Paczkomaty 24/7", 20023: "Odbiór w punkcie - Allegro Paczkomaty InPost", 20060: "Odbiór w punkcie - Paczka24 Odbiór w Punkcie", 20061: "Odbiór w punkcie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie" } client = Client(wsdl) url = "http://allegro.pl/listing/listing.php?generalDelivery_rec=1&vat_invoice=1&standard_allegro=1&startingTime=7&buyNew=1&offerTypeBuyNow=1&order=d&price_to=1000&string=samsung&bmatch=engagement-v6-promo-sm-sqm-fall-ele-1-1-1214&city=Pozna%C5%84" url_parser = UrlParser() params = url_parser.parse(url) alle_options_parser = AlleOptions(client) options = alle_options_parser.get_options(params) api_methods = ApiMethods(client) api_version = api_methods.get_version() session = api_methods.get_session(api_version) items = api_methods.get_items_list(options) item_ids = [x.itemId for x in items] calculated_items = []
31: "Paczka48", 10006: "Odbiór w punkcie po przedpłacie - PACZKA W RUCHu", 10022: "Odbiór w punkcie po przedpłacie - Paczkomaty 24/7", 10023: "Odbiór w punkcie po przedpłacie - Allegro Paczkomaty InPost", 10060: "Odbiór w punkcie po przedpłacie - Paczka24 Odbiór w Punkcie", 10061: "Odbiór w punkcie po przedpłacie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie", 20006: "Odbiór w punkcie - PACZKA W RUCHu", 20022: "Odbiór w punkcie - Paczkomaty 24/7", 20023: "Odbiór w punkcie - Allegro Paczkomaty InPost", 20060: "Odbiór w punkcie - Paczka24 Odbiór w Punkcie", 20061: "Odbiór w punkcie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie" } client = Client(wsdl) url = "http://allegro.pl/listing/listing.php?generalDelivery_rec=1&vat_invoice=1&standard_allegro=1&startingTime=7&buyNew=1&offerTypeBuyNow=1&order=d&price_to=1000&string=samsung&bmatch=engagement-v6-promo-sm-sqm-fall-ele-1-1-1214&city=Pozna%C5%84" url_parser = UrlParser() params = url_parser.parse(url) alle_options_parser = AlleOptions(client) options = alle_options_parser.get_options(params) api_methods = ApiMethods(client) api_version = api_methods.get_version() session = api_methods.get_session(api_version) items = api_methods.get_items_list(options) item_ids = [x.itemId for x in items] calculated_items = []
def test_parse(self): url_parser = UrlParser() url = "http://allegro.pl/listing/listing.php?order=d&string=asd&bmatch=engagement-v6-promo-sm-sqm-dyn-v2-aut-1-1-1120&buyNew=1&offerTypeBuyNow=1&price_from=1&price_to=2&city=test&startingTime=6&state=9&standard_allegro=1&freeReturn=1&freeShipping=1&personal_rec=1&vat_invoice=1&generalDelivery_rec=1" self.assertEqual(url_parser.parse(url), {'search': 'asd', 'offerType': ['buyNow'], 'condition': ['new'], 'price': {'min': '1', 'max': '2'}, 'city': 'test', 'state': '9', 'startingTime': '12h', 'offerOptions': ['freeReturn', 'freeShipping', 'personalReceipt', 'vatInvoice', 'generalDelivery', 'standardAllegro']})