Esempio n. 1
0
    def __init__(self):
        self.authority = r'https://www.wenku8.net'
        self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.pageurl = r"/modules/article/articlelist.php?page="
        self.username = r''
        self.password = r''

        self.formdata = {}
        self.formdata['username'] = self.username
        self.formdata['password'] = self.password
        self.formdata['usecookie'] = '0'
        self.formdata['action'] = r'login'
        self.formdata[
            'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B'

        self.headers = {}
        self.headers['origin'] = r'https://www.wenku8.net'
        self.headers[
            'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.headers['upgrade-insecure-requests'] = '1'
        self.headers[
            'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

        self.session = requests.Session()
        self.manager = UrlManager()
        self.downloader = UrlDownloader()
        self.parser = UrlParser(self.session)
Esempio n. 2
0
    def create_book(command, counter):
        Path.reset_path()

        Debug.logger.info(u"Ready to make No.{} e-book".format(counter))
        Debug.logger.info(u"Analyzes {} ".format(command))
        task_package = UrlParser.get_task(command)  # 分析命令
        if not task_package.is_work_list_empty():
            worker_factory(task_package.work_list)  # 执行抓取程序
            Debug.logger.info(u"Complete fetching from web")

        file_name_set = None
        if not task_package.is_book_list_empty():
            Debug.logger.info(u"Start generating e-book from the database")
            book = Book(task_package.book_list)
            file_name_set = book.create()
        if file_name_set is not None:
            file_name_set2list = list(file_name_set)
            file_name = '-'.join(file_name_set2list[0:3])
            return file_name
        return u"Oops! no epub file produced"
Esempio n. 3
0
class Wenku():
    def __init__(self):
        self.authority = r'https://www.wenku8.net'
        self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.pageurl = r"/modules/article/articlelist.php?page="
        self.username = r''
        self.password = r''

        self.formdata = {}
        self.formdata['username'] = self.username
        self.formdata['password'] = self.password
        self.formdata['usecookie'] = '0'
        self.formdata['action'] = r'login'
        self.formdata[
            'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B'

        self.headers = {}
        self.headers['origin'] = r'https://www.wenku8.net'
        self.headers[
            'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.headers['upgrade-insecure-requests'] = '1'
        self.headers[
            'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

        self.session = requests.Session()
        self.manager = UrlManager()
        self.downloader = UrlDownloader()
        self.parser = UrlParser(self.session)

    def login(self):
        response = self.session.post(self.loginurl,
                                     data=self.formdata,
                                     headers=self.headers)
        if response.status_code == 200:
            return True
        return False

    def parser_some_pages(self, begin, end):
        for turn in range(begin, end):
            index = self.session.get(self.authority + self.pageurl + str(turn))
            if index.status_code != 200:
                print('get page error page num: ' + str(turn) +
                      ' ,error code: ' + str(index.status_code))
                return
            index.encoding = 'gbk'
            self.parser_one_page(index)
            print('parser page ' + str(turn) + ' done!')
            time.sleep(random.random() * 3)

    def parser_one_page(self, index):
        data = self.parser.parser(index, index.text)
        self.manager.add_new_urls(data)

    def save_2_files(self, filename):
        self.manager.save_2_file(filename)

    def Run(self):
        thread_pool = []
        for i in range(0, 10):
            t = threading.Thread(target=self.parser_some_pages,
                                 args=(1 + i * 10, 1 + (i + 1) * 10))
            thread_pool.append(t)
        for t in thread_pool:
            t.start()
        for t in thread_pool:
            t.join()
        self.save_2_files('dict.txt')

    def load_and_download(self):
        with open('dict.txt', 'r') as f:
            urls = json.load(fp=f)
        for k, v in urls.items():
            name = (k + '.txt').replace('?', '!')
            self.downloader.download(v[0], name)
            print('download done ' + name)
            time.sleep(random.random() * 3)
Esempio n. 4
0
    10006: "Odbiór w punkcie po przedpłacie - PACZKA W RUCHu",
    10022: "Odbiór w punkcie po przedpłacie - Paczkomaty 24/7",
    10023: "Odbiór w punkcie po przedpłacie - Allegro Paczkomaty InPost",
    10060: "Odbiór w punkcie po przedpłacie - Paczka24 Odbiór w Punkcie",
    10061:
    "Odbiór w punkcie po przedpłacie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie",
    20006: "Odbiór w punkcie - PACZKA W RUCHu",
    20022: "Odbiór w punkcie - Paczkomaty 24/7",
    20023: "Odbiór w punkcie - Allegro Paczkomaty InPost",
    20060: "Odbiór w punkcie - Paczka24 Odbiór w Punkcie",
    20061: "Odbiór w punkcie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie"
}

client = Client(wsdl)
url = "http://allegro.pl/listing/listing.php?generalDelivery_rec=1&vat_invoice=1&standard_allegro=1&startingTime=7&buyNew=1&offerTypeBuyNow=1&order=d&price_to=1000&string=samsung&bmatch=engagement-v6-promo-sm-sqm-fall-ele-1-1-1214&city=Pozna%C5%84"
url_parser = UrlParser()
params = url_parser.parse(url)

alle_options_parser = AlleOptions(client)
options = alle_options_parser.get_options(params)

api_methods = ApiMethods(client)

api_version = api_methods.get_version()
session = api_methods.get_session(api_version)

items = api_methods.get_items_list(options)

item_ids = [x.itemId for x in items]
calculated_items = []
Esempio n. 5
0
    31: "Paczka48",
    10006: "Odbiór w punkcie po przedpłacie - PACZKA W RUCHu",
    10022: "Odbiór w punkcie po przedpłacie - Paczkomaty 24/7",
    10023: "Odbiór w punkcie po przedpłacie - Allegro Paczkomaty InPost",
    10060: "Odbiór w punkcie po przedpłacie - Paczka24 Odbiór w Punkcie",
    10061: "Odbiór w punkcie po przedpłacie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie",
    20006: "Odbiór w punkcie - PACZKA W RUCHu",
    20022: "Odbiór w punkcie - Paczkomaty 24/7",
    20023: "Odbiór w punkcie - Allegro Paczkomaty InPost",
    20060: "Odbiór w punkcie - Paczka24 Odbiór w Punkcie",
    20061: "Odbiór w punkcie - E-PRZESYŁKA / Paczka48 Odbiór w Punkcie"
}

client = Client(wsdl)
url = "http://allegro.pl/listing/listing.php?generalDelivery_rec=1&vat_invoice=1&standard_allegro=1&startingTime=7&buyNew=1&offerTypeBuyNow=1&order=d&price_to=1000&string=samsung&bmatch=engagement-v6-promo-sm-sqm-fall-ele-1-1-1214&city=Pozna%C5%84"
url_parser = UrlParser()
params = url_parser.parse(url)

alle_options_parser = AlleOptions(client)
options = alle_options_parser.get_options(params)

api_methods = ApiMethods(client)

api_version = api_methods.get_version()
session = api_methods.get_session(api_version)

items = api_methods.get_items_list(options)

item_ids = [x.itemId for x in items]
calculated_items = []
 def test_parse(self):
     url_parser = UrlParser()
     url = "http://allegro.pl/listing/listing.php?order=d&string=asd&bmatch=engagement-v6-promo-sm-sqm-dyn-v2-aut-1-1-1120&buyNew=1&offerTypeBuyNow=1&price_from=1&price_to=2&city=test&startingTime=6&state=9&standard_allegro=1&freeReturn=1&freeShipping=1&personal_rec=1&vat_invoice=1&generalDelivery_rec=1"
     self.assertEqual(url_parser.parse(url), {'search': 'asd', 'offerType': ['buyNow'], 'condition': ['new'], 'price': {'min': '1', 'max': '2'}, 'city': 'test', 'state': '9', 'startingTime': '12h', 'offerOptions': ['freeReturn', 'freeShipping', 'personalReceipt', 'vatInvoice', 'generalDelivery', 'standardAllegro']})