Exemple #1
0
class Model_Scraper_Keywords_First(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Keywords_First, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Keywords_First', region)

    def scraper(self, keywords):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=1"
        print(url)
        try:
            content = self.process.processkeywords(url)
        except Exception as err:
            print(err)
        try:
            if (content):
                # 这边写解析代码
                result = self.processor.process(content)
                if (result):
                    return result
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Exemple #2
0
class Model_Scraper_Product_Base(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Product_Base, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Product_Base', region)

    def scrape(self, asin):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/dp/" + asin + "?th=1&psc=1"
        print(url)
        try:
            content = self.process.process(url)
        except Exception as err:
            print(err)
        try:
            if (content):
                # 解析代码
                data = self.processor.process(content.encode('utf-8'))
                if (data):
                    return data
                else:
                    return False
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Exemple #3
0
class Model_Scraper_TopReviewer(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'TopReviewer', region)

    def scrape(self, begin, end):
        if not str(begin).isdigit() and not str(end).isdigit() and begin > end:
            return Model_Static_Scrape_Status.FAILED
        self.process = Model_Scraper_Standard(self.region)
        data = []
        for i in range(begin, end):
            pageUrl = "https://www.amazon." + self.region + "/review/top-reviewers?page=" + str(
                i)
            pageContent = self.process.processTopReviewer(pageUrl)
            if not pageContent:
                continue
            rankEnd = i * 10
            rankBegin = rankEnd - 9
            pageResult = self.processor.process(pageContent, rankBegin,
                                                rankEnd + 1)
            if (pageResult):
                # 数组合并
                data.append(pageResult)
        if (len(data)):
            return data
Exemple #4
0
class Model_Scraper_Product_Offer1(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Product_Offer', region)

    def process(self, asin):
        self.processOffer = Model_Scraper_Standard(self.region)
        content = self.processOffer.processOffer(self.region, asin)
        if (content):
            return content

    # def scrapeInventory(self, data):
    #     if (data == '' or data == None):
    #         return Model_Static_Scrape_Status.FAILED
    #     url ="http://www.amazon."+self.region+"/gp/aws/cart/add.html"
    #     fields = []
    #     session_id = None

    def scrape(self, asin):
        content = self.process(asin)
        if (content):
            # 这边写解析代码, 通过解析返回的数据再进行库存的抓取
            print(content)
            data = self.processor.process(content.encode('utf-8'))
            if (data):
                print(data)
                # 通过解析得到的数据进行库存的计算
                # Inventory = self.scrapeInventory(asin, data)
                # print (Inventory)
        pageCount = self.processor.getPageCount(content)
        # print (pageCount)
        if (pageCount > 1):
            for i in range(2, int(pageCount) + 1):
                # print (i)
                index = str((i - 1) * 10)
                pageUrl = "http://www.amazon." + "com" + "/gp/offer-listing/" + asin + "/ref=olpOffersSuppressed?ie=UTF8&f_new=true&overridePriceSuppression=1&startIndex=" + index
                # print (pageUrl)
                pageContent = self.processPageOffer(pageUrl)
                if (pageContent):
                    print(pageContent)
                    pageResult = self.processor.process(
                        pageContent.encode('utf-8'))
                    if (pageResult):
                        print(pageResult)
Exemple #5
0
class Model_Scraper_Keywords(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Keywords, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor('Keywords', region)

    def scraper(self, keywords):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon."+self.region+"/gp/search?keywords="+keywords+"&page=1"
        # 不显示浏览器
        # with Display(backend="xvfb", size=(1440, 900)):
        print (url)
        try:
            content = self.process.processkeywords(url)
        except Exception as err:
            print (err)
        try:
            if (content):
                # 这边写解析代码
                data = []
                result = self.processor.process(content.encode('utf-8'), 1)
                if (result):
                    # print (result)
                    data.append(result)
                    pagecount = int(self.processor.getPageCount(content))
                    if (pagecount > 5):
                        pagecount = 5
                    # pagecount = 1
                    if (pagecount > 1):
                        for i in range(2, pagecount + 1):
                            pageurl = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=" + str(i)
                            print (pageurl)
                            pagecontent = self.process.processkeywords(pageurl)
                            if (pagecontent):
                                pageresult = self.processor.process(pagecontent.encode('utf-8'), i)
                                # print (pageresult)
                                data.append(pageresult)
                    return data
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Exemple #6
0
class Model_Scraper_Sina_Auto():
    def __init__(self):
        self.processor = Service_Functions().getSinaProcessor('Sina_Base')

    def scrape(self, url):
        self.process = Model_Scraper_Sina_Standard()
        html = self.process.process_auto(url)
        if (html):
            content = self.processor.process(html)
            return (content)
Exemple #7
0
class Model_Scraper_Mobile_Product_Base(Model_Scraper_Standard):

    def __init__(self , region):
        self.region = region
        self.processor = Service_Functions().getProcessor('MobileProduct_Base', region)

    def scrape(self , region, keywords):
        result = []
        self.process = Model_Scraper_Standard(region)
        requrl = "https://www.amazon."+region+"/s?page="+str(1)+"&keywords="+keywords+"&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
        content = self.process.mobile_process(requrl)
        if(content):
            # 解析代码
            # print (content)
            data = self.processor.mobile_process(region, content)
            if(data):
                # print (data)
                result.append(data)
                page_count = content['pagination']['numPages']
                # print (page_count)
                if (int(page_count) > 20):
                    page_count = 20
                for k in range(2, page_count + 1):
                    try:
                        requrl = "https://www.amazon." + region + "/s?page=" + str(k) + "&keywords=" + keywords + "&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
                        # print (requrl)
                        content = self.process.mobile_process(requrl)
                        result.append(self.processor.mobile_process(region, content))
                    except Exception as err:
                        print (err)
                try:
                    total = {}
                    total['total'] = content['resultsMetadata']['totalResults']
                except Exception as err:
                    print (err)
                result.append(total)
            return result
Exemple #8
0
class Model_Scraper_Seller_Product(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Seller_Product', region)

    def scrape(self, merchantId):
        if not merchantId:
            return False
        url = "https://www.amazon." + self.region + "/s?merchant=" + merchantId
        print(url)
        content = Model_Scraper_Standard(self.region).processSellerProduct(url)
        if (content):
            result = self.processor.process(content)
            if (result):
                data = []
                data.append(result)
                pagecount = int(self.processor.getPageCount(content))
                pagecount = 1  # 测试
                if (pagecount > 1):
                    if (pagecount > 50):
                        pagecount = 50  # 测试 原为50
                    for i in range(2, pagecount + 1):
                        pageurl = "https://www.amazon." + self.region + "/s?merchant=" + merchantId + "&page=" + str(
                            i)
                        print(pageurl)
                        pageContent = Model_Scraper_Standard(
                            self.region).processSellerProduct(pageurl)
                        if not pageContent:
                            continue
                        pageResult = self.processor.process(pageContent)
                        if (pageResult):
                            data.append(pageResult)
                return data
            return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
        return Model_Static_DownloadQueue_Status().FAILED
Exemple #9
0
class Model_Scraper_Seller_Base(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Seller_Base', region)

    def scrape(self, merchantId):
        if not merchantId:
            return False
        url = "http://www.amazon." + self.region + "/gp/aag/main?seller=" + merchantId
        content = Model_Scraper_Standard(self.region).processSeller(url)
        if (content):
            data = self.processor.process(content)
            if (data):
                return data
            return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
        return Model_Static_DownloadQueue_Status().FAILED
Exemple #10
0
class Model_Scraper_Sina_Base():
    def __init__(self):
        self.processor = Service_Functions().getSinaProcessor('Sina_Base')

    def scrape(self, url):
        self.process = Model_Scraper_Sina_Standard()
        html = self.process.process(url)
        if (html):
            content = self.processor.process(html)
            return (content)

    def process_content(self, html, url_id, url):
        scrape = Model_Processor_Sina_Content()
        content = scrape.process(html)
        content['url_id'] = url_id
        content['url'] = url
        return content
Exemple #11
0
class Model_Scraper_Tencent_Sports():
    def __init__(self):
        self.processor = Service_Functions().getTencentProcessor(
            'Tencent_Sports')

    def scrape(self, url):
        self.process = Model_Scraper_Tencent_Standard()
        html = self.process.process_gb2312(url)
        # print (html)
        if (html):
            content = self.processor.process(html)
            return (content)

    def process_content(self, html, url_id, url):
        scrape = Model_Processor_Tencent_Content()
        content = scrape.process(html)
        content['url_id'] = url_id
        content['url'] = url
        return content
Exemple #12
0
 def __init__(self):
     self.processor = Service_Functions().getSinaProcessor('Sina_Base')
Exemple #13
0
 def __init__(self, region):
     super(Model_Scraper_Keywords_First, self).__init__(region)
     self.region = region
     self.processor = Service_Functions().getProcessor(
         'Keywords_First', region)
Exemple #14
0
 def __init__(self , region):
     self.region = region
     self.processor = Service_Functions().getProcessor('MobileProduct_Base', region)
Exemple #15
0
 def __init__(self):
     self.processor = Service_Functions().getTencentProcessor(
         'Tencent_Sports')
Exemple #16
0
 def __init__(self, region):
     self.region = region
     self.processor = Service_Functions().getProcessor(
         'TopReviewer', region)
Exemple #17
0
 def __init__(self, region):
     super(Model_Scraper_Product_Base, self).__init__(region)
     self.region = region
     self.processor = Service_Functions().getProcessor(
         'Product_Base', region)
Exemple #18
0
class Model_Scraper_Product_Review(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Product_Review', region)

    def scrape(self, asin, scrapedCount):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=1"
        print(url)
        content = self.process.processReview(url)
        if (content):
            data = {}
            items = []
            summary = self.processor.getSummary(content.encode('utf-8'))
            if (summary):
                # print (summary)
                data['summary'] = summary
            # 处理首页数据
            result = self.processor.process(content.encode('utf-8'))
            if (result):
                # print (result)
                items.append(result)

            newScrapedCount = 10
            if (data['summary']['page_count']
                    and data['summary']['page_count'] > 0):
                # print (data['summary']['page_count'])
                pageCount = data['summary']['page_count']
                # 已经抓取的页面
                scrapedPageCount = int(floor(int(scrapedCount) / 10))
                # print (scrapedPageCount)
                # 实际需要抓取的页面
                pageCount = pageCount - scrapedPageCount
                # print (pageCount)
                if (pageCount > 20):
                    pageCount = 2
                if (pageCount >= 2):
                    newScrapedCount = pageCount * 10
                    for i in range(2, pageCount + 1):
                        pageUrl = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=" + str(
                            i)
                        print(pageUrl)
                        pageContent = self.process.processReview(pageUrl)
                        if not pageContent:
                            continue
                        # 处理page数据
                        pageResult = self.processor.process(
                            pageContent.encode("utf-8"))
                        if (pageResult):
                            # print (pageResult)
                            # items = []
                            items.append(pageResult)
                            # print (items[0])
                            # print (items[1])
            data['list'] = items
            if (len(data) > 0):
                data['new_scraped_count'] = newScrapedCount
                # print (data)
                return data
            else:
                return Model_Static_Scrape_Status.SUCCESS_NO_DATA
        elif (content == None):
            return None
        else:
            return False
Exemple #19
0
 def __init__(self, region):
     self.region = region
     self.processor = Service_Functions().getProcessor(
         'Product_Offer', region)
Exemple #20
0
 def __init__(self, region):
     self.region = region
     self.processor = Service_Functions().getProcessor(
         'Seller_Product', region)