コード例 #1
0
class OmvicScrapper:
    isFinished = False
    im_data = []

    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.initScrapper()


    def initScrapper(self):
        try:
            dupCsvReader = Csv()
            dupCsvRows = dupCsvReader.readCsvRow('omvic.csv')
            self.dbHelper = DbHelper('omvic.db')
            self.dbHelper.createTable('omvic')
            self.totaldata = self.dbHelper.getTotalProduct('omvic')
            self.csvWriter = Csv('omvic.csv')
            csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass',
                             'Operating Status',
                             'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names']
            if len(dupCsvRows) == 0:
                self.csvWriter.writeCsvRow(csvDataHeader)
            del dupCsvReader
            del dupCsvRows
            gc.collect()
            del gc.garbage[:]
            gc.collect()
        except Exception, x:
            print x
コード例 #2
0
class AmazonScrapper(QThread):
    notifyAmazon = pyqtSignal(object)

    def __init__(self, urlList, category):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
        if csvDataHeader not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://www.amazon.com'
        self.scrapUrl = None
        self.dbHelper = DbHelper('amazon.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            # self.scrapProductDetail(
            #     'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544')
            # return
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url)
                        imUrl = None
                        retry = 0
                        while imUrl is None and retry < 4:
                            imUrl = self.reformatUrl(url)
                            retry += 1
                        if imUrl is None:
                            imUrl = url
                        self.total = 0
                        print 'URL: ' + str(imUrl)
                        sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority',
                                    'date-desc-rank']
                        for sort in sortList:
                            self.scrapReformatData(imUrl, sort)
                        self.notifyAmazon.emit(
                            '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url)
            self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
コード例 #3
0
class SaraivaScrapper(QThread):
    notifySaraiva = pyqtSignal(object)

    def __init__(self, urlList, category, htmlTag, replaceTag):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
        self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
        self.replaceTag = replaceTag
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
        self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://busca.livrariasaraiva.com.br'
        self.scrapUrl = None
        self.dbHelper = DbHelper('saraiva.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url)
                        paginationUrl, self.maxRecords = self.reformatUrl(url)
                        self.notifySaraiva.emit(
                            '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords))
                        print 'Max records: ', self.maxRecords
                        print 'URL: ' + str(paginationUrl)
                        sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev',
                                    '&isort=price+rev',
                                    '&isort=price', '&isort=date+rev']
                        for sort in sortList:
                            self.scrapResults(paginationUrl, sort)
            self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)