class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None
        self.mycookie = None

    def login(self, url, loginInfo, retry=0, proxy=None):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        conn = ('Connection', 'keep-alive')
        ac = ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        ln = ('Accept-Language', 'en-us,en;q=0.5')
        if proxy is None:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler())
        else:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy)
        urllib2.install_opener(self.opener)
        try:
            return self.opener.open(url, urllib.urlencode(loginInfo)).read()
        except Exception, x:
            print x.message
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                self.login(url, loginInfo, retry + 1)
        return None
class Browser:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.browser = None
        self.browserCookieJar = None

    def browserLogin(self, url, loginParams, formId=None, saveCookie=False, retry=0):
        """
        Login page just like web browser
        url = '' Ex. http://www.example.com
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """

        try:
            self.browser = self.createBrowser([config.USER_AGENT])
            self.browser.open(url, timeout=config.TIMEOUT)
            if formId is not None:
                self.browser.select_form(predicate=lambda f: 'id' in f.attrs and f.attrs['id'] == formId)
            else:
                self.browser.select_form(nr=0)
            for key in loginParams:
                self.browser.form[key] = loginParams[key]
            self.browser.submit()
            if saveCookie:
                self.browserCookieJar.save(config.COOKIE_FILE)
            return self.browser.response().read()
        except Exception, x:
            self.logger.error(x)
            if retry < config.RETRY_COUNT:
                self.browserLogin(url, loginParams, formId, saveCookie, retry + 1)
        return None
Beispiel #3
0
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None
        self.mycookie = None

    def login(self, url, loginInfo, retry=0, proxy=None):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        conn = ('Connection', 'keep-alive')
        ac = (
            'Accept',
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        ln = ('Accept-Language', 'en-us,en;q=0.5')
        if proxy is None:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln],
                                            self.createCookieJarHandler())
        else:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln],
                                            self.createCookieJarHandler(),
                                            proxy)
        urllib2.install_opener(self.opener)
        try:
            return self.opener.open(url, urllib.urlencode(loginInfo)).read()
        except Exception, x:
            print x.message
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                self.login(url, loginInfo, retry + 1)
        return None
class GoogleFinanceScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'https://www.google.com/finance?'
        self.main_url = 'https://www.google.com'
        self.csvWriter = Csv('google_finance.csv')
        csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                params = urllib.urlencode({'q': line})
                url = self.url + params
                self.scrapBykeyword(url, line)
        except Exception, x:
            print x
            self.logger.error('Error: ' + x.message)
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None
        self.mycookie = None

    def login(self, url, loginInfo, retry=0):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        host = ("Host", "www.vizury.com")
        conn = ("Connection", "keep-alive")
        enc = ("Accept-Encoding", "gzip, deflate")
        ac = ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        ln = ("Accept-Language", "en-us,en;q=0.5")
        self.opener = self.createOpener([config.USER_AGENT, conn, enc, ac, ln, host], self.createCookieJarHandler())
        urllib2.install_opener(self.opener)
        try:
            return self.opener.open(url, urllib.urlencode(loginInfo)).read()
        except Exception, x:
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                self.login(url, loginInfo, retry + 1)
        return None
class AmazonScrapper(QThread):
    notifyAmazon = pyqtSignal(object)

    def __init__(self, urlList, category):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
        if csvDataHeader not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://www.amazon.com'
        self.scrapUrl = None
        self.dbHelper = DbHelper('amazon.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            # self.scrapProductDetail(
            #     'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544')
            # return
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url)
                        imUrl = None
                        retry = 0
                        while imUrl is None and retry < 4:
                            imUrl = self.reformatUrl(url)
                            retry += 1
                        if imUrl is None:
                            imUrl = url
                        self.total = 0
                        print 'URL: ' + str(imUrl)
                        sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority',
                                    'date-desc-rank']
                        for sort in sortList:
                            self.scrapReformatData(imUrl, sort)
                        self.notifyAmazon.emit(
                            '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url)
            self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
class NisbetProduct(QtCore.QThread):
    scrapProductData = QtCore.pyqtSignal(object)
    stopThread = QtCore.pyqtSignal(int)

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.isExiting = False
        self.totalProducts = 0
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0)
        self.csvWriter = Csv('nisbets.csv')
        self.mainUrl = 'http://www.nisbets.co.uk'
        csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand',
                         'Product Price', 'Product Short Description',
                         'Product Long Description', 'Image File Name', 'User Manual File Name',
                         'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1',
                         'Category2', 'Category3',
                         'Category4']
        if 'URL' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(csvHeaderList)
            self.dupCsvRows.append(csvHeaderList[0])

        self.utils = Utils()

    def run(self):
        self.scrapData()

    def stop(self):
        self.isExiting = True

    def scrapData(self):
        if self.isExiting: return
        self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
        self.logger.debug('===== URL [' + self.mainUrl + '] =====')
        data = self.spider.fetchData(self.mainUrl)
        if data and len(str(data).strip()) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data)
            if category1Chunk and len(str(category1Chunk).strip()) > 0:
                i = 0
                for category1Data in category1Chunk:
                    category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data)
                    category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>',
                                                                   category1Data)
                    if category2Chunk and len(str(category2Chunk).strip()) > 0:
                        for category2Data in category2Chunk:
                            try:
                                self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1])
                            except Exception, x:
                                self.logger.error(x)
        self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
Beispiel #8
0
class Csv:
    def __init__(self, fileName=None):
        self.logger = LogManager(__name__)
        if fileName is not None:
            self.writer = UnicodeWriter(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)

    def writeCsvRow(self, data):
        try:
            self.writer.writerow(data)
        except Exception, x:
            self.logger.error(x)
Beispiel #9
0
class Csv:
    def __init__(self, fileName=None):
        self.logger = LogManager(__name__)
        if fileName is not None:
            self.writer = csv.writer(open(fileName, 'ab'))

    def writeCsvRow(self, data):
        try:
            self.writer.writerow(data)
        except Exception, x:
            self.logger.error(x)
Beispiel #10
0
class Csv:
    def __init__(self, fileName=None):
        self.logger = LogManager(__name__)
        if fileName is not None:
            self.writer = csv.writer(open(fileName, 'ab'))

    def writeCsvRow(self, data):
        try:
            self.writer.writerow(data)
        except Exception, x:
            self.logger.error(x)
Beispiel #11
0
class Csv:
    def __init__(self, fileName=None):
        self.logger = LogManager(__name__)
        if fileName is not None:
            self.file = open(fileName, 'wb')
            self.writer = UnicodeWriter(self.file, quoting=csv.QUOTE_ALL)

    def writeCsvRow(self, data):
        try:
            self.writer.writerow(data)
        except Exception, x:
            self.logger.error(x)
class Utils:
    def __init__(self):
        self.logger = LogManager(__name__)

    def downloadFile(self, url, savePath):
        try:
            directory = os.path.dirname(savePath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            webFile = urllib2.urlopen(url)
            localFile = open(savePath, 'wb')
            localFile.write(webFile.read())
        except Exception, x:
            self.logger.error(x)
Beispiel #13
0
class Utils:
    def __init__(self):
        self.logger = LogManager(__name__)

    def downloadFile(self, url, savePath):
        try:
            directory = os.path.dirname(savePath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            webFile = urllib2.urlopen(url)
            localFile = open(savePath, 'wb')
            localFile.write(webFile.read())
        except Exception, x:
            self.logger.error(x)
class SaraivaScrapper(QThread):
    notifySaraiva = pyqtSignal(object)

    def __init__(self, urlList, category, htmlTag, replaceTag):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
        self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
        self.replaceTag = replaceTag
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
        self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://busca.livrariasaraiva.com.br'
        self.scrapUrl = None
        self.dbHelper = DbHelper('saraiva.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url)
                        paginationUrl, self.maxRecords = self.reformatUrl(url)
                        self.notifySaraiva.emit(
                            '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords))
                        print 'Max records: ', self.maxRecords
                        print 'URL: ' + str(paginationUrl)
                        sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev',
                                    '&isort=price+rev',
                                    '&isort=price', '&isort=date+rev']
                        for sort in sortList:
                            self.scrapResults(paginationUrl, sort)
            self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
Beispiel #15
0
class PaodeacucarScrapper(QThread):
    notifyPaode = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.mainUrl = 'http://www.paodeacucar.com.br/'
        self.url = 'http://www.paodeacucar.com.br/'
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4)
        self.csvWriter = Csv('paodeacucar.csv')
        csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details',
                         'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14']
        if 'URL' not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()

    def scrapData(self):
        try:
            print 'Main URL: ', self.url
            self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url))
            data = self.spider.fetchData(self.url)
            if data and len(data) > 0:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                soup = BeautifulSoup(data)
                categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*'))
                print 'Total Categories: ', len(categories)
                self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories))))
                for category in categories:
                    if category.a is not None:
                        submenu_target = self.regex.replaceData('#', '', category.a.get('data-target'))
                        sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item')
                        print 'Total Sub Categories: ', len(sub_categories)
                        self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories))))
                        for sub_category in sub_categories:
                            sub_category_label = sub_category.find('span', class_='label').text
                            sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A'
                            self.scrapItems(sub_category_url, category.text, sub_category_label)
        except Exception, x:
            self.logger.error(x.message)
            print x
class CsTest(QThread):
    notifyProduct = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0)
        self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1)
        self.csvWriter = Csv('cs_product.csv')
        #        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/brands'
        self.utils = Utils()
        if 'Product Code' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(
                ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount',
                 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty'
                    ,
                 'Delivery',
                 'Product Image',
                 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image'])
        self.totalProducts = len(self.dupCsvRows)

    def run(self):
        self.scrapBrands()
        self.notifyProduct.emit('<font color=red><b>Finished Scraping All Brands.</b></font>')

    def scrapBrands(self):
        self.notifyProduct.emit('<font color=green><b>Main URL: %s<b></font>' % self.mainUrl)
        self.notifyProduct.emit('<b>Try To scrap All Brands.<b>')
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data)
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk)
                    self.notifyProduct.emit('<b>Total Brands Found: %s<b>' % str(len(brands)))
                    if brands and len(brands) > 0:
                        for brand in brands:
                            try:
                                self.scrapBrandInfo(brand[0], 'Shop By Brand', brand[1])
                            except Exception, x:
                                self.logger.error(x)
class CsBrands(QThread):
    notifyBrand = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow("cs_Brands.csv")
        self.csvWriter = Csv("cs_Brands.csv")
        self.mainUrl = "http://www.cs-catering-equipment.co.uk/brands"
        self.isExiting = False
        headerData = [
            "URL",
            "Parent Category",
            "Brand Category",
            "Brand Description",
            "Image File",
            "Product Codes in this category",
        ]
        if headerData not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(headerData)

    def run(self):
        self.scrapBrands()
        self.notifyBrand.emit("<font color=red><b>Finished Scraping All Brands.</b></font>")

    def scrapBrands(self):
        self.notifyBrand.emit("<font color=green><b>Main URL: %s<b></font>" % self.mainUrl)
        self.notifyBrand.emit("<b>Try To scrap All Brands.<b>")
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data)
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk)
                    self.notifyBrand.emit("<b>Total Brands Found: %s<b>" % str(len(brands)))
                    if brands and len(brands) > 0:
                        for brand in brands:
                            try:
                                self.scrapBrandInfo(brand[0], "Shop By Brand", brand[1])
                            except Exception, x:
                                self.logger.error(x)
Beispiel #18
0
class WebTableScrapper(object):
    def __init__(self):
        self.browser = None
        self.url = "http://environmentclearance.nic.in/Search.aspx"
        self.statuses = []
        self.categories = []
        self.years = []
        self.states = []
        self.csvDataHeader = [
            'Status', 'Category', 'Year', 'State', 'Serial No',
            'Proposal details', 'Location', 'Important Date', 'Category',
            'Company Proponent'
        ]
        self.logger = LogManager(__name__)
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv')
        self.csvWriter = Csv('env_clearance.csv')
        if self.csvDataHeader not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(self.csvDataHeader)
            self.dupCsvRows.append(self.csvDataHeader)

    def scrapData(self):
        try:
            self.browser = self.createBrowser([Config.USER_AGENT])
            self.browser.set_handle_robots(False)
            # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat')
            # exit(1)
            data = self.browser.open(self.url, None, 60).read()
            if data is not None:
                soup = BeautifulSoup(data)
                self.statuses = self.populateDropDownValues(
                    soup, 'ddlstatus', '0')
                self.categories = self.populateDropDownValues(
                    soup, 'ddlcategory', '-All Category-')
                self.years = self.populateDropDownValues(
                    soup, 'ddlyear', '-All Years-')
                self.states = self.populateDropDownValues(
                    soup, 'ddlstate', '-All State-')

                for status in self.statuses:
                    self.scrapDataByStatus(status[0], status[1])
        except Exception, x:
            print x
            self.logger.error(x)
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None
        self.mycookie = None

    def login(self, url, loginInfo, retry=0, proxy=None):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        conn = ('Connection', 'keep-alive')
        ac = (
            'Accept',
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        ln = ('Accept-Language', 'en-us,en;q=0.5')
        if proxy is None:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln],
                                            self.createCookieJarHandler())
        else:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln],
                                            self.createCookieJarHandler(),
                                            proxy)
        urllib2.install_opener(self.opener)
        try:
            response = self.opener.open(url, urllib.urlencode(loginInfo))
            print 'Response from Server:'
            print 'Status: ', response.getcode()
            print response.info()
            self.logger.debug('Response from Server:')
            self.logger.debug('Status: ' + str(response.getcode()))
            self.logger.debug(response.info())
            redirected_url = response.url
            return redirected_url, response.read()
        except Exception, x:
            print x
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                print 'Retry again. Please wait 5 seconds...'
                time.sleep(5)
                self.login(url, loginInfo, retry + 1)
            else:
                print 'Failed to retrieve data after maximum %d retry!' % config.RETRY_COUNT
        return None, None
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None

    def login(self, url, loginInfo, retry=0):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        self.opener = self.createOpener([config.USER_AGENT], self.createCookieJarHandler())
        urllib2.install_opener(self.opener)
        try:
            return self.opener.open(url, urllib.urlencode(loginInfo)).read()
        except Exception, x:
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                self.login(url, loginInfo, retry + 1)
        return None
Beispiel #21
0
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None

    def login(self, url, loginInfo, retry=0):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        self.opener = self.createOpener([config.USER_AGENT],
                                        self.createCookieJarHandler())
        urllib2.install_opener(self.opener)
        try:
            return self.opener.open(url, urllib.urlencode(loginInfo)).read()
        except Exception, x:
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                self.login(url, loginInfo, retry + 1)
        return None
Beispiel #22
0
class Spider:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.opener = None
        self.mycookie = None

    def login(self, url, loginInfo, retry=0, proxy=None):
        """
        Login request for user
        url = '' Ex. http://www.example.com/login
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """
        conn = ('Connection', 'keep-alive')
        ac = ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        ln = ('Accept-Language', 'en-us,en;q=0.5')
        if proxy is None:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler())
        else:
            self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy)
        urllib2.install_opener(self.opener)
        try:
            response = self.opener.open(url, urllib.urlencode(loginInfo))
            print 'Response from Server:'
            print 'Status: ', response.getcode()
            print response.info()
            self.logger.debug('Response from Server:')
            self.logger.debug('Status: ' + str(response.getcode()))
            self.logger.debug(response.info())
            redirected_url = response.url
            return redirected_url, response.read()
        except Exception, x:
            print x
            self.logger.error(x.message)
            if retry < config.RETRY_COUNT:
                print 'Retry again. Please wait 5 seconds...'
                time.sleep(5)
                self.login(url, loginInfo, retry + 1)
            else:
                print 'Failed to retrieve data after maximum %d retry!' % config.RETRY_COUNT
        return None, None
class Browser:
    def __init__(self):
        self.logger = LogManager(__name__)
        self.browser = None
        self.browserCookieJar = None

    def browserLogin(self,
                     url,
                     loginParams,
                     formId=None,
                     saveCookie=False,
                     retry=0):
        """
        Login page just like web browser
        url = '' Ex. http://www.example.com
        loginInfo = {} Ex. {'user': '******', 'pass': '******'}
        """

        try:
            self.browser = self.createBrowser([config.USER_AGENT])
            self.browser.open(url, timeout=config.TIMEOUT)
            if formId is not None:
                self.browser.select_form(predicate=lambda f: 'id' in f.attrs
                                         and f.attrs['id'] == formId)
            else:
                self.browser.select_form(nr=0)
            for key in loginParams:
                self.browser.form[key] = loginParams[key]
            self.browser.submit()
            if saveCookie:
                self.browserCookieJar.save(config.COOKIE_FILE)
            return self.browser.response().read()
        except Exception, x:
            self.logger.error(x)
            if retry < config.RETRY_COUNT:
                self.browserLogin(url, loginParams, formId, saveCookie,
                                  retry + 1)
        return None
class WebTableScrapper(object):
    def __init__(self):
        self.browser = None
        self.url = "http://environmentclearance.nic.in/Search.aspx"
        self.statuses = []
        self.categories = []
        self.years = []
        self.states = []
        self.csvDataHeader = ['Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location',
                              'Important Date', 'Category', 'Company Proponent']
        self.logger = LogManager(__name__)
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv')
        self.csvWriter = Csv('env_clearance.csv')
        if self.csvDataHeader not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(self.csvDataHeader)
            self.dupCsvRows.append(self.csvDataHeader)

    def scrapData(self):
        try:
            self.browser = self.createBrowser([Config.USER_AGENT])
            self.browser.set_handle_robots(False)
            # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat')
            # exit(1)
            data = self.browser.open(self.url, None, 60).read()
            if data is not None:
                soup = BeautifulSoup(data)
                self.statuses = self.populateDropDownValues(soup, 'ddlstatus', '0')
                self.categories = self.populateDropDownValues(soup, 'ddlcategory', '-All Category-')
                self.years = self.populateDropDownValues(soup, 'ddlyear', '-All Years-')
                self.states = self.populateDropDownValues(soup, 'ddlstate', '-All State-')

                for status in self.statuses:
                    self.scrapDataByStatus(status[0], status[1])
        except Exception, x:
            print x
            self.logger.error(x)
Beispiel #25
0
class GoogleFinanceScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'https://www.google.com/finance?'
        self.main_url = 'https://www.google.com'
        self.csvWriter = Csv('google_finance.csv')
        csvDataHeader = [
            'Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue',
            'Date of Scrape'
        ]
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                params = urllib.urlencode({'q': line})
                url = self.url + params
                self.scrapBykeyword(url, line)
        except Exception, x:
            print x
            self.logger.error('Error: ' + x.message)
Beispiel #26
0
class Scrapper(QThread):
    notifyScrapper = pyqtSignal(object)
    isFinished = False

    def __init__(self, urllist):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        print urllist
        self.urllist = urllist
        self.csv = Csv('scrapper.csv')

    def run(self):
        self.scrapData()
        self.notifyScrapper.emit(
            '<font color=green><b>------------------ Finish! ------------------------- </b></font>')

    def scrapData(self):
        try:
            total = 0
            csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category',
                         'Size', 'Color', 'Design']
            self.csv.writeCsvRow(csvHeader)
            if self.isFinished: return
            for url in self.urllist:
                if len(url) > 0:
                    url = self.regex.replaceData('(?i)\r', '', url)
                    url = self.regex.replaceData('(?i)\n', '', url)
                    url = self.regex.getSearchedData('(?i)(http.*?)$', url)
                    print 'URL: ', url
                    self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url))
                    data = self.spider.fetchData(url)
                    if data and len(data) > 0:
                        data = self.regex.reduceNewLine(data)
                        data = self.regex.reduceBlankSpace(data)
                        soup = BeautifulSoup(data)
                        soup.prettify()
                        title = ''
                        price = ''
                        size = ''
                        brand = ''
                        features = ''
                        material = ''
                        measurements = ''
                        category = ''
                        color = ''
                        design = ''
                        if soup.find('span', id='vi-lkhdr-itmTitl') is not None:
                            title = soup.find('span', id='vi-lkhdr-itmTitl').text
                        if soup.find('span', id='prcIsum'):
                            price = soup.find('span', id='prcIsum').text
                        if soup.find('div', class_='itemAttr'):
                            specchunk = soup.find('div', class_='itemAttr')
                            rows = specchunk.find_all('tr')
                            for row in rows:
                                cols = row.find_all('td')
                                for i in range(0, len(cols), 2):
                                    # if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()):
                                    #     conditionChunk = cols[i + 1]
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk))
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk)
                                    #     condition = BeautifulSoup(conditionChunk).text
                                    #     print condition
                                    if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()):
                                        brand = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()):
                                        features = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()):
                                        material = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()):
                                        measurements = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()):
                                        category = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()):
                                        color = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()):
                                        design = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()):
                                        size = cols[i + 1].text
                        self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>')
                        csvData = [url, title, price, brand, features, material, measurements, category, size, color, design]
                        self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData))
                        self.csv.writeCsvRow(csvData)
                        self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>')
                        total += 1
                        self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total))
        except Exception, x:
            self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message)
            self.logger.error(x.message)
            print x