class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None self.mycookie = None def login(self, url, loginInfo, retry=0, proxy=None): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ conn = ('Connection', 'keep-alive') ac = ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ln = ('Accept-Language', 'en-us,en;q=0.5') if proxy is None: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler()) else: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy) urllib2.install_opener(self.opener) try: return self.opener.open(url, urllib.urlencode(loginInfo)).read() except Exception, x: print x.message self.logger.error(x.message) if retry < config.RETRY_COUNT: self.login(url, loginInfo, retry + 1) return None
class Browser: def __init__(self): self.logger = LogManager(__name__) self.browser = None self.browserCookieJar = None def browserLogin(self, url, loginParams, formId=None, saveCookie=False, retry=0): """ Login page just like web browser url = '' Ex. http://www.example.com loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ try: self.browser = self.createBrowser([config.USER_AGENT]) self.browser.open(url, timeout=config.TIMEOUT) if formId is not None: self.browser.select_form(predicate=lambda f: 'id' in f.attrs and f.attrs['id'] == formId) else: self.browser.select_form(nr=0) for key in loginParams: self.browser.form[key] = loginParams[key] self.browser.submit() if saveCookie: self.browserCookieJar.save(config.COOKIE_FILE) return self.browser.response().read() except Exception, x: self.logger.error(x) if retry < config.RETRY_COUNT: self.browserLogin(url, loginParams, formId, saveCookie, retry + 1) return None
class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None self.mycookie = None def login(self, url, loginInfo, retry=0, proxy=None): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ conn = ('Connection', 'keep-alive') ac = ( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ln = ('Accept-Language', 'en-us,en;q=0.5') if proxy is None: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler()) else: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy) urllib2.install_opener(self.opener) try: return self.opener.open(url, urllib.urlencode(loginInfo)).read() except Exception, x: print x.message self.logger.error(x.message) if retry < config.RETRY_COUNT: self.login(url, loginInfo, retry + 1) return None
class GoogleFinanceScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape'] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() params = urllib.urlencode({'q': line}) url = self.url + params self.scrapBykeyword(url, line) except Exception, x: print x self.logger.error('Error: ' + x.message)
class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None self.mycookie = None def login(self, url, loginInfo, retry=0): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ host = ("Host", "www.vizury.com") conn = ("Connection", "keep-alive") enc = ("Accept-Encoding", "gzip, deflate") ac = ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") ln = ("Accept-Language", "en-us,en;q=0.5") self.opener = self.createOpener([config.USER_AGENT, conn, enc, ac, ln, host], self.createCookieJarHandler()) urllib2.install_opener(self.opener) try: return self.opener.open(url, urllib.urlencode(loginInfo)).read() except Exception, x: self.logger.error(x.message) if retry < config.RETRY_COUNT: self.login(url, loginInfo, retry + 1) return None
class AmazonScrapper(QThread): notifyAmazon = pyqtSignal(object) def __init__(self, urlList, category): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv') self.csvWriter = Csv(category + '.csv') csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL'] if csvDataHeader not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://www.amazon.com' self.scrapUrl = None self.dbHelper = DbHelper('amazon.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: # self.scrapProductDetail( # 'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544') # return if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url) imUrl = None retry = 0 while imUrl is None and retry < 4: imUrl = self.reformatUrl(url) retry += 1 if imUrl is None: imUrl = url self.total = 0 print 'URL: ' + str(imUrl) sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority', 'date-desc-rank'] for sort in sortList: self.scrapReformatData(imUrl, sort) self.notifyAmazon.emit( '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url) self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class NisbetProduct(QtCore.QThread): scrapProductData = QtCore.pyqtSignal(object) stopThread = QtCore.pyqtSignal(int) def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.totalProducts = 0 self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0) self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand', 'Product Price', 'Product Short Description', 'Product Long Description', 'Image File Name', 'User Manual File Name', 'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1', 'Category2', 'Category3', 'Category4'] if 'URL' not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils() def run(self): self.scrapData() def stop(self): self.isExiting = True def scrapData(self): if self.isExiting: return self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl) self.logger.debug('===== URL [' + self.mainUrl + '] =====') data = self.spider.fetchData(self.mainUrl) if data and len(str(data).strip()) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data) if category1Chunk and len(str(category1Chunk).strip()) > 0: i = 0 for category1Data in category1Chunk: category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data) category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>', category1Data) if category2Chunk and len(str(category2Chunk).strip()) > 0: for category2Data in category2Chunk: try: self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1]) except Exception, x: self.logger.error(x) self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
class Csv: def __init__(self, fileName=None): self.logger = LogManager(__name__) if fileName is not None: self.writer = UnicodeWriter(open(fileName, 'wb'), quoting=csv.QUOTE_ALL) def writeCsvRow(self, data): try: self.writer.writerow(data) except Exception, x: self.logger.error(x)
class Csv: def __init__(self, fileName=None): self.logger = LogManager(__name__) if fileName is not None: self.writer = csv.writer(open(fileName, 'ab')) def writeCsvRow(self, data): try: self.writer.writerow(data) except Exception, x: self.logger.error(x)
class Csv: def __init__(self, fileName=None): self.logger = LogManager(__name__) if fileName is not None: self.file = open(fileName, 'wb') self.writer = UnicodeWriter(self.file, quoting=csv.QUOTE_ALL) def writeCsvRow(self, data): try: self.writer.writerow(data) except Exception, x: self.logger.error(x)
class Utils: def __init__(self): self.logger = LogManager(__name__) def downloadFile(self, url, savePath): try: directory = os.path.dirname(savePath) if not os.path.exists(directory): os.makedirs(directory) webFile = urllib2.urlopen(url) localFile = open(savePath, 'wb') localFile.write(webFile.read()) except Exception, x: self.logger.error(x)
class SaraivaScrapper(QThread): notifySaraiva = pyqtSignal(object) def __init__(self, urlList, category, htmlTag, replaceTag): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category self.htmlTag = self.regex.replaceData('\r+', '', htmlTag) self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag) self.replaceTag = replaceTag self.csvWriter = Csv(category + '.csv') csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture'] self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://busca.livrariasaraiva.com.br' self.scrapUrl = None self.dbHelper = DbHelper('saraiva.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url) paginationUrl, self.maxRecords = self.reformatUrl(url) self.notifySaraiva.emit( '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords)) print 'Max records: ', self.maxRecords print 'URL: ' + str(paginationUrl) sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev', '&isort=price+rev', '&isort=price', '&isort=date+rev'] for sort in sortList: self.scrapResults(paginationUrl, sort) self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class PaodeacucarScrapper(QThread): notifyPaode = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.mainUrl = 'http://www.paodeacucar.com.br/' self.url = 'http://www.paodeacucar.com.br/' dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4) self.csvWriter = Csv('paodeacucar.csv') csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details', 'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14'] if 'URL' not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() def scrapData(self): try: print 'Main URL: ', self.url self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url)) data = self.spider.fetchData(self.url) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) soup = BeautifulSoup(data) categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*')) print 'Total Categories: ', len(categories) self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories)))) for category in categories: if category.a is not None: submenu_target = self.regex.replaceData('#', '', category.a.get('data-target')) sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item') print 'Total Sub Categories: ', len(sub_categories) self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories)))) for sub_category in sub_categories: sub_category_label = sub_category.find('span', class_='label').text sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A' self.scrapItems(sub_category_url, category.text, sub_category_label) except Exception, x: self.logger.error(x.message) print x
class CsTest(QThread): notifyProduct = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0) self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1) self.csvWriter = Csv('cs_product.csv') # self.mainUrl = 'http://www.cs-catering-equipment.co.uk/' self.mainUrl = 'http://www.cs-catering-equipment.co.uk/brands' self.utils = Utils() if 'Product Code' not in self.dupCsvRows: self.csvWriter.writeCsvRow( ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount', 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty' , 'Delivery', 'Product Image', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image']) self.totalProducts = len(self.dupCsvRows) def run(self): self.scrapBrands() self.notifyProduct.emit('<font color=red><b>Finished Scraping All Brands.</b></font>') def scrapBrands(self): self.notifyProduct.emit('<font color=green><b>Main URL: %s<b></font>' % self.mainUrl) self.notifyProduct.emit('<b>Try To scrap All Brands.<b>') data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data) if brandChunks and len(brandChunks) > 0: for brandChunk in brandChunks: brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk) self.notifyProduct.emit('<b>Total Brands Found: %s<b>' % str(len(brands))) if brands and len(brands) > 0: for brand in brands: try: self.scrapBrandInfo(brand[0], 'Shop By Brand', brand[1]) except Exception, x: self.logger.error(x)
class CsBrands(QThread): notifyBrand = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow("cs_Brands.csv") self.csvWriter = Csv("cs_Brands.csv") self.mainUrl = "http://www.cs-catering-equipment.co.uk/brands" self.isExiting = False headerData = [ "URL", "Parent Category", "Brand Category", "Brand Description", "Image File", "Product Codes in this category", ] if headerData not in self.dupCsvRows: self.csvWriter.writeCsvRow(headerData) def run(self): self.scrapBrands() self.notifyBrand.emit("<font color=red><b>Finished Scraping All Brands.</b></font>") def scrapBrands(self): self.notifyBrand.emit("<font color=green><b>Main URL: %s<b></font>" % self.mainUrl) self.notifyBrand.emit("<b>Try To scrap All Brands.<b>") data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data) if brandChunks and len(brandChunks) > 0: for brandChunk in brandChunks: brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk) self.notifyBrand.emit("<b>Total Brands Found: %s<b>" % str(len(brands))) if brands and len(brands) > 0: for brand in brands: try: self.scrapBrandInfo(brand[0], "Shop By Brand", brand[1]) except Exception, x: self.logger.error(x)
class WebTableScrapper(object): def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = [ 'Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent' ] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader) def scrapData(self): try: self.browser = self.createBrowser([Config.USER_AGENT]) self.browser.set_handle_robots(False) # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat') # exit(1) data = self.browser.open(self.url, None, 60).read() if data is not None: soup = BeautifulSoup(data) self.statuses = self.populateDropDownValues( soup, 'ddlstatus', '0') self.categories = self.populateDropDownValues( soup, 'ddlcategory', '-All Category-') self.years = self.populateDropDownValues( soup, 'ddlyear', '-All Years-') self.states = self.populateDropDownValues( soup, 'ddlstate', '-All State-') for status in self.statuses: self.scrapDataByStatus(status[0], status[1]) except Exception, x: print x self.logger.error(x)
class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None self.mycookie = None def login(self, url, loginInfo, retry=0, proxy=None): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ conn = ('Connection', 'keep-alive') ac = ( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ln = ('Accept-Language', 'en-us,en;q=0.5') if proxy is None: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler()) else: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy) urllib2.install_opener(self.opener) try: response = self.opener.open(url, urllib.urlencode(loginInfo)) print 'Response from Server:' print 'Status: ', response.getcode() print response.info() self.logger.debug('Response from Server:') self.logger.debug('Status: ' + str(response.getcode())) self.logger.debug(response.info()) redirected_url = response.url return redirected_url, response.read() except Exception, x: print x self.logger.error(x.message) if retry < config.RETRY_COUNT: print 'Retry again. Please wait 5 seconds...' time.sleep(5) self.login(url, loginInfo, retry + 1) else: print 'Failed to retrieve data after maximum %d retry!' % config.RETRY_COUNT return None, None
class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None def login(self, url, loginInfo, retry=0): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ self.opener = self.createOpener([config.USER_AGENT], self.createCookieJarHandler()) urllib2.install_opener(self.opener) try: return self.opener.open(url, urllib.urlencode(loginInfo)).read() except Exception, x: self.logger.error(x.message) if retry < config.RETRY_COUNT: self.login(url, loginInfo, retry + 1) return None
class Spider: def __init__(self): self.logger = LogManager(__name__) self.opener = None self.mycookie = None def login(self, url, loginInfo, retry=0, proxy=None): """ Login request for user url = '' Ex. http://www.example.com/login loginInfo = {} Ex. {'user': '******', 'pass': '******'} """ conn = ('Connection', 'keep-alive') ac = ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ln = ('Accept-Language', 'en-us,en;q=0.5') if proxy is None: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler()) else: self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy) urllib2.install_opener(self.opener) try: response = self.opener.open(url, urllib.urlencode(loginInfo)) print 'Response from Server:' print 'Status: ', response.getcode() print response.info() self.logger.debug('Response from Server:') self.logger.debug('Status: ' + str(response.getcode())) self.logger.debug(response.info()) redirected_url = response.url return redirected_url, response.read() except Exception, x: print x self.logger.error(x.message) if retry < config.RETRY_COUNT: print 'Retry again. Please wait 5 seconds...' time.sleep(5) self.login(url, loginInfo, retry + 1) else: print 'Failed to retrieve data after maximum %d retry!' % config.RETRY_COUNT return None, None
class WebTableScrapper(object): def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = ['Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent'] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader) def scrapData(self): try: self.browser = self.createBrowser([Config.USER_AGENT]) self.browser.set_handle_robots(False) # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat') # exit(1) data = self.browser.open(self.url, None, 60).read() if data is not None: soup = BeautifulSoup(data) self.statuses = self.populateDropDownValues(soup, 'ddlstatus', '0') self.categories = self.populateDropDownValues(soup, 'ddlcategory', '-All Category-') self.years = self.populateDropDownValues(soup, 'ddlyear', '-All Years-') self.states = self.populateDropDownValues(soup, 'ddlstate', '-All State-') for status in self.statuses: self.scrapDataByStatus(status[0], status[1]) except Exception, x: print x self.logger.error(x)
class GoogleFinanceScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = [ 'Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape' ] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() params = urllib.urlencode({'q': line}) url = self.url + params self.scrapBykeyword(url, line) except Exception, x: print x self.logger.error('Error: ' + x.message)
class Scrapper(QThread): notifyScrapper = pyqtSignal(object) isFinished = False def __init__(self, urllist): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() print urllist self.urllist = urllist self.csv = Csv('scrapper.csv') def run(self): self.scrapData() self.notifyScrapper.emit( '<font color=green><b>------------------ Finish! ------------------------- </b></font>') def scrapData(self): try: total = 0 csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category', 'Size', 'Color', 'Design'] self.csv.writeCsvRow(csvHeader) if self.isFinished: return for url in self.urllist: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) url = self.regex.getSearchedData('(?i)(http.*?)$', url) print 'URL: ', url self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url)) data = self.spider.fetchData(url) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) soup = BeautifulSoup(data) soup.prettify() title = '' price = '' size = '' brand = '' features = '' material = '' measurements = '' category = '' color = '' design = '' if soup.find('span', id='vi-lkhdr-itmTitl') is not None: title = soup.find('span', id='vi-lkhdr-itmTitl').text if soup.find('span', id='prcIsum'): price = soup.find('span', id='prcIsum').text if soup.find('div', class_='itemAttr'): specchunk = soup.find('div', class_='itemAttr') rows = specchunk.find_all('tr') for row in rows: cols = row.find_all('td') for i in range(0, len(cols), 2): # if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()): # conditionChunk = cols[i + 1] # conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk)) # conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk) # condition = BeautifulSoup(conditionChunk).text # print condition if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()): brand = cols[i + 1].text if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()): features = cols[i + 1].text if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()): material = cols[i + 1].text if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()): measurements = cols[i + 1].text if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()): category = cols[i + 1].text if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()): color = cols[i + 1].text if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()): design = cols[i + 1].text if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()): size = cols[i + 1].text self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>') csvData = [url, title, price, brand, features, material, measurements, category, size, color, design] self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData)) self.csv.writeCsvRow(csvData) self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>') total += 1 self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total)) except Exception, x: self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message) self.logger.error(x.message) print x