class TopsyScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'http://topsy.com/s?' self.csvWriter = Csv('topsy.csv') csvDataHeader = ['Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape'] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() if len(line) > 0: params = urllib.urlencode({'q': line, 'window': 'm', 'type': 'tweet'}) url = self.url + params self.scrapBrowserData(url, line) except Exception, x: print x
def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow("nisbets.csv", 0) self.csvWriter = Csv("nisbets.csv") self.mainUrl = "http://www.nisbets.co.uk" csvHeaderList = [ "URL", "Product Code", "Product Technical Specifications", "Product Name", "Brand", "Product Price", "Product Short Description", "Product Long Description", "Image File Name", "User Manual File Name", "Exploded View File Name", "Spares Code", "Accessories", "Product Status" "Category1", "Category2", "Category3", "Category4", ] if "URL" not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils()
class GoogleFinanceScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape'] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() params = urllib.urlencode({'q': line}) url = self.url + params self.scrapBykeyword(url, line) except Exception, x: print x self.logger.error('Error: ' + x.message)
class OmvicScrapper: isFinished = False im_data = [] def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.initScrapper() def initScrapper(self): try: dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow('omvic.csv') self.dbHelper = DbHelper('omvic.db') self.dbHelper.createTable('omvic') self.totaldata = self.dbHelper.getTotalProduct('omvic') self.csvWriter = Csv('omvic.csv') csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass', 'Operating Status', 'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names'] if len(dupCsvRows) == 0: self.csvWriter.writeCsvRow(csvDataHeader) del dupCsvReader del dupCsvRows gc.collect() del gc.garbage[:] gc.collect() except Exception, x: print x
def __init__(self, parent=None): super(Form, self).__init__(parent) self.createGui() self.memberDic = {} self.excludedMember = None dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('linkedIn.csv', 0) self.csvWriter = Csv('linkedIn.csv') self.allMembers = []
def __init__(self, parent=None): super(Form, self).__init__(parent) self.createGui() self.memberDic = {} self.excludedMember = None dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow("linkedIn.csv", 0) self.csvWriter = Csv("linkedIn.csv") self.allMembers = []
class AmazonScrapper(QThread): notifyAmazon = pyqtSignal(object) def __init__(self, urlList, category): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv') self.csvWriter = Csv(category + '.csv') csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL'] if csvDataHeader not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://www.amazon.com' self.scrapUrl = None self.dbHelper = DbHelper('amazon.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: # self.scrapProductDetail( # 'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544') # return if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url) imUrl = None retry = 0 while imUrl is None and retry < 4: imUrl = self.reformatUrl(url) retry += 1 if imUrl is None: imUrl = url self.total = 0 print 'URL: ' + str(imUrl) sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority', 'date-desc-rank'] for sort in sortList: self.scrapReformatData(imUrl, sort) self.notifyAmazon.emit( '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url) self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class NisbetProduct(QtCore.QThread): scrapProductData = QtCore.pyqtSignal(object) stopThread = QtCore.pyqtSignal(int) def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.totalProducts = 0 self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0) self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand', 'Product Price', 'Product Short Description', 'Product Long Description', 'Image File Name', 'User Manual File Name', 'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1', 'Category2', 'Category3', 'Category4'] if 'URL' not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils() def run(self): self.scrapData() def stop(self): self.isExiting = True def scrapData(self): if self.isExiting: return self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl) self.logger.debug('===== URL [' + self.mainUrl + '] =====') data = self.spider.fetchData(self.mainUrl) if data and len(str(data).strip()) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data) if category1Chunk and len(str(category1Chunk).strip()) > 0: i = 0 for category1Data in category1Chunk: category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data) category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>', category1Data) if category2Chunk and len(str(category2Chunk).strip()) > 0: for category2Data in category2Chunk: try: self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1]) except Exception, x: self.logger.error(x) self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'http://topsy.com/s?' self.csvWriter = Csv('topsy.csv') csvDataHeader = [ 'Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape' ] self.csvWriter.writeCsvRow(csvDataHeader)
def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbetCat.csv') self.csvWriter = Csv('nisbetCat.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = ['Parent Category', 'Category Name', 'Category Description'] if csvHeaderList not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList)
def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = [ 'Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape' ] self.csvWriter.writeCsvRow(csvDataHeader)
def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.main_url = 'http://www.walgreens.com' self.url = 'http://www.walgreens.com/store/catalog/shopLanding' self.sitemap_xml = 'http://www.walgreens.com/sitemap.xml' dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('walgreens.csv') self.csvWriter = Csv('walgreens.csv') csvDataHeader = ['Product Name', 'Price', 'Description', 'Shipping', 'Ingredients', 'Image'] if csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvDataHeader)
class SaraivaScrapper(QThread): notifySaraiva = pyqtSignal(object) def __init__(self, urlList, category, htmlTag, replaceTag): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category self.htmlTag = self.regex.replaceData('\r+', '', htmlTag) self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag) self.replaceTag = replaceTag self.csvWriter = Csv(category + '.csv') csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture'] self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://busca.livrariasaraiva.com.br' self.scrapUrl = None self.dbHelper = DbHelper('saraiva.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url) paginationUrl, self.maxRecords = self.reformatUrl(url) self.notifySaraiva.emit( '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords)) print 'Max records: ', self.maxRecords print 'URL: ' + str(paginationUrl) sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev', '&isort=price+rev', '&isort=price', '&isort=date+rev'] for sort in sortList: self.scrapResults(paginationUrl, sort) self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class PaodeacucarScrapper(QThread): notifyPaode = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.mainUrl = 'http://www.paodeacucar.com.br/' self.url = 'http://www.paodeacucar.com.br/' dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4) self.csvWriter = Csv('paodeacucar.csv') csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details', 'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14'] if 'URL' not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() def scrapData(self): try: print 'Main URL: ', self.url self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url)) data = self.spider.fetchData(self.url) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) soup = BeautifulSoup(data) categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*')) print 'Total Categories: ', len(categories) self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories)))) for category in categories: if category.a is not None: submenu_target = self.regex.replaceData('#', '', category.a.get('data-target')) sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item') print 'Total Sub Categories: ', len(sub_categories) self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories)))) for sub_category in sub_categories: sub_category_label = sub_category.find('span', class_='label').text sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A' self.scrapItems(sub_category_url, category.text, sub_category_label) except Exception, x: self.logger.error(x.message) print x
def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.mainUrl = 'http://www.paodeacucar.com.br/' self.url = 'http://www.paodeacucar.com.br/' dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4) self.csvWriter = Csv('paodeacucar.csv') csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details', 'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14'] if 'URL' not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader)
class CsTest(QThread): notifyProduct = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0) self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1) self.csvWriter = Csv('cs_product.csv') # self.mainUrl = 'http://www.cs-catering-equipment.co.uk/' self.mainUrl = 'http://www.cs-catering-equipment.co.uk/brands' self.utils = Utils() if 'Product Code' not in self.dupCsvRows: self.csvWriter.writeCsvRow( ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount', 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty' , 'Delivery', 'Product Image', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image']) self.totalProducts = len(self.dupCsvRows) def run(self): self.scrapBrands() self.notifyProduct.emit('<font color=red><b>Finished Scraping All Brands.</b></font>') def scrapBrands(self): self.notifyProduct.emit('<font color=green><b>Main URL: %s<b></font>' % self.mainUrl) self.notifyProduct.emit('<b>Try To scrap All Brands.<b>') data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data) if brandChunks and len(brandChunks) > 0: for brandChunk in brandChunks: brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk) self.notifyProduct.emit('<b>Total Brands Found: %s<b>' % str(len(brands))) if brands and len(brands) > 0: for brand in brands: try: self.scrapBrandInfo(brand[0], 'Shop By Brand', brand[1]) except Exception, x: self.logger.error(x)
class CsBrands(QThread): notifyBrand = pyqtSignal(object) def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow("cs_Brands.csv") self.csvWriter = Csv("cs_Brands.csv") self.mainUrl = "http://www.cs-catering-equipment.co.uk/brands" self.isExiting = False headerData = [ "URL", "Parent Category", "Brand Category", "Brand Description", "Image File", "Product Codes in this category", ] if headerData not in self.dupCsvRows: self.csvWriter.writeCsvRow(headerData) def run(self): self.scrapBrands() self.notifyBrand.emit("<font color=red><b>Finished Scraping All Brands.</b></font>") def scrapBrands(self): self.notifyBrand.emit("<font color=green><b>Main URL: %s<b></font>" % self.mainUrl) self.notifyBrand.emit("<b>Try To scrap All Brands.<b>") data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data) if brandChunks and len(brandChunks) > 0: for brandChunk in brandChunks: brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk) self.notifyBrand.emit("<b>Total Brands Found: %s<b>" % str(len(brands))) if brands and len(brands) > 0: for brand in brands: try: self.scrapBrandInfo(brand[0], "Shop By Brand", brand[1]) except Exception, x: self.logger.error(x)
def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = ['Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent'] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader)
def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = ['Category', 'Product Image Url', 'Product Code', 'Product Name', 'Price'] self.csvWriter.writeCsvRow(csvHeaderList)
def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 0) self.csvWriter = Csv('cs_product.csv') self.mainUrl = 'http://www.cs-catering-equipment.co.uk/' self.utils = Utils() self.csvWriter.writeCsvRow( ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount', 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty', 'Delivery', 'Product Image', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image']) self.totalProducts = 0
class AmazonScrapper(): def __init__(self, url): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.url = url self.base_product_url = 'http://www.amazon.com/dp/' self.base_image_url = 'http://ecx.images-amazon.com/images/I/' self.csvWriter = Csv('amazon.csv') csvDataHeader = ['URL', 'HTML Path', 'Image URLS'] self.csvWriter.writeCsvRow(csvDataHeader) def scrapData(self): try: host = ('Host', 'www.amazon.com') data = self.spider.fetchData(self.url, host=host) if data: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) searchParams = self.regex.getSearchedData('(?i)var searchParams = {([^\}]*)}', data) searchParams = searchParams.split(',') seller = '' marketPlaceId = '' useMYI = '' for searchParam in searchParams: searchParam = self.regex.reduceBlankSpace(searchParam) searchParam = self.regex.replaceData('\'', '', searchParam) if searchParam.startswith('seller'): seller = searchParam.split(':')[1].strip() seller = seller.decode('string-escape') if searchParam.startswith('marketplaceID'): marketPlaceId = searchParam.split(':')[1].strip() marketPlaceId = marketPlaceId.decode('string-escape') if searchParam.startswith('useMYI'): useMYI = searchParam.split(':')[1].strip() useMYI = useMYI.decode('string-escape') params = {'seller': seller, 'marketPlaceId': marketPlaceId, 'useMYI': useMYI} ajax_url = 'http://www.amazon.com/gp/aag/ajax/productWidget.html' self.scrapAjaxPage(ajax_url, params, host) except Exception, x: print x
class WebTableScrapper(object): def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = [ 'Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent' ] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader) def scrapData(self): try: self.browser = self.createBrowser([Config.USER_AGENT]) self.browser.set_handle_robots(False) # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat') # exit(1) data = self.browser.open(self.url, None, 60).read() if data is not None: soup = BeautifulSoup(data) self.statuses = self.populateDropDownValues( soup, 'ddlstatus', '0') self.categories = self.populateDropDownValues( soup, 'ddlcategory', '-All Category-') self.years = self.populateDropDownValues( soup, 'ddlyear', '-All Years-') self.states = self.populateDropDownValues( soup, 'ddlstate', '-All State-') for status in self.statuses: self.scrapDataByStatus(status[0], status[1]) except Exception, x: print x self.logger.error(x)
def __init__(self, urllist): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() print urllist self.urllist = urllist self.csv = Csv('scrapper.csv')
def scrapBertos(self, retry=0): # self.downloadFile('http://s900.bertos.it/download.php?file=editorcms/documentazione/schede/scheda_13722600.pdf', 'a.pdf') # self.scrapSubCategory('http://s900.bertos.it/en/', '', None, None) # self.scrapProducts('http://s900.bertos.it/en/pasta_cookers/', '', '', None, None) # return self.notifyProduct.emit('<font color=green><b>Try to get all language links.</b></font>') self.logger.debug(self.mainUrl) data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) languages = self.regex.getAllSearchedData( '(?i)<div class="[^"]*"><a href="([^"]*)"\s*?class="boxalingua">([^<]*)</a>', data) if languages and len(languages) > 0: self.logger.debug('Total languages: %s' % str(len(languages))) self.notifyProduct.emit('<b>Total languages found[%s]</b>' % str(len(languages))) for language in languages: self.totalProducts = 0 url = language[0] # if str(language[1]).lower() != 'en': # continue urlChunk = self.spider.fetchData(url) if urlChunk and len(urlChunk) > 0: urlChunk = self.regex.reduceNewLine(urlChunk) urlChunk = self.regex.reduceBlankSpace(urlChunk) url = self.regex.getSearchedData('(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"', urlChunk) csvFile = str(language[1].strip()).lower() + '_' + 'bertos.csv' dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow(csvFile) csvWriter = Csv(csvFile) if self.csvHeader not in dupCsvRows: dupCsvRows.append(self.csvHeader) csvWriter.writeCsvRow(self.csvHeader) self.notifyProduct.emit( '<font color=green><b>Try to get data for language [%s].</b></font>' % language[1]) self.scrapCategory(url, dupCsvRows, csvWriter) self.notifyProduct.emit( '<font color=red><b>===== Finish scraping data for [%s] =====</b></font><br /><br />' % language[1]) else: if retry < 5: return self.scrapBertos(retry + 1)
def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'http://topsy.com/s?' self.csvWriter = Csv('topsy.csv') csvDataHeader = ['Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape'] self.csvWriter.writeCsvRow(csvDataHeader)
def initScrapper(self): try: dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow('omvic.csv') self.dbHelper = DbHelper('omvic.db') self.dbHelper.createTable('omvic') self.totaldata = self.dbHelper.getTotalProduct('omvic') self.csvWriter = Csv('omvic.csv') csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass', 'Operating Status', 'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names'] if len(dupCsvRows) == 0: self.csvWriter.writeCsvRow(csvDataHeader) del dupCsvReader del dupCsvRows gc.collect() del gc.garbage[:] gc.collect() except Exception, x: print x
def __init__(self, urlList, category): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv') self.csvWriter = Csv(category + '.csv') csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL'] if csvDataHeader not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://www.amazon.com' self.scrapUrl = None self.dbHelper = DbHelper('amazon.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category)
def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape'] self.csvWriter.writeCsvRow(csvDataHeader)
def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = [ 'Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent' ] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader)
def __init__(self, url): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.url = url self.base_product_url = 'http://www.amazon.com/dp/' self.base_image_url = 'http://ecx.images-amazon.com/images/I/' self.csvWriter = Csv('amazon.csv') csvDataHeader = ['URL', 'HTML Path', 'Image URLS'] self.csvWriter.writeCsvRow(csvDataHeader)
def __init__(self): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow("cs_Brands.csv") self.csvWriter = Csv("cs_Brands.csv") self.mainUrl = "http://www.cs-catering-equipment.co.uk/brands" self.isExiting = False headerData = [ "URL", "Parent Category", "Brand Category", "Brand Description", "Image File", "Product Codes in this category", ] if headerData not in self.dupCsvRows: self.csvWriter.writeCsvRow(headerData)
class TopsyScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'http://topsy.com/s?' self.csvWriter = Csv('topsy.csv') csvDataHeader = [ 'Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape' ] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() if len(line) > 0: params = urllib.urlencode({ 'q': line, 'window': 'm', 'type': 'tweet' }) url = self.url + params self.scrapBrowserData(url, line) except Exception, x: print x
def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.totalProducts = 0 self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0) self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand', 'Product Price', 'Product Short Description', 'Product Long Description', 'Image File Name', 'User Manual File Name', 'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1', 'Category2', 'Category3', 'Category4'] if 'URL' not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils()
class WebTableScrapper(object): def __init__(self): self.browser = None self.url = "http://environmentclearance.nic.in/Search.aspx" self.statuses = [] self.categories = [] self.years = [] self.states = [] self.csvDataHeader = ['Status', 'Category', 'Year', 'State', 'Serial No', 'Proposal details', 'Location', 'Important Date', 'Category', 'Company Proponent'] self.logger = LogManager(__name__) self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('env_clearance.csv') self.csvWriter = Csv('env_clearance.csv') if self.csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(self.csvDataHeader) self.dupCsvRows.append(self.csvDataHeader) def scrapData(self): try: self.browser = self.createBrowser([Config.USER_AGENT]) self.browser.set_handle_robots(False) # self.scrapDataByState('UPEC', 'MIN', '2011', 'Gujarat') # exit(1) data = self.browser.open(self.url, None, 60).read() if data is not None: soup = BeautifulSoup(data) self.statuses = self.populateDropDownValues(soup, 'ddlstatus', '0') self.categories = self.populateDropDownValues(soup, 'ddlcategory', '-All Category-') self.years = self.populateDropDownValues(soup, 'ddlyear', '-All Years-') self.states = self.populateDropDownValues(soup, 'ddlstate', '-All State-') for status in self.statuses: self.scrapDataByStatus(status[0], status[1]) except Exception, x: print x self.logger.error(x)
class GoogleFinanceScrapper: isFinished = False def __init__(self, filename): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.filename = filename self.url = 'https://www.google.com/finance?' self.main_url = 'https://www.google.com' self.csvWriter = Csv('google_finance.csv') csvDataHeader = [ 'Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape' ] self.csvWriter.writeCsvRow(csvDataHeader) def run(self): self.scrapData() self.csvWriter.closeWriter() def scrapData(self): try: file = open(self.filename, 'rb') for line in file.readlines(): if self.isFinished: return line = self.regex.replaceData('\r+', '', line) line = self.regex.reduceNewLine(line) line = self.regex.reduceBlankSpace(line) line = line.strip() params = urllib.urlencode({'q': line}) url = self.url + params self.scrapBykeyword(url, line) except Exception, x: print x self.logger.error('Error: ' + x.message)
class WalgreensScrapper(): def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.main_url = 'http://www.walgreens.com' self.url = 'http://www.walgreens.com/store/catalog/shopLanding' self.sitemap_xml = 'http://www.walgreens.com/sitemap.xml' dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('walgreens.csv') self.csvWriter = Csv('walgreens.csv') csvDataHeader = ['Product Name', 'Price', 'Description', 'Shipping', 'Ingredients', 'Image'] if csvDataHeader not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvDataHeader) def scrapData(self): try: print 'First scrapping sitemap...' self.scrapSiteMap() print 'Main URL: ' + self.url data = self.spider.fetchData(self.url) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) soup = BeautifulSoup(data) categoryBar = soup.find('div', class_='wid150 padrt5px padlt5px float-left') if categoryBar: categories = categoryBar.find_all('li') for category in categories: category_url = self.main_url + category.a.get('href') self.scrapCategory(category_url) except Exception, x: print x
def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0) self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = [ 'URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand', 'Product Price', 'Product Short Description', 'Product Long Description', 'Image File Name', 'User Manual File Name', 'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1', 'Category2', 'Category3', 'Category4' ] if 'URL' not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils()
def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.loginUrl = 'http://www.trggroup.net/victorinox/index.php' self.username = '******' self.password = '******' self.collectionUrl = 'http://www.trggroup.net/victorinox/index.php?p=124' self.mainUrl = 'http://www.trggroup.net/victorinox/' self.url = 'http://www.ebags.com/brands' self.csvWriter = Csv('trggroup.csv') csvDataHeader = ['Name1', 'Name2', 'Dimension1', 'Dimension2', 'Spec1', 'Spec2', 'Spec3', 'Product Details', 'Image'] self.csvWriter.writeCsvRow(csvDataHeader) self.proxy = urllib2.ProxyHandler({'http': '184.168.55.226:80'})
def scrapBertos(self, retry=0): # self.downloadFile('http://s900.bertos.it/download.php?file=editorcms/documentazione/schede/scheda_13722600.pdf', 'a.pdf') # self.scrapSubCategory('http://s900.bertos.it/en/', '', None, None) # self.scrapProducts('http://s900.bertos.it/en/pasta_cookers/', '', '', None, None) # return self.notifyProduct.emit( '<font color=green><b>Try to get all language links.</b></font>') self.logger.debug(self.mainUrl) data = self.spider.fetchData(self.mainUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) languages = self.regex.getAllSearchedData( '(?i)<div class="[^"]*"><a href="([^"]*)"\s*?class="boxalingua">([^<]*)</a>', data) if languages and len(languages) > 0: self.logger.debug('Total languages: %s' % str(len(languages))) self.notifyProduct.emit('<b>Total languages found[%s]</b>' % str(len(languages))) for language in languages: self.totalProducts = 0 url = language[0] # if str(language[1]).lower() != 'en': # continue urlChunk = self.spider.fetchData(url) if urlChunk and len(urlChunk) > 0: urlChunk = self.regex.reduceNewLine(urlChunk) urlChunk = self.regex.reduceBlankSpace(urlChunk) url = self.regex.getSearchedData( '(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"', urlChunk) csvFile = str( language[1].strip()).lower() + '_' + 'bertos.csv' dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow(csvFile) csvWriter = Csv(csvFile) if self.csvHeader not in dupCsvRows: dupCsvRows.append(self.csvHeader) csvWriter.writeCsvRow(self.csvHeader) self.notifyProduct.emit( '<font color=green><b>Try to get data for language [%s].</b></font>' % language[1]) self.scrapCategory(url, dupCsvRows, csvWriter) self.notifyProduct.emit( '<font color=red><b>===== Finish scraping data for [%s] =====</b></font><br /><br />' % language[1]) else: if retry < 5: return self.scrapBertos(retry + 1)
class NisbetProduct(QtCore.QThread): scrapProductData = QtCore.pyqtSignal(object) stopThread = QtCore.pyqtSignal(int) def __init__(self): QtCore.QThread.__init__(self) self.isExiting = False self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0) self.csvWriter = Csv('nisbets.csv') self.mainUrl = 'http://www.nisbets.co.uk' csvHeaderList = [ 'URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand', 'Product Price', 'Product Short Description', 'Product Long Description', 'Image File Name', 'User Manual File Name', 'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1', 'Category2', 'Category3', 'Category4' ] if 'URL' not in self.dupCsvRows: self.csvWriter.writeCsvRow(csvHeaderList) self.dupCsvRows.append(csvHeaderList[0]) self.utils = Utils() def run(self): self.scrapData() def stop(self): self.isExiting = True def scrapData(self): if self.isExiting: return self.scrapProductData.emit( '<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl) self.logger.debug('===== URL [' + self.mainUrl + '] =====') data = self.spider.fetchData(self.mainUrl) if data: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) category1Chunk = self.regex.getAllSearchedData( '(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data) if category1Chunk: for category1Data in category1Chunk: category1 = self.regex.getSearchedData( '(?i)<a href="[^"]*">([^<]*)</a>', category1Data) category2Chunk = self.regex.getAllSearchedData( '(?i)<li><a href="([^"]*)">([^<]*)</a>', category1Data) if category2Chunk: for category2Data in category2Chunk: self.scrapCategory2Data( self.mainUrl + category2Data[0], category1, category2Data[1]) self.scrapProductData.emit( '<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl) def scrapCategory2Data(self, url, category1, category2): if self.isExiting: return self.scrapProductData.emit('<b>Category 2 URL: </b>%s' % url) self.logger.debug('== Category 2 URL [' + url + '] ==') data = self.spider.fetchData(url) if data: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) category3Chunks = self.regex.getSearchedData( '(?i)<ul class="topCat clear-fix">(.*?)</ul>', data) if category3Chunks: category3Chunk = self.regex.getAllSearchedData( '(?i)<a href="([^"]*)">([^<]*)<', category3Chunks) if category3Chunk: for category3Data in category3Chunk: self.scrapCategory3Data( self.mainUrl + category3Data[0], category1, category2, category3Data[1]) def scrapCategory3Data(self, url, category1, category2, category3): if self.isExiting: return self.scrapProductData.emit('<b>Category 3 URL: </b>%s' % url) self.logger.debug('== Category 3 URL [' + url + '] ==') data = self.spider.fetchData(url) if data: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) category4Chunks = self.regex.getSearchedData( '(?i)<ul class="topCat clear-fix">(.*?)</ul>', data) if category4Chunks: category4Chunk = self.regex.getAllSearchedData( '(?i)<a href="([^"]*)">([^<]*)<', category4Chunks) if category4Chunk: for category4Data in category4Chunk: category4Url = self.mainUrl + category4Data[0] self.scrapCategory4Data(category4Url, category1, category2, category3, category4Data[1]) def scrapCategory4Data(self, url, category1, category2, category3, category4): if self.isExiting: return self.scrapProductData.emit('<b>Category 4 URL: </b>%s' % url) self.logger.debug('== Category 4 URL [' + url + '] ==') data = self.spider.fetchData(url) if data: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) categoryChunk = self.regex.getAllSearchedData( '(?i)<div class="product-list-row clear-after">(.*?)</fieldset>', data) if categoryChunk: for categoryData in categoryChunk: if self.isExiting: return productInfo = self.regex.getSearchedDataGroups( '(?i)<h3 class="product-name"> <a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryData) productUrl = self.mainUrl + productInfo.group(1) productName = productInfo.group(2) if productUrl not in self.dupCsvRows: self.dupCsvRows.append(productUrl) else: self.scrapProductData.emit( '<font color=green><b>Already exists this item in csv Skip it</b></font>' ) self.logger.debug( '========= Already exists this item Skip it ===========' ) return productImageInfo = self.regex.getSearchedDataGroups( '(?i)<img class="primaryImage" src="([^"]*)" alt="([^"]*)"', categoryData) image = self.regex.replaceData( '(?i)medium', 'xlarge', str(productImageInfo.group(1))) productImageUrl = self.mainUrl + image productImage = self.regex.getSearchedData( '(?i)/([a-zA-Z0-9-_.]*)$', image) self.utils.downloadFile(productImageUrl, 'images/' + productImage) productCode = productImageInfo.group(2) productTechSpecs = self.regex.getSearchedData( '(?i)<p class="description">([^<]*)</p>', categoryData) brandName = self.regex.getSearchedData( '(?i)<img class="brand-image" src="[^"]*" alt="([^"]*)"', categoryData) price = self.regex.getSearchedData( '(?i)<div class="reduced-price"> <span class="[^"]*">([^<]*)</span>', categoryData) if price: price = price.strip()[1:] productStatus = self.regex.getSearchedData( '(?i)<div class="availibility"> <img alt="([^"]*)"', categoryData) productDesc = '' productLongDesc = '' spareCodes = '' accessoryCode = '' userManual = '' explodedView = '' self.scrapProductData.emit( '<br /><font color=green><b>Product Details URL: </b>%s</font>' % productUrl) productChunk = self.spider.fetchData(productUrl) if productChunk: productChunk = self.regex.reduceNewLine(productChunk) productChunk = self.regex.reduceBlankSpace( productChunk) productDesc = self.regex.getSearchedData( '(?i)<div class="productDesc"> <h1 class="[^"]*"[^>]*?>[^<]*?</h1>.*?<p>([^<]*)</p>', productChunk) productLongDesc = self.regex.getSearchedData( '(?i)<div class="info-product[^>]*?>(.*?)</div>', productChunk) otherUrl = self.regex.getSearchedData( '(?i)(^.*?/)[a-zA-Z0-9._-]*?$', productUrl) self.logger.debug('== Common Product URL [' + otherUrl + '] ==') sparesUrl = otherUrl + "AjaxProductSpares.raction" self.logger.debug('== Spares URL [' + sparesUrl + '] ==') spares = self.spider.fetchData(sparesUrl) if spares: spares = self.regex.getAllSearchedData( '(?i)<p class="code"><span class="bold">Code:</span>([^<]*)</p>', spares) if spares: spareCodes = ', '.join(spares) accessoriesUrl = otherUrl + "AjaxProductAccessories.raction" self.logger.debug('== Accessories URL [' + accessoriesUrl + '] ==') accessories = self.spider.fetchData(accessoriesUrl) if accessories: accessories = self.regex.getAllSearchedData( '(?i)<p class="code"><span class="bold">Code:</span>([^<]*)</p>', accessories) if accessories: accessoryCode = ', '.join(accessories) docUrl = otherUrl + "AjaxProductDocuments.raction" self.logger.debug('== Document URL[' + docUrl + '] ==') userManuals = self.spider.fetchData(docUrl) if userManuals: userManual = self.regex.getSearchedData( '(?i)<a class="document-icon" href="([^"]*)"[^>]*?>Download User Manual</a>', userManuals) self.logger.debug('Manual URL: ' + userManual) if userManual: userManualUrl = self.mainUrl + self.regex.replaceData( ' ', '%20', userManual) self.logger.debug('User Manual URL: ' + userManualUrl) self.scrapProductData.emit( '<b>User Manual PDF URL: </b>%s' % userManualUrl) userManual = self.regex.getSearchedData( '(?i)/([a-zA-Z0-9-_. ]*)$', userManual) userManual = self.regex.replaceData( '\s+', '_', userManual.strip()) self.scrapProductData.emit( '<font color=green><b>Downloading User Manual: </b>%s <b>Please Wait...</b>' % userManual) self.utils.downloadFile( userManualUrl, 'user_manual/' + userManual) explodedView = self.regex.getSearchedData( '(?i)<a class="document-icon" href="([^"]*)"[^>]*?>Download Exploded Diagram</a>', userManuals) if explodedView: explodedViewUrl = self.mainUrl + self.regex.replaceData( ' ', '%20', explodedView) self.scrapProductData.emit( '<b>Exploded Diagram PDF URL: </b>%s' % explodedViewUrl) explodedView = self.regex.getSearchedData( '(?i)/([a-zA-Z0-9-_. ]*)$', explodedView) explodedView = self.regex.replaceData( '\s+', '_', explodedView.strip()) self.scrapProductData.emit( '<font color=green><b>Downloading Exploded Diagram: </b>%s <b>Please Wait...</b>' % explodedView) self.utils.downloadFile( explodedViewUrl, 'exploded_view/' + explodedView) csvData = [ productUrl, productCode, productTechSpecs, productName, brandName, price.strip(), productDesc, productLongDesc, productImage, userManual, explodedView, spareCodes, accessoryCode, productStatus, category1, category2, category3, category4 ] self.csvWriter.writeCsvRow(csvData) self.logger.debug('Scraped data ' + str(csvData)) self.scrapProductData.emit( '<div><b>Scraped Data: </b>%s<br /></div>' % str(csvData))
class Form(QMainWindow): def __init__(self, parent=None): super(Form, self).__init__(parent) self.createGui() self.memberDic = {} self.excludedMember = None dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow('linkedIn.csv', 0) self.csvWriter = Csv('linkedIn.csv') self.allMembers = [] def createGui(self): self.labelUser = QLabel('<b>Username: </b>') self.inputUser = QLineEdit() self.labelPass = QLabel('<b>Password:</b>') self.inputPass = QLineEdit() self.labelPageRange = QLabel( '<b>Select Your Page Range:<br />Example: 2-5 or 1 </b>') self.inputPageRange = QLineEdit() self.btnGroup = QPushButton('&Scrap Groups') self.btnGroup.clicked.connect(self.btnOkAction) self.labelCombo = QLabel('<b>Select Your Group: </b>') self.combo = QComboBox() self.combo.currentIndexChanged.connect(self.groupChangeEvent) self.labelExcludeMember = QLabel( '<b>Write Your Excluded Member Name: <br />(ex_member1,ex_member2)</b>' ) self.inputExcludeMember = QLineEdit() self.btnMember = QPushButton('&Scrap Members') self.btnMember.clicked.connect(self.btnMembersAction) self.labelMember = QLabel('<b>Scraped Members: </b>') self.browserMember = QTextBrowser() self.browserMember.setReadOnly(False) self.btnExcludeAll = QPushButton('&Exclude All Member') self.btnExcludeAll.clicked.connect(self.excludeAllAction) self.labelSubject = QLabel('<b>Message Subject: </b>') self.inputSubject = QLineEdit() self.labelMessage = QLabel('<b>Write Message: </b>') self.browserMessage = QTextBrowser() self.browserMessage.setReadOnly(False) self.btnSendMessage = QPushButton('&Send Message') self.btnSendMessage.clicked.connect(self.sendMessageAction) self.browser = QTextBrowser() layout = QGridLayout() layout.addWidget(self.labelUser, 0, 0) layout.addWidget(self.inputUser, 0, 1) layout.addWidget(self.labelPass, 1, 0) layout.addWidget(self.inputPass, 1, 1) layout.addWidget(self.labelPageRange, 2, 0) layout.addWidget(self.inputPageRange, 2, 1) layout.addWidget(self.btnGroup, 3, 1, Qt.AlignLeft) layout.addWidget(self.labelCombo, 4, 0) layout.addWidget(self.combo, 4, 1) layout.addWidget(self.labelExcludeMember, 5, 0) layout.addWidget(self.inputExcludeMember, 5, 1) layout.addWidget(self.btnExcludeAll, 6, 0, Qt.AlignLeft) layout.addWidget(self.btnMember, 6, 1, Qt.AlignLeft) layout.addWidget(self.labelMember, 7, 0) layout.addWidget(self.browserMember, 7, 1) layout.addWidget(self.labelSubject, 8, 0) layout.addWidget(self.inputSubject, 8, 1) layout.addWidget(self.labelMessage, 9, 0) layout.addWidget(self.browserMessage, 9, 1) layout.addWidget(self.btnSendMessage, 10, 1) layoutMain = QVBoxLayout() layoutMain.addLayout(layout) layoutMain.addWidget(self.browser) widget = QWidget() widget.setLayout(layoutMain) self.setCentralWidget(widget) self.resize(600, 600) self.setWindowTitle('LinkedIn Scrapper.') def groupChangeEvent(self): self.browserMember.clear() def btnOkAction(self): self.linkedIn = MyLinkedIn(self.inputUser.text(), self.inputPass.text()) # self.linkedIn = MyLinkedIn('*****@*****.**', 'ubuntu36') self.linkedIn.notifyLinkedIn.connect(self.notifyInfo) self.linkedIn.cookieL.connect(self.setSpiderObj) self.linkedIn.notifyMember.connect(self.addGroups) self.linkedIn.start() def sendMessageAction(self): messageMembers = [] members = self.browserMember.toPlainText().split('\n') for member in members: messageMembers.append((member, self.memberDic[member])) self.linkedInMessage = MyLinkedInMessage( self.spiderObj, messageMembers, self.inputSubject.text(), self.browserMessage.toPlainText()) self.linkedInMessage.notifyLinkedIn.connect(self.notifyInfo) self.linkedInMessage.start() def btnMembersAction(self): # self.linkedInMember = MyLinkedInMembers(self.spiderObj, # self.combo.itemData(self.combo.currentIndex()).toString(), '2-5') self.browserMember.clear() self.linkedInMember = MyLinkedInMembers( self.spiderObj, self.combo.itemData(self.combo.currentIndex()).toString(), self.inputPageRange.text()) self.linkedInMember.notifyLinkedIn.connect(self.notifyInfo) self.linkedInMember.notifyMembers.connect(self.appendMembers) self.linkedInMember.start() def excludeAllAction(self): if self.allMembers is not None and len(self.allMembers) > 0: for member in self.allMembers: if member[0] not in self.dupCsvRows: self.dupCsvRows.append( [member[0], unicode(member[1]), unicode(member[2])]) self.csvWriter.writeCsvRow( [member[0], unicode(member[1]), unicode(member[2])]) self.browserMember.clear() self.allMembers = None def appendMembers(self, members): print self.dupCsvRows try: self.excludedMember = unicode( self.inputExcludeMember.text()).split(',') except Exception, x: print x for member in members: if member[0] is None or len( member[0]) == 0 or member[1] is None or len( member[1]) == 0: continue if member not in self.allMembers: print member self.allMembers.append(member) if self.excludedMember is not None and unicode( member[1] ) is not None and len(unicode(member[1])) > 0 and unicode( member[1] ) in self.excludedMember and member[0] not in self.dupCsvRows: self.dupCsvRows.append( [member[0], unicode(member[1]), unicode(member[2])]) self.csvWriter.writeCsvRow( [member[0], unicode(member[1]), unicode(member[2])]) if self.excludedMember is None or ( unicode(member[1]) not in self.excludedMember and member[0] not in self.dupCsvRows): if unicode( member[1]) is not None and len(unicode(member[1])) > 0: self.browserMember.append(member[1]) self.memberDic[QString(member[1])] = member[0]