class OmvicScrapper: isFinished = False im_data = [] def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.initScrapper() def initScrapper(self): try: dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow('omvic.csv') self.dbHelper = DbHelper('omvic.db') self.dbHelper.createTable('omvic') self.totaldata = self.dbHelper.getTotalProduct('omvic') self.csvWriter = Csv('omvic.csv') csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass', 'Operating Status', 'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names'] if len(dupCsvRows) == 0: self.csvWriter.writeCsvRow(csvDataHeader) del dupCsvReader del dupCsvRows gc.collect() del gc.garbage[:] gc.collect() except Exception, x: print x
class AmazonScrapper(QThread): notifyAmazon = pyqtSignal(object) def __init__(self, urlList, category): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv') self.csvWriter = Csv(category + '.csv') csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL'] if csvDataHeader not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://www.amazon.com' self.scrapUrl = None self.dbHelper = DbHelper('amazon.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: # self.scrapProductDetail( # 'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544') # return if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url) imUrl = None retry = 0 while imUrl is None and retry < 4: imUrl = self.reformatUrl(url) retry += 1 if imUrl is None: imUrl = url self.total = 0 print 'URL: ' + str(imUrl) sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority', 'date-desc-rank'] for sort in sortList: self.scrapReformatData(imUrl, sort) self.notifyAmazon.emit( '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url) self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class SaraivaScrapper(QThread): notifySaraiva = pyqtSignal(object) def __init__(self, urlList, category, htmlTag, replaceTag): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category self.htmlTag = self.regex.replaceData('\r+', '', htmlTag) self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag) self.replaceTag = replaceTag self.csvWriter = Csv(category + '.csv') csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture'] self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://busca.livrariasaraiva.com.br' self.scrapUrl = None self.dbHelper = DbHelper('saraiva.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category) def run(self, retry=0): try: if self.urlList is not None and len(self.urlList): for url in self.urlList: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url) paginationUrl, self.maxRecords = self.reformatUrl(url) self.notifySaraiva.emit( '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords)) print 'Max records: ', self.maxRecords print 'URL: ' + str(paginationUrl) sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev', '&isort=price+rev', '&isort=price', '&isort=date+rev'] for sort in sortList: self.scrapResults(paginationUrl, sort) self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>') except Exception, x: print x.message self.logger.error('Exception at run: ', x.message) if retry < 5: self.run(retry + 1)
class HostDAO: __db = None; def __init__(self): self.__db = DbHelper() def getAll(self): return self.__db.query("SELECT * FROM host", None).fetchall();
class DiscoveryDAO: __db = None def __init__(self): self.__db = DbHelper() def getAll(self): return self.__db.query("SELECT * FROM container_discovery", None).fetchall() def getAvailableContainer(self, workload, host): return self.__db.query( """ SELECT * FROM container_discovery WHERE status = %s AND workload = %s AND host = %s """, ("stopped", workload, host)) def updateContainerStatus(self, status, containerId): self.__db.query( """ UPDATE container_discovery SET status=%s WHERE id=%s """, (status, containerId)) self.__db.commit()
def __init__(self, urlList, category): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category dupCsvReader = Csv() self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv') self.csvWriter = Csv(category + '.csv') csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL'] if csvDataHeader not in self.dupCsvRows: self.dupCsvRows.append(csvDataHeader) self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://www.amazon.com' self.scrapUrl = None self.dbHelper = DbHelper('amazon.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category)
def initScrapper(self): try: dupCsvReader = Csv() dupCsvRows = dupCsvReader.readCsvRow('omvic.csv') self.dbHelper = DbHelper('omvic.db') self.dbHelper.createTable('omvic') self.totaldata = self.dbHelper.getTotalProduct('omvic') self.csvWriter = Csv('omvic.csv') csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass', 'Operating Status', 'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names'] if len(dupCsvRows) == 0: self.csvWriter.writeCsvRow(csvDataHeader) del dupCsvReader del dupCsvRows gc.collect() del gc.garbage[:] gc.collect() except Exception, x: print x
def __init__(self, urlList, category, htmlTag, replaceTag): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() self.urlList = urlList self.category = category self.htmlTag = self.regex.replaceData('\r+', '', htmlTag) self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag) self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag) self.replaceTag = replaceTag self.csvWriter = Csv(category + '.csv') csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture'] self.csvWriter.writeCsvRow(csvDataHeader) self.mainUrl = 'http://busca.livrariasaraiva.com.br' self.scrapUrl = None self.dbHelper = DbHelper('saraiva.db') self.dbHelper.createTable(category) self.total = self.dbHelper.getTotalProduct(category)
class CpuStatusDAO: __db = None def __init__(self): self.__db = DbHelper() def getAll(self): return self.__db.query("SELECT * FROM cpu_status", None).fetchall() def getAvailableCpu(self): return self.__db.query( "SELECT * FROM cpu_status WHERE status = 'idle'", None) def updateCpuStatus(self, status, host, cpuNumber): self.__db.query( """ UPDATE cpu_status SET status=%s WHERE host=%s AND cpu=%s """, (status, host, cpuNumber)) self.__db.commit()
def __init__(self): self.__db = DbHelper()
class FundRankTask: """查询基金持仓任务""" def __init__(self): pass def start(self): print("start holding stocks task") mDbHelper = DbHelper() # 查询基金排名 def queryFundRank(self, index): print("query one fund") reqUrl = fCodeUrl.replace("10086", index) print(reqUrl) req = urllib2.urlopen(reqUrl) resp = json.loads(req.read().decode("utf-8")) print(resp) # jsonData = json.loads(resp) # print(type(resp["Datas"])) data = resp["Datas"] if len(data): for item in data: # FCODE = data["FCODE"] # SHORTNAME = data["SHORTNAME"] # FUNDTYPE = data["FUNDTYPE"] # BFUNDTYPE = data["BFUNDTYPE"] # FEATURE = data["FEATURE"] # FSRQ = data["FSRQ"] # RZDF = data["RZDF"] # DWJZ = data["DWJZ"] # HLDWJZ = data["HLDWJZ"] # LJJZ = data["LJJZ"] # FTYI = data["FTYI"] # TEYI = data["TEYI"] # TFYI = data["TFYI"] # SYL_Z = data["SYL_Z"] # SYL_Y = data["SYL_Y"] # SYL_3Y = data["SYL_3Y"] # SYL_6Y = data["SYL_6Y"] # SYL_1N = data["SYL_1N"] # SYL_2N = data["SYL_2N"] # SYL_3N = data["SYL_3N"] # SYL_5N = data["SYL_5N"] # SYL_JN = data["SYL_JN"] # SYL_LN = data["SYL_LN"] # ZJL = data["ZJL"] # TARGETYIELD = data["TARGETYIELD"] # CYCLE = data["CYCLE"] # KFR = data["KFR"] # LEVERAGE = data["LEVERAGE"] # BAGTYPE = data["BAGTYPE"] # BUY = data["BUY"] # LISTTEXCH = data["LISTTEXCH"] # NEWTEXCH = data["NEWTEXCH"] # ISLISTTRADE = data["ISLISTTRADE"] # PTDT_Y = data["PTDT_Y"] # PTDT_TWY = data["PTDT_TWY"] # PTDT_TRY = data["PTDT_TRY"] # PTDT_FY = data["PTDT_FY"] # MBDT_Y = data["MBDT_Y"] # MBDT_TWY = data["MBDT_TWY"] # MBDT_TRY = data["MBDT_TRY"] # MBDT_FY = data["MBDT_FY"] # YDDT_Y = data["YDDT_Y"] # YDDT_TWY = data["YDDT_TWY"] # YDDT_TRY = data["YDDT_TRY"] # YDDT_FY = data["YDDT_FY"] # DWDT_Y = data["DWDT_Y"] # DWDT_TWY = data["DWDT_TWY"] # DWDT_TRY = data["DWDT_TRY"] # DWDT_FY = data["DWDT_FY"] # ENDNAV = data["ENDNAV"] # SALEVOLUME = data["SALEVOLUME"] # PV_Y = data["PV_Y"] # DTCOUNT_Y = data["DTCOUNT_Y"] # ORGSALESRANK = data["ORGSALESRANK"] # ISABNORMAL = data["ISABNORMAL"] # self.saveToDb(item) print(item) # FundRankTask.mDbHelper.saveFundRank(data) return True else: FundRankTask.mDbHelper.close() print("data error") return False def queryAllFunRank(self): print("query all fund`s rank") hasNext = True index = 0 while hasNext: print("get next data") hasNext = self.queryFundRank(str(index)) index = index + 1 def saveToDb(self, data): print("save to db")