Ejemplo n.º 1
0
class OmvicScrapper:
    isFinished = False
    im_data = []

    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.initScrapper()


    def initScrapper(self):
        try:
            dupCsvReader = Csv()
            dupCsvRows = dupCsvReader.readCsvRow('omvic.csv')
            self.dbHelper = DbHelper('omvic.db')
            self.dbHelper.createTable('omvic')
            self.totaldata = self.dbHelper.getTotalProduct('omvic')
            self.csvWriter = Csv('omvic.csv')
            csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass',
                             'Operating Status',
                             'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names']
            if len(dupCsvRows) == 0:
                self.csvWriter.writeCsvRow(csvDataHeader)
            del dupCsvReader
            del dupCsvRows
            gc.collect()
            del gc.garbage[:]
            gc.collect()
        except Exception, x:
            print x
class AmazonScrapper(QThread):
    notifyAmazon = pyqtSignal(object)

    def __init__(self, urlList, category):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
        if csvDataHeader not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://www.amazon.com'
        self.scrapUrl = None
        self.dbHelper = DbHelper('amazon.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            # self.scrapProductDetail(
            #     'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544')
            # return
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url)
                        imUrl = None
                        retry = 0
                        while imUrl is None and retry < 4:
                            imUrl = self.reformatUrl(url)
                            retry += 1
                        if imUrl is None:
                            imUrl = url
                        self.total = 0
                        print 'URL: ' + str(imUrl)
                        sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority',
                                    'date-desc-rank']
                        for sort in sortList:
                            self.scrapReformatData(imUrl, sort)
                        self.notifyAmazon.emit(
                            '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url)
            self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
class SaraivaScrapper(QThread):
    notifySaraiva = pyqtSignal(object)

    def __init__(self, urlList, category, htmlTag, replaceTag):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
        self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
        self.replaceTag = replaceTag
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
        self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://busca.livrariasaraiva.com.br'
        self.scrapUrl = None
        self.dbHelper = DbHelper('saraiva.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url)
                        paginationUrl, self.maxRecords = self.reformatUrl(url)
                        self.notifySaraiva.emit(
                            '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords))
                        print 'Max records: ', self.maxRecords
                        print 'URL: ' + str(paginationUrl)
                        sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev',
                                    '&isort=price+rev',
                                    '&isort=price', '&isort=date+rev']
                        for sort in sortList:
                            self.scrapResults(paginationUrl, sort)
            self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
Ejemplo n.º 4
0
class HostDAO:
   __db = None;

   def __init__(self):
       self.__db = DbHelper()

   def getAll(self):
       return self.__db.query("SELECT * FROM host", None).fetchall();
class DiscoveryDAO:
    __db = None

    def __init__(self):
        self.__db = DbHelper()

    def getAll(self):
        return self.__db.query("SELECT * FROM container_discovery",
                               None).fetchall()

    def getAvailableContainer(self, workload, host):
        return self.__db.query(
            """
          SELECT * FROM container_discovery 
          WHERE status = %s AND workload = %s AND host = %s
          """, ("stopped", workload, host))

    def updateContainerStatus(self, status, containerId):
        self.__db.query(
            """
         UPDATE container_discovery 
         SET status=%s
         WHERE id=%s
         """, (status, containerId))

        self.__db.commit()
 def __init__(self, urlList, category):
     QThread.__init__(self)
     self.logger = LogManager(__name__)
     self.spider = Spider()
     self.regex = Regex()
     self.utils = Utils()
     self.urlList = urlList
     self.category = category
     dupCsvReader = Csv()
     self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
     self.csvWriter = Csv(category + '.csv')
     csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
     if csvDataHeader not in self.dupCsvRows:
         self.dupCsvRows.append(csvDataHeader)
         self.csvWriter.writeCsvRow(csvDataHeader)
     self.mainUrl = 'http://www.amazon.com'
     self.scrapUrl = None
     self.dbHelper = DbHelper('amazon.db')
     self.dbHelper.createTable(category)
     self.total = self.dbHelper.getTotalProduct(category)
Ejemplo n.º 7
0
 def initScrapper(self):
     try:
         dupCsvReader = Csv()
         dupCsvRows = dupCsvReader.readCsvRow('omvic.csv')
         self.dbHelper = DbHelper('omvic.db')
         self.dbHelper.createTable('omvic')
         self.totaldata = self.dbHelper.getTotalProduct('omvic')
         self.csvWriter = Csv('omvic.csv')
         csvDataHeader = ['URL', 'Legal Name', 'Business Name', 'Status', 'Class of Registration', 'Subclass',
                          'Operating Status',
                          'Business Address', 'Email', 'Phone Number', 'Salesperson(s) Names']
         if len(dupCsvRows) == 0:
             self.csvWriter.writeCsvRow(csvDataHeader)
         del dupCsvReader
         del dupCsvRows
         gc.collect()
         del gc.garbage[:]
         gc.collect()
     except Exception, x:
         print x
 def __init__(self, urlList, category, htmlTag, replaceTag):
     QThread.__init__(self)
     self.logger = LogManager(__name__)
     self.spider = Spider()
     self.regex = Regex()
     self.utils = Utils()
     self.urlList = urlList
     self.category = category
     self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
     self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
     self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
     self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
     self.replaceTag = replaceTag
     self.csvWriter = Csv(category + '.csv')
     csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
     self.csvWriter.writeCsvRow(csvDataHeader)
     self.mainUrl = 'http://busca.livrariasaraiva.com.br'
     self.scrapUrl = None
     self.dbHelper = DbHelper('saraiva.db')
     self.dbHelper.createTable(category)
     self.total = self.dbHelper.getTotalProduct(category)
Ejemplo n.º 9
0
class CpuStatusDAO:
    __db = None

    def __init__(self):
        self.__db = DbHelper()

    def getAll(self):
        return self.__db.query("SELECT * FROM cpu_status", None).fetchall()

    def getAvailableCpu(self):
        return self.__db.query(
            "SELECT * FROM cpu_status WHERE status = 'idle'", None)

    def updateCpuStatus(self, status, host, cpuNumber):
        self.__db.query(
            """
         UPDATE cpu_status 
         SET status=%s
         WHERE host=%s AND cpu=%s
         """, (status, host, cpuNumber))

        self.__db.commit()
 def __init__(self):
     self.__db = DbHelper()
Ejemplo n.º 11
0
class FundRankTask:
    """查询基金持仓任务"""

    def __init__(self):
        pass

    def start(self):
        print("start holding stocks task")

    mDbHelper = DbHelper()

    # 查询基金排名
    def queryFundRank(self, index):
        print("query one fund")
        reqUrl = fCodeUrl.replace("10086", index)
        print(reqUrl)
        req = urllib2.urlopen(reqUrl)
        resp = json.loads(req.read().decode("utf-8"))
        print(resp)
        # jsonData = json.loads(resp)
        # print(type(resp["Datas"]))
        data = resp["Datas"]
        if len(data):
            for item in data:
                # FCODE = data["FCODE"]
                # SHORTNAME = data["SHORTNAME"]
                # FUNDTYPE = data["FUNDTYPE"]
                # BFUNDTYPE = data["BFUNDTYPE"]
                # FEATURE = data["FEATURE"]
                # FSRQ = data["FSRQ"]
                # RZDF = data["RZDF"]
                # DWJZ = data["DWJZ"]
                # HLDWJZ = data["HLDWJZ"]
                # LJJZ = data["LJJZ"]
                # FTYI = data["FTYI"]
                # TEYI = data["TEYI"]
                # TFYI = data["TFYI"]
                # SYL_Z = data["SYL_Z"]
                # SYL_Y = data["SYL_Y"]
                # SYL_3Y = data["SYL_3Y"]
                # SYL_6Y = data["SYL_6Y"]
                # SYL_1N = data["SYL_1N"]
                # SYL_2N = data["SYL_2N"]
                # SYL_3N = data["SYL_3N"]
                # SYL_5N = data["SYL_5N"]
                # SYL_JN = data["SYL_JN"]
                # SYL_LN = data["SYL_LN"]
                # ZJL = data["ZJL"]
                # TARGETYIELD = data["TARGETYIELD"]
                # CYCLE = data["CYCLE"]
                # KFR = data["KFR"]
                # LEVERAGE = data["LEVERAGE"]
                # BAGTYPE = data["BAGTYPE"]
                # BUY = data["BUY"]
                # LISTTEXCH = data["LISTTEXCH"]
                # NEWTEXCH = data["NEWTEXCH"]
                # ISLISTTRADE = data["ISLISTTRADE"]
                # PTDT_Y = data["PTDT_Y"]
                # PTDT_TWY = data["PTDT_TWY"]
                # PTDT_TRY = data["PTDT_TRY"]
                # PTDT_FY = data["PTDT_FY"]
                # MBDT_Y = data["MBDT_Y"]
                # MBDT_TWY = data["MBDT_TWY"]
                # MBDT_TRY = data["MBDT_TRY"]
                # MBDT_FY = data["MBDT_FY"]
                # YDDT_Y = data["YDDT_Y"]
                # YDDT_TWY = data["YDDT_TWY"]
                # YDDT_TRY = data["YDDT_TRY"]
                # YDDT_FY = data["YDDT_FY"]
                # DWDT_Y = data["DWDT_Y"]
                # DWDT_TWY = data["DWDT_TWY"]
                # DWDT_TRY = data["DWDT_TRY"]
                # DWDT_FY = data["DWDT_FY"]
                # ENDNAV = data["ENDNAV"]
                # SALEVOLUME = data["SALEVOLUME"]
                # PV_Y = data["PV_Y"]
                # DTCOUNT_Y = data["DTCOUNT_Y"]
                # ORGSALESRANK = data["ORGSALESRANK"]
                # ISABNORMAL = data["ISABNORMAL"]
                # self.saveToDb(item)
                print(item)
                # FundRankTask.mDbHelper.saveFundRank(data)
                return True
        else:
            FundRankTask.mDbHelper.close()
            print("data error")
            return False

    def queryAllFunRank(self):
        print("query all fund`s rank")
        hasNext = True
        index = 0
        while hasNext:
            print("get next data")
            hasNext = self.queryFundRank(str(index))
            index = index + 1

    def saveToDb(self, data):
        print("save to db")