Esempio n. 1
0
    def iniFromDB(self):
        memP, memS = self._memDB.db()
        conn, cur = PgConnectionPool.getConn()
        try:
            cur.execute('SELECT * FROM t_uc_material_partition')
            results = cur.fetchall()
            for r in results:
                print(r)
                memP.insert(pid=r[0],
                            maxSeq=r[1],
                            minSeq=r[2],
                            tableName=r[3],
                            assigned=r[4],
                            splitNum=r[5],
                            phase=r[6])

            cur.execute('SELECT * FROM t_uc_material_partition_scan')
            results = cur.fetchall()
            for r in results:
                print(r)
                memS.insert(startSeq=r[0],
                            endSeq=r[1],
                            intervalSecs=int(r[2]),
                            scannerId=r[3],
                            pid=r[4])
        except Exception as e:
            print(e)
        finally:
            PgConnectionPool.release(conn, cur)
Esempio n. 2
0
async def sjob(pid,scannerId,startSeq,endSeq):
    conn,cur = PgConnectionPool.getConn()
    try:
        print("job fired...%s,%s,%d,%d"%(pid,scannerId,startSeq,endSeq))
        
        limit = int(ApplicationProperties.configure("application.storage.access.page.limit"))
        offset = 0
        flag = limit
        while flag == limit:
            flag = 1
            cur.execute('SELECT * FROM t_uc_material__%s where status = 2 and materialId >= %d and materialId < %d limit %d offset %d'%(pid,startSeq,endSeq,limit,offset))
            results=cur.fetchall()
            print(results)
            for idx, r in enumerate(results):
                #print(r[2])
                #definition = json.loads(r[2])
                #print("task definition:%s"%(definition))
                type = r[2]["type"]
                addr = r[2]["addr"]
                secondDomain = re.search(r"[A-Za-z0-9\-]+\.com|[A-Za-z0-9\-]+\.edu.cn|[A-Za-z0-9\-]+\.cn|[A-Za-z0-9\-]+\.com.cn|[A-Za-z0-9\-]+\.org|[A-Za-z0-9\-]+\.net|[A-Za-z0-9\-]+\.tv|[A-Za-z0-9\-]+\.vip|[A-Za-z0-9\-]+\.cc|[A-Za-z0-9\-]+\.gov.cn|[A-Za-z0-9\-]+\.gov|[A-Za-z0-9\-]+\.edu|[A-Za-z0-9\-]+\.biz|[A-Za-z0-9\-]+\.net.cn|[A-Za-z0-9\-]+\.org.cn",addr.lower())
                #if type == 1 or type == 9:
                    #todo: send to massCollect
                    #{materialId, seqno, definition}
                    #None
                #else:
                await inq.put({"pid":pid,"materialId":r[0],"uuid":r[1],"type":type,"url":addr,"domain":secondDomain.group(),"definition":r[2]});
                flag=flag+idx
                
            offset=offset+limit
    except Exception as e:
        print(e)
    finally:
        PgConnectionPool.release(conn, cur)        
Esempio n. 3
0
def write(task, writable):
    #print("I can write now...")
    #print(writable)
    data = {}
    queue = writable["queue"]
    rootcontents = writable["contents"]
    childrennum = writable["childrennum"]
    print("%d article, table list:" % (childrennum))
    #for list in rootcontents:
    #   print(list)
    #print("articles: ")
    try:
        data["tablelist"] = rootcontents
        data["contents"] = {}
        for idx in range(queue.qsize()):
            subcontents = queue.get_nowait()
            for con in subcontents:
                url = con["url"]
                value = con["value"]
                no = con["no"]
                hashcode = con["hashcode"]
                data["contents"][url] = {
                    "value": value,
                    "no": no,
                    "hashcode": hashcode
                }

        print("insert material[%d] data: partition:%s" %
              (task["materialId"], task["pid"]))
        print(data)
        ttime = datetime.now()
        pt = gettimepartition()
        sql = "insert into t_uc_material_data__" + pt + "__" + task[
            "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) "
        sqldata = (task["materialId"], task["seqno"], date.today(),
                   json.dumps(data), ttime)
        conn, cur = PgConnectionPool.getConn()
        cur.execute(sql, sqldata)
        conn.commit()
        print("insert material[%d] data: partition:%s, done!" %
              (task["materialId"], task["pid"]))
    except Exception as e:
        print(e)
    finally:
        PgConnectionPool.release(conn, cur)
    '''
Esempio n. 4
0
def db_write(material, writable):
    conn, cur = PgConnectionPool.getConn()
    try:
        ttime = datetime.now()
        pt = get_time_partition()
        sql = "insert into t_uc_material_data__" + pt + "__" + material[
            "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) "
        sqldata = (material["materialId"], material["seqno"], date.today(),
                   json.dumps(material["data"]), ttime)
        cur.execute(sql, sqldata)
        conn.commit()
        print("insert material[%d] data: partition:%s, done!" %
              (task["materialId"], task["pid"]))
    except Exception as e:
        print(e)
    finally:
        PgConnectionPool.release(conn, cur)
Esempio n. 5
0
 def write(self, p_task, p_data):
     if p_data:
         data = {}
         data["charset"] = p_task["charset"]
         data["url"] = p_task["url"]
         data["value"] = p_data
         ttime = datetime.now()
         pt = gettimepartition()
         sql = "insert into t_uc_material_data__" + pt + "__" + p_task[
             "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) "
         sqldata = (p_task["materialId"], p_task["seqno"], date.today(),
                    json.dumps(data), ttime)
         conn, cur = PgConnectionPool.getConn()
         try:
             cur.execute(sql, sqldata)
             conn.commit()
         except Exception as e:
             print(e)
         finally:
             PgConnectionPool.release(conn, cur)
Esempio n. 6
0
                except Exception as e:
                    print("Exception: %s %s" % (e, url))
                    dead.add(url)
                finally:
                    inboundq.task_done()

        #await q.put(base_url)

        # Start workers, then wait for the work queue to be empty.
        workers = gen.multi([worker() for _ in range(concurrency)])
        ioloop.IOLoop.current().start()
        #io_loop = ioloop.IOLoop.current()
        #io_loop.run_sync(main)


if __name__ == '__main__':
    ApplicationProperties.populate(
        {"config": "/Users/apple/Documents/var/workspace/var-daemon"})
    PgConnectionPool.ini(
        host=ApplicationProperties.configure(
            "application.storage.postgres.connection.host"),
        port=ApplicationProperties.configure(
            "application.storage.postgres.connection.port"),
        user=ApplicationProperties.configure(
            "application.storage.postgres.connection.user"),
        password=ApplicationProperties.configure(
            "application.storage.postgres.connection.password"),
        database=ApplicationProperties.configure(
            "application.storage.postgres.connection.database"))

    LoadCollectors().run()
Esempio n. 7
0
    def start(self):
        global __g_scheduler
        __g_scheduler = TornadoScheduler()
        #ApplicationProperties.populate(p_command=self._command)
        
        scannerId = ApplicationProperties.configure("scannerId")
        print("Ready initialize scanner(id=%s)"%(scannerId))
        conn,cur = PgConnectionPool.getConn()
        try:
            
            sql = 'SELECT p.pid,p.maxSeq,p.minSeq,p.splitNum,s.intervalSecs FROM t_uc_material_partition_scan s inner join t_uc_material_partition p on s.pid=p.pid where s.scannerId = %s'
            data = (scannerId)
            cur.execute(sql, data)
            results=cur.fetchall()
            print(results)
            record = results[0]
            
            splitNum = record[3]
            maxSeq = record[1]
            minSeq = record[2]
            intervalSecs = int(record[4])
            pid = record[0]
            fragment = math.ceil((maxSeq-minSeq+1)/splitNum)
            sp = minSeq
            for i in range(splitNum):
                fromI = sp
                sp = sp + fragment
                print("add scanner, from %d to %d, interval %d seconds, partition id %s."%(fromI,sp,intervalSecs,pid))
                #memS.insert(startSeq=fromI,endSeq=sp,intervalSecs=__g_properties.getProperty("application.scanner.schedule.intervalSeconds"),scannerId=scannerId,pid=unassiP["pid"])
                __g_scheduler.add_job(sjob,'interval', seconds=intervalSecs, args=(pid,scannerId,fromI,sp))
            __g_scheduler.start()
                    
        #except Exception as e:
        #    print(e)
        finally:
            PgConnectionPool.release(conn, cur)

        self._exchangeServerHost = ApplicationProperties.configure("application.exchange.server.host")
        self._exchangeServerPort = ApplicationProperties.configure("application.exchange.server.port")
        '''
        jobstores = {
            'mongo': {'type': 'mongodb'},
            'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
        }
        executors = {
            'default': {'type': 'threadpool', 'max_workers': 5}
        }
        job_defaults = {
            'coalesce': False,
            'max_instances': 1
        }
        '''
        '''
        for idx, sche in enumerate(p_schedule):
            sid = sche["scannerId"]+"#"+str(idx)
            _scheduler.add_job(sjob,'interval', seconds=_properties.getProperty("application.scanner.schedule.intervalSeconds"), args=(sche["pid"],sche["tableName"],sid,sche["startSeq"],sche["endSeq"]))
        '''
        
        
        #if q:
        #    q.put_nowait(self)
        #newLoop = tornado.ioloop.IOLoop.current()
        #print(newLoop)
        tornado.ioloop.IOLoop.current().run_sync(self.send)