def iniFromDB(self): memP, memS = self._memDB.db() conn, cur = PgConnectionPool.getConn() try: cur.execute('SELECT * FROM t_uc_material_partition') results = cur.fetchall() for r in results: print(r) memP.insert(pid=r[0], maxSeq=r[1], minSeq=r[2], tableName=r[3], assigned=r[4], splitNum=r[5], phase=r[6]) cur.execute('SELECT * FROM t_uc_material_partition_scan') results = cur.fetchall() for r in results: print(r) memS.insert(startSeq=r[0], endSeq=r[1], intervalSecs=int(r[2]), scannerId=r[3], pid=r[4]) except Exception as e: print(e) finally: PgConnectionPool.release(conn, cur)
async def sjob(pid,scannerId,startSeq,endSeq): conn,cur = PgConnectionPool.getConn() try: print("job fired...%s,%s,%d,%d"%(pid,scannerId,startSeq,endSeq)) limit = int(ApplicationProperties.configure("application.storage.access.page.limit")) offset = 0 flag = limit while flag == limit: flag = 1 cur.execute('SELECT * FROM t_uc_material__%s where status = 2 and materialId >= %d and materialId < %d limit %d offset %d'%(pid,startSeq,endSeq,limit,offset)) results=cur.fetchall() print(results) for idx, r in enumerate(results): #print(r[2]) #definition = json.loads(r[2]) #print("task definition:%s"%(definition)) type = r[2]["type"] addr = r[2]["addr"] secondDomain = re.search(r"[A-Za-z0-9\-]+\.com|[A-Za-z0-9\-]+\.edu.cn|[A-Za-z0-9\-]+\.cn|[A-Za-z0-9\-]+\.com.cn|[A-Za-z0-9\-]+\.org|[A-Za-z0-9\-]+\.net|[A-Za-z0-9\-]+\.tv|[A-Za-z0-9\-]+\.vip|[A-Za-z0-9\-]+\.cc|[A-Za-z0-9\-]+\.gov.cn|[A-Za-z0-9\-]+\.gov|[A-Za-z0-9\-]+\.edu|[A-Za-z0-9\-]+\.biz|[A-Za-z0-9\-]+\.net.cn|[A-Za-z0-9\-]+\.org.cn",addr.lower()) #if type == 1 or type == 9: #todo: send to massCollect #{materialId, seqno, definition} #None #else: await inq.put({"pid":pid,"materialId":r[0],"uuid":r[1],"type":type,"url":addr,"domain":secondDomain.group(),"definition":r[2]}); flag=flag+idx offset=offset+limit except Exception as e: print(e) finally: PgConnectionPool.release(conn, cur)
def write(task, writable): #print("I can write now...") #print(writable) data = {} queue = writable["queue"] rootcontents = writable["contents"] childrennum = writable["childrennum"] print("%d article, table list:" % (childrennum)) #for list in rootcontents: # print(list) #print("articles: ") try: data["tablelist"] = rootcontents data["contents"] = {} for idx in range(queue.qsize()): subcontents = queue.get_nowait() for con in subcontents: url = con["url"] value = con["value"] no = con["no"] hashcode = con["hashcode"] data["contents"][url] = { "value": value, "no": no, "hashcode": hashcode } print("insert material[%d] data: partition:%s" % (task["materialId"], task["pid"])) print(data) ttime = datetime.now() pt = gettimepartition() sql = "insert into t_uc_material_data__" + pt + "__" + task[ "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) " sqldata = (task["materialId"], task["seqno"], date.today(), json.dumps(data), ttime) conn, cur = PgConnectionPool.getConn() cur.execute(sql, sqldata) conn.commit() print("insert material[%d] data: partition:%s, done!" % (task["materialId"], task["pid"])) except Exception as e: print(e) finally: PgConnectionPool.release(conn, cur) '''
def db_write(material, writable): conn, cur = PgConnectionPool.getConn() try: ttime = datetime.now() pt = get_time_partition() sql = "insert into t_uc_material_data__" + pt + "__" + material[ "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) " sqldata = (material["materialId"], material["seqno"], date.today(), json.dumps(material["data"]), ttime) cur.execute(sql, sqldata) conn.commit() print("insert material[%d] data: partition:%s, done!" % (task["materialId"], task["pid"])) except Exception as e: print(e) finally: PgConnectionPool.release(conn, cur)
def write(self, p_task, p_data): if p_data: data = {} data["charset"] = p_task["charset"] data["url"] = p_task["url"] data["value"] = p_data ttime = datetime.now() pt = gettimepartition() sql = "insert into t_uc_material_data__" + pt + "__" + p_task[ "pid"] + " (materialId,seqno,insertDay,contents,insertTime) values (%s,%s,%s,%s,%s) " sqldata = (p_task["materialId"], p_task["seqno"], date.today(), json.dumps(data), ttime) conn, cur = PgConnectionPool.getConn() try: cur.execute(sql, sqldata) conn.commit() except Exception as e: print(e) finally: PgConnectionPool.release(conn, cur)
except Exception as e: print("Exception: %s %s" % (e, url)) dead.add(url) finally: inboundq.task_done() #await q.put(base_url) # Start workers, then wait for the work queue to be empty. workers = gen.multi([worker() for _ in range(concurrency)]) ioloop.IOLoop.current().start() #io_loop = ioloop.IOLoop.current() #io_loop.run_sync(main) if __name__ == '__main__': ApplicationProperties.populate( {"config": "/Users/apple/Documents/var/workspace/var-daemon"}) PgConnectionPool.ini( host=ApplicationProperties.configure( "application.storage.postgres.connection.host"), port=ApplicationProperties.configure( "application.storage.postgres.connection.port"), user=ApplicationProperties.configure( "application.storage.postgres.connection.user"), password=ApplicationProperties.configure( "application.storage.postgres.connection.password"), database=ApplicationProperties.configure( "application.storage.postgres.connection.database")) LoadCollectors().run()
def start(self): global __g_scheduler __g_scheduler = TornadoScheduler() #ApplicationProperties.populate(p_command=self._command) scannerId = ApplicationProperties.configure("scannerId") print("Ready initialize scanner(id=%s)"%(scannerId)) conn,cur = PgConnectionPool.getConn() try: sql = 'SELECT p.pid,p.maxSeq,p.minSeq,p.splitNum,s.intervalSecs FROM t_uc_material_partition_scan s inner join t_uc_material_partition p on s.pid=p.pid where s.scannerId = %s' data = (scannerId) cur.execute(sql, data) results=cur.fetchall() print(results) record = results[0] splitNum = record[3] maxSeq = record[1] minSeq = record[2] intervalSecs = int(record[4]) pid = record[0] fragment = math.ceil((maxSeq-minSeq+1)/splitNum) sp = minSeq for i in range(splitNum): fromI = sp sp = sp + fragment print("add scanner, from %d to %d, interval %d seconds, partition id %s."%(fromI,sp,intervalSecs,pid)) #memS.insert(startSeq=fromI,endSeq=sp,intervalSecs=__g_properties.getProperty("application.scanner.schedule.intervalSeconds"),scannerId=scannerId,pid=unassiP["pid"]) __g_scheduler.add_job(sjob,'interval', seconds=intervalSecs, args=(pid,scannerId,fromI,sp)) __g_scheduler.start() #except Exception as e: # print(e) finally: PgConnectionPool.release(conn, cur) self._exchangeServerHost = ApplicationProperties.configure("application.exchange.server.host") self._exchangeServerPort = ApplicationProperties.configure("application.exchange.server.port") ''' jobstores = { 'mongo': {'type': 'mongodb'}, 'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite') } executors = { 'default': {'type': 'threadpool', 'max_workers': 5} } job_defaults = { 'coalesce': False, 'max_instances': 1 } ''' ''' for idx, sche in enumerate(p_schedule): sid = sche["scannerId"]+"#"+str(idx) _scheduler.add_job(sjob,'interval', seconds=_properties.getProperty("application.scanner.schedule.intervalSeconds"), args=(sche["pid"],sche["tableName"],sid,sche["startSeq"],sche["endSeq"])) ''' #if q: # q.put_nowait(self) #newLoop = tornado.ioloop.IOLoop.current() #print(newLoop) tornado.ioloop.IOLoop.current().run_sync(self.send)