def __init__(self, generation, mgr_interval=5): self.settings = Settings.getInstance() self.debug = self.settings.debug self.gen = generation self.mgr_interval = mgr_interval self.person_queue = Queue.Queue(maxsize=self.settings.person_cache_size) self.person_id_set = set([]) # sync with queue, quick contains using id. self.pubmap = {} # {id -> pub} self.person_pub_map = {} # {person_id->[pub_id_list]} - person to pub_ids self.pub_db_cache = {} self.pub_lock = threading.Lock() self.pub_dbcache_lock = threading.RLock() self.running = True #sync ed with main running flag in mgr_interval_thread self.blocked_pub_t = 0 # time sum self.ppt_wait = 0 self.ppt_getlock = 0 self.ppt_get = 0 self.person_dao = PersonDao() self.pub_dao = PublicationDao()
def __init__(self, extractorInstance, idList): threading.Thread.__init__(self) self.extractor = extractorInstance self.settings = Settings.getInstance() self.store = self.extractor.store self.personUpdater = PersonUpdateTool() self.idList = idList
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.linkcache = Queue.Queue() self.running = True #sync ed with main running flag in mgr_interval_thread now = datetime.datetime.now() filepath = "pdflink_%s_%s_%s_%s_%s.list" % (now.year, now.month, now.day, now.minute, now.second) self.pdflink_file = file(os.path.join(self.settings.pdflink_dir, filepath), 'w')
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.linkcache = Queue.Queue() self.running = True #sync ed with main running flag in mgr_interval_thread now = datetime.datetime.now() filepath = "pdflink_%s_%s_%s_%s_%s.list" % ( now.year, now.month, now.day, now.minute, now.second) self.pdflink_file = file( os.path.join(self.settings.pdflink_dir, filepath), 'w')
def __init__(self): self.settings = Settings.getInstance() self.pool = PooledDB(MySQLdb, 3, 20, host=self.settings.db_host, user=self.settings.db_user, passwd=self.settings.db_passwd, port=self.settings.db_port, db=self.settings.db_database, maxusage=20)
def __init__(self): self.settings = Settings.getInstance() self.pool = PooledDB ( MySQLdb, 3, 20, host=self.settings.db_host, user=self.settings.db_user, passwd=self.settings.db_passwd, port=self.settings.db_port, db=self.settings.db_database, maxusage=20 )
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [ ] #= Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [] # = Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] # = Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
def __init__(self, manager_instance, load_from_web=False): self.settings = Settings.getInstance() self.proxyResource = ProxyResource() self.manager = manager_instance self.html_getter = WebPageDownloader(); self.filename = os.path.join(self.settings.resourcedir, "proxies.txt") self.filename_static = os.path.join(self.settings.resourcedir, "proxies_static.txt") self.autosave_interval = (5, 12 * 5) # (seconds * check times) self.autosave_checkcount = 0 # load first if os.path.exists(self.filename) and not load_from_web: self.loadFromFile() if len(self.manager.proxies) < 10: print "Load too less proxies from file." self.loadProxyFromWeb() self.saveToFile()
for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -' if __name__ == '__main__': ''' top pub person, this is in local database. ''' debug = DebugSuit() # debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4) debug.debug_pubs() # end if Settings.getInstance().save_pdflink: PDFLinkSaver.getInstance().flush()
def __init__(self): self.settings = Settings.getInstance() self.html_getter = WebPageDownloader();
def __init__(self): self.extractor = Extractor().getInstance() self.settings = Settings.getInstance()
print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -' if __name__ == '__main__': ''' top pub person, this is in local database. ''' debug = DebugSuit() # debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4) debug.debug_pubs() # end if Settings.getInstance().save_pdflink: PDFLinkSaver.getInstance().flush()