Exemple #1
0
	def __init__(self, generation, mgr_interval=5):
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug

		self.gen = generation
		self.mgr_interval = mgr_interval

		self.person_queue 		 = Queue.Queue(maxsize=self.settings.person_cache_size)
		self.person_id_set	 	 = set([])	# sync with queue, quick contains using id. 

		self.pubmap		 		 = {}		# {id -> pub}
		self.person_pub_map		 = {}		# {person_id->[pub_id_list]} - person to pub_ids
		self.pub_db_cache 		 = {}

		self.pub_lock			 = threading.Lock()
		self.pub_dbcache_lock 	 = threading.RLock()

		self.running = True #sync ed with main running flag in mgr_interval_thread
		self.blocked_pub_t 		 = 0

		# time sum
		self.ppt_wait = 0
		self.ppt_getlock = 0
		self.ppt_get = 0

		self.person_dao = PersonDao()
		self.pub_dao = PublicationDao()
Exemple #2
0
	def __init__(self, extractorInstance, idList):
		threading.Thread.__init__(self)
		self.extractor = extractorInstance
		self.settings = Settings.getInstance()
		self.store = self.extractor.store
		self.personUpdater = PersonUpdateTool()
		self.idList = idList
Exemple #3
0
	def __init__(self):
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug
		self.linkcache = Queue.Queue()
		self.running = True  #sync ed with main running flag in mgr_interval_thread
		
		now = datetime.datetime.now()
		filepath = "pdflink_%s_%s_%s_%s_%s.list" % (now.year, now.month, now.day, now.minute, now.second)
		self.pdflink_file = file(os.path.join(self.settings.pdflink_dir, filepath), 'w')
Exemple #4
0
    def __init__(self):
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug
        self.linkcache = Queue.Queue()
        self.running = True  #sync ed with main running flag in mgr_interval_thread

        now = datetime.datetime.now()
        filepath = "pdflink_%s_%s_%s_%s_%s.list" % (
            now.year, now.month, now.day, now.minute, now.second)
        self.pdflink_file = file(
            os.path.join(self.settings.pdflink_dir, filepath), 'w')
Exemple #5
0
 def __init__(self):
     self.settings = Settings.getInstance()
     self.pool = PooledDB(MySQLdb,
                          3,
                          20,
                          host=self.settings.db_host,
                          user=self.settings.db_user,
                          passwd=self.settings.db_passwd,
                          port=self.settings.db_port,
                          db=self.settings.db_database,
                          maxusage=20)
Exemple #6
0
	def __init__(self):
		self.settings = Settings.getInstance()
		self.pool = PooledDB (
			MySQLdb, 3, 20,
			host=self.settings.db_host,
			user=self.settings.db_user,
			passwd=self.settings.db_passwd,
			port=self.settings.db_port,
			db=self.settings.db_database,
			maxusage=20
		)
Exemple #7
0
    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = [
        ]  #= Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  #= Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

        # start
        self.determineGereration()
Exemple #8
0
    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = []  # = Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  # = Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

            # start
        self.determineGereration()
	def __init__(self, manager_instance, load_from_web=False):
		self.settings = Settings.getInstance()
		self.proxyResource = ProxyResource()
		self.manager = manager_instance
		self.html_getter = WebPageDownloader();
		self.filename = os.path.join(self.settings.resourcedir, "proxies.txt")
		self.filename_static = os.path.join(self.settings.resourcedir, "proxies_static.txt")

		self.autosave_interval = (5, 12 * 5) # (seconds * check times)
		self.autosave_checkcount = 0

		# load first
		if os.path.exists(self.filename) and not load_from_web:
			self.loadFromFile()

		if len(self.manager.proxies) < 10:
			print "Load too less proxies from file."
			self.loadProxyFromWeb()
			self.saveToFile()
Exemple #10
0
			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

if __name__ == '__main__':
	''' top pub person, this is in local database.
	'''
	debug = DebugSuit()
#	debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4)
	debug.debug_pubs()


	# end
	if Settings.getInstance().save_pdflink:
		PDFLinkSaver.getInstance().flush()



	def __init__(self):
		self.settings = Settings.getInstance()
		self.html_getter = WebPageDownloader();
 def __init__(self):
     self.extractor = Extractor().getInstance()
     self.settings = Settings.getInstance()
Exemple #13
0
 def __init__(self):
     self.extractor = Extractor().getInstance()
     self.settings = Settings.getInstance()
Exemple #14
0
        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'


if __name__ == '__main__':
    ''' top pub person, this is in local database.
	'''
    debug = DebugSuit()
    #	debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4)
    debug.debug_pubs()

    # end
    if Settings.getInstance().save_pdflink:
        PDFLinkSaver.getInstance().flush()