def test_debug_not_found(self):
		'''Debug Errors'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()

		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5))
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))

		matcher = PubMatcher.getInstance()
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'
Example #2
0
    def test_matchPub(self):
        self.extractor = Extractor().getInstance()
        pubdao = PublicationDao()
        person_id = 13419
        person_name = 'jie tang'
        # Read sources from files
        all_models = {}
        for page in range(0, 3):
            filename = "".join((person_name, '_page_', str(page), '.html'))
            f = file(os.path.join(self.settings.source_dir, filename), 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            if models is not None:
                self.extractor._Extractor__merge_into_extractedmap(
                    all_models, models)
        print 'Total found DEBUG  %s items.' % len(all_models)

        # part 2
        pubs = pubdao.getPublicationByPerson(person_id,
                                             self.settings.generation)

        printout = False
        if printout:
            for key, models in all_models.items():
                print key, " --> ", models
            print '==================='
            for pub in pubs:
                print pub

        (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
        print '- test done -', len(pubs_matched), len(pubs_not_matched)
        return pubs_not_matched
Example #3
0
    def integrate_test_pubs(self, pub_candidates):
        """
			For Debug Errors
		"""
        print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print "Test %s pub, query: \n\t%s" % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True)

        # print out
        print "-" * 100
        for pub in pubs_found:
            print "[%s] %s" % (pub.ncitation, pub)
        print "-" * 100
        for pub in pubs_notfound:
            print "[%s] %s" % ("-", pub)
        print "-" * 100
        print "- test done -"
Example #4
0
    def integrate_test_pubs(self, pub_candidates):
        '''
			For Debug Errors
		'''
        print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(
            query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs,
                                                       all_models,
                                                       debug_output=True)

        # print out
        print '-' * 100
        for pub in pubs_found:
            print '[%s] %s' % (pub.ncitation, pub)
        print '-' * 100
        for pub in pubs_notfound:
            print '[%s] %s' % ('-', pub)
        print '-' * 100
        print '- test done -'
 def test_getNodesByPersonName(self):
     '''Test method getNodesByPersonName.'''
     print '-TEST-:', self.test_extractFromPage.__doc__.strip()
     e = Extractor()
     models = e.getNodesByPersonName('jie tang')
     for model in models:
         print model
     print '-END TEST-'
	def test_getNodesByPersonName(self):
		'''Test method getNodesByPersonName.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		e = Extractor()
		models = e.getNodesByPersonName('jie tang')
		for model in models:
			print model
		print '-END TEST-'
    def test_debug_not_found(self):
        '''Debug Errors'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()

        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)',
                "pubkey", -1, "authors", -5))
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'On the Use of Spreading Activation Methods in Automatic Information Retrieval',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000,
                        'Introduction to Modern Information Retrieval',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey",
                        -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))

        matcher = PubMatcher.getInstance()
        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'
	def test_fetchByPubs(self, pubs):
		'''Test use a list of pubs that not found in person search'''
		print '-- test fetchByPubs %s pubs', len(pubs)
		new_pubs = []
		for pub in pubs:
			new_pubs.append((pub, 'jie tang'))

		extractor = Extractor()
		extractor.getNodesByPubs(new_pubs)
		print '- test done -'
Example #9
0
    def test_fetchByPubs(self, pubs):
        '''Test use a list of pubs that not found in person search'''
        print '-- test fetchByPubs %s pubs', len(pubs)
        new_pubs = []
        for pub in pubs:
            new_pubs.append((pub, 'jie tang'))

        extractor = Extractor()
        extractor.getNodesByPubs(new_pubs)
        print '- test done -'
Example #10
0
	def debug_pubs(self):
		'''Debug get by pub'''
		print '-TEST-:', self.debug_pubs.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		
		# group 1
#		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))
		
		# group 2
#		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))
		
		# group 3
#		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

		# group 4
		pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5))
		
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)

		#
		# Get WEB PAGE
		#
		use_web = True # ***************
		if use_web:
			all_models = extractor.getNodesByPubs(used_pubs)
		else:
			f = file('debug_pubs.txt', 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			all_models = self.extractor._Extractor__merge_into_extractedmap(None, models)

		print '\n- all_models ----------------------'
		if all_models is not None:
			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'
Example #11
0
	def debug_person(self, person_id, person_name, generation):
		'''Test method extract_from_source.'''
		print '- DEBUG Person "%s" -:' % person_name

		pubs = self.pubdao.getPublicationByPerson(person_id, generation)
		all_models = self.extractor.getNodesByPersonName(person_name)
#		if True:#print all all_models
#			print '-' * 100, 'This is all_models'
#			for key, models in all_models.items():
#				print key, ':'
#				for model in models:
#					print '\t', model.readable_title, '(', model, ')'
#			print '=' * 100 , 'all_models print done'
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '|||||||||||||||||||||||||||| get by pubs '
		# todo here should be a while
		query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = self.extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '- END DEBUG -'
	def test_matchPub(self):
		self.extractor = Extractor().getInstance()
		pubdao = PublicationDao()
		person_id = 13419
		person_name = 'jie tang'
		# Read sources from files
		all_models = {}
		for page in range(0, 3):
			filename = "".join((person_name, '_page_', str(page), '.html'))
			f = file(os.path.join(self.settings.source_dir, filename), 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			if models is not None:
				self.extractor._Extractor__merge_into_extractedmap(all_models, models)
		print 'Total found DEBUG  %s items.' % len(all_models)

		# part 2
		pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation)

		printout = False
		if printout:
			for key, models in all_models.items():
				print key, " --> ", models
			print '==================='
			for pub in pubs:
				print pub

		(pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
		print '- test done -', len(pubs_matched), len(pubs_not_matched)
		return pubs_not_matched
Example #13
0
    def debug_person(self, person_id, person_name, generation):
        '''Test method extract_from_source.'''
        print '- DEBUG Person "%s" -:' % person_name

        pubs = self.pubdao.getPublicationByPerson(person_id, generation)
        all_models = self.extractor.getNodesByPersonName(person_name)
        #		if True:#print all all_models
        #			print '-' * 100, 'This is all_models'
        #			for key, models in all_models.items():
        #				print key, ':'
        #				for model in models:
        #					print '\t', model.readable_title, '(', model, ')'
        #			print '=' * 100 , 'all_models print done'
        (pubs_found,
         pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '|||||||||||||||||||||||||||| get by pubs '
        # todo here should be a while
        query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = self.extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '- END DEBUG -'
Example #14
0
    def update(self):
        pubs = self.pubdao.getPublicationByPerson(self.person.id,
                                                  self.generation)
        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        self.show(pubs)
        all_models = Extractor.getInstance().getNodesByPersonName(
            self.person.names)

        print 'all models:'
        for model in all_models:
            print model
        raw_input()

        if all_models is not None:
            print "=" * 100
            (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
                pubs, all_models)
            if pubs_found is None or pubs_notfound is None:
                print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                    % self.person
                return
            print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                len(pubs_found), len(pubs_notfound),
                len(pubs_found) + len(pubs_notfound), self.person)
        else:
            pubs_notfound = pubs
        print 'pubs found :'
        self.show(pubs_found)
        print 'done'
Example #15
0
    def update(self):
        pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation)
        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        self.show(pubs)
        all_models = Extractor.getInstance().getNodesByPersonName(self.person.names)
        
        print 'all models:'
        for model in all_models:
            print model
        raw_input()
        
        if all_models is not None:
            print "=" * 100
            (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
            if pubs_found is None or pubs_notfound is None:
                print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                    % self.person
                return
            print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person
            )
        else:
            pubs_notfound = pubs
        print 'pubs found :'
        self.show(pubs_found)
        print 'done'
    def process_person(self):
        ''' real logic of process person '''
        # all pubs need to update citation number.
        # totalPubCount = self.pubdao.getPersonPubCount(self.person.id)

        pubs = self.pubdao.getPublicationByPerson(self.person.id,
                                                  self.extractor.generation)

        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        print "$Ex/get:> person '%s' has %d papers to crawl" % (
            self.person.names, len(pubs))
        # by crawlByPerson, a lot of publication maybe found and update.
        pubs_found = None
        pubs_notfound = None
        if len(pubs) > 4:
            all_models = Extractor.getInstance().getNodesByPersonName(
                self.person.names)
            if all_models is not None:
                print "=" * 100
                (pubs_found,
                 pubs_notfound) = PubMatcher.getInstance().matchPub(
                     pubs, all_models)
                if pubs_found is None or pubs_notfound is None:
                    print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                     % self.person
                    return
                print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                    len(pubs_found), len(pubs_notfound),
                    len(pubs_found) + len(pubs_notfound), self.person)
            else:
                pubs_notfound = pubs
        else:
            pubs_found = []
            pubs_notfound = pubs

        if pubs_found is not None:
            for pub in pubs_found:
                self.store.putToPubdbcache(pub)
                print "{-A}[%4s] %s" % (pub.ncitation, pub)

        if pubs_notfound is not None:
            for pub in pubs_notfound:
                self.store.putToPubCache(self.person, pub)
Example #17
0
    def runOriginal(self):
        while self.extractor.running and not self.ask_to_stop:
            self.mark()
            self.extractor.wait_for_pause()  # wait if paused
            #			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs

            query, used_pubs = self.store.getFromPubQueue()  # get url and pubs
            if used_pubs is None or len(used_pubs) == 0:
                print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (
                    self.name, query, used_pubs)
                time.sleep(10)
                continue
            self.extractor.wait_for_pause()  # wait again

            with self.extractor.busy_semaphore_lock:
                self.extractor.busy_semaphore += 1
                self.extractor.busy_pub_semaphore += 1

            pubs_found = None
            pubs_notfound = None
            try:
                all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
                if all_models is not None:
                    (pubs_found,
                     pubs_notfound) = PubMatcher.getInstance().matchPub(
                         used_pubs, all_models)
                    if pubs_found is None or pubs_notfound is None:
                        print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
                        return
                    print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (
                        len(pubs_found), len(pubs_notfound), query)
                else:
                    pubs_notfound = used_pubs
            except Exception, e:
                ExceptionHelper.print_exec(e)
                print '-------------------------------------------------------'
                print 'query:', query
                print 'all_models', all_models
                print 'used_pubs', used_pubs
                print '-------------------------------------------------------'
                return
            finally:
	def process_person(self):
		''' real logic of process person '''
		# all pubs need to update citation number.
		# totalPubCount = self.pubdao.getPersonPubCount(self.person.id)

		pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation)

		if pubs is not None and len(pubs) == 0:
			self.store.markPersonFinished(self.person)
			print "[*] Mark Person as Finished '%s'." % self.person
			return

		print "$Ex/get:> person '%s' has %d papers to crawl" % (self.person.names, len(pubs))
		# by crawlByPerson, a lot of publication maybe found and update.
		pubs_found = None
		pubs_notfound = None
		if len(pubs) > 4:
			all_models = Extractor.getInstance().getNodesByPersonName(self.person.names)
			if all_models is not None:
				print "=" * 100
				(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
				if pubs_found is None or pubs_notfound is None:
					print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
						% self.person
					return
				print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
					len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person
				)
			else:
				pubs_notfound = pubs
		else:
			pubs_found = []
			pubs_notfound = pubs

		if pubs_found is not None:
			for pub in pubs_found:
				self.store.putToPubdbcache(pub)
				print "{-A}[%4s] %s" % (pub.ncitation, pub)
	
		if pubs_notfound is not None:
			for pub in pubs_notfound:
				self.store.putToPubCache(self.person, pub)
Example #19
0
class TestCase():

	def __init__(self):
		self.extractor = Extractor().getInstance()


	def test_extractFromPage(self):
		'''Test method extract_from_source.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		# prepare
		f = file("../test/example_google_page.txt", "r")
		html = f.read()
		f.close()
		
		# test
		models = self.extractor.extract_from_source(html)
		print "**:>> %s" % len(models)
		for model in models:
			print model.asDetailText();
			
		print '-END TEST-'
Example #20
0
	def runOriginal(self):
		while self.extractor.running and not self.ask_to_stop:
			self.mark()
			self.extractor.wait_for_pause() # wait if paused
#			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs
			
			query, used_pubs = self.store.getFromPubQueue() # get url and pubs
			if used_pubs is None or len(used_pubs) == 0:
				print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs)
				time.sleep(10)
				continue
			self.extractor.wait_for_pause() # wait again

			with self.extractor.busy_semaphore_lock: 
				self.extractor.busy_semaphore += 1
				self.extractor.busy_pub_semaphore += 1

			pubs_found = None
			pubs_notfound = None
			try:
				all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
				if all_models is not None:
					(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
					if pubs_found is None or pubs_notfound is None:
						print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
						return
					print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query)
				else:
					pubs_notfound = used_pubs
			except Exception, e:
				ExceptionHelper.print_exec(e)
				print '-------------------------------------------------------'
				print 'query:', 	query
				print 'all_models', all_models
				print 'used_pubs', used_pubs
				print '-------------------------------------------------------'
				return
			finally:
class TestCase():

	def __init__(self):
		self.extractor = Extractor().getInstance()


	def test_extractFromPage(self):
		'''Test method extract_from_source.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		# prepare
		f = file("../test/example_google_page.txt", "r")
		html = f.read()
		f.close()
		# test
		models = self.extractor.extract_from_source(html)
		for model in models:
			print model
		print '-END TEST-'


	def test_getNodesByPersonName(self):
		'''Test method getNodesByPersonName.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		e = Extractor()
		models = e.getNodesByPersonName('jie tang')
		for model in models:
			print model
		print '-END TEST-'


	def test_clean_title(self):
		html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association  &hellip;</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&amp;rep=rep1&amp;type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br>
strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br>
rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&amp;hl=en&amp;num=100&amp;as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">All 28 versions</a></span></font>  </div>  <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return'''
		models = self.extractor.extract_from_source(html)
		for model in models: print model
		print '- test done -'


	def test_debug_not_found(self):
		'''Debug Errors'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()

		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5))
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))

		matcher = PubMatcher.getInstance()
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'


	def test_pin_query(self):
		'''Test pin query'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5))
		query, pubs = self.extractor.pinMaxQuery(pub_candidates)
		print query
		for pub in pubs:
			print pub
 def __init__(self):
     self.extractor = Extractor().getInstance()
class TestCase():
    def __init__(self):
        self.extractor = Extractor().getInstance()

    def test_extractFromPage(self):
        '''Test method extract_from_source.'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        # prepare
        f = file("../test/example_google_page.txt", "r")
        html = f.read()
        f.close()
        # test
        models = self.extractor.extract_from_source(html)
        for model in models:
            print model
        print '-END TEST-'

    def test_getNodesByPersonName(self):
        '''Test method getNodesByPersonName.'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        e = Extractor()
        models = e.getNodesByPersonName('jie tang')
        for model in models:
            print model
        print '-END TEST-'

    def test_clean_title(self):
        html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association  &hellip;</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&amp;rep=rep1&amp;type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br>
strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br>
rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&amp;hl=en&amp;num=100&amp;as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">All 28 versions</a></span></font>  </div>  <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return'''
        models = self.extractor.extract_from_source(html)
        for model in models:
            print model
        print '- test done -'

    def test_debug_not_found(self):
        '''Debug Errors'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()

        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)',
                "pubkey", -1, "authors", -5))
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'On the Use of Spreading Activation Methods in Automatic Information Retrieval',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000,
                        'Introduction to Modern Information Retrieval',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey",
                        -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))

        matcher = PubMatcher.getInstance()
        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

    def test_pin_query(self):
        '''Test pin query'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Methodology and technology for virtual component driven hardware/software co-design on the system-level',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'XML for the Exchange of Automation Project Information',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors",
                        -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder',
                "pubkey", -1, "authors", -5))
        query, pubs = self.extractor.pinMaxQuery(pub_candidates)
        print query
        for pub in pubs:
            print pub
Example #24
0
	def getFromPubQueueBack(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合,(取几个pub拼成一个最长字符串用来抓取)
			如果遇到错误,可能返回None.
			@return: (url, pubs[])
		'''
		print_verbose = True
		try:
			# block if no pub items.
			start = time.time()
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
			dur = (time.time() - start)
			#print "---------============----------- get 1 wait %.4s" % dur
			if print_verbose: print('TimeUsed:%.4s ms, ' % dur)

			start = time.time()
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				# count
				self.ppt_wait += dur
				#print "---------============----------- get 3 getlock %.4s" % (time.time() - start)
				self.ppt_getlock += (time.time() - start)
				start = time.time()

				# select candidates
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids, del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub is not None:
									pub_candidates.append(_pub)
									valid_ids = valid_ids + 1
									if print_verbose: 
										print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					del self.person_pub_map[personId]
#					print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[ERR] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)
Example #25
0
	def getFromPubQueue(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合
		(取几个pub拼成一个最长字符串用来抓取)
		如果遇到错误,可能返回None.
		@return: (url, pubs[])
		'''
		print_verbose = False
		try:
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
				
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids, del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub:
									pub_candidates.append(_pub)
									valid_ids += 1
									
								if print_verbose: 
									print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose:
									print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					for pubId in self.person_pub_map[personId]:
						if pubId in self.pubmap:
#							print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId)
							del self.pubmap[pubId]
							
					del self.person_pub_map[personId]
#					print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[store] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)
Example #26
0
 def __init__(self):
     self.extractor = Extractor.getInstance()
     self.matcher = PubMatcher.getInstance()
     self.pubdao = PublicationDao()
Example #27
0
class TestPubMatcher:
    def __init__(self):
        self.matcher = PubMatcher()

    #
    # Test
    #
    def test_matchPub(self):
        self.extractor = Extractor().getInstance()
        pubdao = PublicationDao()
        person_id = 13419
        person_name = 'jie tang'
        # Read sources from files
        all_models = {}
        for page in range(0, 3):
            filename = "".join((person_name, '_page_', str(page), '.html'))
            f = file(os.path.join(self.settings.source_dir, filename), 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            if models is not None:
                self.extractor._Extractor__merge_into_extractedmap(
                    all_models, models)
        print 'Total found DEBUG  %s items.' % len(all_models)

        # part 2
        pubs = pubdao.getPublicationByPerson(person_id,
                                             self.settings.generation)

        printout = False
        if printout:
            for key, models in all_models.items():
                print key, " --> ", models
            print '==================='
            for pub in pubs:
                print pub

        (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
        print '- test done -', len(pubs_matched), len(pubs_not_matched)
        return pubs_not_matched

    def test_fetchByPubs(self, pubs):
        '''Test use a list of pubs that not found in person search'''
        print '-- test fetchByPubs %s pubs', len(pubs)
        new_pubs = []
        for pub in pubs:
            new_pubs.append((pub, 'jie tang'))

        extractor = Extractor()
        extractor.getNodesByPubs(new_pubs)
        print '- test done -'

    def test_match_with_authors(self):
        data_test = ((
            '… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org',
            'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'
        ), (
            'R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org',
            'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'
        ), (
            'P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org',
            'Peter Lyngbak,William Kent'
        ), (
            'W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com',
            'William Kent'
        ), (
            'DE Neiman, DW Hildum, VR Lessef, T  &hellip;',
            'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm'
        ), (
            'M Esmaili, R Safavi-Naini, J Pieprzyk',
            'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk'
        ), ('DH Fishman, J Annevelink, E Chow, T  &hellip;',
            'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson'
            ))
        data_debug = ((
            'DH Fishman, J Annevelink, E Chow, T  &hellip;',
            'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson'
        ), )
        for ga, da in data_debug:
            print "match: %s \n with: %s \n   is: %s" % (ga, da, \
               self.matcher.matchAuthors(ga, da, debug_output=True))
Example #28
0
    def debug_pubs(self):
        '''Debug get by pub'''
        print '-TEST-:', self.debug_pubs.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []

        # group 1
        #		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))

        # group 2
        #		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))

        # group 3
        #		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

        # group 4
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices',
                "pubkey", -1, "Dusan Guller", -5))

        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)

        #
        # Get WEB PAGE
        #
        use_web = True  # ***************
        if use_web:
            all_models = extractor.getNodesByPubs(used_pubs)
        else:
            f = file('debug_pubs.txt', 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            all_models = self.extractor._Extractor__merge_into_extractedmap(
                None, models)

        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'
Example #29
0
	def __init__(self):
		self.extractor = Extractor.getInstance()
		self.matcher = PubMatcher.getInstance()
		self.pubdao = PublicationDao()
Example #30
0
 def __init__(self):
     self.extractor = Extractor().getInstance()
     self.settings = Settings.getInstance()
	def __init__(self):
		self.extractor = Extractor().getInstance()
class TestPubMatcher:
	def __init__(self):
		self.matcher = PubMatcher()
		
	#
	# Test
	#
	def test_matchPub(self):
		self.extractor = Extractor().getInstance()
		pubdao = PublicationDao()
		person_id = 13419
		person_name = 'jie tang'
		# Read sources from files
		all_models = {}
		for page in range(0, 3):
			filename = "".join((person_name, '_page_', str(page), '.html'))
			f = file(os.path.join(self.settings.source_dir, filename), 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			if models is not None:
				self.extractor._Extractor__merge_into_extractedmap(all_models, models)
		print 'Total found DEBUG  %s items.' % len(all_models)

		# part 2
		pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation)

		printout = False
		if printout:
			for key, models in all_models.items():
				print key, " --> ", models
			print '==================='
			for pub in pubs:
				print pub

		(pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
		print '- test done -', len(pubs_matched), len(pubs_not_matched)
		return pubs_not_matched

	def test_fetchByPubs(self, pubs):
		'''Test use a list of pubs that not found in person search'''
		print '-- test fetchByPubs %s pubs', len(pubs)
		new_pubs = []
		for pub in pubs:
			new_pubs.append((pub, 'jie tang'))

		extractor = Extractor()
		extractor.getNodesByPubs(new_pubs)
		print '- test done -'

	def test_match_with_authors(self):
		data_test = (
			('… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org',
			 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'),
			('R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org',
			 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'),
			('P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org',
			 'Peter Lyngbak,William Kent'),
			('W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com',
			 'William Kent'),
			('DE Neiman, DW Hildum, VR Lessef, T  &hellip;',
			 'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm'),
			('M Esmaili, R Safavi-Naini, J Pieprzyk',
			 'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk'),
			 ('DH Fishman, J Annevelink, E Chow, T  &hellip;',
			 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson')
		)
		data_debug = (
			('DH Fishman, J Annevelink, E Chow, T  &hellip;',
			 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson'),
		)
		for ga, da in data_debug:
			print "match: %s \n with: %s \n   is: %s" % (ga, da, \
					 self.matcher.matchAuthors(ga, da, debug_output=True))