Python Extractor.pinMaxQueryの例、com.lish.ajia.googlescholar.extractor.Extractor.pinMaxQuery Pythonの例

コード例 #1

0

ファイルを表示

ファイル: integrate_test.py プロジェクト: Rygbee/aminer-spider

    def integrate_test_pubs(self, pub_candidates):
        """
			For Debug Errors
		"""
        print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print "Test %s pub, query: \n\t%s" % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True)

        # print out
        print "-" * 100
        for pub in pubs_found:
            print "[%s] %s" % (pub.ncitation, pub)
        print "-" * 100
        for pub in pubs_notfound:
            print "[%s] %s" % ("-", pub)
        print "-" * 100
        print "- test done -"

コード例 #2

0

ファイルを表示

ファイル: debug.py プロジェクト: AlexLyj/aminer-spider

	def debug_person(self, person_id, person_name, generation):
		'''Test method extract_from_source.'''
		print '- DEBUG Person "%s" -:' % person_name

		pubs = self.pubdao.getPublicationByPerson(person_id, generation)
		all_models = self.extractor.getNodesByPersonName(person_name)
#		if True:#print all all_models
#			print '-' * 100, 'This is all_models'
#			for key, models in all_models.items():
#				print key, ':'
#				for model in models:
#					print '\t', model.readable_title, '(', model, ')'
#			print '=' * 100 , 'all_models print done'
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '|||||||||||||||||||||||||||| get by pubs '
		# todo here should be a while
		query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = self.extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '- END DEBUG -'

コード例 #3

0

ファイルを表示

ファイル: _________test_extractor.py プロジェクト: AlexLyj/aminer-spider

	def test_debug_not_found(self):
		'''Debug Errors'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()

		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5))
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))

		matcher = PubMatcher.getInstance()
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

コード例 #4

0

ファイルを表示

    def integrate_test_pubs(self, pub_candidates):
        '''
			For Debug Errors
		'''
        print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(
            query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs,
                                                       all_models,
                                                       debug_output=True)

        # print out
        print '-' * 100
        for pub in pubs_found:
            print '[%s] %s' % (pub.ncitation, pub)
        print '-' * 100
        for pub in pubs_notfound:
            print '[%s] %s' % ('-', pub)
        print '-' * 100
        print '- test done -'

コード例 #5

0

ファイルを表示

ファイル: debug.py プロジェクト: yinonbaron/aminer-spider

    def debug_person(self, person_id, person_name, generation):
        '''Test method extract_from_source.'''
        print '- DEBUG Person "%s" -:' % person_name

        pubs = self.pubdao.getPublicationByPerson(person_id, generation)
        all_models = self.extractor.getNodesByPersonName(person_name)
        #		if True:#print all all_models
        #			print '-' * 100, 'This is all_models'
        #			for key, models in all_models.items():
        #				print key, ':'
        #				for model in models:
        #					print '\t', model.readable_title, '(', model, ')'
        #			print '=' * 100 , 'all_models print done'
        (pubs_found,
         pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '|||||||||||||||||||||||||||| get by pubs '
        # todo here should be a while
        query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = self.extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '- END DEBUG -'

コード例 #6

0

ファイルを表示

ファイル: _________test_extractor.py プロジェクト: yinonbaron/aminer-spider

    def test_debug_not_found(self):
        '''Debug Errors'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()

        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)',
                "pubkey", -1, "authors", -5))
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'On the Use of Spreading Activation Methods in Automatic Information Retrieval',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000,
                        'Introduction to Modern Information Retrieval',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey",
                        -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))

        matcher = PubMatcher.getInstance()
        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

コード例 #7

0

ファイルを表示

ファイル: debug.py プロジェクト: AlexLyj/aminer-spider

	def debug_pubs(self):
		'''Debug get by pub'''
		print '-TEST-:', self.debug_pubs.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		
		# group 1
#		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))
		
		# group 2
#		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))
		
		# group 3
#		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

		# group 4
		pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5))
		
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)

		#
		# Get WEB PAGE
		#
		use_web = True # ***************
		if use_web:
			all_models = extractor.getNodesByPubs(used_pubs)
		else:
			f = file('debug_pubs.txt', 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			all_models = self.extractor._Extractor__merge_into_extractedmap(None, models)

		print '\n- all_models ----------------------'
		if all_models is not None:
			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

コード例 #8

0

ファイルを表示

ファイル: _________test_extractor.py プロジェクト: yinonbaron/aminer-spider

class TestCase():
    def __init__(self):
        self.extractor = Extractor().getInstance()

    def test_extractFromPage(self):
        '''Test method extract_from_source.'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        # prepare
        f = file("../test/example_google_page.txt", "r")
        html = f.read()
        f.close()
        # test
        models = self.extractor.extract_from_source(html)
        for model in models:
            print model
        print '-END TEST-'

    def test_getNodesByPersonName(self):
        '''Test method getNodesByPersonName.'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        e = Extractor()
        models = e.getNodesByPersonName('jie tang')
        for model in models:
            print model
        print '-END TEST-'

    def test_clean_title(self):
        html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association  &hellip;</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&amp;rep=rep1&amp;type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br>
strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br>
rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&amp;hl=en&amp;num=100&amp;as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">All 28 versions</a></span></font>  </div>  <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return'''
        models = self.extractor.extract_from_source(html)
        for model in models:
            print model
        print '- test done -'

    def test_debug_not_found(self):
        '''Debug Errors'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()

        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)',
                "pubkey", -1, "authors", -5))
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'On the Use of Spreading Activation Methods in Automatic Information Retrieval',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000,
                        'Introduction to Modern Information Retrieval',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey",
                        -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))

        matcher = PubMatcher.getInstance()
        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

    def test_pin_query(self):
        '''Test pin query'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Methodology and technology for virtual component driven hardware/software co-design on the system-level',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'XML for the Exchange of Automation Project Information',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors",
                        -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder',
                "pubkey", -1, "authors", -5))
        query, pubs = self.extractor.pinMaxQuery(pub_candidates)
        print query
        for pub in pubs:
            print pub

コード例 #9

0

ファイルを表示

ファイル: _________test_extractor.py プロジェクト: AlexLyj/aminer-spider

class TestCase():

	def __init__(self):
		self.extractor = Extractor().getInstance()


	def test_extractFromPage(self):
		'''Test method extract_from_source.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		# prepare
		f = file("../test/example_google_page.txt", "r")
		html = f.read()
		f.close()
		# test
		models = self.extractor.extract_from_source(html)
		for model in models:
			print model
		print '-END TEST-'


	def test_getNodesByPersonName(self):
		'''Test method getNodesByPersonName.'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		e = Extractor()
		models = e.getNodesByPersonName('jie tang')
		for model in models:
			print model
		print '-END TEST-'


	def test_clean_title(self):
		html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association  &hellip;</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&amp;rep=rep1&amp;type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br>
strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br>
rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&amp;hl=en&amp;num=100&amp;as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&amp;hl=en&amp;num=100&amp;as_sdt=2000">All 28 versions</a></span></font>  </div>  <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return'''
		models = self.extractor.extract_from_source(html)
		for model in models: print model
		print '- test done -'


	def test_debug_not_found(self):
		'''Debug Errors'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()

		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5))
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))

		matcher = PubMatcher.getInstance()
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'


	def test_pin_query(self):
		'''Test pin query'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5))
		query, pubs = self.extractor.pinMaxQuery(pub_candidates)
		print query
		for pub in pubs:
			print pub

コード例 #10

0

ファイルを表示

ファイル: debug.py プロジェクト: yinonbaron/aminer-spider

    def debug_pubs(self):
        '''Debug get by pub'''
        print '-TEST-:', self.debug_pubs.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []

        # group 1
        #		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))

        # group 2
        #		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))

        # group 3
        #		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

        # group 4
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices',
                "pubkey", -1, "Dusan Guller", -5))

        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)

        #
        # Get WEB PAGE
        #
        use_web = True  # ***************
        if use_web:
            all_models = extractor.getNodesByPubs(used_pubs)
        else:
            f = file('debug_pubs.txt', 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            all_models = self.extractor._Extractor__merge_into_extractedmap(
                None, models)

        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

コード例 #11

0

ファイルを表示

	def getFromPubQueue(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合
		(取几个pub拼成一个最长字符串用来抓取)
		如果遇到错误，可能返回None.
		@return: (url, pubs[])
		'''
		print_verbose = False
		try:
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
				
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids， del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub:
									pub_candidates.append(_pub)
									valid_ids += 1
									
								if print_verbose: 
									print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose:
									print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					for pubId in self.person_pub_map[personId]:
						if pubId in self.pubmap:
#							print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId)
							del self.pubmap[pubId]
							
					del self.person_pub_map[personId]
#					print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[store] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)

コード例 #12

0

ファイルを表示

	def getFromPubQueueBack(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合，（取几个pub拼成一个最长字符串用来抓取)
			如果遇到错误，可能返回None.
			@return: (url, pubs[])
		'''
		print_verbose = True
		try:
			# block if no pub items.
			start = time.time()
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
			dur = (time.time() - start)
			#print "---------============----------- get 1 wait %.4s" % dur
			if print_verbose: print('TimeUsed:%.4s ms, ' % dur)

			start = time.time()
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				# count
				self.ppt_wait += dur
				#print "---------============----------- get 3 getlock %.4s" % (time.time() - start)
				self.ppt_getlock += (time.time() - start)
				start = time.time()

				# select candidates
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids， del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub is not None:
									pub_candidates.append(_pub)
									valid_ids = valid_ids + 1
									if print_verbose: 
										print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					del self.person_pub_map[personId]
#					print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[ERR] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)