Python Extractor.getInstance Examples, com.lish.ajia.googlescholar.extractor.Extractor.getInstance Python Examples

Example #1

0

Show file

File: _________test_extractor.py Project: AlexLyj/aminer-spider

	def test_debug_not_found(self):
		'''Debug Errors'''
		print '-TEST-:', self.test_extractFromPage.__doc__.strip()

		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5))
		#----------------------------------------------------
		pub_candidates = []
		pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5))
		pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5))

		matcher = PubMatcher.getInstance()
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

Example #2

0

Show file

File: integrate_test.py Project: Rygbee/aminer-spider

    def integrate_test_pubs(self, pub_candidates):
        """
			For Debug Errors
		"""
        print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print "Test %s pub, query: \n\t%s" % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True)

        # print out
        print "-" * 100
        for pub in pubs_found:
            print "[%s] %s" % (pub.ncitation, pub)
        print "-" * 100
        for pub in pubs_notfound:
            print "[%s] %s" % ("-", pub)
        print "-" * 100
        print "- test done -"

Example #3

0

Show file

File: update_author.py Project: yinonbaron/aminer-spider

    def update(self):
        pubs = self.pubdao.getPublicationByPerson(self.person.id,
                                                  self.generation)
        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        self.show(pubs)
        all_models = Extractor.getInstance().getNodesByPersonName(
            self.person.names)

        print 'all models:'
        for model in all_models:
            print model
        raw_input()

        if all_models is not None:
            print "=" * 100
            (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
                pubs, all_models)
            if pubs_found is None or pubs_notfound is None:
                print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                    % self.person
                return
            print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                len(pubs_found), len(pubs_notfound),
                len(pubs_found) + len(pubs_notfound), self.person)
        else:
            pubs_notfound = pubs
        print 'pubs found :'
        self.show(pubs_found)
        print 'done'

Example #4

0

Show file

    def integrate_test_pubs(self, pub_candidates):
        '''
			For Debug Errors
		'''
        print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip()

        extractor = Extractor.getInstance()
        matcher = PubMatcher.getInstance()

        # print queries
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query)
        url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(
            query)
        # url = URLCleaner.encodeUrlForDownload(url)
        print "\t", url

        # do
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs,
                                                       all_models,
                                                       debug_output=True)

        # print out
        print '-' * 100
        for pub in pubs_found:
            print '[%s] %s' % (pub.ncitation, pub)
        print '-' * 100
        for pub in pubs_notfound:
            print '[%s] %s' % ('-', pub)
        print '-' * 100
        print '- test done -'

Example #5

0

Show file

File: update_author.py Project: AlexLyj/aminer-spider

    def update(self):
        pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation)
        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        self.show(pubs)
        all_models = Extractor.getInstance().getNodesByPersonName(self.person.names)
        
        print 'all models:'
        for model in all_models:
            print model
        raw_input()
        
        if all_models is not None:
            print "=" * 100
            (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
            if pubs_found is None or pubs_notfound is None:
                print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                    % self.person
                return
            print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person
            )
        else:
            pubs_notfound = pubs
        print 'pubs found :'
        self.show(pubs_found)
        print 'done'

Example #6

0

Show file

File: _________test_extractor.py Project: yinonbaron/aminer-spider

    def test_debug_not_found(self):
        '''Debug Errors'''
        print '-TEST-:', self.test_extractFromPage.__doc__.strip()

        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)',
                "pubkey", -1, "authors", -5))
        #----------------------------------------------------
        pub_candidates = []
        pub_candidates.append(
            Publication(
                -1, 2000,
                'On the Use of Spreading Activation Methods in Automatic Information Retrieval',
                "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000,
                        'Introduction to Modern Information Retrieval',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1,
                        "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey",
                        -1, "authors", -5))
        pub_candidates.append(
            Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme',
                        "pubkey", -1, "authors", -5))

        matcher = PubMatcher.getInstance()
        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

Example #7

0

Show file

File: debug.py Project: AlexLyj/aminer-spider

	def debug_pubs(self):
		'''Debug get by pub'''
		print '-TEST-:', self.debug_pubs.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		
		# group 1
#		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))
		
		# group 2
#		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))
		
		# group 3
#		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

		# group 4
		pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5))
		
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)

		#
		# Get WEB PAGE
		#
		use_web = True # ***************
		if use_web:
			all_models = extractor.getNodesByPubs(used_pubs)
		else:
			f = file('debug_pubs.txt', 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			all_models = self.extractor._Extractor__merge_into_extractedmap(None, models)

		print '\n- all_models ----------------------'
		if all_models is not None:
			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

Example #8

0

Show file

File: t_person_processer.py Project: yinonbaron/aminer-spider

    def process_person(self):
        ''' real logic of process person '''
        # all pubs need to update citation number.
        # totalPubCount = self.pubdao.getPersonPubCount(self.person.id)

        pubs = self.pubdao.getPublicationByPerson(self.person.id,
                                                  self.extractor.generation)

        if pubs is not None and len(pubs) == 0:
            self.store.markPersonFinished(self.person)
            print "[*] Mark Person as Finished '%s'." % self.person
            return

        print "$Ex/get:> person '%s' has %d papers to crawl" % (
            self.person.names, len(pubs))
        # by crawlByPerson, a lot of publication maybe found and update.
        pubs_found = None
        pubs_notfound = None
        if len(pubs) > 4:
            all_models = Extractor.getInstance().getNodesByPersonName(
                self.person.names)
            if all_models is not None:
                print "=" * 100
                (pubs_found,
                 pubs_notfound) = PubMatcher.getInstance().matchPub(
                     pubs, all_models)
                if pubs_found is None or pubs_notfound is None:
                    print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
                     % self.person
                    return
                print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
                    len(pubs_found), len(pubs_notfound),
                    len(pubs_found) + len(pubs_notfound), self.person)
            else:
                pubs_notfound = pubs
        else:
            pubs_found = []
            pubs_notfound = pubs

        if pubs_found is not None:
            for pub in pubs_found:
                self.store.putToPubdbcache(pub)
                print "{-A}[%4s] %s" % (pub.ncitation, pub)

        if pubs_notfound is not None:
            for pub in pubs_notfound:
                self.store.putToPubCache(self.person, pub)

Example #9

0

Show file

File: t_person_processer.py Project: AlexLyj/aminer-spider

	def process_person(self):
		''' real logic of process person '''
		# all pubs need to update citation number.
		# totalPubCount = self.pubdao.getPersonPubCount(self.person.id)

		pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation)

		if pubs is not None and len(pubs) == 0:
			self.store.markPersonFinished(self.person)
			print "[*] Mark Person as Finished '%s'." % self.person
			return

		print "$Ex/get:> person '%s' has %d papers to crawl" % (self.person.names, len(pubs))
		# by crawlByPerson, a lot of publication maybe found and update.
		pubs_found = None
		pubs_notfound = None
		if len(pubs) > 4:
			all_models = Extractor.getInstance().getNodesByPersonName(self.person.names)
			if all_models is not None:
				print "=" * 100
				(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
				if pubs_found is None or pubs_notfound is None:
					print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\
						% self.person
					return
				print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % (
					len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person
				)
			else:
				pubs_notfound = pubs
		else:
			pubs_found = []
			pubs_notfound = pubs

		if pubs_found is not None:
			for pub in pubs_found:
				self.store.putToPubdbcache(pub)
				print "{-A}[%4s] %s" % (pub.ncitation, pub)
	
		if pubs_notfound is not None:
			for pub in pubs_notfound:
				self.store.putToPubCache(self.person, pub)

Example #10

0

Show file

    def runOriginal(self):
        while self.extractor.running and not self.ask_to_stop:
            self.mark()
            self.extractor.wait_for_pause()  # wait if paused
            #			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs

            query, used_pubs = self.store.getFromPubQueue()  # get url and pubs
            if used_pubs is None or len(used_pubs) == 0:
                print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (
                    self.name, query, used_pubs)
                time.sleep(10)
                continue
            self.extractor.wait_for_pause()  # wait again

            with self.extractor.busy_semaphore_lock:
                self.extractor.busy_semaphore += 1
                self.extractor.busy_pub_semaphore += 1

            pubs_found = None
            pubs_notfound = None
            try:
                all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
                if all_models is not None:
                    (pubs_found,
                     pubs_notfound) = PubMatcher.getInstance().matchPub(
                         used_pubs, all_models)
                    if pubs_found is None or pubs_notfound is None:
                        print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
                        return
                    print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (
                        len(pubs_found), len(pubs_notfound), query)
                else:
                    pubs_notfound = used_pubs
            except Exception, e:
                ExceptionHelper.print_exec(e)
                print '-------------------------------------------------------'
                print 'query:', query
                print 'all_models', all_models
                print 'used_pubs', used_pubs
                print '-------------------------------------------------------'
                return
            finally:

Example #11

0

Show file

File: t_pub_processer.py Project: AlexLyj/aminer-spider

	def runOriginal(self):
		while self.extractor.running and not self.ask_to_stop:
			self.mark()
			self.extractor.wait_for_pause() # wait if paused
#			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs
			
			query, used_pubs = self.store.getFromPubQueue() # get url and pubs
			if used_pubs is None or len(used_pubs) == 0:
				print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs)
				time.sleep(10)
				continue
			self.extractor.wait_for_pause() # wait again

			with self.extractor.busy_semaphore_lock: 
				self.extractor.busy_semaphore += 1
				self.extractor.busy_pub_semaphore += 1

			pubs_found = None
			pubs_notfound = None
			try:
				all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
				if all_models is not None:
					(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
					if pubs_found is None or pubs_notfound is None:
						print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
						return
					print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query)
				else:
					pubs_notfound = used_pubs
			except Exception, e:
				ExceptionHelper.print_exec(e)
				print '-------------------------------------------------------'
				print 'query:', 	query
				print 'all_models', all_models
				print 'used_pubs', used_pubs
				print '-------------------------------------------------------'
				return
			finally:

Example #12

0

Show file

File: debug.py Project: AlexLyj/aminer-spider

	def __init__(self):
		self.extractor = Extractor.getInstance()
		self.matcher = PubMatcher.getInstance()
		self.pubdao = PublicationDao()

Example #13

0

Show file

File: debug.py Project: yinonbaron/aminer-spider

    def debug_pubs(self):
        '''Debug get by pub'''
        print '-TEST-:', self.debug_pubs.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []

        # group 1
        #		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))

        # group 2
        #		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))

        # group 3
        #		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

        # group 4
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices',
                "pubkey", -1, "Dusan Guller", -5))

        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)

        #
        # Get WEB PAGE
        #
        use_web = True  # ***************
        if use_web:
            all_models = extractor.getNodesByPubs(used_pubs)
        else:
            f = file('debug_pubs.txt', 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            all_models = self.extractor._Extractor__merge_into_extractedmap(
                None, models)

        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

Example #14

0

Show file

File: debug.py Project: yinonbaron/aminer-spider

 def __init__(self):
     self.extractor = Extractor.getInstance()
     self.matcher = PubMatcher.getInstance()
     self.pubdao = PublicationDao()