def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- END DEBUG -'
def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- END DEBUG -'
def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def integrate_test_pubs(self, pub_candidates): """ For Debug Errors """ print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print "Test %s pub, query: \n\t%s" % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print "-" * 100 for pub in pubs_found: print "[%s] %s" % (pub.ncitation, pub) print "-" * 100 for pub in pubs_notfound: print "[%s] %s" % ("-", pub) print "-" * 100 print "- test done -"
def integrate_test_pubs(self, pub_candidates): ''' For Debug Errors ''' print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload( query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print '-' * 100 for pub in pubs_found: print '[%s] %s' % (pub.ncitation, pub) print '-' * 100 for pub in pubs_notfound: print '[%s] %s' % ('-', pub) print '-' * 100 print '- test done -'
def update(self): pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return self.show(pubs) all_models = Extractor.getInstance().getNodesByPersonName( self.person.names) print 'all models:' for model in all_models: print model raw_input() if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person) else: pubs_notfound = pubs print 'pubs found :' self.show(pubs_found) print 'done'
def update(self): pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return self.show(pubs) all_models = Extractor.getInstance().getNodesByPersonName(self.person.names) print 'all models:' for model in all_models: print model raw_input() if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person ) else: pubs_notfound = pubs print 'pubs found :' self.show(pubs_found) print 'done'
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap(None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def process_person(self): ''' real logic of process person ''' # all pubs need to update citation number. # totalPubCount = self.pubdao.getPersonPubCount(self.person.id) pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return print "$Ex/get:> person '%s' has %d papers to crawl" % ( self.person.names, len(pubs)) # by crawlByPerson, a lot of publication maybe found and update. pubs_found = None pubs_notfound = None if len(pubs) > 4: all_models = Extractor.getInstance().getNodesByPersonName( self.person.names) if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person) else: pubs_notfound = pubs else: pubs_found = [] pubs_notfound = pubs if pubs_found is not None: for pub in pubs_found: self.store.putToPubdbcache(pub) print "{-A}[%4s] %s" % (pub.ncitation, pub) if pubs_notfound is not None: for pub in pubs_notfound: self.store.putToPubCache(self.person, pub)
def process_person(self): ''' real logic of process person ''' # all pubs need to update citation number. # totalPubCount = self.pubdao.getPersonPubCount(self.person.id) pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return print "$Ex/get:> person '%s' has %d papers to crawl" % (self.person.names, len(pubs)) # by crawlByPerson, a lot of publication maybe found and update. pubs_found = None pubs_notfound = None if len(pubs) > 4: all_models = Extractor.getInstance().getNodesByPersonName(self.person.names) if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person ) else: pubs_notfound = pubs else: pubs_found = [] pubs_notfound = pubs if pubs_found is not None: for pub in pubs_found: self.store.putToPubdbcache(pub) print "{-A}[%4s] %s" % (pub.ncitation, pub) if pubs_notfound is not None: for pub in pubs_notfound: self.store.putToPubCache(self.person, pub)
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % ( self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % ( len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao()
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append( Publication( -1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap( None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'