def integrate_test_pubs(self, pub_candidates): """ For Debug Errors """ print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print "Test %s pub, query: \n\t%s" % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print "-" * 100 for pub in pubs_found: print "[%s] %s" % (pub.ncitation, pub) print "-" * 100 for pub in pubs_notfound: print "[%s] %s" % ("-", pub) print "-" * 100 print "- test done -"
def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- END DEBUG -'
def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def integrate_test_pubs(self, pub_candidates): ''' For Debug Errors ''' print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload( query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print '-' * 100 for pub in pubs_found: print '[%s] %s' % (pub.ncitation, pub) print '-' * 100 for pub in pubs_notfound: print '[%s] %s' % ('-', pub) print '-' * 100 print '- test done -'
def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- END DEBUG -'
def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap(None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append( Publication( -1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap( None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def getFromPubQueue(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合 (取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = False try: while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub: pub_candidates.append(_pub) valid_ids += 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: for pubId in self.person_pub_map[personId]: if pubId in self.pubmap: # print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId) del self.pubmap[pubId] del self.person_pub_map[personId] # print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[store] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)
def getFromPubQueueBack(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合,(取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = True try: # block if no pub items. start = time.time() while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) dur = (time.time() - start) #print "---------============----------- get 1 wait %.4s" % dur if print_verbose: print('TimeUsed:%.4s ms, ' % dur) start = time.time() self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 # count self.ppt_wait += dur #print "---------============----------- get 3 getlock %.4s" % (time.time() - start) self.ppt_getlock += (time.time() - start) start = time.time() # select candidates pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub is not None: pub_candidates.append(_pub) valid_ids = valid_ids + 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: del self.person_pub_map[personId] # print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[ERR] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)