def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap( all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched
def integrate_test_pubs(self, pub_candidates): """ For Debug Errors """ print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print "Test %s pub, query: \n\t%s" % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print "-" * 100 for pub in pubs_found: print "[%s] %s" % (pub.ncitation, pub) print "-" * 100 for pub in pubs_notfound: print "[%s] %s" % ("-", pub) print "-" * 100 print "- test done -"
def integrate_test_pubs(self, pub_candidates): ''' For Debug Errors ''' print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload( query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print '-' * 100 for pub in pubs_found: print '[%s] %s' % (pub.ncitation, pub) print '-' * 100 for pub in pubs_notfound: print '[%s] %s' % ('-', pub) print '-' * 100 print '- test done -'
def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-'
def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def test_fetchByPubs(self, pubs): '''Test use a list of pubs that not found in person search''' print '-- test fetchByPubs %s pubs', len(pubs) new_pubs = [] for pub in pubs: new_pubs.append((pub, 'jie tang')) extractor = Extractor() extractor.getNodesByPubs(new_pubs) print '- test done -'
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap(None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- END DEBUG -'
def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap(all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched
def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- END DEBUG -'
def update(self): pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return self.show(pubs) all_models = Extractor.getInstance().getNodesByPersonName( self.person.names) print 'all models:' for model in all_models: print model raw_input() if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person) else: pubs_notfound = pubs print 'pubs found :' self.show(pubs_found) print 'done'
def update(self): pubs = self.pubdao.getPublicationByPerson(self.person.id, self.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return self.show(pubs) all_models = Extractor.getInstance().getNodesByPersonName(self.person.names) print 'all models:' for model in all_models: print model raw_input() if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person ) else: pubs_notfound = pubs print 'pubs found :' self.show(pubs_found) print 'done'
def process_person(self): ''' real logic of process person ''' # all pubs need to update citation number. # totalPubCount = self.pubdao.getPersonPubCount(self.person.id) pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return print "$Ex/get:> person '%s' has %d papers to crawl" % ( self.person.names, len(pubs)) # by crawlByPerson, a lot of publication maybe found and update. pubs_found = None pubs_notfound = None if len(pubs) > 4: all_models = Extractor.getInstance().getNodesByPersonName( self.person.names) if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person) else: pubs_notfound = pubs else: pubs_found = [] pubs_notfound = pubs if pubs_found is not None: for pub in pubs_found: self.store.putToPubdbcache(pub) print "{-A}[%4s] %s" % (pub.ncitation, pub) if pubs_notfound is not None: for pub in pubs_notfound: self.store.putToPubCache(self.person, pub)
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % ( self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % ( len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
def process_person(self): ''' real logic of process person ''' # all pubs need to update citation number. # totalPubCount = self.pubdao.getPersonPubCount(self.person.id) pubs = self.pubdao.getPublicationByPerson(self.person.id, self.extractor.generation) if pubs is not None and len(pubs) == 0: self.store.markPersonFinished(self.person) print "[*] Mark Person as Finished '%s'." % self.person return print "$Ex/get:> person '%s' has %d papers to crawl" % (self.person.names, len(pubs)) # by crawlByPerson, a lot of publication maybe found and update. pubs_found = None pubs_notfound = None if len(pubs) > 4: all_models = Extractor.getInstance().getNodesByPersonName(self.person.names) if all_models is not None: print "=" * 100 (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) if pubs_found is None or pubs_notfound is None: print "[ERROR][-/-] person '%s', pubs_found is None or pubs_notfound is None, return"\ % self.person return print "{+A}[%s+%s=%s] Download by page, [found + not_found = total], person '%s'." % ( len(pubs_found), len(pubs_notfound), len(pubs_found) + len(pubs_notfound), self.person ) else: pubs_notfound = pubs else: pubs_found = [] pubs_notfound = pubs if pubs_found is not None: for pub in pubs_found: self.store.putToPubdbcache(pub) print "{-A}[%4s] %s" % (pub.ncitation, pub) if pubs_notfound is not None: for pub in pubs_notfound: self.store.putToPubCache(self.person, pub)
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) print "**:>> %s" % len(models) for model in models: print model.asDetailText(); print '-END TEST-'
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub
def __init__(self): self.extractor = Extractor().getInstance()
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub
def getFromPubQueueBack(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合,(取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = True try: # block if no pub items. start = time.time() while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) dur = (time.time() - start) #print "---------============----------- get 1 wait %.4s" % dur if print_verbose: print('TimeUsed:%.4s ms, ' % dur) start = time.time() self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 # count self.ppt_wait += dur #print "---------============----------- get 3 getlock %.4s" % (time.time() - start) self.ppt_getlock += (time.time() - start) start = time.time() # select candidates pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub is not None: pub_candidates.append(_pub) valid_ids = valid_ids + 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: del self.person_pub_map[personId] # print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[ERR] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)
def getFromPubQueue(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合 (取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = False try: while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub: pub_candidates.append(_pub) valid_ids += 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: for pubId in self.person_pub_map[personId]: if pubId in self.pubmap: # print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId) del self.pubmap[pubId] del self.person_pub_map[personId] # print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[store] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)
def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao()
class TestPubMatcher: def __init__(self): self.matcher = PubMatcher() # # Test # def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap( all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched def test_fetchByPubs(self, pubs): '''Test use a list of pubs that not found in person search''' print '-- test fetchByPubs %s pubs', len(pubs) new_pubs = [] for pub in pubs: new_pubs.append((pub, 'jie tang')) extractor = Extractor() extractor.getNodesByPubs(new_pubs) print '- test done -' def test_match_with_authors(self): data_test = (( '… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan' ), ( 'R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan' ), ( 'P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org', 'Peter Lyngbak,William Kent' ), ( 'W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com', 'William Kent' ), ( 'DE Neiman, DW Hildum, VR Lessef, T …', 'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm' ), ( 'M Esmaili, R Safavi-Naini, J Pieprzyk', 'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk' ), ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson' )) data_debug = (( 'DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson' ), ) for ga, da in data_debug: print "match: %s \n with: %s \n is: %s" % (ga, da, \ self.matcher.matchAuthors(ga, da, debug_output=True))
def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append( Publication( -1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap( None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def __init__(self): self.extractor = Extractor().getInstance() self.settings = Settings.getInstance()
class TestPubMatcher: def __init__(self): self.matcher = PubMatcher() # # Test # def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap(all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched def test_fetchByPubs(self, pubs): '''Test use a list of pubs that not found in person search''' print '-- test fetchByPubs %s pubs', len(pubs) new_pubs = [] for pub in pubs: new_pubs.append((pub, 'jie tang')) extractor = Extractor() extractor.getNodesByPubs(new_pubs) print '- test done -' def test_match_with_authors(self): data_test = ( ('… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'), ('R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'), ('P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org', 'Peter Lyngbak,William Kent'), ('W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com', 'William Kent'), ('DE Neiman, DW Hildum, VR Lessef, T …', 'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm'), ('M Esmaili, R Safavi-Naini, J Pieprzyk', 'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk'), ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson') ) data_debug = ( ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson'), ) for ga, da in data_debug: print "match: %s \n with: %s \n is: %s" % (ga, da, \ self.matcher.matchAuthors(ga, da, debug_output=True))