def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction
def setUp(self): factory = UtilFactory() self.session = create_session("sqlite:///:memory:", True) self.irc = IRController(factory) self.queries = [ '"We view these methods as tools which can be used"', '"believe that the information thus extracted"', '"can be used to extract useful information"', ]
class TestIRController(unittest.TestCase): def setUp(self): factory = UtilFactory() self.session = create_session("sqlite:///:memory:", True) self.irc = IRController(factory) self.queries = [ '"We view these methods as tools which can be used"', '"believe that the information thus extracted"', '"can be used to extract useful information"', ] def _populate_db(self): wg = WrapperGateway(self.session) collection01 = wg.new_wrapper_collection() collection01.field = u"a" collection01.url = u"url01" collection02 = wg.new_wrapper_collection() collection02.field = u"b" collection02.url = u"url01" collection03 = wg.new_wrapper_collection() collection03.field = u"a" collection03.url = u"url02" collection04 = wg.new_wrapper_collection() collection04.field = u"b" collection04.url = u"url02" collection05 = wg.new_wrapper_collection() collection05.field = u"a" collection05.url = u"url03" wrapper01 = wg.new_wrapper() wrapper01.downvotes = 0 wrapper01.upvotes = 3 wrapper01.score = 1.0 collection01.wrappers.append(wrapper01) wrapper02 = wg.new_wrapper() wrapper02.downvotes = 0 wrapper02.upvotes = 2 wrapper02.score = 1.0 collection01.wrappers.append(wrapper02) wrapper03 = wg.new_wrapper() wrapper03.downvotes = 1 wrapper03.upvotes = 1 wrapper03.score = 0.5 collection02.wrappers.append(wrapper03) wrapper04 = wg.new_wrapper() wrapper04.downvotes = 0 wrapper04.upvotes = 3 wrapper04.score = 1.0 collection04.wrappers.append(wrapper04) wrapper05 = wg.new_wrapper() wrapper05.downvotes = 0 wrapper05.upvotes = 2 wrapper05.score = 1.0 collection05.wrappers.append(wrapper05) wrapper06 = wg.new_wrapper() wrapper06.downvotes = 1 wrapper06.upvotes = 1 wrapper06.score = 0.8 collection05.wrappers.append(wrapper06) wrapper07 = wg.new_wrapper() wrapper07.downvotes = 0 wrapper07.upvotes = 3 wrapper07.score = 1.0 collection05.wrappers.append(wrapper07) wrapper08 = wg.new_wrapper() wrapper08.downvotes = 1 wrapper08.upvotes = 1 wrapper08.score = 0.2 collection05.wrappers.append(wrapper08) self.session.flush() def tearDown(self): pass def xtest_get_top_results_wrong_search_engine(self): results = self.irc.get_top_results(self.queries, -1) self.failUnless(not results) def xtest_get_top_results(self): results = self.irc.get_top_results(self.queries) self.failUnless(results) def test_sort_results(self): self._populate_db() results = [ SearchResult("", u"url01/dir"), SearchResult("", u"url05/someotherdir"), SearchResult("", u"http://portal.acm.org"), ] results = self.irc._sort_results(results) self.failUnless(results == [u"http://portal.acm.org", u"url01"])