def persist_file_references(self, file_path): """ Parses references from a file and stores them to the database """ extraction_gw = ExtractionGateway() references = self._parse_entries_file(file_path) extractions = [] for reference, index in zip(references, range(len(references))): extraction = Extraction() # Clean fields that we don't want reference.remove_field('reference_id') reference.remove_field('abstract') reference.remove_field('reference_type') url = reference.remove_field('url') if not url: url = file_path else: url = url.value extraction.used_result = SearchResult('', url) text = unicode('Reference %d from %s' % (index, file_path.rsplit('/', 1)[-1])) extraction.file_path = text extraction.entries.append(reference) extractions.append(extraction) extraction_gw.persist_extraction(extraction) log.info(''.join(['Imported ', text.lower()])) #@UndefinedVariable return extractions
def find_extraction_by_id(self, e_id): m_extraction = self.session.query(mappers.Extraction).filter_by(id=e_id).one() extraction = Extraction() extraction.id = m_extraction.id extraction.used_query = m_extraction.query_string extraction.used_result = SearchResult("", m_extraction.result) return extraction
def run(self): """ Runs indefinitely until it is asked to finish. Processes files from the 'input_queue' and supplies them to a 'ReferenceMaker' object. Once the ReferenceMaker is done, it stores the results in tuples (file, reference) to the output queue. """ log.debug("Running thread", extra={'threadname': self.getName()}) #@UndefinedVariable while not self.stop_event.isSet(): file = None if not self.in_queue.empty(): try: file = self.in_queue.get(False) except Queue.Empty: continue if file: log.debug("Processing file %s" % file) #@UndefinedVariable try: reference = ReferenceMaker().make_reference( file, self.target_format) self.out_queue.put(reference) except Exception, e: log.error( 'Unexpected exception while extracting reference' #@UndefinedVariable ' for file %s: %s' % (file, str(e))) self.out_queue.put(Extraction()) continue
def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction