def update(self, indexer, iterable): """Update an index from a queryset.""" self._open_index() for o in ifilter(indexer.should_index, iterable): # Clear a potential old object out of the index self.remove(o) # Create a new document to index. doc = PyLucene.Document() # Index the model identifier so we can easily deal with only models of a certain type doc.add( PyLucene.Field(MODEL_FIELD, str(o._meta), STORE_YES, UN_TOKENIZED)) # Index the "identifier" (app_label.module_name.pk) for this object doc.add( PyLucene.Field(IDENTIFIER_FIELD, self.get_identifier(o), STORE_YES, INDEX_NO)) # Index the default content for the object # Don't actually store the complete contents; just index them. doc.add( PyLucene.Field(CONTENTS_FIELD, indexer.flatten(o), STORE_NO, TOKENIZED)) # Index each field that needs to be individually searchable. for (name, value) in indexer.get_field_values(o).items(): doc.add(PyLucene.Field(name, value, STORE_NO, TOKENIZED)) self._index.addDocument(doc) self._close_index()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: #if not filename.endswith('.txt'): # continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", path, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if len(contents) > 0: doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def _create_empty_document(self): """create an empty document to be filled and added to the index later :return: the new document object :rtype: PyLucene.Document """ return PyLucene.Document()
def create_index(self, arg): """ Post download setup callback for creating a lucene index """ moreinfo("Creating lucene index") storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) self.lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) # Uncomment this line to enable a PorterStemmer analyzer # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True) self.lucene_writer.setMaxFieldLength(1048576) count = 0 urllist = [] for urlobj in self._urldict.values(): filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(url) continue except ValueError: urllist.append(url) if not filename in self._downloaddict['_savedfiles']: continue data = '' moreinfo('Adding index for URL', url) if os.path.isfile(filename): try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: extrainfo("warning: no content in %s" % filename) self.lucene_writer.addDocument(doc) count += 1
def index_files(board, time_delta): store = PyLucene.FSDirectory.getDirectory( BOARDSPATH + board + '/' + RECENT_INDEX, True) writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("owner", owner, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("title", title, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def add(self, message): doc = PyLucene.Document() for part in message.textParts(): doc.add( PyLucene.Field( 'text', part.translate({ ord(u'@'): u' ', ord(u'-'): u' ', ord(u'.'): u' ' }).encode('utf-8'), PyLucene.Field.Store.NO, PyLucene.Field.Index.TOKENIZED)) for (k, v) in message.keywordParts().iteritems(): doc.add( PyLucene.Field( k, v.translate({ ord(u'@'): u' ', ord(u'-'): u' ', ord(u'.'): u' ' }).encode('utf-8'), PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) doc.add( PyLucene.Field('documentType', message.documentType(), PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) doc.add( PyLucene.Field('storeID', message.uniqueIdentifier(), PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field('sortKey', message.sortKey(), PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) # Deprecated. use Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED) instead self.writer.addDocument(doc)
def index_feed(self, feed, feed_data=None): """ Indexes the given feed """ #remove any existing entries for this feed self.delete_existing_feed_docs(feed) writer = self.feed_modifier.get_writer() doc = lucene.Document() doc.add( lucene.Field('id', str(feed.id), lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add( lucene.Field('url', feed.xml_url, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) if feed.channel_link: doc.add( lucene.Field('link', feed.channel_link, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) if feed.title: doc.add( lucene.Field('title', feed.title, lucene.Field.Store.YES, lucene.Field.Index.TOKENIZED)) if feed.subtitle: doc.add( lucene.Field('subtitle', feed.subtitle, lucene.Field.Store.YES, lucene.Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() logging.info('Indexed Feed: %s' % feed.xml_url) writer = self.entry_modifier.get_writer() for entry in feed.get_entries(): try: doc = lucene.Document() id = '%s:%s' % (feed.xml_url, entry.get('id', None)) doc.add( lucene.Field('id', id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add( lucene.Field('feed_url', feed.xml_url, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) if entry.get('title', None): doc.add( lucene.Field('title', entry['title'], lucene.Field.Store.YES, lucene.Field.Index.TOKENIZED)) if entry.get('summary', None): doc.add( lucene.Field('summary', entry['summary'], lucene.Field.Store.YES, lucene.Field.Index.TOKENIZED)) if entry.get('link', None): doc.add( lucene.Field('link', entry['link'], lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) updated = parse_date(entry.get('updated', None)) if updated: doc.add( lucene.Field('updated', updated.isoformat(' '), lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field('pickle', pickle.dumps(entry), lucene.Field.Store.YES, lucene.Field.Index.NO)) writer.addDocument(doc) logging.info( 'Indexed Feed Entry: %s' % entry.get('title', None) or id) except: {} writer.close()
def create_index(self): """ Post download setup callback for creating a lucene index """ info("Creating lucene index") count = 0 urllist = [] urldb = objects.datamgr.get_urldb() storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) lucene_writer.setMaxFieldLength(1048576) for node in urldb.preorder(): urlobj = node.get() # Only index if web-page or document if not urlobj.is_webpage() and not urlobj.is_document(): continue filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(urlobj.index) continue except ValueError: urllist.append(urlobj.index) if not os.path.isfile(filename): continue data = '' extrainfo('Adding index for URL', url) try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' try: doc = PyLucene.Document() doc.add( PyLucene.Field("name", 'file://' + filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: warning("warning: no content in %s" % filename) lucene_writer.addDocument(doc) except PyLucene.JavaError, e: print e continue
def _document_node(self, iba_node): d = PyLucene.Document() # Index the NID d.add( Field(COLUMN_NID, iba_node.nid, Field.Store.YES, Field.Index.UN_TOKENIZED)) # Index the Names for name in iba_node.names: d.add( Field(COLUMN_NAME, name[0], Field.Store.NO, Field.Index.TOKENIZED)) # Index the Attributes for att in iba_node.attributes: # allowing search for nodes having a particular attribute type d.add( Field(COLUMN_ATTRIBUTE_TYPE_NID, att.type, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes with any attribute having a particular value d.add( Field(COLUMN_ATTRIBUTE_VALUE, att.value, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes having aparticular attribute with a particular value d.add( Field(COLUMN_ATTRIBUTE_TYPE_NID, att.value, Field.Store.NO, Field.Index.TOKENIZED)) # Index the Statements for stat in iba_node.statements: for att in stat.attributes: # allowing the search of nodes have any predicate with the specified attribute type d.add( Field(COLUMN_PREDICATE_NID + COLUMN_ATTRIBUTE_TYPE_NID, att.type, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes having the specified predicate with the specified attribute type of any value d.add( Field(stat.predicate + COLUMN_ATTRIBUTE_TYPE_NID, att.type, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes having the specified predicate with any attribute type and any value d.add( Field(COLUMN_PREDICATE_NID, stat.predicate, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes have any predicate with any attribute type of the specified value d.add( Field(COLUMN_PREDICATE_NID + COLUMN_ATTRIBUTE_TYPE_NID, att.value, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of node having any predicate with the specified attribute type and value d.add( Field(COLUMN_PREDICATE_NID + att.type, att.value, Field.Store.NO, Field.Index.TOKENIZED)) # allowing the search of nodes having the specified predicate with any attribute type and the specified value d.add( Field(stat.predicate + COLUMN_ATTRIBUTE_TYPE_NID, att.value, Field.Store.NO, Field.Index.TOKENIZED)) # allowign the search of nodes having a specified predicate and attribute type and value d.add( Field(stat.predicate + att.type, att.value, Field.Store.NO, Field.Index.TOKENIZED)) return d
def index_record(self, record, writer=None): # field, value, store?, index?, token? try: if not writer: had_writer = False writer = self.context.get_search_index_writer(False) else: had_writer = True study = Study(self.context, record.study_id) self.logger.debug('starting document') doc = PyLucene.Document() # First, we need to create a unique key so we can later delete # if necessary. Will try simply uid for now. doc.add(PyLucene.Field('uid', str(record.uid), True, True, False)) doc.add(PyLucene.Field('all', str(record.uid), True, True, False)) # Second, save internal-use metadata. These should probably # be x'd out at Query-time. doc.add( PyLucene.Field('record-status', str(record.status), False, True, False)) doc.add( PyLucene.Field('article-type', str(study.article_type), False, True, False)) source_catalog = self.context.get_source_catalog() complete_term_map = source_catalog.get_complete_mapping() mapped_metadata = record.get_mapped_metadata(complete_term_map) # First index all the non-multiple metadata fields for field in ('abstract', 'affiliation', 'issn', 'journal', 'pubdate', 'issue', 'pages', 'title', 'volume'): val = mapped_metadata.get(field, None) if val: doc.add(PyLucene.Field(field, val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Be sure to index all of (abbrev, full title, issn) as "journal" issn = mapped_metadata.get('issn') if issn: j = Journal() j.load_from_issn(self.context, issn) no_dash = j.no_dash() self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \ (j.journal_title, j.abbreviation, issn)) doc.add(PyLucene.Field('journal', issn, False, True, True)) doc.add(PyLucene.Field('journal', no_dash, False, True, True)) doc.add(PyLucene.Field('all', issn, False, True, True)) doc.add(PyLucene.Field('all', no_dash, False, True, True)) if j.abbreviation: doc.add( PyLucene.Field('journal', j.abbreviation, False, True, True)) doc.add( PyLucene.Field('all', j.abbreviation, False, True, True)) if j.journal_title: doc.add( PyLucene.Field('journal', j.journal_title, False, True, True)) doc.add( PyLucene.Field('all', j.journal_title, False, True, True)) # If a page range is given, index the first page, assuming # the delimiter is '-' pages = mapped_metadata.get('pages', None) if pages \ and '-' in pages: first_page = pages[0:pages.index('-')] doc.add(PyLucene.Field('pages', first_page, False, True, True)) doc.add(PyLucene.Field('all', first_page, False, True, True)) # 'unique_identifier' must be specially treated because # of the '_' val = mapped_metadata.get('unique_identifier', None) if val: doc.add( PyLucene.Field('unique-identifier', val, False, True, True)) doc.add(PyLucene.Field('all', val, False, True, True)) # Next, index all the possibly-multiple metadata fields # Give these (especially for author and subject) a little # boost, less than for canary UMLS concepts for field in ('author', 'grantnum', 'keyword', 'registrynum', 'subject'): vals = mapped_metadata.get(field, None) for val in vals: doc.add(PyLucene.Field(field, val, False, True, True)) f = PyLucene.Field('all', val, False, True, True) f.setBoost(1.3) doc.add(f) # If at least one author name is available, index the first # author to support first-author searching. Also, boost it # slightly higher than the other authors. authors = mapped_metadata.get('author', None) if authors: doc.add( PyLucene.Field('first-author', authors[0], False, True, True)) f = PyLucene.Field('all', authors[0], False, True, True) f.setBoost(1.5) doc.add(f) # All the booleans for bool in ('has_outcomes', 'has_exposures', 'has_relationships', 'has_interspecies', 'has_exposure_linkage', 'has_outcome_linkage', 'has_genomic'): val = getattr(study, bool) # NOTE: I think lucene dislikes '_' in field names ?? boolstr = bool.replace('_', '-') doc.add( PyLucene.Field(boolstr, str(int(val)), False, True, False)) # NOTE: no need to add this to 'all'. I think. # Now, all the UMLS concepts. Simpler approach to # lucene "synonym injection", but it works! Give it # slightly bigger boost than keywords/subjects for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'): # NOTE: I think lucene dislikes '_' in field names ?? ctype_search = ctype.replace('_', '-') for val in getattr(study, ctype): concept = Concept(self.context, val.concept_id) for syn in concept.synonyms: doc.add( PyLucene.Field(ctype_search, unicode(syn, 'latin-1'), False, True, True)) f = PyLucene.Field('all', unicode(syn, 'latin-1'), False, True, True) f.setBoost(2.0) doc.add(f) # And, the locations gazeteer = self.context.get_gazeteer() locs = [] for location in study.locations: feature = Feature(self.context, uid=location.feature_id) feature.load(self.context) if gazeteer.fips_codes.has_key( (feature.country_code, feature.adm1)): region_name = gazeteer.fips_codes[(feature.country_code, feature.adm1)] else: region_name = '' full_name = '%s (%s, %s, %s)' % ( feature.name, gazeteer.feature_codes[feature.feature_type], render_capitalized(region_name), render_capitalized( gazeteer.country_codes[feature.country_code])) doc.add( PyLucene.Field('location', unicode(full_name, 'latin-1'), False, True, True)) doc.add( PyLucene.Field('all', unicode(full_name, 'latin-1'), False, True, True)) # Finally, the methodologies for meth in study.methodologies: doc.add( PyLucene.Field('methodology', meth.get_study_type(text=True), False, True, True)) doc.add( PyLucene.Field('all', meth.get_study_type(text=True), False, True, True)) # And each exposure route term for route in meth.get_routes(True): doc.add( PyLucene.Field('exposure_route', route, False, True, True)) doc.add(PyLucene.Field('all', route, False, True, True)) writer.addDocument(doc) if not had_writer: writer.close() except Exception, e: self.logger.error('Failed to index record: %s', e) self.logger.error(traceback.print_exc())