Beispiel #1
0
    def update(self, indexer, iterable):
        """Update an index from a queryset."""
        self._open_index()

        for o in ifilter(indexer.should_index, iterable):
            # Clear a potential old object out of the index
            self.remove(o)

            # Create a new document to index.
            doc = PyLucene.Document()

            # Index the model identifier so we can easily deal with only models of a certain type
            doc.add(
                PyLucene.Field(MODEL_FIELD, str(o._meta), STORE_YES,
                               UN_TOKENIZED))

            # Index the "identifier" (app_label.module_name.pk) for this object
            doc.add(
                PyLucene.Field(IDENTIFIER_FIELD, self.get_identifier(o),
                               STORE_YES, INDEX_NO))

            # Index the default content for the object
            # Don't actually store the complete contents; just index them.
            doc.add(
                PyLucene.Field(CONTENTS_FIELD, indexer.flatten(o), STORE_NO,
                               TOKENIZED))

            # Index each field that needs to be individually searchable.
            for (name, value) in indexer.get_field_values(o).items():
                doc.add(PyLucene.Field(name, value, STORE_NO, TOKENIZED))

            self._index.addDocument(doc)

        self._close_index()
Beispiel #2
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             #if not filename.endswith('.txt'):
             #    continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'iso-8859-1')
                 file.close()
                 doc = PyLucene.Document()
                 doc.add(
                     PyLucene.Field("name", filename,
                                    PyLucene.Field.Store.YES,
                                    PyLucene.Field.Index.UN_TOKENIZED))
                 doc.add(
                     PyLucene.Field("path", path, PyLucene.Field.Store.YES,
                                    PyLucene.Field.Index.UN_TOKENIZED))
                 if len(contents) > 0:
                     doc.add(
                         PyLucene.Field("contents", contents,
                                        PyLucene.Field.Store.YES,
                                        PyLucene.Field.Index.TOKENIZED))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
Beispiel #3
0
    def _create_empty_document(self):
        """create an empty document to be filled and added to the index later

        :return: the new document object
        :rtype: PyLucene.Document
        """
        return PyLucene.Document()
Beispiel #4
0
def create_index(self, arg):
    """ Post download setup callback for creating a lucene index """

    moreinfo("Creating lucene index")
    storeDir = "index"
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)

    store = PyLucene.FSDirectory.getDirectory(storeDir, True)

    self.lucene_writer = PyLucene.IndexWriter(store,
                                              PyLucene.StandardAnalyzer(),
                                              True)
    # Uncomment this line to enable a PorterStemmer analyzer
    # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True)
    self.lucene_writer.setMaxFieldLength(1048576)

    count = 0

    urllist = []

    for urlobj in self._urldict.values():

        filename = urlobj.get_full_filename()
        url = urlobj.get_full_url()

        try:
            urllist.index(url)
            continue
        except ValueError:
            urllist.append(url)

        if not filename in self._downloaddict['_savedfiles']: continue

        data = ''

        moreinfo('Adding index for URL', url)

        if os.path.isfile(filename):
            try:
                data = unicode(open(filename).read(), 'iso-8859-1')
            except UnicodeDecodeError, e:
                data = ''

        doc = PyLucene.Document()
        doc.add(
            PyLucene.Field("name", filename, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        doc.add(
            PyLucene.Field("path", url, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        if data and len(data) > 0:
            doc.add(
                PyLucene.Field("contents", data, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
        else:
            extrainfo("warning: no content in %s" % filename)

        self.lucene_writer.addDocument(doc)
        count += 1
Beispiel #5
0
def index_files(board, time_delta):
    store = PyLucene.FSDirectory.getDirectory(
        BOARDSPATH + board + '/' + RECENT_INDEX, True)
    writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)
    writer.setMaxFieldLength(1048576)  # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = PyLucene.Document()
            doc.add(
                PyLucene.Field("name", filename, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("owner", owner, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("title", title, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("contents", contents, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
Beispiel #6
0
    def add(self, message):
        doc = PyLucene.Document()
        for part in message.textParts():
            doc.add(
                PyLucene.Field(
                    'text',
                    part.translate({
                        ord(u'@'): u' ',
                        ord(u'-'): u' ',
                        ord(u'.'): u' '
                    }).encode('utf-8'), PyLucene.Field.Store.NO,
                    PyLucene.Field.Index.TOKENIZED))

        for (k, v) in message.keywordParts().iteritems():
            doc.add(
                PyLucene.Field(
                    k,
                    v.translate({
                        ord(u'@'): u' ',
                        ord(u'-'): u' ',
                        ord(u'.'): u' '
                    }).encode('utf-8'), PyLucene.Field.Store.YES,
                    PyLucene.Field.Index.TOKENIZED))
        doc.add(
            PyLucene.Field('documentType', message.documentType(),
                           PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.TOKENIZED))

        doc.add(
            PyLucene.Field('storeID', message.uniqueIdentifier(),
                           PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        doc.add(
            PyLucene.Field('sortKey', message.sortKey(),
                           PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        # Deprecated. use Field(name, value, Field.Store.YES, Field.Index.UN_TOKENIZED) instead

        self.writer.addDocument(doc)
Beispiel #7
0
    def index_feed(self, feed, feed_data=None):
        """ Indexes the given feed """
        #remove any existing entries for this feed
        self.delete_existing_feed_docs(feed)

        writer = self.feed_modifier.get_writer()
        doc = lucene.Document()
        doc.add(
            lucene.Field('id', str(feed.id), lucene.Field.Store.YES,
                         lucene.Field.Index.UN_TOKENIZED))
        doc.add(
            lucene.Field('url', feed.xml_url, lucene.Field.Store.YES,
                         lucene.Field.Index.UN_TOKENIZED))
        if feed.channel_link:
            doc.add(
                lucene.Field('link', feed.channel_link, lucene.Field.Store.YES,
                             lucene.Field.Index.UN_TOKENIZED))
        if feed.title:
            doc.add(
                lucene.Field('title', feed.title, lucene.Field.Store.YES,
                             lucene.Field.Index.TOKENIZED))
        if feed.subtitle:
            doc.add(
                lucene.Field('subtitle', feed.subtitle, lucene.Field.Store.YES,
                             lucene.Field.Index.TOKENIZED))
        writer.addDocument(doc)
        writer.close()
        logging.info('Indexed Feed: %s' % feed.xml_url)

        writer = self.entry_modifier.get_writer()
        for entry in feed.get_entries():
            try:
                doc = lucene.Document()
                id = '%s:%s' % (feed.xml_url, entry.get('id', None))
                doc.add(
                    lucene.Field('id', id, lucene.Field.Store.YES,
                                 lucene.Field.Index.UN_TOKENIZED))
                doc.add(
                    lucene.Field('feed_url', feed.xml_url,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.UN_TOKENIZED))
                if entry.get('title', None):
                    doc.add(
                        lucene.Field('title', entry['title'],
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.TOKENIZED))
                if entry.get('summary', None):
                    doc.add(
                        lucene.Field('summary', entry['summary'],
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.TOKENIZED))
                if entry.get('link', None):
                    doc.add(
                        lucene.Field('link', entry['link'],
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.UN_TOKENIZED))
                updated = parse_date(entry.get('updated', None))
                if updated:
                    doc.add(
                        lucene.Field('updated', updated.isoformat(' '),
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.NO))
                doc.add(
                    lucene.Field('pickle', pickle.dumps(entry),
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                writer.addDocument(doc)
                logging.info(
                    'Indexed Feed Entry: %s' % entry.get('title', None) or id)
            except:
                {}
        writer.close()
    def create_index(self):
        """ Post download setup callback for creating a lucene index """

        info("Creating lucene index")

        count = 0

        urllist = []

        urldb = objects.datamgr.get_urldb()

        storeDir = "index"
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = PyLucene.FSDirectory.getDirectory(storeDir, True)
        lucene_writer = PyLucene.IndexWriter(store,
                                             PyLucene.StandardAnalyzer(), True)
        lucene_writer.setMaxFieldLength(1048576)

        for node in urldb.preorder():
            urlobj = node.get()

            # Only index if web-page or document
            if not urlobj.is_webpage() and not urlobj.is_document(): continue

            filename = urlobj.get_full_filename()
            url = urlobj.get_full_url()

            try:
                urllist.index(urlobj.index)
                continue
            except ValueError:
                urllist.append(urlobj.index)

            if not os.path.isfile(filename): continue

            data = ''

            extrainfo('Adding index for URL', url)

            try:
                data = unicode(open(filename).read(), 'iso-8859-1')
            except UnicodeDecodeError, e:
                data = ''

            try:
                doc = PyLucene.Document()
                doc.add(
                    PyLucene.Field("name", 'file://' + filename,
                                   PyLucene.Field.Store.YES,
                                   PyLucene.Field.Index.UN_TOKENIZED))
                doc.add(
                    PyLucene.Field("path", url, PyLucene.Field.Store.YES,
                                   PyLucene.Field.Index.UN_TOKENIZED))
                if data and len(data) > 0:
                    doc.add(
                        PyLucene.Field("contents", data,
                                       PyLucene.Field.Store.YES,
                                       PyLucene.Field.Index.TOKENIZED))
                else:
                    warning("warning: no content in %s" % filename)

                lucene_writer.addDocument(doc)
            except PyLucene.JavaError, e:
                print e
                continue
Beispiel #9
0
    def _document_node(self, iba_node):
        d = PyLucene.Document()
        # Index the NID
        d.add(
            Field(COLUMN_NID, iba_node.nid, Field.Store.YES,
                  Field.Index.UN_TOKENIZED))

        # Index the Names
        for name in iba_node.names:
            d.add(
                Field(COLUMN_NAME, name[0], Field.Store.NO,
                      Field.Index.TOKENIZED))

        # Index the Attributes
        for att in iba_node.attributes:
            # allowing search for nodes having a particular attribute type
            d.add(
                Field(COLUMN_ATTRIBUTE_TYPE_NID, att.type, Field.Store.NO,
                      Field.Index.TOKENIZED))
            # allowing the search of nodes with any attribute having a particular value
            d.add(
                Field(COLUMN_ATTRIBUTE_VALUE, att.value, Field.Store.NO,
                      Field.Index.TOKENIZED))
            # allowing the search of nodes having aparticular attribute with a particular value
            d.add(
                Field(COLUMN_ATTRIBUTE_TYPE_NID, att.value, Field.Store.NO,
                      Field.Index.TOKENIZED))

        # Index the Statements
        for stat in iba_node.statements:
            for att in stat.attributes:
                # allowing the search of nodes have any predicate with the specified attribute type
                d.add(
                    Field(COLUMN_PREDICATE_NID + COLUMN_ATTRIBUTE_TYPE_NID,
                          att.type, Field.Store.NO, Field.Index.TOKENIZED))
                # allowing the search of nodes having the specified predicate with the specified attribute type of any value
                d.add(
                    Field(stat.predicate + COLUMN_ATTRIBUTE_TYPE_NID, att.type,
                          Field.Store.NO, Field.Index.TOKENIZED))
                # allowing the search of nodes having the specified predicate with any attribute type and any value
                d.add(
                    Field(COLUMN_PREDICATE_NID, stat.predicate, Field.Store.NO,
                          Field.Index.TOKENIZED))

                # allowing the search of nodes have any predicate with any attribute type of the specified value
                d.add(
                    Field(COLUMN_PREDICATE_NID + COLUMN_ATTRIBUTE_TYPE_NID,
                          att.value, Field.Store.NO, Field.Index.TOKENIZED))
                # allowing the search of node having any predicate with the specified attribute type and value
                d.add(
                    Field(COLUMN_PREDICATE_NID + att.type, att.value,
                          Field.Store.NO, Field.Index.TOKENIZED))
                # allowing the search of nodes having the specified predicate with any attribute type and the specified value
                d.add(
                    Field(stat.predicate + COLUMN_ATTRIBUTE_TYPE_NID,
                          att.value, Field.Store.NO, Field.Index.TOKENIZED))
                # allowign the search of nodes having a specified predicate and attribute type and value
                d.add(
                    Field(stat.predicate + att.type, att.value, Field.Store.NO,
                          Field.Index.TOKENIZED))

        return d
Beispiel #10
0
    def index_record(self, record, writer=None):
        # field, value, store?, index?, token?
        try:
            if not writer:
                had_writer = False
                writer = self.context.get_search_index_writer(False)
            else:
                had_writer = True

            study = Study(self.context, record.study_id)

            self.logger.debug('starting document')
            doc = PyLucene.Document()

            # First, we need to create a unique key so we can later delete
            # if necessary.  Will try simply uid for now.
            doc.add(PyLucene.Field('uid', str(record.uid), True, True, False))
            doc.add(PyLucene.Field('all', str(record.uid), True, True, False))

            # Second, save internal-use metadata.  These should probably
            # be x'd out at Query-time.
            doc.add(
                PyLucene.Field('record-status', str(record.status), False,
                               True, False))
            doc.add(
                PyLucene.Field('article-type', str(study.article_type), False,
                               True, False))

            source_catalog = self.context.get_source_catalog()
            complete_term_map = source_catalog.get_complete_mapping()
            mapped_metadata = record.get_mapped_metadata(complete_term_map)

            # First index all the non-multiple metadata fields
            for field in ('abstract', 'affiliation', 'issn', 'journal',
                          'pubdate', 'issue', 'pages', 'title', 'volume'):
                val = mapped_metadata.get(field, None)
                if val:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    doc.add(PyLucene.Field('all', val, False, True, True))

            # Be sure to index all of (abbrev, full title, issn) as "journal"
            issn = mapped_metadata.get('issn')
            if issn:
                j = Journal()
                j.load_from_issn(self.context, issn)
                no_dash = j.no_dash()
                self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \
                    (j.journal_title, j.abbreviation, issn))
                doc.add(PyLucene.Field('journal', issn, False, True, True))
                doc.add(PyLucene.Field('journal', no_dash, False, True, True))
                doc.add(PyLucene.Field('all', issn, False, True, True))
                doc.add(PyLucene.Field('all', no_dash, False, True, True))
                if j.abbreviation:
                    doc.add(
                        PyLucene.Field('journal', j.abbreviation, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.abbreviation, False, True,
                                       True))
                if j.journal_title:
                    doc.add(
                        PyLucene.Field('journal', j.journal_title, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.journal_title, False, True,
                                       True))

            # If a page range is given, index the first page, assuming
            # the delimiter is '-'
            pages = mapped_metadata.get('pages', None)
            if pages \
                and '-' in pages:
                first_page = pages[0:pages.index('-')]
                doc.add(PyLucene.Field('pages', first_page, False, True, True))
                doc.add(PyLucene.Field('all', first_page, False, True, True))

            # 'unique_identifier' must be specially treated because
            # of the '_'
            val = mapped_metadata.get('unique_identifier', None)
            if val:
                doc.add(
                    PyLucene.Field('unique-identifier', val, False, True,
                                   True))
                doc.add(PyLucene.Field('all', val, False, True, True))

            # Next, index all the possibly-multiple metadata fields
            # Give these (especially for author and subject) a little
            # boost, less than for canary UMLS concepts
            for field in ('author', 'grantnum', 'keyword', 'registrynum',
                          'subject'):
                vals = mapped_metadata.get(field, None)
                for val in vals:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    f = PyLucene.Field('all', val, False, True, True)
                    f.setBoost(1.3)
                    doc.add(f)

            # If at least one author name is available, index the first
            # author to support first-author searching.  Also, boost it
            # slightly higher than the other authors.
            authors = mapped_metadata.get('author', None)
            if authors:
                doc.add(
                    PyLucene.Field('first-author', authors[0], False, True,
                                   True))
                f = PyLucene.Field('all', authors[0], False, True, True)
                f.setBoost(1.5)
                doc.add(f)

            # All the booleans
            for bool in ('has_outcomes', 'has_exposures', 'has_relationships',
                         'has_interspecies', 'has_exposure_linkage',
                         'has_outcome_linkage', 'has_genomic'):
                val = getattr(study, bool)
                # NOTE: I think lucene dislikes '_' in field names ??
                boolstr = bool.replace('_', '-')
                doc.add(
                    PyLucene.Field(boolstr, str(int(val)), False, True, False))
                # NOTE: no need to add this to 'all'.  I think.

            # Now, all the UMLS concepts.  Simpler approach to
            # lucene "synonym injection", but it works!  Give it
            # slightly bigger boost than keywords/subjects
            for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'):
                # NOTE: I think lucene dislikes '_' in field names ??
                ctype_search = ctype.replace('_', '-')
                for val in getattr(study, ctype):
                    concept = Concept(self.context, val.concept_id)
                    for syn in concept.synonyms:
                        doc.add(
                            PyLucene.Field(ctype_search,
                                           unicode(syn, 'latin-1'), False,
                                           True, True))
                        f = PyLucene.Field('all', unicode(syn, 'latin-1'),
                                           False, True, True)
                        f.setBoost(2.0)
                        doc.add(f)

            # And, the locations
            gazeteer = self.context.get_gazeteer()
            locs = []
            for location in study.locations:
                feature = Feature(self.context, uid=location.feature_id)
                feature.load(self.context)
                if gazeteer.fips_codes.has_key(
                    (feature.country_code, feature.adm1)):
                    region_name = gazeteer.fips_codes[(feature.country_code,
                                                       feature.adm1)]
                else:
                    region_name = ''
                full_name = '%s (%s, %s, %s)' % (
                    feature.name, gazeteer.feature_codes[feature.feature_type],
                    render_capitalized(region_name),
                    render_capitalized(
                        gazeteer.country_codes[feature.country_code]))
                doc.add(
                    PyLucene.Field('location', unicode(full_name, 'latin-1'),
                                   False, True, True))
                doc.add(
                    PyLucene.Field('all', unicode(full_name, 'latin-1'), False,
                                   True, True))

            # Finally, the methodologies
            for meth in study.methodologies:
                doc.add(
                    PyLucene.Field('methodology',
                                   meth.get_study_type(text=True), False, True,
                                   True))
                doc.add(
                    PyLucene.Field('all', meth.get_study_type(text=True),
                                   False, True, True))
                # And each exposure route term
                for route in meth.get_routes(True):
                    doc.add(
                        PyLucene.Field('exposure_route', route, False, True,
                                       True))
                    doc.add(PyLucene.Field('all', route, False, True, True))

            writer.addDocument(doc)
            if not had_writer:
                writer.close()
        except Exception, e:
            self.logger.error('Failed to index record: %s', e)
            self.logger.error(traceback.print_exc())