Ejemplo n.º 1
0
 def prepare_chained_filter(self, dt1, dt2):
     """Return a chained filter."""
     return ChainedFilter([
         self.dup_filter,
         TermRangeFilter(
             'date_published', BytesRef(dt1.strftime(self.date_format)),
             BytesRef(dt2.strftime(self.date_format)), True, True)
     ], [ChainedFilter.AND, ChainedFilter.AND])
Ejemplo n.º 2
0
 def query_between_dates(self, dt1, dt2, original_query=None):
     '''Update the given query to only allow records between dt1 and dt2.'''
     return TermRangeQuery(
         'date_published',  # Field
         BytesRef(dt1.strftime(self.date_format)),  # Lower bound
         BytesRef(dt2.strftime(self.date_format)),  # Upper bound
         True,  # Include lower bound
         True  # Include upper bound
     )
Ejemplo n.º 3
0
 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)
Ejemplo n.º 4
0
    def __init__(self, input):
        super(PayloadSetter, self).__init__(input)

        self.input = input
        self.payloadAtt = self.addAttribute(PayloadAttribute.class_)
        self.data = JArray('byte')(1)
        self.p = BytesRef(self.data, 0, 1)
Ejemplo n.º 5
0
    def incrementToken(self):

        if self.input.incrementToken():
            bytes = JArray('byte')("pos: %d" %(self.pos))
            self.payloadAttr.setPayload(BytesRef(bytes))

            if self.pos == 0 or self.i % 2 == 1:
                posIncr = 1
            else:
                posIncr = 0

            self.posIncrAttr.setPositionIncrement(posIncr)
            self.pos += posIncr
            self.i += 1
            return True

        return False
Ejemplo n.º 6
0
def index_document(writer, log):
    doc = Document()

    doc.add(SortedDocValuesField('host', BytesRef(log['host'])))
    doc.add(Field('host', log['host'], TextField.TYPE_STORED))
    doc.add(Field('client_user_name_if_available', log['client_user_name_if_available'], TextField.TYPE_STORED))
    date = handleDate(log['date_time'])
    doc.add(SortedDocValuesField('date_time', BytesRef(date)))
    doc.add(Field('date_time', date, StringField.TYPE_STORED))

    doc.add(SortedDocValuesField('method', BytesRef(log['method'])))
    # doc.add(FacetField('method', log['method']))
    doc.add(Field('method', log['method'], TextField.TYPE_STORED))

    doc.add(SortedDocValuesField('request_path', BytesRef(log['request_path'])))
    doc.add(Field('request_path', log['request_path'], TextField.TYPE_STORED))
    doc.add(SortedDocValuesField('protocol', BytesRef(log['protocol'])))
    doc.add(Field('protocol', log['protocol'], StringField.TYPE_STORED)) 
    doc.add(SortedDocValuesField('response_code', BytesRef(str(log['response_code']))))
    response_code = str(log['response_code']) if log['response_code'] else 'None'
    doc.add(Field('response_code_string', response_code, StringField.TYPE_STORED))
    doc.add(IntPoint('response_code', log['response_code']))
    doc.add(SortedDocValuesField('content_size', BytesRef(log['content_size'])))
    doc.add(IntPoint('content_size', log['content_size']))
    doc.add(Field('request_referrer', log['request_referrer'], TextField.TYPE_NOT_STORED))
    doc.add(Field('request_user_agent', log['request_user_agent'], TextField.TYPE_NOT_STORED))
    doc.add(Field('router_name', log['router_name'], TextField.TYPE_NOT_STORED))
    doc.add(Field('server_url', log['server_url'], TextField.TYPE_STORED))
    doc.add(SortedDocValuesField('request_duration', BytesRef(log['request_duration'])))
    doc.add(IntPoint('request_duration', log['request_duration']))

    location = str(log['location']) if log['location'] else 'None'
    location_ascii_free = unicodedata.normalize('NFKD', location).encode('ascii','ignore').decode('ascii')
    doc.add(SortedDocValuesField('location', BytesRef(location_ascii_free)))
    # doc.add(Field('location_raw', location, StringField.TYPE_STORED))
    doc.add(Field('location', location, TextField.TYPE_STORED))

    writer.addDocument(doc)
Ejemplo n.º 7
0
 def termsForField(self, field, prefix=None, limit=10, **kwargs):
     convert = lambda term: term.utf8ToString()
     terms = []
     termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
     if termsEnum is None:
         return terms
     iterator = termsEnum.iterator(None)
     if prefix:
         iterator.seekCeil(BytesRef(prefix))
         terms.append((iterator.docFreq(), convert(iterator.term())))
     bytesIterator = BytesRefIterator.cast_(iterator)
     try:
         while len(terms) < limit:
             term = convert(bytesIterator.next())
             if prefix and not term.startswith(prefix):
                 break
             terms.append((iterator.docFreq(), term))
     except StopIteration:
         pass
     return terms
Ejemplo n.º 8
0
    def _updateOaiRecord(self,
                         identifier,
                         setSpecs,
                         metadataPrefixes,
                         delete=False,
                         oldDoc=None,
                         deleteInSets=None,
                         deleteInPrefixes=None,
                         _overrideStamp=None):
        oldDoc = oldDoc or self._getDocument(identifier)
        doc, oldDeletedSets, oldDeletedPrefixes = self._getNewDocument(
            identifier, oldDoc=oldDoc)
        newStamp = _overrideStamp if self._importMode else self._newStamp()
        doc.add(LongPoint(STAMP_FIELD, int(newStamp)))
        doc.add(
            StoredField(STAMP_FIELD,
                        BytesRef(JArray('byte')(int_to_bytes(newStamp)))))
        doc.add(NumericDocValuesField(NUMERIC_STAMP_FIELD, int(newStamp)))

        allMetadataPrefixes, allDeletedPrefixes = self._setMetadataPrefixes(
            doc=doc,
            metadataPrefixes=asSet(metadataPrefixes),
            delete=delete,
            deleteInPrefixes=asSet(deleteInPrefixes),
            oldDeletedPrefixes=oldDeletedPrefixes)

        allSets, allDeletedSets = self._setSets(doc=doc,
                                                setSpecs=setSpecs or [],
                                                delete=delete,
                                                deleteInSets=deleteInSets,
                                                oldDeletedSets=oldDeletedSets)
        if delete or (allDeletedSets and allSets == allDeletedSets
                      ) or allMetadataPrefixes == allDeletedPrefixes:
            doc.add(
                StringField(TOMBSTONE_FIELD, TOMBSTONE_VALUE, Field.Store.YES))

        self._writer.updateDocument(Term(IDENTIFIER_FIELD, identifier), doc)
        self._latestModifications.add(str(identifier))
        self.do.signalOaiUpdate(metadataPrefixes=allMetadataPrefixes,
                                sets=allSets,
                                stamp=newStamp)
Ejemplo n.º 9
0
    def testSetPosition(self):

        class _tokenizer(PythonTokenizer):
            def __init__(_self):
                super(_tokenizer, _self).__init__()

                _self.TOKENS = ["1", "2", "3", "4", "5"]
                _self.INCREMENTS = [1, 2, 1, 0, 1]
                _self.i = 0
                _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_)
                _self.termAtt = _self.addAttribute(CharTermAttribute.class_)
                _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_)

            def incrementToken(_self):
                if _self.i == len(_self.TOKENS):
                    return False

                _self.clearAttributes()
                _self.termAtt.append(_self.TOKENS[_self.i])
                _self.offsetAtt.setOffset(_self.i, _self.i)
                _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i])
                _self.i += 1

                return True

            def reset(_self):
                super(_tokenizer, _self).reset()
                _self.i = 0

        class _analyzer(PythonAnalyzer):
            def createComponents(_self, fieldName):
                return Analyzer.TokenStreamComponents(_tokenizer())
            def initReader(_self, fieldName, reader):
                return reader

        writer = self.getWriter(analyzer=_analyzer())

        d = Document()
        d.add(Field("field", "bogus", TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        reader = searcher.getIndexReader()
        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1"))
        pos.nextDoc()
        # first token should be at position 0
        self.assertEqual(0, pos.nextPosition())

        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2"))
        pos.nextDoc()
        # second token should be at position 2
        self.assertEqual(2, pos.nextPosition())

        b = PhraseQuery.Builder()
        b.add(Term("field", "1"))
        b.add(Term("field", "2"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # same as previous, just specify positions explicitely.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 1)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # specifying correct positions should find the phrase.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 2)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "3"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # phrase query would find it when correct positions are specified. 
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "4"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # phrase query should fail for non existing searched term 
        # even if there exist another searched terms in the same searched
        # position.
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "9"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # multi-phrase query should succed for non existing searched term
        # because there exist another searched terms in the same searched
        # position.

        b = MultiPhraseQuery.Builder()
        b.add([Term("field", "3"), Term("field", "9")], 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "4"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))
Ejemplo n.º 10
0
 def collectLeaf(_self, postings, position, term):
     if postings.getPayload() is not None:
         _self.payloads.append(BytesRef.deepCopyOf(postings.getPayload()))
Ejemplo n.º 11
0
    def testPayloadsPos0(self):

        writer = self.getWriter(analyzer=TestPayloadAnalyzer())

        doc = Document()
        doc.add(
            Field("content", "a a b c d e a f g h i j a b k k",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)
        reader = writer.getReader()
        writer.close()

        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getLiveDocs(reader),
                                              "content", BytesRef("a"))

        count = 0
        self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS)
        # "a" occurs 4 times
        self.assertEqual(4, tp.freq())

        expected = 0
        self.assertEqual(expected, tp.nextPosition())
        self.assertEqual(1, tp.nextPosition())
        self.assertEqual(3, tp.nextPosition())
        self.assertEqual(6, tp.nextPosition())

        # only one doc has "a"
        self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS)

        searcher = self.getSearcher(reader=reader)

        stq1 = SpanTermQuery(Term("content", "a"))
        stq2 = SpanTermQuery(Term("content", "k"))
        sqs = [stq1, stq2]
        snq = SpanNearQuery(sqs, 30, False)

        count = 0
        sawZero = False
        pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        while pspans.next():
            payloads = pspans.getPayload()
            sawZero |= pspans.start() == 0

            it = payloads.iterator()
            while it.hasNext():
                count += 1
                it.next()

        self.assertEqual(5, count)
        self.assert_(sawZero)

        spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        count = 0
        sawZero = False
        while spans.next():
            count += 1
            sawZero |= spans.start() == 0

        self.assertEqual(4, count)
        self.assert_(sawZero)

        sawZero = False
        psu = PayloadSpanUtil(searcher.getTopReaderContext())
        pls = psu.getPayloadsForQuery(snq)
        count = pls.size()
        it = pls.iterator()
        while it.hasNext():
            bytes = JArray('byte').cast_(it.next())
            s = bytes.string_
            sawZero |= s == "pos: 0"

        self.assertEqual(5, count)
        self.assert_(sawZero)
Ejemplo n.º 12
0
    def index_docs(self, log_interval=100000):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS)

        t2_tk = FieldType()
        t2_tk.setStored(True)
        t2_tk.setTokenized(True)
        t2_tk.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t3_tkfp = FieldType()
        t3_tkfp.setStored(True)
        t3_tkfp.setTokenized(True)
        t3_tkfp.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        bin_dv_ft = FieldType()
        bin_dv_ft.setDocValuesType(DocValuesType.BINARY)
        bin_dv_ft.setStored(True)
        bin_dv_ft.setIndexOptions(IndexOptions.DOCS)

        num_paragraphs = 0
        num_empty_paragraphs = 0
        no_entity_paragraphs = 0
        num_avg_entities = 0

        # get docs from our sqlite db
        for d_idx, doc_id in enumerate(self.doc_ids):
            doc_p_ents = self.wiki_db.get_doc_p_ents(doc_id)

            assert doc_p_ents

            doc_dict = json.loads(doc_p_ents)

            paragraphs = doc_dict['paragraphs']
            for p_idx, p in enumerate(paragraphs):

                p_text = p['text']

                if len(p_text) == 0:
                    num_empty_paragraphs += 1
                    continue

                lucene_doc = Document()
                lucene_doc.add(Field("wiki_doc_id", doc_id, t3_tkfp))
                lucene_doc.add(Field("p_idx", str(p_idx), t1))
                lucene_doc.add(Field("content", p_text, t3_tkfp))

                # Named-entities
                ents = p['ents']
                ent_set = set()

                if len(ents) > 0:
                    entity_idx_set = set()
                    entity_type_id_set = set()
                    entity_positions = list()

                    for entity in ents:

                        # Filter number types
                        if entity['label_'] in self.spacy_number_types:
                            continue

                        assert 'label' in entity, 'doc_id={}'.format(doc_id)

                        num_avg_entities += 1

                        entity_key = entity['text'] + '\t' + entity['label_']

                        etypeidx = self.entitytype2idx.get(entity['label'])
                        if etypeidx is None:
                            etypeidx = len(self.entitytype2idx)
                            self.entitytype2idx[entity['label']] = etypeidx

                        eidx = self.entity2idx.get(entity_key)
                        if eidx is None:
                            eidx = len(self.entity2idx)
                            self.entity2idx[entity_key] = eidx
                            self.idx2entity[eidx] = entity_key
                            self.entity_dict[eidx] = \
                                (entity['text'], entity['label_'], etypeidx)

                        entity_idx_set.add(eidx)
                        entity_type_id_set.add(etypeidx)
                        entity_positions.append((eidx, etypeidx,
                                                 entity['start_char'],
                                                 entity['end_char']))

                        ent_set.add((eidx, etypeidx))

                    if len(entity_idx_set) > 0:
                        lucene_doc.add(
                            Field("entity_id",
                                  '\t'.join([str(eidx)
                                             for eidx in entity_idx_set]),
                                  t2_tk))
                        lucene_doc.add(
                            Field("entity_type_id",
                                  '\t'.join([str(etid)
                                             for etid in entity_type_id_set]),
                                  t2_tk))

                        positions = \
                            '\t'.join(['{},{},{},{}'
                                      .format(eidx, etidx, start_char, end_char)
                                       for eidx, etidx, start_char, end_char
                                       in entity_positions])
                        lucene_doc.add(Field("entity_position", positions, t1))
                else:
                    no_entity_paragraphs += 1

                if self.num_entities_max < len(ent_set):
                    self.num_entities_max = len(ent_set)

                binary = get_binary4dvs(ent_set, write_type=self.write_type)

                # https://lucene.apache.org/pylucene/jcc/features.html
                br = BytesRef(lucene.JArray('byte')(binary))
                lucene_doc.add(BinaryDocValuesField("eqa_bin", br))

                # # debug
                # lucene_doc.add(StoredField("eqa_bin_store", br))
                # lucene_doc.add(StoredField("bin_raw", binary.hex()))

                self.writer.addDocument(lucene_doc)
                num_paragraphs += 1

                if num_paragraphs % log_interval == 0:
                    print(datetime.now(), 'Added #paragraphs', num_paragraphs,
                          '#wikidocs', d_idx + 1,
                          '#entities', len(self.entity_dict))

        print('#paragraphs', num_paragraphs)
        print('#no_entity_paragraphs', no_entity_paragraphs,
              '{:.2f}%'.format(100*no_entity_paragraphs/num_paragraphs))
        print('avg num of entities {:.2f}'
              .format(num_avg_entities /
                      (num_paragraphs - no_entity_paragraphs)))

        if num_empty_paragraphs > 0:
            print('#skipped_empty_paragraphs', num_empty_paragraphs)

        print('\nAdding entity docs..')
        for e_dict_idx, entity_idx in enumerate(self.entity_dict):
            # skip UNK
            if entity_idx == self.entity2idx['UNK']:
                continue
            ename, etype, etype_idx = self.entity_dict[entity_idx]
            entity_doc = Document()
            entity_doc.add(Field("name", ename, t2_tk))
            entity_doc.add(Field("type", etype, t1))
            entity_doc.add(Field("eid", str(entity_idx), t1))
            entity_doc.add(Field("etid", str(etype_idx), t1))
            self.writer.addDocument(entity_doc)
            if (e_dict_idx + 1) % (10 * log_interval) == 0:
                print(datetime.now(), '#entities', e_dict_idx + 1)

        print('#entities', len(self.entity2idx) - 1)
        print('#entities_max', self.num_entities_max)

        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print('done')
Ejemplo n.º 13
0
def stats_tooltip(word, doc_id, reader):
    # content statistics
    term = Term('content', tokenize(word))
    term_text = unicode(term).replace('content:', '')
    doc_count = reader.docFreq(term)  # in how many docs the term appears

    total_term_count = reader.totalTermFreq(
        term)  # how many times the term appears in any doc
    n_docs = reader.getDocCount('content')  # total number of docs

    postings = MultiFields.getTermDocsEnum(reader, 'content',
                                           BytesRef(term_text))
    while postings.docID() != doc_id:  # this is bad
        postings.nextDoc()
    term_count = postings.freq()  # how many times the term appears in this doc

    similarity = ClassicSimilarity()
    tf = similarity.tf(float(term_count))  # sqrt(term_freq)
    # whether the term is is common or rare among all the docs
    idf = similarity.idf(long(doc_count),
                         long(n_docs))  # log((n_docs+1)/(doc_count+1)) + 1

    # abstract statistics
    abstract_term = Term('abstract', tokenize(word))
    abstract_doc_count = reader.docFreq(abstract_term)
    abstract_total_term_count = reader.totalTermFreq(abstract_term)
    a_idf = similarity.idf(long(abstract_doc_count), long(n_docs))

    abstract_postings = MultiFields.getTermDocsEnum(reader, 'abstract',
                                                    BytesRef(term_text))
    if not abstract_postings:  # the term appears in no document's abstract
        abstract_term_count = 0
        a_tf = 1
    else:
        while abstract_postings.docID() != doc_id:  # this is bad
            if abstract_postings.nextDoc() == abstract_postings.NO_MORE_DOCS:
                abstract_term_count = 0  # it does not appear in this document's abstract
                a_tf = 1
                break
        else:  # no break, it does appear in this document's abstract
            abstract_term_count = abstract_postings.freq()
            a_tf = similarity.tf(float(abstract_term_count))

    content_score = tf * idf**2 * CONTENT_BOOST
    abstract_score = a_tf * a_idf**2 * ABSTRACT_BOOST

    # mixing concerns like nobody's business
    return '''
            <div class="popup">
                <div class="term">{}</div>     
                
                <table>
                <tr>
                    <th> </th>
                    <th>abstr</th>
                    <th>body</th>
                    <th>total</th>
                </tr>
                
                <tr><td>this doc</td>   <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                <tr><td>TF</td>         <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr>
                
                <tr><td>nr docs</td>    <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                <tr><td>IDF</td>        <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr>
                
                <tr><td>score</td>      <td>{:.2g}</td> <td>{:.2g}</td> <td><b>{:.2g}</b></td> </tr>
                <tr><td>all docs</td>   <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                </table>
                
                <div class="total-docs">{}</div>
            </div>
            '''.format(
        term_text, abstract_term_count, term_count - abstract_term_count,
        term_count, a_tf, tf, a_tf * tf, abstract_doc_count, doc_count,
        doc_count, a_idf, idf, a_idf * idf, abstract_score, content_score,
        abstract_score * content_score, abstract_total_term_count,
        total_term_count - abstract_total_term_count, total_term_count, n_docs)