def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed",
                                          CompressionTools.compress(bytes))
        stringFldCompressed = StoredField(
            "stringCompressed",
            CompressionTools.compressString(self.binaryValCompressed))

        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(
            docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(
            CompressionTools.decompressString(
                docFromReader.getBinaryValue("stringCompressed")),
            self.binaryValCompressed)

        reader.close()
Exemple #2
0
 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)
Exemple #3
0
def createDocument(item_id, label, viewSimilar, viewProspective):
    doc = Document()
    doc.add(StringField('itemID', item_id, Field.Store.YES))
    doc.add(StringField('label', label, Field.Store.YES))
    for item in viewSimilar:
        doc.add(StoredField("viewSimilar", item))
    for item in viewProspective:
        doc.add(StoredField("viewProspective", item))
    return doc
Exemple #4
0
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"):
    doc = Document()
    now_time = int(time.time())
    _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest()
    doc.add(StringField("id", _id, Field.Store.NO))
    doc.add(StringField("itemID", item_id, Field.Store.YES))
    doc.add(StringField("label", label, Field.Store.YES))
    doc.add(StoredField("viewSimilar", viewSimilar))
    doc.add(StoredField("viewProspective", viewProspective))
    doc.add(StringField("model", model, Field.Store.YES))
    doc.add(StringField("ttl", str(now_time), Field.Store.NO))
    return _id, doc
def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)
def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))
    def add_doc(self, item_data):
        item_id = item_data['item_id']
        ttl = item_data['ttl']
        version = item_data.get('version', 'default')
        view_similar = json.dumps(item_data.get('view_similar', {}))
        view_prospective = json.dumps(item_data.get('view_prospective', {}))

        doc = Document()
        _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest()
        doc.add(StringField("id", _id, Field.Store.NO))
        doc.add(LongPoint("ttl", ttl))
        doc.add(StringField("version", version, Field.Store.YES))
        doc.add(StringField("item_id", item_id, Field.Store.YES))
        doc.add(StoredField("view_similar", view_similar))
        doc.add(StoredField("view_prospective", view_prospective))
        self.writer.updateDocument(Term("id", _id), doc)
Exemple #8
0
def addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract):
    global batch,cnt_batch
    
    #print 'title='+title+ '  category='+category+'   skos='+skos_category
    doc = Document()
    doc.add(StringField('title',title,Field.Store.YES))
    doc.add(TextField('name',name,Field.Store.YES))
    doc.add(TextField('value',value,Field.Store.YES))
    doc.add(StoredField('category',category))
    doc.add(StoredField('skos_category',skos_category))
    doc.add(TextField('all_text',all_text,Field.Store.YES))
    doc.add(TextField('raw_name',raw_name,Field.Store.YES))
    doc.add(TextField('raw_value',raw_value,Field.Store.YES))
    doc.add(TextField('abstract',abstract,Field.Store.YES))
    
    #batch.append(doc)
    #cnt_batch+=1
    #if cnt_batch==1000:
    #   w.addDocuments(batch)
    #   cnt_batch=0
    #   del batch[:]
    w.addDocument(doc)
Exemple #9
0
    def _updateOaiRecord(self,
                         identifier,
                         setSpecs,
                         metadataPrefixes,
                         delete=False,
                         oldDoc=None,
                         deleteInSets=None,
                         deleteInPrefixes=None,
                         _overrideStamp=None):
        oldDoc = oldDoc or self._getDocument(identifier)
        doc, oldDeletedSets, oldDeletedPrefixes = self._getNewDocument(
            identifier, oldDoc=oldDoc)
        newStamp = _overrideStamp if self._importMode else self._newStamp()
        doc.add(LongPoint(STAMP_FIELD, int(newStamp)))
        doc.add(
            StoredField(STAMP_FIELD,
                        BytesRef(JArray('byte')(int_to_bytes(newStamp)))))
        doc.add(NumericDocValuesField(NUMERIC_STAMP_FIELD, int(newStamp)))

        allMetadataPrefixes, allDeletedPrefixes = self._setMetadataPrefixes(
            doc=doc,
            metadataPrefixes=asSet(metadataPrefixes),
            delete=delete,
            deleteInPrefixes=asSet(deleteInPrefixes),
            oldDeletedPrefixes=oldDeletedPrefixes)

        allSets, allDeletedSets = self._setSets(doc=doc,
                                                setSpecs=setSpecs or [],
                                                delete=delete,
                                                deleteInSets=deleteInSets,
                                                oldDeletedSets=oldDeletedSets)
        if delete or (allDeletedSets and allSets == allDeletedSets
                      ) or allMetadataPrefixes == allDeletedPrefixes:
            doc.add(
                StringField(TOMBSTONE_FIELD, TOMBSTONE_VALUE, Field.Store.YES))

        self._writer.updateDocument(Term(IDENTIFIER_FIELD, identifier), doc)
        self._latestModifications.add(str(identifier))
        self.do.signalOaiUpdate(metadataPrefixes=allMetadataPrefixes,
                                sets=allSets,
                                stamp=newStamp)
    def testBinaryFieldInIndex(self):

        ft = FieldType()
        ft.setStored(True)

        bytes = JArray('byte')(self.binaryValStored)
        binaryFldStored = StoredField("binaryStored", bytes)
        stringFldStored = Field("stringStored", self.binaryValStored, ft)

        doc = Document()
        doc.add(binaryFldStored)
        doc.add(stringFldStored)

        # test for field count
        self.assertEqual(2, doc.fields.size())

        # add the doc to a ram index
        writer = self.getWriter(
            analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assert_(docFromReader is not None)

        # fetch the binary stored field and compare it's content with the
        # original one
        bytes = docFromReader.getBinaryValue("binaryStored")
        binaryFldStoredTest = bytes.bytes.string_
        self.assertEqual(binaryFldStoredTest, self.binaryValStored)

        # fetch the string field and compare it's content with the original
        # one
        stringFldStoredTest = docFromReader.get("stringStored")
        self.assertEqual(stringFldStoredTest, self.binaryValStored)

        reader.close()
Exemple #11
0
def make_document(full_path, unix_timestamp, contents):
    """
    Create Lucene document with specific content.
    """
    doc = Document()
    # two separate date fields per recommendation
    # at https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/DateTools.html
    doc.add(LongPoint('date_for_pointrangequery', int(unix_timestamp)))
    doc.add(StoredField('last_modified_time', int(unix_timestamp)))
    # https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/TextField.html
    # indexed and tokenized
    doc.add(TextField('fullpath', full_path,
                      Field.Store.YES))  # this is file key but tokenized
    doc.add(TextField('body', contents, Field.Store.YES))
    # It is also possible to add fields that are indexed but not tokenized.
    # See https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/StringField.html
    # However there is a limitation: https://stackoverflow.com/a/32654329/130164
    # MultiFieldQueryParser will have bizarre results because the query parser runs the analyzer
    # , while StringField does not run the analyzer.
    # We deliberately store the key as untokenized so we can search by it directly with a TermQuery.
    doc.add(StringField('key', full_path, Field.Store.YES))  # this is file key
    return doc
Exemple #12
0
def build_index(file_dir):
    indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)

    # t1 = FieldType()
    # t1.setStored(True)
    # t1.setTokenized(False)
    # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    #
    # t2 = FieldType()
    # t2.setStored(True)
    # t2.setTokenized(True)
    # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    print("%d docs in index" % writer.numDocs())
    if writer.numDocs():
        print("Index already built.")
        return
    with open(file_dir + "/train/train.ast.src") as fc:

        codes = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in fc.readlines()
        ]

    for k, code in enumerate(codes):
        doc = Document()
        doc.add(StoredField("id", str(k)))
        doc.add(TextField("code", code, Field.Store.YES))

        writer.addDocument(doc)

    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()
Exemple #13
0
    def binary(self, b):

        c = JArray('byte')(b)
        field = StoredField("bin", c)
        v = field.binaryValue().bytes
        assert c == v and b == [a for a in v]
Exemple #14
0
def addStoredFields(doc, metaDict):
    for key, value in metaDict.items():
        value = '' if value is None else value
        doc.add(StoredField(key, value))
Exemple #15
0
def create_index_from_folder(folder, index_file):
    """Lets Lucene create an index of all database files within a specified folder

    :param folder: absolute or relative path to database files
    :param index_file: absolute or relative output location for index

    Notes:
    - Does not go through database folder recursively, i.e. all files have to be at the root of the folder
    - Only CSV files are supported
    - Column headers are hardcoded and should follow:
        ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold
    """
    # Set up Lucene
    print()
    print("Starting Lucene ...")
    lucene.initVM()
    index_store = SimpleFSDirectory.open(File(index_file).toPath())
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    print()
    # Go through files, add rows of each as Documents to writer
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            print("Indexing {} ...".format(file), end=" ", flush=True)
            with open(os.path.join(folder, file), newline='') as db:
                reader = csv.reader(db)

                # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those
                post_ids = set()
                duplicate_counter = 0

                # To store term vectors (used for query expansion) we have to use a custom fieldtype
                customfield = FieldType()
                customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                customfield.setStored(True)
                customfield.setTokenized(True)
                customfield.setStoreTermVectors(True)

                # CSV files have a useless first row...
                skipfirst = True
                # ... and a useless first column. Skip both.
                for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader:
                    if skipfirst:
                        skipfirst = False
                        continue
                    doc = Document()

                    if rid in post_ids:
                        duplicate_counter += 1
                        continue  # skip
                    else:
                        post_ids.add(rid)

                    # Tokenize, index and store
                    doc.add(Field("text", text, customfield))

                    # Index and store
                    doc.add(StringField("id", rid, Field.Store.YES))
                    doc.add(
                        StringField("subreddit", subreddit, Field.Store.YES))
                    doc.add(StringField("meta", meta, Field.Store.YES))
                    doc.add(StringField("time", time, Field.Store.YES))
                    doc.add(StringField("author", author, Field.Store.YES))

                    # Store only
                    doc.add(StoredField("ups", ups))
                    doc.add(StoredField("downs", downs))
                    doc.add(StoredField("authorlinkkarma", authorlinkkarma))
                    doc.add(StoredField("authorkarma", authorkarma))
                    doc.add(StoredField("authorisgold", authorisgold))

                    writer.addDocument(doc)

            print("DONE!\t(Duplicate posts skipped: {})".format(
                duplicate_counter))

    writer.commit()
    writer.close()

    print()
    print("Finished indexing!")
Exemple #16
0
from org.apache.lucene.analysis.standard import StandardAnalyzer

if __name__ == "__main__":
    lucene.initVM()
    path = Paths.get('index')
    indexDir = SimpleFSDirectory(path)
    analyzer = StandardAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from sys.stdin..."
    todo = get_all_rawtext_ids()
    for n, i in enumerate(todo):
        try:
            html = get_rawtext_by_id(i).html
            root = LH.fromstring(html)
            text = root.text_content().strip()
        except:
            #print "Failed to parse doc"
            continue
        doc = Document()
        # print text
        doc.add(TextField("text", text, Field.Store.NO))
        doc.add(StoredField("id", i))
        writer.addDocument(doc)
        if n % 1000 == 0:
            print "Indexed %d files (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Exemple #17
0
                pmid_query = TermQuery(Term('pmid', json_doc['pmid']))
                pmcid_query = TermQuery(Term('pmcid', json_doc['pmcid']))
                id_query = IntPoint.newRangeQuery("id", json_doc['id'],
                                                  json_doc['id'])
                bq = BooleanQuery.Builder()
                bq.add(pmid_query, BooleanClause.Occur.MUST)
                bq.add(pmcid_query, BooleanClause.Occur.MUST)
                bq.add(id_query, BooleanClause.Occur.MUST)
                q = bq.build()
                writer.deleteDocuments(q)

                # Add whole abstract.
                doc = Document()
                # Store field.
                doc.add(IntPoint('id', json_doc['id']))  # index
                doc.add(StoredField('id', json_doc['id']))  # store
                doc.add(StringField('pmid', json_doc['pmid'], Field.Store.YES))
                doc.add(
                    StringField('pmcid', json_doc['pmcid'], Field.Store.YES))
                # Index only.
                doc.add(
                    StringField('article_type', json_doc['article_type'],
                                Field.Store.NO))
                doc.add(StringField('type', json_doc['type'], Field.Store.NO))
                doc.add(
                    StringField('sec_type', json_doc['sec_type'],
                                Field.Store.NO))
                doc.add(Field('text', json_doc['text'], t1))

                writer.addDocument(doc)
            except: