Esempio n. 1
0
 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)
Esempio n. 2
0
    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc
Esempio n. 3
0
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue
Esempio n. 4
0
    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)
Esempio n. 5
0
def addDoc(w, name, birth_date, death_date, birth_note, death_note):
    doc = Document()
    doc.add(TextField("name", name, Field.Store.YES))
    doc.add(StringField("birth_date", birth_date, Field.Store.YES))
    doc.add(StringField("death_date", death_date, Field.Store.YES))
    doc.add(StringField("birth_note", birth_note, Field.Store.YES))
    doc.add(StringField("death_note", death_note, Field.Store.YES))
    w.addDocument(doc)
Esempio n. 6
0
def createDocument(item_id, label, viewSimilar, viewProspective):
    doc = Document()
    doc.add(StringField('itemID', item_id, Field.Store.YES))
    doc.add(StringField('label', label, Field.Store.YES))
    for item in viewSimilar:
        doc.add(StoredField("viewSimilar", item))
    for item in viewProspective:
        doc.add(StoredField("viewProspective", item))
    return doc
Esempio n. 7
0
 def __init__(self, path):
     lazyImport()
     self._writer, self._reader, self._searcher = self._getLucene(path)
     self._latestModifications = {}
     self._doc = Document()
     self._keyField = StringField("key", "", Field.Store.NO)
     self._valueField = Field("value", "", UNINDEXED_TYPE)
     self._doc.add(self._keyField)
     self._doc.add(self._valueField)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc
def create_document_by_document_content(org_title, preprocessed_title,
                                        preprocessed_title_lower, content):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(
        StringField("preprocessed_title_lower", preprocessed_title_lower,
                    Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("content", content, Field.Store.YES))
    return doc
Esempio n. 10
0
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"):
    doc = Document()
    now_time = int(time.time())
    _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest()
    doc.add(StringField("id", _id, Field.Store.NO))
    doc.add(StringField("itemID", item_id, Field.Store.YES))
    doc.add(StringField("label", label, Field.Store.YES))
    doc.add(StoredField("viewSimilar", viewSimilar))
    doc.add(StoredField("viewProspective", viewProspective))
    doc.add(StringField("model", model, Field.Store.YES))
    doc.add(StringField("ttl", str(now_time), Field.Store.NO))
    return _id, doc
Esempio n. 11
0
    def get_doc(self, img):
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc
Esempio n. 12
0
def create_document(line):
    doc = Document()
    line = line.split()
    keyterm = line[0]
    doc.add(StringField("keyterm", keyterm, Field.Store.YES))
    index = line[1]
    doc.add(StringField("Sno", index, Field.Store.YES))
    del line[0:2]
    line = ' '.join(line)
    qterm = keyterm.replace("_", " ")
    if qterm not in line:
        line = qterm + ' ' + line
    doc.add(TextField("text", line, Field.Store.YES))
    return doc
def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res
Esempio n. 14
0
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()
def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))
Esempio n. 16
0
    def createDoc(self, url, html, duplicate):
        title, contents = self.parseHtml(url, html)

        doc = Document()
        doc.add(StringField("title", title, Field.Store.YES))
        doc.add(StringField("url", url, Field.Store.YES))
        doc.add(
            StringField("duplicate",
                        str(duplicate).lower(), Field.Store.YES))

        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print "Warning: No content in %s" % url

        return doc
    def add_doc(self, item_data):
        item_id = item_data['item_id']
        ttl = item_data['ttl']
        version = item_data.get('version', 'default')
        view_similar = json.dumps(item_data.get('view_similar', {}))
        view_prospective = json.dumps(item_data.get('view_prospective', {}))

        doc = Document()
        _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest()
        doc.add(StringField("id", _id, Field.Store.NO))
        doc.add(LongPoint("ttl", ttl))
        doc.add(StringField("version", version, Field.Store.YES))
        doc.add(StringField("item_id", item_id, Field.Store.YES))
        doc.add(StoredField("view_similar", view_similar))
        doc.add(StoredField("view_prospective", view_prospective))
        self.writer.updateDocument(Term("id", _id), doc)
Esempio n. 18
0
    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs
Esempio n. 19
0
 def index_single_file(self, doc_file):
     logger.info("adding {}".format(doc_file))
     lucene_doc_num = 0
     try:
         with open(doc_file) as df:
             for line in df:
                 wiki_doc = json.loads(line)
                 doc_title = wiki_doc['title']
                 doc_text = wiki_doc['plaintext']
                 doc_id = wiki_doc['_id']
                 paragraphs = doc_text.split('\n\n')
                 if len(paragraphs) < 3:
                     continue
                 doc_text = rm_special_chars(doc_text)
                 doc = Document()
                 doc.add(StringField("id", str(doc_id), Field.Store.YES))
                 doc.add(TextField("title", doc_title, Field.Store.YES))
                 doc.add(TextField("text", doc_text, Field.Store.YES))
                 self.writer.addDocument(doc)
                 lucene_doc_num += 1
                 if lucene_doc_num % 10000 == 0:
                     logger.info('added {} lucene docs'.format(lucene_doc_num))
     except Exception as e:
         import traceback
         traceback.print_tb(e.__traceback__)
         logger.error("Failed in: {}".format(doc_file))
     return lucene_doc_num
def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)
def add_code_keyword_into_document(document, file_content, node, counter):
    # Flag is set when at least 1 code characteristics has been stored
    flag = False
    # document.add(Field("line_numbers", str(dict(node["line_numbers"])), Field.Store.YES, Field.Index.NO))
    # document.add(Field("hash", str(md5(file_content)), Field.Store.YES, Field.Index.NO))
    # document.add(Field("code", so_tokenizer(file_content, False), Field.Store.YES, Field.Index.ANALYZED))

    for m in node["typed_method_call"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.typed_method_call_count += 1
            flag = True

    for e in node["extends"]:
        if e:
            document.add(Field("word", e, Field.Store.NO,
                               Field.Index.ANALYZED))
            counter.extends_count += 1

    for c in node["used_classes"]:
        if c:
            document.add(
                Field("word", str(c), Field.Store.YES, Field.Index.ANALYZED))
            counter.used_classes_count += 1

    for i in node["class_instance_creation"]:
        if i:
            document.add(
                Field("word", i, Field.Store.YES, Field.Index.ANALYZED))
            counter.class_instance_creation_count += 1
            flag = True

    for m in node["methods"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.methods_count += 1

    for m in node["methods_called"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.methods_called_count += 1
            flag = True

    for m in node["unresolved_method_calls"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.unresolved_method_calls_count += 1

    for l in node["literals"]:
        if l:
            document.add(StringField("word", l, Field.Store.YES))
            counter.literals_count += 1
            flag = True

    return flag
Esempio n. 22
0
def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc
Esempio n. 23
0
 def index_document(self, wiki_doc):
     """
     :param wiki_doc: the document to be indexed.
     :return:
     """
     # Method that indexes documents
     i = 0
     for section in wiki_doc.sections:
         doc = Document()
         doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES))
         doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES))
         doc.add(StringField("id_section", str(
             wiki_doc.id) + "_" + str(i), Field.Store.YES))
         doc.add(TextField("title_section", section.title, Field.Store.YES))
         doc.add(TextField("content_section", section.text, Field.Store.YES))
         self.writer.addDocument(doc)
         i += 1
 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()
Esempio n. 25
0
 def __init__(self, path):
     lazyImport()
     self._writer, self._reader, self._searcher = self._getLucene(path)
     self._latestModifications = {}
     self._doc = Document()
     self._keyField = StringField("key", "", Field.Store.NO)
     self._valueField = Field("value", "", UNINDEXED_TYPE)
     self._doc.add(self._keyField)
     self._doc.add(self._valueField)
Esempio n. 26
0
def create_document(file_name):
    path = INPUT_DIR + file_name  # assemble the file descriptor
    file = open(path)  # open in read mode
    doc = Document()  # create a new document
    # add the title field
    doc.add(StringField("title", input_file, Field.Store.YES))
    # add the whole book
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()  # close the file pointer
    return doc
Esempio n. 27
0
    def _setMetadataPrefixes(self, doc, metadataPrefixes, delete,
                             oldDeletedPrefixes, deleteInPrefixes):
        allMetadataPrefixes = set(doc.getValues(PREFIX_FIELD))
        allDeletedPrefixes = set(oldDeletedPrefixes)
        for prefix in metadataPrefixes:
            allDeletedPrefixes.discard(prefix)
        for prefix in metadataPrefixes.union(deleteInPrefixes):
            if prefix not in allMetadataPrefixes:
                doc.add(StringField(PREFIX_FIELD, prefix, Field.Store.YES))
                self._prefixes.setdefault(prefix, ('', ''))
                allMetadataPrefixes.add(prefix)
        allDeletedPrefixes.update(deleteInPrefixes)
        if delete:
            allDeletedPrefixes = allMetadataPrefixes

        for prefix in allDeletedPrefixes:
            doc.add(StringField(PREFIX_DELETED_FIELD, prefix, Field.Store.YES))

        return allMetadataPrefixes, allDeletedPrefixes
Esempio n. 28
0
    def get_doc(self, doc_info, contents):
        '''
        Generate a `Document` according to the given info.

        Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`)
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc
Esempio n. 29
0
 def _getNewDocument(self, identifier, oldDoc, purgeSets=None):
     doc = Document()
     doc.add(StringField(IDENTIFIER_FIELD, identifier, Field.Store.YES))
     doc.add(IntPoint(HASH_FIELD, Partition.hashId(identifier)))
     oldDeletedSets = set()
     oldDeletedPrefixes = set()
     if oldDoc is not None:
         filterPurgedSets = lambda x: x
         if purgeSets:
             filterPurgedSets = lambda sets: [
                 s for s in sets if s not in purgeSets
             ]
         for oldPrefix in oldDoc.getValues(PREFIX_FIELD):
             doc.add(StringField(PREFIX_FIELD, oldPrefix, Field.Store.YES))
         for oldSet in filterPurgedSets(oldDoc.getValues(SETS_FIELD)):
             doc.add(StringField(SETS_FIELD, oldSet, Field.Store.YES))
         oldDeletedSets.update(oldDoc.getValues(SETS_DELETED_FIELD))
         oldDeletedPrefixes.update(
             filterPurgedSets(oldDoc.getValues(PREFIX_DELETED_FIELD)))
     return doc, oldDeletedSets, oldDeletedPrefixes
Esempio n. 30
0
 def index_docs(self, input_documents):
     for document in tqdm(input_documents, total=len(input_documents)):
         doc = Document()
         doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES))
         doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES))
         type = FieldType()
         type.setIndexOptions(
             IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
         type.setStored(True)
         type.setStoreTermVectors(True)
         type.setTokenized(True)
         if ".W" in document and ".M" in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".M"].lower() + " " +
                                            document[".T"].lower() +
                                            document[".W"].lower())), type))
         elif ".M" in document and ".W" not in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".M"].lower() + " " +
                                            document[".T"].lower())), type))
         elif ".M" not in document and ".W" in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".T"].lower() +
                                            document[".W"].lower())), type))
         elif ".M" not in document and ".W" not in document:
             doc.add(
                 Field("text",
                       " ".join(tokenizer.tokenize(document[".T"].lower())),
                       type))
         if self.writer.getConfig().getOpenMode(
         ) == IndexWriterConfig.OpenMode.CREATE:
             self.writer.addDocument(doc)
         else:
             self.writer.updateDocument(Term(".U", document[".U"]), doc)
     self.writer.close()
Esempio n. 31
0
 def _setSets(self, doc, setSpecs, delete, deleteInSets, oldDeletedSets):
     currentSets = set(doc.getValues(SETS_FIELD))
     allSets = set(currentSets)
     for setSpec in _validSetSpecs(setSpecs):
         allSets.update(_setSpecAndSubsets(setSpec))
     if delete:
         allDeletedSets = set(allSets)
     else:
         allDeletedSets = set(oldDeletedSets)
         for setSpec in _validSetSpecs(setSpecs):
             allDeletedSets.difference_update(_setSpecAndSubsets(setSpec))
         if self._deleteInSetsSupport and deleteInSets:
             allSets.update(deleteInSets)
             allDeletedSets.update(deleteInSets)
     for aSet in allSets:
         if not aSet in currentSets:
             self._sets.setdefault(aSet, '')
             doc.add(StringField(SETS_FIELD, aSet, Field.Store.YES))
             allSets.add(aSet)
     for aSet in allDeletedSets:
         doc.add(StringField(SETS_DELETED_FIELD, aSet, Field.Store.YES))
     return allSets, allDeletedSets
Esempio n. 32
0
class LuceneKeyValueStore(object):
    def __init__(self, path):
        lazyImport()
        self._writer, self._reader, self._searcher = self._getLucene(path)
        self._latestModifications = {}
        self._doc = Document()
        self._keyField = StringField("key", "", Field.Store.NO)
        self._valueField = Field("value", "", UNINDEXED_TYPE)
        self._doc.add(self._keyField)
        self._doc.add(self._valueField)

    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def __setitem__(self, key, value):
        key = str(key)
        value = str(value)
        self._maybeReopen()
        self._keyField.setStringValue(key)
        self._valueField.setStringValue(value)
        self._writer.updateDocument(Term("key", key), self._doc)
        self._latestModifications[key] = value

    def __getitem__(self, key):
        key = str(key)
        value = self._latestModifications.get(key)
        if value is DELETED_RECORD:
            raise KeyError(key)
        if not value is None:
            return value
        self._maybeReopen()
        topDocs = self._searcher.search(TermQuery(Term("key", key)), 1)
        if topDocs.totalHits == 0:
            raise KeyError(key)
        return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value")

    def __delitem__(self, key):
        key = str(key)
        self._writer.deleteDocuments(Term("key", key))
        self._latestModifications[key] = DELETED_RECORD

    def __len__(self):
        raise NotImplementedError

    def __iter__(self):
        raise NotImplementedError

    def items(self):
        raise NotImplementedError

    def keys(self):
        raise NotImplementedError

    def values(self):
        raise NotImplementedError

    def _getLucene(self, path):
        directory = FSDirectory.open(Paths.get(path))
        config = IndexWriterConfig(None)
        config.setRAMBufferSizeMB(256.0) # faster
        config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
        writer = IndexWriter(directory, config)
        reader = writer.getReader()
        searcher = IndexSearcher(reader)
        return writer, reader, searcher

    def _maybeReopen(self):
        if len(self._latestModifications) > 10000:
            newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True)
            if not newReader is None:
                self._reader.close()
                self._reader = newReader
                self._searcher = IndexSearcher(self._reader)
                self._latestModifications.clear()

    def commit(self):
        self._writer.commit()

    def close(self):
        self._writer.close()