def setUp(self):
        super(PyLuceneThreadTestCase, self).setUp()

        self.classLoader = Thread.currentThread().getContextClassLoader()

        writer = self.getWriter(analyzer=StandardAnalyzer())

        doc1 = Document()
        doc2 = Document()
        doc3 = Document()
        doc4 = Document()
        doc1.add(Field("field", "one", TextField.TYPE_STORED))
        doc2.add(Field("field", "two", TextField.TYPE_STORED))
        doc3.add(Field("field", "three", TextField.TYPE_STORED))
        doc4.add(Field("field", "one", TextField.TYPE_STORED))

        writer.addDocument(doc1)
        writer.addDocument(doc2)
        writer.addDocument(doc3)
        writer.addDocument(doc4)
        writer.commit()
        writer.close()

        self.testData = [('one', 2), ('two', 1), ('three', 1),
                         ('five', 0)] * 500
        self.lock = threading.Lock()
        self.totalQueries = 0
Exemple #2
0
def index(personDB, familyDB, relationDB):
    #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)
#?#indexWriter.setRAMBufferSizeMB(50);  KOLLA 256

    mt = matchtext()

    for p in personDB.find({}, no_cursor_timeout=True):
        matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    #Family matchtext
    for f in familyDB.find():
        matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex','FAM', StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return
Exemple #3
0
    def setUp(self):
        super(Test_Bug1763, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.d1 = RAMDirectory()
        self.d2 = RAMDirectory()

        w1, w2 = [
            self.getWriter(directory=d, analyzer=self.analyzer)
            for d in [self.d1, self.d2]
        ]
        doc1 = Document()
        doc2 = Document()
        doc1.add(
            Field("all", "blah blah double blah Gesundheit",
                  TextField.TYPE_NOT_STORED))
        doc1.add(Field('id', '1', StoredField.TYPE))
        doc2.add(
            Field("all", "a quick brown test ran over the lazy data",
                  TextField.TYPE_NOT_STORED))
        doc2.add(Field('id', '2', StoredField.TYPE))
        w1.addDocument(doc1)
        w2.addDocument(doc2)
        for w in [w1, w2]:
            w.close()
Exemple #4
0
def get_document(fname, split_by=None):
    docs = []
    _name = os.path.split(fname)[-1]
    with open(fname) as f:
        contents = f.read()
        if split_by:
            paragraphs = contents.split(split_by)
            for ix, par in enumerate(paragraphs):
                if not par:
                    continue
                doc = Document()
                name = "{}_{}".format(_name, ix)
                doc.add(
                    Field('filename', name, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                doc.add(
                    Field('content', par, Field.Store.YES,
                          Field.Index.ANALYZED))
                docs.append(doc)
        else:
            doc = Document()
            doc.add(
                Field('filename', _name, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field('content', contents, Field.Store.YES,
                      Field.Index.ANALYZED))
            docs.append(doc)
    return docs
Exemple #5
0
    def build(self, index):

        writer = self.getWriter(directory=index.index,
                                analyzer=SimpleAnalyzer(
                                    Version.LUCENE_CURRENT))

        seed(101)
        for d in xrange(self.minId, self.maxId + 1):
            doc = Document()
            doc.add(Field("id", self.pad(d), StringField.TYPE_STORED))
            if index.allowNegativeRandomInts:
                r = randint(~self.MAX_INT, self.MAX_INT)
            else:
                r = randint(0, self.MAX_INT)

            if index.maxR < r:
                index.maxR = r

            if r < index.minR:
                index.minR = r

            doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED))
            doc.add(Field("body", "body", StringField.TYPE_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)
Exemple #7
0
def generate_indices_from_benchmark(writer, counter):
	javafiles = java_files_from_dir(source_path)
	i = 0
	j = 0
	for javafile in javafiles:
		# print javafile
		i += 1
		if i % 1000 == 0:	#1000개 마다 프린트
			print("Counter: %s" % i)
			print "typed_method_call" + str(counter.typed_method_call_count)
		document = Document()
		document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO))
		try:
			with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f:
				file_content = f.read().encode("utf-8", errors='ignore')
			f.close()

			ast = parse(file_content, resolve=False)
			if add_code_keyword_into_document(document, file_content, ast, counter):
				writer.addDocument(document)
				j += 1
				if j % 1000 == 0:
					print "Wrote:: %s files" % j

		except Exception as e:
			print("Error: %s" % e)
			continue
	print "Number of files: %s" % i
	print "Number of duplicates: %s" % len(hashes)
	print "%s files has been indexed" % j
Exemple #8
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        index_file = open("index.txt", 'r')
        for line in index_file.readlines():

            try:
                src = line.strip().split('\t')[0]
                filename = line.strip().split('\t')[1]
                tag = line.strip().split('\t')[2]
                path = os.path.join(root, filename)

                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                doc.add(Field("src", src, t1))

                if len(tag) > 0:
                    doc.add(Field("tag", tag, t2))
                else:
                    print "warning: no tag in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Exemple #9
0
def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()
Exemple #10
0
def index_docs(root, writer):
    # metadata: name and path
    metadata = FieldType()
    metadata.setStored(True)  # as is value
    metadata.setTokenized(False)
    metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    # content: abstract and body
    content_type = FieldType()
    content_type.setStored(True)  # to highlight on search results
    content_type.setTokenized(True)  # tokenize words
    content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for directory, _, file_names in walk(root):
        for file_name in file_names:
            name, extension = splitext(file_name)
            if extension not in DOC_FORMATS:
                continue  # skip unsupported formats

            file_path = join(directory, file_name)
            print ' ', file_path

            # Build indexed document
            doc = Document()
            doc.add(Field('name', file_name, metadata))
            doc.add(Field('path', directory, metadata))

            # Read file contents
            content = process(file_path, 'utf-8', method='pdfminer')
            abstract = extract_abstract(content)
            doc.add(Field('content', content, content_type))
            doc.add(Field('abstract', abstract, content_type))

            writer.addDocument(doc)
Exemple #11
0
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
Exemple #12
0
 def index_single_file(self, doc_file):
     logger.info("adding {}".format(doc_file))
     lucene_doc_num = 0
     try:
         with open(doc_file) as df:
             for line in df:
                 wiki_doc = json.loads(line)
                 doc_title = wiki_doc['title']
                 doc_text = wiki_doc['plaintext']
                 doc_id = wiki_doc['_id']
                 paragraphs = doc_text.split('\n\n')
                 if len(paragraphs) < 3:
                     continue
                 doc_text = rm_special_chars(doc_text)
                 doc = Document()
                 doc.add(StringField("id", str(doc_id), Field.Store.YES))
                 doc.add(TextField("title", doc_title, Field.Store.YES))
                 doc.add(TextField("text", doc_text, Field.Store.YES))
                 self.writer.addDocument(doc)
                 lucene_doc_num += 1
                 if lucene_doc_num % 10000 == 0:
                     logger.info('added {} lucene docs'.format(lucene_doc_num))
     except Exception as e:
         import traceback
         traceback.print_tb(e.__traceback__)
         logger.error("Failed in: {}".format(doc_file))
     return lucene_doc_num
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        inFile = open(str(args["inputFile"]))
        indexName = inFile.readline()
        while (indexName != ''):
            print "adding", indexName
            doc = Document()
            doc.add(Field("name", indexName, t1))
            #doc.add(Field("path", root, t1))
            text = inFile.readline()
            if (len(text) > 0):
                print("contents: %s\n" % text)
                doc.add(Field("contents", text, t2))
            else:
                print "warning: no content in %s" % indexName
            indexName = inFile.readline()
            writer.addDocument(doc)
        inFile.close()
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
Exemple #15
0
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue
    def indexDocs(self, sourceDir, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for sourceDir, dirnames, filenames in os.walk(sourceDir):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print(filename)
                try:
                    path = os.path.join(sourceDir, filename)
                    file = open(path, 'r', encoding="utf-8")
                    contents = file.read()
                    #contents = str(filecontent, 'utf-8')
                    #contents = filecontent.encode('utf-8')
                    #print('path', path, len(contents))
                    doc = Document()
                    doc.add(Field("name", filename, t1))  # filename (title)
                    #doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field(queryField, contents, t2))  # content
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                    file.close()
                except NameError:
                    print("Failed in indexDocs:")
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Exemple #18
0
    def indexer(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        def repalcer(text):
            chars = '\\`*_{}[]()>#+-.!$‘'
            for c in chars:
                if c in text:
                    text = text.replace(c, ' ')
            return text

        for root, dirnames, filenames in os.walk(root):
            i = 0
            for filename in filenames:
                i += 1
                with open(os.path.join(root, filename)) as f:
                    for line in f.readlines():
                        line = line.split(' ', 2)
                        docname = line[0] + ' ' + line[1]
                        name = repalcer(line[0])
                        contents = line[2]
                        doc = Document()
                        doc.add(Field('docname', docname, t1))
                        doc.add(Field('name', name, t1))
                        doc.add(Field('contents', contents, t1))
                        writer.addDocument(doc)
                print('File %d done indexing' % i)
Exemple #19
0
    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs
Exemple #20
0
    def test_indexDocumentWithUnicodeText(self):

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()
            writer = self.getWriter(store, analyzer, True)

            doc = Document()
            doc.add(Field("title", "value of testing", TextField.TYPE_STORED))
            doc.add(Field("docid", str(1), StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester", StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom", StoredField.TYPE))
            doc.add(
                Field("meta_words", "rabbits are beautiful",
                      TextField.TYPE_NOT_STORED))

            # using a unicode body cause problems, which seems very odd
            # since the python type is the same regardless affter doing
            # the encode
            body_text = u"hello world" * 20
            body_reader = StringReader(body_text)
            doc.add(Field("content", body_reader, TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)
Exemple #21
0
	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return
Exemple #22
0
    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            #print "adding",i, sent
            try:
                root = os.getcwd()
                #contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field("name", str(i), t1))
                doc.add(Field("path", root, t1))
                if len(sent) > 0:
                    doc.add(Field("contents", sent.lower(), t2))
                else:
                    print "warning: no content in %s" % str(i)
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexsents:", e
Exemple #23
0
    def Indexing(self, writer):
        print("Indexing Segmented File [", SEGMENTATION_FILE, "]")
        with open(SEGMENTATION_FILE, 'r') as f:
            line_count = 0
            for line in f:
                # 建立 context 的 fieldtype,需要搭建索引、存储、向量化
                fieldtype_context = FieldType()
                fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                fieldtype_context.setStored(True)
                fieldtype_context.setTokenized(True)

                # 建立 phrase 的 fieldtype,只需要保存
                fieldtype_phrase = FieldType()
                fieldtype_phrase.setStored(True)

                # 对分词好的内容进行处理,把词语和词性分开来存储
                processed_context, processed_phrase = self.process_line(line)

                doc = Document()
                # context field是用于记录文章的内容
                doc.add(Field('context', processed_context, fieldtype_context))
                # phrase field适用于记录文章每个词所对应的词性
                doc.add(Field('phrase', processed_phrase, fieldtype_phrase))

                # 把document写入索引库
                writer.addDocument(doc)

                # 跟踪程序运行情况用
                print("\r", str(line_count), " lines", end="", flush=True)
                line_count = line_count + 1
                if line_count > self.index_limit and not self.training:
                    break

        writer.close()
        print()
Exemple #24
0
    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(False)
        t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        total=0
        file = open(root,"r")
        for line in file.readlines():
            try:
                imgurl, itemurl, content = line.split('\t')
                total+=1
                print total
                print "adding", content
                contents = ' '.join(jieba.cut(content))
                doc = Document()
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("itemurl", itemurl, t1))
                doc.add(Field("title", content, t1))
                doc.add(Field("contents",contents,t3))
                writer.addDocument(doc)
            except Exception, e:
                    print "Failed in indexDocs:", e
Exemple #26
0
    def to_document(self):
        def bool_to_int(bv):
            if bv:
                return 1
            return 0

        doc = Document()
        doc.add(StringField('func_name', self.func_name, Field.Store.YES))
        doc.add(
            StringField('id_isint',
                        str(bool_to_int(isinstance(self.p_id, (int, long)))),
                        Field.Store.YES))
        if isinstance(self.p_id, unicode):
            doc.add(
                StringField('id', self.p_id.encode('utf8'), Field.Store.YES))
        else:
            doc.add(StringField('id', str(self.p_id), Field.Store.YES))
        doc.add(
            StringField(
                'start_isint',
                str(bool_to_int(isinstance(self.p_start, (int, long)))),
                Field.Store.YES))
        doc.add(StringField('start', str(self.p_start), Field.Store.YES))
        doc.add(StringField('pagesize', str(self.p_pagesize), Field.Store.YES))
        doc.add(StringField('pextra', str(self.p_extra), Field.Store.YES))
        doc.add(StringField('fails', str(self.fails), Field.Store.YES))
        doc.add(
            StringField('finish_time', str(self.finish_time), Field.Store.YES))
        doc.add(StringField('docid', str(self.docid), Field.Store.YES))
        return doc
Exemple #27
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'gbk')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed",
                                          CompressionTools.compress(bytes))
        stringFldCompressed = StoredField(
            "stringCompressed",
            CompressionTools.compressString(self.binaryValCompressed))

        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(
            docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(
            CompressionTools.decompressString(
                docFromReader.getBinaryValue("stringCompressed")),
            self.binaryValCompressed)

        reader.close()
Exemple #29
0
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            # traverse through the doc directory
            for filename in filenames:
                #	if not filename.endswith('.cdc'):
                #		continue
                try:
                    # only add the filename and path for indexing
                    path = os.path.join(root, filename)
                    print "adding file : ", path
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in ", filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "failed in indexDocs:", e
Exemple #30
0
 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)