Python Document.add Exemples, org.apache.lucene.document.Document.add Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_BinaryDocument.py Projet : devs4v/devs4v-information-retrieval15

    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes))
        stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed))
    
        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)
    
        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        writer.addDocument(doc)
        writer.close()
    
        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assert_(docFromReader is not None)
    
        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed)

        reader.close()

Exemple #2

0

Afficher le fichier

Fichier : mlucene.py Projet : o0neup/deepHackQA

def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()

Exemple #3

0

Afficher le fichier

def index(personDB, familyDB, relationDB):
    #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)
#?#indexWriter.setRAMBufferSizeMB(50);  KOLLA 256

    mt = matchtext()

    for p in personDB.find({}, no_cursor_timeout=True):
        matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    #Family matchtext
    for f in familyDB.find():
        matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex','FAM', StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return

Exemple #4

0

Afficher le fichier

Fichier : lucene_search.py Projet : zhanglae/QueryReformulator

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

Exemple #5

0

Afficher le fichier

Fichier : lucene_index.py Projet : danieldmm/minerva

    def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms=bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)
##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata=json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO))
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

Exemple #6

0

Afficher le fichier

 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()

Exemple #7

0

Afficher le fichier

def generate_indices_from_benchmark(writer, counter):
	javafiles = java_files_from_dir(source_path)
	i = 0
	j = 0
	for javafile in javafiles:
		# print javafile
		i += 1
		if i % 1000 == 0:	#1000개 마다 프린트
			print("Counter: %s" % i)
			print "typed_method_call" + str(counter.typed_method_call_count)
		document = Document()
		document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO))
		try:
			with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f:
				file_content = f.read().encode("utf-8", errors='ignore')
			f.close()

			ast = parse(file_content, resolve=False)
			if add_code_keyword_into_document(document, file_content, ast, counter):
				writer.addDocument(document)
				j += 1
				if j % 1000 == 0:
					print "Wrote:: %s files" % j

		except Exception as e:
			print("Error: %s" % e)
			continue
	print "Number of files: %s" % i
	print "Number of duplicates: %s" % len(hashes)
	print "%s files has been indexed" % j

Exemple #8

0

Afficher le fichier

    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs

Exemple #9

0

Afficher le fichier

Fichier : indexer.py Projet : JinwooSeong/thu-IR-hw4

    def Indexing(self, writer):
        print("Indexing Segmented File [", SEGMENTATION_FILE, "]")
        with open(SEGMENTATION_FILE, 'r') as f:
            line_count = 0
            for line in f:
                # 建立 context 的 fieldtype，需要搭建索引、存储、向量化
                fieldtype_context = FieldType()
                fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                fieldtype_context.setStored(True)
                fieldtype_context.setTokenized(True)

                # 建立 phrase 的 fieldtype，只需要保存
                fieldtype_phrase = FieldType()
                fieldtype_phrase.setStored(True)

                # 对分词好的内容进行处理，把词语和词性分开来存储
                processed_context, processed_phrase = self.process_line(line)

                doc = Document()
                # context field是用于记录文章的内容
                doc.add(Field('context', processed_context, fieldtype_context))
                # phrase field适用于记录文章每个词所对应的词性
                doc.add(Field('phrase', processed_phrase, fieldtype_phrase))

                # 把document写入索引库
                writer.addDocument(doc)

                # 跟踪程序运行情况用
                print("\r", str(line_count), " lines", end="", flush=True)
                line_count = line_count + 1
                if line_count > self.index_limit and not self.training:
                    break

        writer.close()
        print()

Exemple #10

0

Afficher le fichier

Fichier : sentence_indexer.py Projet : successar/Lucene-QA

def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()

Exemple #11

0

Afficher le fichier

    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")

Exemple #12

0

Afficher le fichier

Fichier : WikiPageIndex.py Projet : abell25/TestTaker

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

Exemple #13

0

Afficher le fichier

def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

Exemple #14

0

Afficher le fichier

Fichier : test_Highlighter.py Projet : mindis/pylucene

    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)

Exemple #15

0

Afficher le fichier

Fichier : IndexFiles.py Projet : deternan/Light-tools-Python-

    def indexDocs(self, sourceDir, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for sourceDir, dirnames, filenames in os.walk(sourceDir):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print(filename)
                try:
                    path = os.path.join(sourceDir, filename)
                    file = open(path, 'r', encoding="utf-8")
                    contents = file.read()
                    #contents = str(filecontent, 'utf-8')
                    #contents = filecontent.encode('utf-8')
                    #print('path', path, len(contents))
                    doc = Document()
                    doc.add(Field("name", filename, t1))  # filename (title)
                    #doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field(queryField, contents, t2))  # content
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                    file.close()
                except NameError:
                    print("Failed in indexDocs:")

Exemple #16

0

Afficher le fichier

Fichier : sentence_indexer.py Projet : successar/Lucene-QA

def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

Exemple #17

0

Afficher le fichier

Fichier : test_Highlighter.py Projet : devs4v/devs4v-information-retrieval15

    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)

Exemple #18

0

Afficher le fichier

Fichier : onlyworthy_dev.py Projet : ryancutter/OnlyWorthy

    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e

Exemple #19

0

Afficher le fichier

Fichier : lucene_search.py Projet : domarps/WebNav

def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

Exemple #20

0

Afficher le fichier

Fichier : service_database.py Projet : wrongtest/smart_music

def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()

Exemple #21

0

Afficher le fichier

Fichier : bm_indexFiles.py Projet : gauravpatil93/evaluation-framework

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        inFile = open(str(args["inputFile"]))
        indexName = inFile.readline()
        while (indexName != ''):
            print "adding", indexName
            doc = Document()
            doc.add(Field("name", indexName, t1))
            #doc.add(Field("path", root, t1))
            text = inFile.readline()
            if (len(text) > 0):
                print("contents: %s\n" % text)
                doc.add(Field("contents", text, t2))
            else:
                print "warning: no content in %s" % indexName
            indexName = inFile.readline()
            writer.addDocument(doc)
        inFile.close()

Exemple #22

0

Afficher le fichier

Fichier : test_BinaryDocument.py Projet : rudimeier/pylucene

    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed",
                                          CompressionTools.compress(bytes))
        stringFldCompressed = StoredField(
            "stringCompressed",
            CompressionTools.compressString(self.binaryValCompressed))

        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(
            docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(
            CompressionTools.decompressString(
                docFromReader.getBinaryValue("stringCompressed")),
            self.binaryValCompressed)

        reader.close()

Exemple #23

0

Afficher le fichier

    def addDocument(self, writer, new_doc, metadata, fields_to_process,
                    bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms = bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object = Field(field, new_doc[field], Field.Store.NO,
                                 Field.Index.ANALYZED, Field.TermVector.YES)
            ##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost = 1 / float(
                math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata = json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(
            Field("bow_info", json.dumps(bow_info), Field.Store.YES,
                  Field.Index.NO))
        doc.add(
            Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("year_from", metadata["year"], Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

Exemple #24

0

Afficher le fichier

Fichier : create_category_corpus.py Projet : linxinshi/EntityRetrievalPAS

def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))

Exemple #25

0

Afficher le fichier

Fichier : FacetExample.py Projet : jessekafor/bisonlucene

    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)

Exemple #26

0

Afficher le fichier

Fichier : Handler_callable.py Projet : NitinJamadagni/Mini-NoSQL-Database

	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		primary_key_update=False
		for key in toupdate.keys():
			if key in primary_keys_map:
				primary_key_update=True
				break
		if primary_key_update == True:
			query_search=BooleanQuery()
			for key in primary_keys_map:
				temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
				query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
			hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
			if len(hits) > 0:
				return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			temp=json.dumps(data)
			data_string=base64.b64encode(snappy.compress(temp))
		else:
			temp=json.dumps(data)
			data_string=base64.b64encode(temp)

		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)

Exemple #27

0

Afficher le fichier

Fichier : pylucene.py Projet : TharunMohandoss/All_Assignments

def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc

Exemple #28

0

Afficher le fichier

 def _addDocument(self, identifier, isformatof, sort=None):
     doc = Document()
     if isformatof:
         doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
     if sort:
         doc.add(NumericDocValuesField("__sort__", long(sort)))
     consume(self.lucene.addDocument(identifier, doc))
     self.lucene.commit()  # Explicitly, not required: since commitCount=1.

Exemple #29

0

Afficher le fichier

Fichier : dedupfiltercollectortest.py Projet : jerryba/meresco-lucene

 def _addDocument(self, identifier, isformatof, sort=None):
     doc = Document()
     if isformatof:
         doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
     if sort:
         doc.add(NumericDocValuesField("__sort__", long(sort)))
     consume(self.lucene.addDocument(identifier, doc))
     self.lucene.commit()  # Explicitly, not required: since commitCount=1.

Exemple #30

0

Afficher le fichier

Fichier : incremental.py Projet : shubhampachori12110095/QA-Clustering

 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()

Exemple #31

0

Afficher le fichier

def luceneindex(text):
    for n, l in enumerate(text):
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        #print( "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs()))
        #print( "Closing index of %d docs..." % writer.numDocs())
        writer.close()

Exemple #32

0

Afficher le fichier

Fichier : JavaCodeIndexer (Sirres Raphael's conflicted copy 2015-10-15).py Projet : Deadlyelder/CoCaBu

def index_code_snippet(writer):
    HOME = "/Users/Raphael/Downloads/GitArchive"  #29.06.2015, 03.07.2015, 15.07.2015
    jfiles = java_files_from_dir(HOME)

    N_cores = 4

    # print("Number of Java files to process: %s" % (len(jfiles)))
    source_queue = []

    i = 0
    j = 0

    for jfile in jfiles:
        i += 1
        if i % 1000 == 0:
            print("Counter: %s" % i)
            break

        document = Document()
        document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO))

        try:
            with codecs.open(jfile, "r", encoding='utf-8') as f:
                file_content = f.read().encode("utf-8")

            document.add(
                Field("file_content", compress(file_content), Field.Store.YES,
                      Field.Index.NO))
            # Check for duplicates files and accumulate source code
            # hash_v =  str(md5(file_content))
            # if hash_v not in hashes:
            # 	source_queue.append((document, file_content))
            # 	hashes.add(hash_v)

            # Wait until source files
            # if len(source_queue) >= N_cores:
            # 	ast_docs = parallize(source_queue)
            # 	source_queue = []

            # 	for ast, file_content, doc in ast_docs:
            ast = parse(file_content, resolve=False)
            if add_code_keyword_into_document(document, file_content, ast):
                writer.addDocument(document)
                j += 1
                if j % 1000 == 0:
                    print "Wrote:: %s files" % j

        except Exception as e:
            #traceback.print_exc()
            #print jfile
            print("Error: %s" % e)
            continue

    print "Number of files: %s" % i

    print "Number of duplicates: %s" % len(hashes)

    print("%s files has been indexed" % j)

Exemple #33

0

Afficher le fichier

 def create_document(self, contents):
     """Create a Lucene document from the specified contents.
     Contents is a list of fields to be indexed, represented as a dictionary
     with keys 'field_name', 'field_type', and 'field_value'."""
     doc = Document()
     for f in contents:
         doc.add(Field(f['field_name'], f['field_value'],
                       self.ldf.get_field(f['field_type'])))
     return doc

Exemple #34

0

Afficher le fichier

Fichier : lucene_tools.py Projet : renespeck/TAGME_Reproducibility

 def create_document(self, contents):
     """Create a Lucene document from the specified contents.
     Contents is a list of fields to be indexed, represented as a dictionary
     with keys 'field_name', 'field_type', and 'field_value'."""
     doc = Document()
     for f in contents:
         doc.add(Field(f['field_name'], f['field_value'],
                       self.ldf.get_field(f['field_type'])))
     return doc

Exemple #35

0

Afficher le fichier

Fichier : text_indexer.py Projet : BitCurator/bca-webtools

 def index_text(self, sha1, full_text):
     """Index the full text and map it to the source sha1."""
     document = Document()
     document.add(Field("sha1", sha1, ImageIndexer.hash_field))
     if full_text:
         document.add(Field("full_text", full_text, ImageIndexer.text_field))
         self.writer.updateDocument(Term("sha1", sha1), document)
     else:
         logging.info("No text for sha1 %s", sha1)

Exemple #36

0

Afficher le fichier

def getDocument(fname):
    doc = Document()
    doc.add(
        Field('filename',
              os.path.split(fname)[-1], Field.Store.YES,
              Field.Index.NOT_ANALYZED))
    doc.add(
        Field('content',
              open(fname).read(), Field.Store.YES, Field.Index.ANALYZED))
    return doc

Exemple #37

0

Afficher le fichier

    def _insertDoc(self, writer, content):

        doc = Document()

        doc.add(Field("id", "id" + str(self.docCount),
                      StringField.TYPE_STORED))
        doc.add(Field("content", content, TextField.TYPE_NOT_STORED))

        writer.addDocument(doc)
        self.docCount += 1

Exemple #38

0

Afficher le fichier

Fichier : lucene_demo.py Projet : yasmineTYM/PyLucene-Demo

def create_document(file_name):
    path = INPUT_DIR + file_name  # assemble the file descriptor
    file = open(path)  # open in read mode
    doc = Document()  # create a new document
    # add the title field
    doc.add(StringField("title", input_file, Field.Store.YES))
    # add the whole book
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()  # close the file pointer
    return doc

Exemple #39

0

Afficher le fichier

Fichier : indexer.py Projet : Dnguy104/Twitter-Search

def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"

Exemple #40

0

Afficher le fichier

Fichier : Indexer.py Projet : arrudamichel/Systems-Engineering

def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

Exemple #41

0

Afficher le fichier

Fichier : test_PhraseQuery.py Projet : svn2github/pylucene

    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()

Exemple #42

0

Afficher le fichier

    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()

Exemple #43

0

Afficher le fichier

def create_minidoc(termstring, field='text'):
    # To store term vectors (used for query expansion) we have to use a custom fieldtype
    customfield = FieldType()
    customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    customfield.setStored(True)
    customfield.setTokenized(True)
    customfield.setStoreTermVectors(True)

    doc = Document()
    doc.add(Field(field, termstring, customfield))
    return doc

Exemple #44

0

Afficher le fichier

Fichier : luceneDB.py Projet : andersardo/gedMerge

 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return

Exemple #45

0

Afficher le fichier

Fichier : network_feature_index_retrieval_nodes.py Projet : rarezhang/allen-ai-science-challenge

def addDoc(w, text):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    w.addDocument(doc)

Exemple #46

0

Afficher le fichier

Fichier : lucene_indexer.py Projet : ChristopherWilks/ncbi_indexing

 def add_document(self, fields, header, id_):
     doc = Document()
     if len(fields) > len(header):
         sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
         for field in fields:
             sys.stderr.write('%s\n' % field)
         return
     for idx, field in enumerate(fields):
         fname, fieldtype = header[idx]
         if fieldtype is IntField:
             field = int(field)
         doc.add(fieldtype(fname, field, Field.Store.YES))
     self.writer.addDocument(doc)
     self.num_docs += 1

Exemple #47

0

Afficher le fichier

Fichier : mypylucene.py Projet : giuliolovisotto/information-retrieval

def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()

Exemple #48

0

Afficher le fichier

Fichier : idx.py Projet : mkind/crawler

    def add_document(self, url, field, text):
        """
        add a new document to index writer

        input:
            url     the url of the target to be indexed
            field   fieldname of the value that will be indexed
            text    text to be indexed

        """
        doc = Document()
        doc.add(Field('url', url, TextField.TYPE_STORED))
        doc.add(Field(field, text, TextField.TYPE_STORED))
        self.idx_writer.addDocument(doc)

Exemple #49

0

Afficher le fichier

Fichier : test_PyLuceneThread.py Projet : svn2github/pylucene

    def setUp(self):
        super(PyLuceneThreadTestCase, self).setUp()

        self.classLoader = Thread.currentThread().getContextClassLoader()

        writer = self.getWriter(analyzer=StandardAnalyzer())

        doc1 = Document()
        doc2 = Document()
        doc3 = Document()
        doc4 = Document()
        doc1.add(Field("field", "one", TextField.TYPE_STORED))
        doc2.add(Field("field", "two", TextField.TYPE_STORED))
        doc3.add(Field("field", "three", TextField.TYPE_STORED))
        doc4.add(Field("field", "one", TextField.TYPE_STORED))

        writer.addDocument(doc1)
        writer.addDocument(doc2)
        writer.addDocument(doc3)
        writer.addDocument(doc4)
        writer.commit()
        writer.close()

        self.testData = [('one',2), ('two',1), ('three', 1), ('five', 0)] * 500
        self.lock = threading.Lock()
        self.totalQueries = 0

Exemple #50

0

Afficher le fichier

Fichier : IndexFilesForImage_v4.py Projet : yuchenlin/EEIntro_C

    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)

Exemple #51

0

Afficher le fichier

Fichier : IndexingEngine.py Projet : haonguyen14/CLIFinder

	def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0):

		realPath = os.path.abspath(root)
		for i in os.listdir(realPath):

			path = os.path.join(realPath, i)
			if os.path.isfile(path):
				#index this file
				doc = Document()

				doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name']))
				doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent']))
				doc.add(Field('id', str(docID), self.mFieldTypes['id']))
				doc.add(Field('parentID', str(parentID), self.mFieldTypes['id']))

				fd = open(path, 'r')
				content = fd.read()
				fd.close()

				if len(content) > 0:
					doc.add(Field('content', content, self.mFieldTypes['content']))

				self.mIndexWriter.addDocument(doc)
				##################### Logging ##############################
				if IS_DEBUG:
					nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip())
					parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent))
					contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content)
					self.mLog = self.mLog + ( "File %s\n   {name - %s}: %s\n   {parent - %s}: %s\n   {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) )



				docID = docID + 1
				################### index sub commands	
				if os.path.isdir(path + ".sub"):
					parent.append(i)
					docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1)
					parent.pop()
					
						
		
		if id == 0:
			self.mIndexWriter.commit()
			self.mIndexWriter.close()
			
			if IS_DEBUG:
				loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue())
				loggingBot.start()
				self.mLog = ""
		return docID

Exemple #52

0

Afficher le fichier

Fichier : termfrequencysimilaritytest.py Projet : jerryba/meresco-lucene

    def testScore(self):
        reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False)
        lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings)
        document = Document()
        document.add(TextField('field', 'x '*100, Field.Store.NO))
        returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document))

        q = TermQuery(Term("field", 'x'))
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(0.1, result.hits[0].score)

        q.setBoost(10.0)
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(1, result.hits[0].score)

Exemple #53

0

Afficher le fichier

Fichier : wikipedia_topic_modeling.py Projet : msnyder/wikipedia-indexing

def create_index():
    for country in cleaned_dictionary:
        doc = Document()
        doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

Exemple #54

0

Afficher le fichier

Fichier : test_BooleanOr.py Projet : devs4v/devs4v-information-retrieval15

    def setUp(self):
        super(BooleanOrTestCase, self).setUp()

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        d = Document()
        d.add(Field(self.FIELD_T, "Optimize not deleting all files",
                    TextField.TYPE_STORED))
        d.add(Field(self.FIELD_C,
                    "Deleted When I run an optimize in our production environment.",
                    TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.close()

        self.searcher = self.getSearcher()

Exemple #55

0

Afficher le fichier

Fichier : test_Not.py Projet : svn2github/pylucene

    def testNot(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer())

        d1 = Document()
        d1.add(Field("field", "a b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b")

        topDocs = searcher.search(query, 50)
        self.assertEqual(0, topDocs.totalHits)

Exemple #56

0

Afficher le fichier

Fichier : filesearch.py Projet : bashwork/common

def generate_document(path, indexer):
    """ Given a file, convert it into a lucene document that
    is ready to be indexed.

    :param path: The file to add to the search index
    :param indexer: The indexer to operate with
    :returns: The index document for the specified camera
    """
    name = FILE_TYPES.get(path.rsplit(".", 1)[-1], "")
    data = open(path, "r").read()

    document = Document()
    document.add(Field("path", path, indexer.field_clean))
    document.add(Field("type", name, indexer.field_clean))
    document.add(Field("data", data, indexer.field_dirty))
    return document

Exemple #57

0

Afficher le fichier

Fichier : test_PhraseQuery.py Projet : svn2github/pylucene

    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer())
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        builder = PhraseQuery.Builder()
        builder.add(Term("field", "stop"))
        builder.add(Term("field", "words"))
        scoreDocs = searcher.search(builder.build(), 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))

Exemple #58

0

Afficher le fichier

Fichier : test_bug1564.py Projet : svn2github/pylucene

    def setUp(self):
        super(Test_Bug1564, self).setUp()

        self.analyzer = StandardAnalyzer()
        writer = self.getWriter(analyzer=self.analyzer)

        doc = Document()
        doc.add(Field('all', u'windowpane beplaster rapacious \
        catatonia gauntlet wynn depressible swede pick dressmake supreme \
        jeremy plumb theoretic bureaucracy causation chartres equipoise \
        dispersible careen heard', TextField.TYPE_NOT_STORED))
        doc.add(Field('id', '1', StoredField.TYPE))

        writer.addDocument(doc)
        writer.commit()
        writer.close()

Exemple #59

0

Afficher le fichier

Fichier : test_PhraseQuery.py Projet : devs4v/devs4v-information-retrieval15

    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        query = PhraseQuery()
        query.add(Term("field", "stop"))
        query.add(Term("field", "words"))
        scoreDocs = searcher.search(query, None, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))