Ejemplo n.º 1
0
def index():
    # Initialize lucene and the JVM
    #    lucene.initVM()
    GLOBALDIRECTORY = getDirectory()

    #Indexwriter config
    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, tokenCount)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(GLOBALDIRECTORY, config)

    fileNames = getTxtFile(textfileDirectory)  #creates document for each tweet
    fileNames = getTxtFile(textfileDirectory)  #creates document for each tweet
    for file in fileNames:
        data = getData(file)

        for tweets in data:
            if 'text' in tweets:
                doc = createDocument_tweet(tweets)
                writer.addDocument(doc)  # add the document to  IndexWriter

        print file
    print "\nNumber of indexed documents: %d" % writer.numDocs(
    )  #number of documents indexed for testing
    writer.close()
    print "Indexing done!\n"
    print "------------------------------------------------------"
    return GLOBALDIRECTORY
class LuceneIndexer:

    def __init__(self, path_to_save):
        self.path_to_save = path_to_save
        self.num_docs = 0
        lucene.initVM()
        self.indexDir = SimpleFSDirectory(File(self.path_to_save))
        self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
        self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
        self.writer = IndexWriter(self.indexDir, self.writerConfig)

    def add_document(self, fields, header, id_):
        doc = Document()
        if len(fields) > len(header):
            sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
            for field in fields:
                sys.stderr.write('%s\n' % field)
            return
        for idx, field in enumerate(fields):
            fname, fieldtype = header[idx]
            if fieldtype is IntField:
                field = int(field)
            doc.add(fieldtype(fname, field, Field.Store.YES))
        self.writer.addDocument(doc)
        self.num_docs += 1

    def close(self):
        print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
        self.writer.close()
Ejemplo n.º 3
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Ejemplo n.º 4
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 5
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
Ejemplo n.º 6
0
def index(personDB, familyDB, relationDB):
    #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)
#?#indexWriter.setRAMBufferSizeMB(50);  KOLLA 256

    mt = matchtext()

    for p in personDB.find({}, no_cursor_timeout=True):
        matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    #Family matchtext
    for f in familyDB.find():
        matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex','FAM', StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return
Ejemplo n.º 7
0
def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 8
0
 def testAdd(self, filepath):
     config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer())
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     writer = IndexWriter(self.dir, config)
     #True,建立新索引,False,建立增量索引
     file = open(filepath)
     contents = unicode(file.read(), 'gbk')
     file.close()
     doc = Document()
     doc.add(Field("name", os.path.basename(filepath),
                          Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
     doc.add(Field("path", filepath,
                          Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
     if len(contents) > 0:
         title = self.getTxtAttribute(contents, 'Title')
         author = self.getTxtAttribute(contents, 'Author')
         language = self.getTxtAttribute(contents, 'Language')
         doc.add(Field("Title", title,
                              Field.Store.YES,
                              Field.Index.ANALYZED))
         doc.add(Field("Author", author,
                              Field.Store.YES,
                              Field.Index.ANALYZED))
         doc.add(Field("Language", language,
                              Field.Store.YES,
                              Field.Index.ANALYZED))
         doc.add(Field("contents", contents,
                              Field.Store.NO,
                              Field.Index.ANALYZED))
     else:
         print "warning: no content in %s" % filename
     writer.addDocument(doc)
     writer.close()
Ejemplo n.º 9
0
class _ChineseRamIndexer:
    def __init__(self):
        indexDir = RAMDirectory()
        analyzer = SmartChineseAnalyzer()
        writerConfig = IndexWriterConfig(analyzer)

        # create new directory, remove previously indexed documents
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('search similarity:{}'.format(
            writerConfig.getSimilarity()))
        self.indexDir = indexDir
        self.writer = IndexWriter(indexDir, writerConfig)

    def add(self, pid, content):
        doc = Document()
        doc.add(StringField("pid", pid, Field.Store.YES))
        doc.add(TextField("content", content, Field.Store.YES))
        self.writer.addDocument(doc)

    def close(self):
        self.writer.close()

    def index_lesson(self, parags):
        for index, content in enumerate(parags):
            pid = 'p' + str(index)
            self.add(pid, content)
        self.close()
Ejemplo n.º 10
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Ejemplo n.º 11
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 12
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
Ejemplo n.º 13
0
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
Ejemplo n.º 14
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 15
0
    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
Ejemplo n.º 16
0
class TAindexer(object):

    INDEX_FILED_TITLE = "Title"
    INDEX_FILED_POSTID = "PostTypeId"
    INDEX_FILED_AC = "AnswerCount"

    def __init__(self, dir, data_file):
        self.dir = dir
        self.data_file = data_file
        index_dir = FSDirectory.open(Paths.get(self.dir))
        analyzer = StandardAnalyzer()
        writer_config = IndexWriterConfig(analyzer)
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(index_dir, writer_config)

    def get_document(self, title, post_id, ac):
        doc = Document()
        doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES))
        doc.add(TextField(self.INDEX_FILED_POSTID, post_id, Field.Store.YES))
        doc.add(TextField(self.INDEX_FILED_AC, ac, Field.Store.YES))
        return doc

    def index_TAs(self):
        start_time = datetime.now()
        print "Start to index " + self.data_file + " @ " + str(start_time)
        count = 0
        for event, elem in ET.iterparse(self.data_file):
            print datetime.now()
            if event == 'end':
                if elem.get('Title'):
                    title = elem.get('Title')
                else:
                    title = ''
                if elem.get('PostTypeId'):
                    post_id = elem.get('PostTypeId')
                else:
                    post_id = ''

                if elem.get('AnswerCount'):
                    answer_count = elem.get('AnswerCount')
                else:
                    answer_count = ''
                print post_id
                print answer_count
                if str(post_id) == '1':
                    print self.dir
                    doc = self.get_document(title.encode("utf-8"),
                                            post_id.encode("utf-8"),
                                            answer_count.encode("utf-8"))
                    self.writer.addDocument(doc)
                    print count
                    count = count + 1

            elem.clear()
        self.writer.close()
        print "Indexing " + self.data_file + " complete @ " + str(
            datetime.now())
        print "Indexing time is " + str(datetime.now() - start_time)
Ejemplo n.º 17
0
    def index(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.addDocument(doc)

        writer.commit()
        writer.close()
Ejemplo n.º 18
0
class Indexer(object):
    def __init__(self, docDir, indexDir, analyzer):
        #set index dir
        if not os.path.exists(indexDir):
            os.makedirs(indexDir)
        self.indexDir = SimpleFSDirectory(Paths.get(indexDir))
        self.docDir = docDir

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        writerConfig = IndexWriterConfig(self.analyzer)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.indexDir, writerConfig)
        self.indexing()

    def indexing(self):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.NONE)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        for filename in os.listdir(self.docDir):
            if filename.endswith('.html') or filename.endswith('.htm'):
                with open(os.path.join(self.docDir, filename)) as f:
                    url = f.readline().strip()
                    htmlString = f.read()
                    #remove HTML markup
                    soup = BeautifulSoup(htmlString, 'html.parser')
                    # kill all script and style elements
                    for script in soup(["script", "style"]):
                        script.extract()  # rip it out
                    # get text
                    text = soup.get_text()
                    # break into lines and remove leading and trailing space on each
                    lines = (line.strip() for line in text.splitlines())
                    # break multi-headlines into a line each
                    chunks = (phrase.strip() for line in lines
                              for phrase in line.split("  "))
                    # drop blank lines
                    text = '\n'.join(chunk for chunk in chunks if chunk)
                    #text = soup.get_text().strip()
                    title = soup.title.string
                    #print text
                doc = Document()
                doc.add(Field("link", url, t1))
                doc.add(Field("title", title, t1))
                doc.add(Field("text", text, t2))
                self.writer.addDocument(doc)
                print "index document", filename

        self.writer.commit()
        self.writer.close()
Ejemplo n.º 19
0
    def rebuildIndex(self, data):
        writer = IndexWriter(
            self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))

        for record in data['records']:
            doc = self.buildDocument(data['fields'], record)
            writer.addDocument(doc)

        writer.commit()
        writer.close()
Ejemplo n.º 20
0
def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"
Ejemplo n.º 21
0
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
Ejemplo n.º 22
0
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 23
0
class WikiPageIndex():
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))

    def createIndex(self):
        self.writer = IndexWriter(self.directory, self.config)

        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

    def closeIndex(self):
        self.writer.commit()
        self.writer.close()


    def searchIndex(self, queryString, field="Text", max_results=100):
        query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
        scoreDocs = self.searcher.search(query, max_results).scoreDocs
        log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))

            #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
            docs.append(doc)

        return docs

    @staticmethod
    def cleanWikiText(text):
        text = text.encode('ascii', 'ignore')
        text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
        text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
        text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
        return text.strip()
Ejemplo n.º 24
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
Ejemplo n.º 25
0
class PyLuceneRetrievalSystem(object):
    def __init__(self):
        self.num_doc = 0
        self.directory = RAMDirectory()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.writer = IndexWriter(self.directory, self.config)

    def parse_articles(self, file_name):
        raw_articles = open(file_name).read()
        parsed_articles = BeautifulSoup(raw_articles, 'lxml')

        doc_nos = parsed_articles.find_all('docno')
        doc_texts = parsed_articles.find_all('text')
        self.num_doc = len(doc_nos)

        bar = ProgressBar()

        for i in bar(range(self.num_doc)):
            doc = Document()
            doc.add(
                Field('docno', doc_nos[i].string.strip(),
                      StringField.TYPE_STORED))
            doc.add(
                Field(
                    'content',
                    re.sub('[^a-zA-Z0-9\n]', ' ',
                           doc_texts[i].get_text()).lower(),
                    TextField.TYPE_STORED))
            self.writer.addDocument(doc)

        self.writer.close()

    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
    def indexing(self):
        docs = self.text.load_seg_without_stopword_data()

        if (os.path.exists(self.index_dir)):
            r = input("Indexing Dir has existed! Continue indexing?")
            if (r.lower() != 'y'):
                return -1

        if (not os.path.exists(self.index_dir)):
            os.makedirs(self.index_dir)
        store = SimpleFSDirectory(Paths.get(self.index_dir))

        # todo
        # version.LUCENE_6_5_0
        # analyzer = CJKAnalyzer(CharArraySet.EMPTY_SET)
        # analyzer =SmartChineseAnalyzer()

        # analyzer = StandardAnalyzer(Version.LUCENE_6_5_0)
        # index_writer = IndexWriter(store,analyzer,True,IndexWriter.MaxFieldLength(512))
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        id_conf = FieldType()
        id_conf.setStored(True)
        id_conf.setTokenized(False)
        id_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        date_conf = FieldType()
        date_conf.setStored(True)
        date_conf.setTokenized(True)
        date_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        content_conf = FieldType()
        content_conf.setStored(True)
        content_conf.setTokenized(True)
        content_conf.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for n, i in enumerate(docs):
            document = Document()

            for key, content in i.items():
                if (key == 'PubDate'):
                    document.add(Field(key, content, date_conf))
                else:
                    document.add(Field(key, content, content_conf))
            document.add(Field('id', str(n), id_conf))
            writer.addDocument(document)
            if (n % 1000 == 0):
                print(n)

        writer.commit()
        writer.close()
Ejemplo n.º 27
0
class IndexBuilder(object):
    def __init__(self, index_path, update=False):
        dir = FSDirectory.open(Paths.get(index_path))
        analyzer = StandardAnalyzer()
        iwc = IndexWriterConfig(analyzer)
        if update:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        else:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(dir, iwc)

    def index_docs(self, input_documents):
        for document in tqdm(input_documents, total=len(input_documents)):
            doc = Document()
            doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES))
            doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES))
            type = FieldType()
            type.setIndexOptions(
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
            type.setStored(True)
            type.setStoreTermVectors(True)
            type.setTokenized(True)
            if ".W" in document and ".M" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" in document and ".W" not in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower())), type))
            elif ".M" not in document and ".W" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" not in document and ".W" not in document:
                doc.add(
                    Field("text",
                          " ".join(tokenizer.tokenize(document[".T"].lower())),
                          type))
            if self.writer.getConfig().getOpenMode(
            ) == IndexWriterConfig.OpenMode.CREATE:
                self.writer.addDocument(doc)
            else:
                self.writer.updateDocument(Term(".U", document[".U"]), doc)
        self.writer.close()
Ejemplo n.º 28
0
    def index(self):
        """
        This function is used to index the preprocessed data.
        The inverted index will be saved to ./index/ folder
        business_id, name, address, categories, review and tip data are indexed.
        """
        print "INDEXING..."
        lucene.initVM()
        indexDir = SimpleFSDirectory(File("index/"))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1,
                                         StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)

        # each business indexed as a document
        for key, business in self.data.items():
            doc = Document()
            text = ""
            doc.add(
                Field("id", business["business_id"], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field("name", business["name"], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field("address", business["full_address"], Field.Store.YES,
                      Field.Index.ANALYZED))
            cat_text = "\n".join(business["categories"])
            doc.add(
                Field("category", cat_text, Field.Store.YES,
                      Field.Index.ANALYZED))

            # combine all reviews of a business together
            review_text = ""
            for review in business["review"]:
                review_text += review["text"]
            # combine all tip of a business together
            tip_text = ""
            for tip in business["tip"]:
                tip_text += tip["text"]

            # concatenate the data to be indexed and add it as one field
            text += business["name"]
            text += business["full_address"]
            text += cat_text
            text += review_text
            text += tip_text
            doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED))

            # add the business doc to writer
            writer.addDocument(doc)

        writer.close()
Ejemplo n.º 29
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
Ejemplo n.º 30
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
Ejemplo n.º 31
0
def create_miniindex(docs):
    index_store = RAMDirectory()
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    for doc in docs:
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return index_store
class DocRepo:
    def __init__(self):
        # self.analyzer = StandardAnalyzer()
        # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw))
        # self.analyzer = PersianAnalyzer()
        self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address))
        self.config = IndexWriterConfig(self.analyzer)
        self.index = RAMDirectory()
        self.w = IndexWriter(self.index, self.config)

    def addDocument(self, id):
        global answers_train
        preA = answers_train[id]
        doc = Document()
        doc.add(TextField("pa", preA, Field.Store.YES))
        doc.add(StringField("id", str(id), Field.Store.YES))
        self.w.addDocument(doc)
        self.w.commit()

    def __del__(self):
        self.w.close()

    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
Ejemplo n.º 33
0
    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True,建立新索引,False,建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e
Ejemplo n.º 34
0
 def buildIndex(self, inputFile):
     analyzer = self.getAnalyzer()
     iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     
     iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf)
     
     # read through input file and write out to lucene
     counter = 0
     linesReadCounter = 0
     
     with open(inputFile, 'r') as lines:
         linesRead = 0
         
         for line in lines:
             try:
                 linesRead+=1
                 
                 if linesRead % 1000 == 0:
                     print "%d lines read" % linesRead
                     
                 cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t")
                 concept = concept.strip()
                 cui = cui.strip()
                 
                 strNorm = self.normalizeCasePunct(concept)
                 strSorted = self.sortWords(strNorm)
                 strStemmed = self.stemWords(strNorm)
                 strStemmedSorted = self.stemWords(strSorted)
       
                 fdoc = Document()
                 
                 counter +=1
                 fid = counter
                 
                 fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
                 writer.addDocument(fdoc)
                 if fid % 1000 == 0:
                     writer.commit()
             except:
                 "Skipping line: %s" % line
                 
     writer.commit()
     writer.close()
Ejemplo n.º 35
0
class QAindexer(object):

    INDEX_FILED_TITLE = "Title"
    INDEX_FILED_BODY = "Body"

    def __init__(self, dir, data_file):
        self.dir = dir
        self.data_file = data_file
        index_dir = FSDirectory.open(Paths.get(self.dir))
        analyzer = StandardAnalyzer()
        writer_config = IndexWriterConfig(analyzer)
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(index_dir, writer_config)

    def get_document(self, title, body):
        doc = Document()
        doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES))
        doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES))
        return doc

    def index_QAs(self):
        print "Start to index " + self.data_file + " @ " + str(datetime.now())
        count = 0
        for event, elem in ET.iterparse(self.data_file):
            if event == 'end':
                if elem.get('Title'):
                    title = elem.get('Title')
                    print title
                else:
                    title = ''
                if elem.get('Body'):
                    body = elem.get('Body')

                    body = re.sub(r'</?\w+[^>]*>', '', body)
                    print body
                else:
                    body = ''

                doc = self.get_document(title.encode("utf-8"),
                                        body.encode("utf-8"))

                self.writer.addDocument(doc)
                print count
                count = count + 1

            elem.clear()
        self.writer.close()
        print "Indexing " + self.data_file + " complete @ " + str(
            datetime.now())
Ejemplo n.º 36
0
def index(analyzer, index_dest_dir, documents):
    """ Builds Lucene index from provided documents using given analyzer
    :param analyzer:
    :param index_dest_dir:
    :param list[Document] documents:
    :return:
    """
    if not all([isinstance(d, Document) for d in documents]):
        raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0]))

    writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config)
    for doc in documents:
        writer.addDocument(doc)
    writer.close()
Ejemplo n.º 37
0
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()
Ejemplo n.º 38
0
    def xmlrpc_indexDocument(self, instance, id, text):
        """Index a new document."""
        self.xmlrpc_unindexDocument(instance, id)

        # Create a document and add two fields to it. 
        doc = Document()
        doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
        doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED))
        doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED))

        # Write the document into the index.
        writer = IndexWriter(self.indexPath, self.analyzer, 0)
        writer.addDocument(doc)
        writer.optimize()
        writer.close()
        log('Insert: Instance: %s Document: %s' %(instance, id))
        return 1
Ejemplo n.º 39
0
def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
def indexing():
    print("建立索引,文本文件夹 [%s] ..." % TEXT_DIR)
    create_dir(INDEX_DIR)
    directory = SimpleFSDirectory(Paths.get(INDEX_DIR))
    config = IndexWriterConfig(ANALYZER)
    writer = IndexWriter(directory, config)

    for x in glob.glob(os.path.join(TEXT_DIR, "*.txt")):
        title, post, terms = get_terms(x)
        doc = Document()
        if terms:
            doc.add(Field("title", title, TextField.TYPE_STORED))
            doc.add(Field("post", post, TextField.TYPE_STORED))
            doc.add(Field("terms", terms, TextField.TYPE_STORED))
            writer.addDocument(doc)

    writer.commit()
    writer.close()
Ejemplo n.º 41
0
def index(analyzer, index_dest_dir, documents):
    """ Builds Lucene index from provided documents using given analyzer
    :param analyzer:
    :param index_dest_dir:
    :param list[Document] documents:
    :return:
    """
    if not all([isinstance(d, Document) for d in documents]):
        raise TypeError(
            "documents should be iterable of type Document! Given: %s" %
            type(documents[0]))

    writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)),
                         writer_config)
    for doc in documents:
        writer.addDocument(doc)
    writer.close()
Ejemplo n.º 42
0
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
Ejemplo n.º 43
0
class PyLuceneRetrievalSystem(object):
    def __init__(self):
        self.num_doc = 0
        self.directory = RAMDirectory()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.writer = IndexWriter(self.directory, self.config)

    def parse_articles(self, file_name):
        raw_articles = open(file_name).read()
        parsed_articles = BeautifulSoup(raw_articles, 'lxml')

        doc_nos = parsed_articles.find_all('docno')
        doc_texts = parsed_articles.find_all('text')
        self.num_doc = len(doc_nos)

        bar = ProgressBar()

        for i in bar(range(self.num_doc)):
            doc = Document()
            doc.add(Field('docno', doc_nos[i].string.strip(), StringField.TYPE_STORED))
            doc.add(Field('content', re.sub('[^a-zA-Z0-9\n]', ' ', doc_texts[i].get_text()).lower(), TextField.TYPE_STORED))
            self.writer.addDocument(doc)

        self.writer.close()

    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
Ejemplo n.º 44
0
    def index(self):
        """
        This function is used to index the preprocessed data.
        The inverted index will be saved to ./index/ folder
        business_id, name, address, categories, review and tip data are indexed.
        """
        print "INDEXING..."
        lucene.initVM()
        indexDir = SimpleFSDirectory(File("index/"))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)

        # each business indexed as a document
        for key, business in self.data.items():
            doc = Document()
            text = ""
            doc.add(Field("id", business["business_id"], Field.Store.YES, Field.Index.ANALYZED))
            doc.add(Field("name", business["name"], Field.Store.YES, Field.Index.ANALYZED))
            doc.add(Field("address", business["full_address"], Field.Store.YES, Field.Index.ANALYZED))
            cat_text = "\n".join(business["categories"])
            doc.add(Field("category", cat_text, Field.Store.YES, Field.Index.ANALYZED))

            # combine all reviews of a business together
            review_text = ""
            for review in business["review"]:
                review_text += review["text"]
            # combine all tip of a business together
            tip_text = ""
            for tip in business["tip"]:
                tip_text += tip["text"]

            # concatenate the data to be indexed and add it as one field
            text += business["name"]
            text += business["full_address"]
            text += cat_text
            text += review_text
            text += tip_text
            doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED))
            
            # add the business doc to writer
            writer.addDocument(doc)

        writer.close()
Ejemplo n.º 45
0
 def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     mt = matchtext()
     writer.deleteDocuments(Term('uid', pid1))
     writer.deleteDocuments(Term('uid', pid2))
     p = personDB.find_one({'_id': pid1})
     matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
     doc = Document()
     doc.add(Field('uid',str(pid1), StringField.TYPE_STORED))
     doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
     doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
     doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Ejemplo n.º 46
0
    def update(self):
        delete(self._dir, self._counter)

        index_dir = SimpleFSDirectory(Paths.get(self._dir))
        config = IndexWriterConfig(StandardAnalyzer())
        index_writer = IndexWriter(index_dir, config)

        for key, val in self._data.items():
            document = Document()
            document.add(Field('id', key, StringField.TYPE_STORED))
            for k, v in val.items():
                if v:
                    if k == 'text':
                        document.add(Field('text', v, TextField.TYPE_STORED))
                    else:
                        document.add(Field(k, v, StringField.TYPE_STORED))
            index_writer.addDocument(document)
        index_writer.commit()
        index_writer.close()
Ejemplo n.º 47
0
class Indexer:
	
	def __init__(self, writerConfig, indexDir):
		
		lucene.initVM()

		self.mIndexDir = SimpleFSDirectory(File(indexDir))
		self.mConfig = writerConfig
		self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
	

	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()
Ejemplo n.º 48
0
def lucene_indexing():
    lucene.initVM()
    index_dir = os.getcwd()
    dir = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_48)
    index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer);
    index_writer = IndexWriter(dir, index_writer_config)

    for tfile in glob.glob(os.path.join(index_dir, '*.txt')):
        print "Indexing: ", tfile
        document = Document()
        with open(tfile, 'r') as f:
            content = f.read()
        document.add(Field("text", content, Field.Store.YES,
                           Field.Index.ANALYZED))
        document.add(Field("title", tfile, Field.Store.YES,
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
    print index_writer.numDocs()
    index_writer.close()
Ejemplo n.º 49
0
    def index(self):
        # if exists sent_index, delete and create a new one
        doc_tool.cleardir(index_root)
        doc_tool.mkdir(index_root)

        index_dir = FSDirectory.open(Paths.get(index_root))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_dir, writer_config)

        ft1 = FieldType()
        ft1.setStored(True)
        ft1.setIndexOptions(IndexOptions.NONE)

        ft2 = FieldType()
        ft2.setStored(False)
        ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        doc_list = self.doc()
        file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc")
        file_list = os.listdir(file_path)

        num = 0
        for file in file_list:
            docs = doc_tool.load_json_file(file_path, file)
            for page_identifier in docs:
                if page_identifier in doc_list:
                    num += 1
                    for sent_number in docs[page_identifier]:
                        sentence_text = self.process_sent(
                            docs[page_identifier][sent_number])
                        doc = Document()
                        doc.add(Field("page_identifier", page_identifier, ft1))
                        doc.add(Field("sentence_number", sent_number, ft1))
                        doc.add(Field("sentence_text", sentence_text, ft2))
                        writer.addDocument(doc)
                    print(num)

        writer.commit()
        writer.close()
        index_dir.close()
Ejemplo n.º 50
0
 def __index(self, emailInfo):
     from org.apache.lucene.index import IndexWriterConfig
     from org.apache.lucene.util import Version
     from org.apache.lucene.analysis.standard import StandardAnalyzer
     analyser = StandardAnalyzer(Version.LUCENE_33)
     conf = IndexWriterConfig(Version.LUCENE_33, analyser)
     from org.apache.lucene.store import FSDirectory
     from java.io import File
     storage = File.createTempFile(u'Tubelight-', '.index')
     storage.delete()
     storage.mkdir()
     storage.deleteOnExit()
     self.storage = storage.getAbsolutePath()
     from java.io import File
     self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx')
     directory = FSDirectory.open(storage)
     from org.apache.lucene.index import IndexWriter
     iw = IndexWriter(directory, conf)
     from us.d8u.tubelight import Configuration
     addr = emailInfo[Configuration.EmailAddressKey]
     (username, server) = addr.split('@')
     from java.lang import System
     System.setProperty("mail.imap.partialfetch", "false")
     urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey])))
     from javax.mail import Session
     session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey))
     session.connect(server, username, emailInfo[Configuration.EmailPasswordKey])
     folder = session.getDefaultFolder()
     for m in folder.getMessages():
         from org.apache.lucene.document import Document
         d = Document()
         subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED)
         toSrc = u''
         toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()]
         to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED)
         d.add(to)
         d.add(subject)
         iw.addDocument(d)
     iw.commit()
     self.searcher = IndexSearcher(directory)
Ejemplo n.º 51
0
    def index (cls, indexDir, taxoDir, facets_config):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        config = IndexWriterConfig(Version.LUCENE_48,
                                   WhitespaceAnalyzer(Version.LUCENE_48))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # obtain the sample facets for current document
            facets = categories[docNum]
            author = authors[docNum]
            # ... and use the FacetField class for adding facet fields to
            # the Lucene document (and via FacetsConfig to the taxonomy index)
            doc.add(FacetField("Author", [author]));
            for f in facets:
                doc.add(FacetField("Categories", f))
            # finally add the document to the index
            iw.addDocument(facets_config.build(taxo, doc));
            nDocsAdded += 1

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        iw.close()
        taxo.close()
        print "Indexed %d documents with facets." % nDocsAdded
Ejemplo n.º 52
0
    def index(self, personDB, familyDB, relationDB):
        """
        indexes a database
        Field match includes information about parents and is used to find matches
        Field text has Ids, names, places, and dates and is used to find a person/family
        """
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(self.indexDir, config)
        #indexWriter.setRAMBufferSizeMB(256)  #?

        mt = matchtext()

        for p in personDB.find({}, no_cursor_timeout=True):
            matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
            doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
            doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        #Family matchtext
        for f in familyDB.find():
            #matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
            #doc.add(Field('sex','FAM', StringField.TYPE_STORED))
            #doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            txt = f['_id']
            if 'refId' in f: txt += ' ' + f['refId']
            doc.add(Field("text", txt, TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
        return
Ejemplo n.º 53
0
def createind(product,url):
	"This function creates index for lucene"
	global counter
	counter += 1
	adId = counter
	adLine = product
	field_string = chunker(product.lower())
	field_related_words = getDbpediaMatches(product, field_string)
	url = url    

	lucene.initVM()
	# 1. create an index
	index_path = File("Home/WishMatcherIndex")
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
	index = SimpleFSDirectory(index_path)

	# 2. fill the index
	config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(index, config)
	#for title in TITLES:
	import time
	millis = int(round(time.time() * 1000))
	
	userid = str(millis)
	
	doc = Document()
	doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(doc)
	print(adId)
	# 3. close resources
	writer.close()
	index.close()	
	return ""
Ejemplo n.º 54
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
Ejemplo n.º 55
0
		# docList = list()  #create a list to store all strings in this document
		docName = document.strip('\n')
		docPath = '/home/650/resources/Homework1/cranfieldDocs'
		tree = ET.parse(docPath + '/' + docName)
		root = tree.getroot()
		
		docId = root.find('DOCNO').text
		title = root.find('TITLE').text
		author = root.find('AUTHOR').text
		biblio = root.find('BIBLIO').text
		text = root.find('TEXT').text
		# print docId, title
		#create a file 
		files = Document()
		files.add(Field("DOCNO", docId, Field.Store.YES, Field.Index.ANALYZED))
		writer.addDocument(files)
		files.add(Field("TITLE", title, Field.Store.YES, Field.Index.ANALYZED))
		writer.addDocument(files)
		files.add(Field("AUTHOR", author, Field.Store.YES, Field.Index.ANALYZED))
		writer.addDocument(files)
		files.add(Field("BIBLIO", biblio, Field.Store.YES, Field.Index.ANALYZED))
		writer.addDocument(files)
		files.add(Field("TEXT", text, Field.Store.YES, Field.Index.ANALYZED))
		writer.addDocument(files)

	# print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs())
	# print "Closing index of %d docs..." % writer.numDocs()
	writer.close()

		
Ejemplo n.º 56
0
  indexDir = SimpleFSDirectory(File("lucene/"))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)
 
  print "%d docs in index" % writer.numDocs()
  print "Reading lines from sys.stdin..."
  header=[]
  for n, l in enumerate(sys.stdin):
    doc = Document()
    fields = l.rstrip().split("\t")
    for (idx,field) in enumerate(fields):
        if n == 0:
            typechar = field[-1]
            if typechar not in set(['t','s','i']):
                sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field))
                exit(-1) 
            header.append([field,LUCENE_TYPES[typechar]])
        else:
            (fname,fieldtype) = header[idx]
            if fieldtype is IntField:
                #sys.stdout.write("int field %s:%s\n" % (fname,field))
                field = int(field) 
            doc.add(fieldtype(fname, field, Field.Store.YES))  #Field.Store.YES, Field.Index.ANALYZED))
            #doc.add(Field(fieldtype, field, Field.Store.YES, Field.Index.ANALYZED))
            #doc.add(fieldtype(header[idx][1],field,Field.Store.YES)
    writer.addDocument(doc)
  print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs())
  print "Closing index of %d docs..." % writer.numDocs()
  writer.close()

Ejemplo n.º 57
0
class IndexingEngine():

	def __init__(self):

		self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
		self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
		self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()


		############################# Writer Configurattion #####################################
		map = HashMap()
		map.put('name', self.mAnalyzers['name'])
		map.put('parent', self.mAnalyzers['parent'])
		map.put('content', self.mAnalyzers['default'])
		map.put('id', self.mAnalyzers['id'])		

		analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)

		self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
		self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)

		if settings.ADMINS_ENGINE.mSimilarity != None:
			self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
		########################################################################################


		directory = SimpleFSDirectory(File(self.mIndexDirectory))
		self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)


		############################# FieldType Prepration #####################
		nameField = FieldType()
		nameField.setIndexed(True)
		nameField.setStored(True)
		nameField.setTokenized(True)
		nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		parentField = FieldType()
		parentField.setIndexed(True)
		parentField.setStored(True)
		parentField.setTokenized(True)
		parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		contentField = FieldType()
		contentField.setIndexed(True)
		contentField.setStored(True)
		contentField.setTokenized(True)
		contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

		idField = FieldType()
		idField.setIndexed(True)
		idField.setStored(True)
		idField.setTokenized(False)
		idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)


		self.mFieldTypes = {
			'name' 		: nameField,
			'parent'	: parentField,
			'content'	: contentField,
			'id'		: idField
		}
		#######################################################################

		self.mLog = ""

	

	def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0):

		realPath = os.path.abspath(root)
		for i in os.listdir(realPath):

			path = os.path.join(realPath, i)
			if os.path.isfile(path):
				#index this file
				doc = Document()

				doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name']))
				doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent']))
				doc.add(Field('id', str(docID), self.mFieldTypes['id']))
				doc.add(Field('parentID', str(parentID), self.mFieldTypes['id']))

				fd = open(path, 'r')
				content = fd.read()
				fd.close()

				if len(content) > 0:
					doc.add(Field('content', content, self.mFieldTypes['content']))

				self.mIndexWriter.addDocument(doc)
				##################### Logging ##############################
				if IS_DEBUG:
					nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip())
					parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent))
					contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content)
					self.mLog = self.mLog + ( "File %s\n   {name - %s}: %s\n   {parent - %s}: %s\n   {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) )



				docID = docID + 1
				################### index sub commands	
				if os.path.isdir(path + ".sub"):
					parent.append(i)
					docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1)
					parent.pop()
					
						
		
		if id == 0:
			self.mIndexWriter.commit()
			self.mIndexWriter.close()
			
			if IS_DEBUG:
				loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue())
				loggingBot.start()
				self.mLog = ""
		return docID
Ejemplo n.º 58
0
def main(indexDir, inputDir):
	"""Creates a Lucene Index, and indexes every .json file it finds.
	It utilizes a stopwords.txt to filter out stop words"""
	lucene.initVM()

	logger.info("Loading stop words from stopwords.txt")
	f = open('stopwords.txt', 'r')
	stopwords = set([])
	for line in f:
		stopwords.add(line.strip())
	f.close()
	logger.debug('Stop words: %s' % str(stopwords))
	temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)

	for stopword in stopwords:
		temp.add(stopword)

	stopwords = temp

	# Create index
	logger.info("Creating Lucene index [%s]..." % indexDir)

	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
	writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
	writer = IndexWriter(dir, writerConfig)

	logger.info("Currently there are %d documents in the index..." % writer.numDocs())

	# Index documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	for f in onlyfiles:
		try:
			journal_code = f.split('.')[0]
			f = join(inputDir, f)
			json_data = open(f)
			data = json.load(json_data)
			for entry in data:
				doc = Document()
				doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
				doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			json_data.close()
		except (IOError) as v:
			try:
				(code, message) = v
			except:
				code = 0
				message = v
			logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
	logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())

	# Wrap it up
	#logger.info("About to optimize index of %d documents..." % writer.numDocs())
	#writer.optimize()
	#logger.info("...done optimizing index of %d documents" % writer.numDocs())

	logger.info("Closing index of %d documents..." % writer.numDocs())
	writer.close()

	reader = IndexReader.open(dir)
	with open('all.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for i in xrange(0, reader.numDocs()):
			doc = reader.document(i)
			csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
				doc.get('title').strip().replace(',', '\,').encode('utf8')])
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT	
	print "started indexing input data......"
	
	#extracting values
	try:
		contents=json.loads(data)
	except:
		return 100


	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	

	#checking for existance of record with same primary_key set
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106
	except:
		pass 	 
	
	


	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)
	#fix this later.....FieldType not defined
	#field_type=FieldType()
	#field_type.setIndexed(True)
	#field_type.setStored(False)
	#field_type.setTokenized(False)
	
	try:
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data=snappy.compress(data)
		field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
		if commit==True:
			writer.commit()
		writer.close()
		return 000
	except:
		return 102