def __init__(self, startJVM=False):
     if startJVM:
         lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     
     self.STORE_DIR = "index_dir"
     self.store = SimpleFSDirectory(File(self.STORE_DIR)) 
     
     tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) 
     self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) 
     
     config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(self.store, config)
def createind(product,url):
	"This function creates index for lucene"
	global counter
	counter += 1
	adId = counter
	adLine = product
	field_string = chunker(product.lower())
	field_related_words = getDbpediaMatches(product, field_string)
	url = url    

	lucene.initVM()
	# 1. create an index
	index_path = File("Home/WishMatcherIndex")
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
	index = SimpleFSDirectory(index_path)

	# 2. fill the index
	config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(index, config)
	#for title in TITLES:
	import time
	millis = int(round(time.time() * 1000))
	
	userid = str(millis)
	
	doc = Document()
	doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(doc)
	print(adId)
	# 3. close resources
	writer.close()
	index.close()	
	return ""
Beispiel #3
0
            print 'url:', doc.get('url')
            print 'rate:', doc.get('rate')

    while True:
        print
        print "Hit enter with no input to quit."
        choice = raw_input('1-perfumer, 2-scents:')
        if choice == '':
            return
        command = raw_input("Query:")
        command = unicode(command, 'utf-8')
        if choice == '1':
            perfumer_search(command)
        if choice == '2':
            scent_search(command)
        if command == '':
            return
        print "Searching for:", command


if __name__ == '__main__':
    STORE_DIR = "index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    del searcher
Beispiel #4
0
def main(index_dir, input_dir):
    """Creates a Lucene Index, and indexes every .json file it finds.
    It utilizes a stopwords.txt to filter out stop words"""
    lucene.initVM()

    logger.info("Loading stop words from stopwords.txt")
    f = open('stopwords.txt', 'r')
    stopwords = set([])
    for line in f:
        stopwords.add(line.strip())
    f.close()
    logger.debug('Stop words: %s' % str(stopwords))
    temp = CharArraySet(1, True)

    for stopword in stopwords:
        temp.add(stopword)

    stopwords = temp

    # Create index
    logger.info("Creating Lucene index [%s]..." % index_dir)

    fs_dir = SimpleFSDirectory(Paths.get(index_dir))
    analyzer = StandardAnalyzer(stopwords)
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(fs_dir, writerConfig)

    logger.info("Currently there are %d documents in the index..." %
                writer.numDocs())

    # Index documents
    onlyfiles = [
        f for f in listdir(input_dir)
        if isfile(join(input_dir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        try:
            journal_code = f.split('.')[0]
            f = join(input_dir, f)
            json_data = open(f)
            data = json.load(json_data)
            for entry in data:
                doc = Document()
                doc.add(StringField("journal", journal_code, Field.Store.YES))
                doc.add(StringField("url", entry['url'], Field.Store.YES))
                doc.add(StringField("date", entry['date'], Field.Store.YES))
                doc.add(TextField("title", entry['title'], Field.Store.YES))
                writer.addDocument(doc)
            json_data.close()
        except IOError as v:
            try:
                (code, message) = v
            except (TypeError, ValueError):
                code = 0
                message = v
            logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
    logger.info("Indexed lines from stdin (%d documents in index)" %
                writer.numDocs())

    # Wrap it up
    # logger.info("About to optimize index of %d documents..." % writer.numDocs())
    # writer.optimize()
    # logger.info("...done optimizing index of %d documents" % writer.numDocs())

    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()

    reader = DirectoryReader.open(fs_dir)
    with open('all.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_ALL)
        for i in range(0, reader.numDocs()):
            doc = reader.document(i)
            csvwriter.writerow([
                doc.get('journal'),
                doc.get('date'),
                doc.get('url'),
                doc.get('title').strip().replace(',', '\,')
            ])
Beispiel #5
0
 def __init__(self, storeDir):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     print 'lucene', lucene.VERSION
     self.dir = SimpleFSDirectory(File(storeDir))
Beispiel #6
0
def index_images_until_stop(session, handler, lbound):
    global _stop, _stopped, _vm

    _vm.attachCurrentThread()
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER))))
    query = BooleanQuery()
    query.add(TermQuery(Term('finish_time', '0')),
              BooleanClause.Occur.MUST_NOT)
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    if not lbound is None:
        query.add(
            TermRangeQuery.newStringRange('finish_time', lbound, '9999999999',
                                          False, True),
            BooleanClause.Occur.MUST)
    sort = Sort(SortField('finish_time', SortField.Type.INT))
    tmpbk = None
    res = searcher.search(query, 100, sort)
    answer_content_searcher = zh_iatd.create_searcher()
    logger = external_console_logger('/tmp/zh_imgc_info')
    while not _stop:
        print 'got', len(res.scoreDocs), 'docs'
        for x in res.scoreDocs:
            try:
                imgsgot = 0
                realdoc = searcher.doc(x.doc)
                doctype = realdoc['func_name']
                objid = realdoc['id']
                logger.write(' ft:{0}'.format(realdoc['finish_time']))
                if doctype == 'user_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/people/{0}'.format(
                                    objid))), HTML_PARSER)
                    cover = soup.select(
                        '#ProfileHeader .ProfileHeader-userCover img')
                    if len(cover) > 0:
                        cover_img = cover[0]['src']
                        imgsgot += 1
                        handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid)
                    avatar_img = soup.select(
                        '#ProfileHeader .ProfileHeader-main .UserAvatar img'
                    )[0]['src']
                    imgsgot += 1
                    handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid)
                elif doctype == 'article_data':
                    jsondata = session.get_article_content_raw(objid)
                    if 'titleImage' in jsondata.keys():
                        cover_img = jsondata['titleImage']
                        if len(cover_img) > 0:
                            imgsgot += 1
                            handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid)
                    soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER)
                    for x in soup.select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid)
                elif doctype == 'topic_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/topic/{0}/hot'.
                                format(objid))), HTML_PARSER)
                    topic_img = soup.select(
                        '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img'
                    )[0]['src']
                    imgsgot += 1
                    handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid)
                elif doctype == 'answer_comments' and realdoc['start'] == '0':
                    obj, q = zh_iatd.query_object(answer_content_searcher,
                                                  objid, zh_pganlz.answer)
                    for x in obj.data.text.as_soup().select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid)
                elif doctype == 'question_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/question/{0}'.
                                format(objid))), HTML_PARSER)
                    for x in soup.select('#zh-question-detail img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid)
                else:
                    logger.write('\n')
                    continue
                logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot))
                if _stop:
                    break
                time.sleep(3)
            except Exception as e:
                logger.write('\n## ERROR ################################\n')
                logger.write(traceback.format_exc())
        if len(res.scoreDocs) > 0:
            tmpbk = res.scoreDocs[-1]
        res = searcher.searchAfter(tmpbk, query, 100, sort)
    print 'stopped'
    _stopped = True
Beispiel #7
0
 def __init__(this, storeDir, analyzer):
     store = SimpleFSDirectory(Paths.get(storeDir))
     config = IndexWriterConfig(analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     this.writer = IndexWriter(store, config)
Beispiel #8
0
def index(docDirPath="data", indexDirPath="index"):
    lucene.initVM()
    indexDir = SimpleFSDirectory(Paths.get(indexDirPath))
    analyzer = StandardAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(indexDir, writerConfig)
    files = os.listdir(docDirPath)
    for file in files:
        if not os.path.isdir(file):
            f = open(docDirPath + "/" + file)
            iter_f = iter(f)
            for line in iter_f:
                document = Document()
                data = json.loads(line)
                document.add(
                    Field("user_id", data["user_id"], TextField.TYPE_STORED))
                document.add(Field("text", data["text"],
                                   TextField.TYPE_STORED))
                if data["urls"] != "None" and len(data["urls"]) != 0:
                    document.add(
                        Field("title", data["urls"]["title"],
                              TextField.TYPE_STORED))
                else:
                    document.add(Field("title", "None", TextField.TYPE_STORED))
                if data["hashtags"] != "None":
                    hashtags = ""
                    for tag in data['hashtags'].values():
                        hashtags += tag["text"] + ", "
                    document.add(
                        Field("hashtags", hashtags[0:-2],
                              TextField.TYPE_STORED))
                else:
                    document.add(
                        Field("hashtags", "None", TextField.TYPE_STORED))
                if data["user_mentions"] != "None":
                    user_mentions = ""
                    for user in data['user_mentions'].values():
                        user_mentions += user + ", "
                    document.add(
                        Field("user_mentions", user_mentions[0:-2],
                              TextField.TYPE_STORED))
                else:
                    document.add(
                        Field("user_mentions", "None", TextField.TYPE_STORED))
                if data["place"] != "None":
                    document.add(
                        Field("place", data["place"]["place_name"],
                              TextField.TYPE_STORED))
                    document.add(
                        Field("cords_x", str(data["place"]["1"]["x"]),
                              StoredField.TYPE))
                    document.add(
                        Field("cords_y", str(data["place"]["1"]["y"]),
                              StoredField.TYPE))
                else:
                    document.add(Field("place", "None", TextField.TYPE_STORED))
                    document.add(Field("cords_x", "None", StoredField.TYPE))
                    document.add(Field("cords_y", "None", StoredField.TYPE))
                document.add(Field("time", data["time"], StoredField.TYPE))
                text = data['tweet'].split(',')
                for i in range(len(text)):
                    if ("'lang':" in text[i]):
                        lang = text[i].split(': ')[1]
                        document.add(
                            Field("lang", lang[1:-1], StoredField.TYPE))
                        break
                index_writer.addDocument(document)
            f.close()
    index_writer.close()
class PyLucene:
    """
        PyLucene module api
    """
    
    def __init__(self, startJVM=False):
        if startJVM:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        
        self.STORE_DIR = "index_dir"
        self.store = SimpleFSDirectory(File(self.STORE_DIR)) 
        
        tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) 
        self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) 
        
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.store, config)
    
    def close_store(self):
        self.store.close()

    def index_doc(self, doc_dict):
        """
          Index a doc to pylucene
          
          obs.: docid is a string not an integer
        """
        
        doc = Document()
        
        doc.add(Field("doc_id", doc_dict["doc_id"], TextField.TYPE_STORED))
        doc.add(Field("general_info", doc_dict["general_info"], TextField.TYPE_NOT_STORED))
        doc.add(Field("subject", doc_dict["subject"], TextField.TYPE_NOT_STORED))
        doc.add(Field("source", doc_dict["source"], TextField.TYPE_NOT_STORED))
        doc.add(Field("initial_date", doc_dict["initial_date"], TextField.TYPE_NOT_STORED))
        doc.add(Field("final_date", doc_dict["final_date"], TextField.TYPE_NOT_STORED))
        
        body_text = doc_dict["content"]
        body_reader = StringReader(body_text)
        doc.add(Field("content", body_reader))
        
        self.writer.addDocument(doc)
        
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        
        self.writer.commit()
        
        ticker.tick = False
        print 'done'
        
    def search_docs(self, value, field="general_info"):
        MAX_RESULTS = 1000
        searcher = IndexSearcher(DirectoryReader.open(self.store))
        query = QueryParser(Version.LUCENE_CURRENT, field,
                            self.analyzer).parse(value)
        topDocs = searcher.search(query, MAX_RESULTS)
        
        return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
Beispiel #10
0
class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content
Beispiel #11
0
import sys
import lucene

from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import IndexReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.index import DirectoryReader

if __name__ == "__main__":
    lucene.initVM()
    analyzer = StandardAnalyzer()
    path = Paths.get('index')
    indexDir = SimpleFSDirectory(path)
    searcher = IndexSearcher(DirectoryReader.open(indexDir))

    query = QueryParser("text", analyzer).parse("certificate")
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("id")
Beispiel #12
0
def query_index(query, hit_logs_for_each, score_logs_for_each):
    ### 1_Query Alternation
    user_code_query = Generator(query)

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 2_Finding 3 Answer Snippets using the User Query (refined)
    answers = SnippetSearcher(searcher, user_code_query)
    answer_ids = answers.more_like_this(20, query=user_code_query)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 3_Finding the Associated Questions
    question_ids = answers.find_question_ids(answer_ids)
    # Log : Answer - Question count
    if question_ids:
        hit_logs_for_each += str(len(question_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    getDoc = GettingQuestionDocs(searcher)
    item_docs = getDoc.search(
        question_ids, 20)[0:7]  # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름.

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Question ItemDoc count
    if item_docs:
        hit_logs_for_each += str(len(item_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 4_Finding 3 Similar Questions per a Question (3 X 3)
    similar_questions = []
    question = SimilarQsSearcher(searcher)

    # Log : Similar Question count for each of Question ItemDoc
    i = 1
    if item_docs:
        for item_doc in item_docs:
            similar_question = question.more_like_this2(
                item_doc, 7)  # 각 question 들에 대해 7개씩 비슷한 것들 찾음.
            if similar_question:
                hit_logs_for_each += str(len(similar_question)) + '\t'
            else:
                hit_logs_for_each += ('0' + '\t')
            similar_questions += similar_question
            i += 1
    else:
        hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' +
                              '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t'
                              )  # 7개

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Similar Question result count
    if similar_questions:
        hit_logs_for_each += str(len(similar_questions)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 5_Finding Associated Answers for each Question (9 - 9)
    answer_ids = find_answer_ids(similar_questions)

    # Log : Question - Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 6_Getting Answer Docs for the Final Query
    getDoc = GettingAnswerDocs(searcher)
    answer_docs = getDoc.search(answer_ids)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer Docs count
    if answer_docs:
        hit_logs_for_each += str(len(answer_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_1'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    bench_results = []
    benchsearcher = BenchSearcher(searcher)  # BigCloneBench

    # Exceptional
    ### 7_Appending for the user query results
    bench_result, score_logs_for_each = benchsearcher.more_like_this2(
        100, answer_docs[0], score_logs_for_each, user_code_query, 1)
    if bench_result:
        hit_logs_for_each += str(len(bench_result)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    bench_results += bench_result

    ### 8_Querying for the Final Results
    # Log : Bench_result for each query
    for answer_doc in answer_docs:
        bench_result, score_logs_for_each = benchsearcher.more_like_this2(
            100, answer_doc, score_logs_for_each, user_code_query,
            0)  # , user_query=user_code_query)
        if bench_result:
            hit_logs_for_each += str(len(bench_result)) + '\t'
        else:
            hit_logs_for_each += ('0' + '\t')
        bench_results += bench_result

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    if answer_docs < 49:
        for a in range(49 - len(answer_docs)):
            hit_logs_for_each += ('0' + '\t')

    if bench_results:
        hit_logs_for_each += str(len(bench_results)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    sorted_bench_results = sorted(bench_results,
                                  key=attrgetter('score'),
                                  reverse=True)

    print 'Search Count : ', len(sorted_bench_results)
    recommended = recommend(sorted_bench_results)
    print 'Final Count : ', len(recommended)
    if bench_results:
        hit_logs_for_each += str(len(recommended)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    return recommended, hit_logs_for_each, score_logs_for_each
Beispiel #13
0

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return
        print
        print "Searching for:", command
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name"), (
                'score: %f' % (scoreDoc.score))


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = EnglishAnalyzer()
    run(searcher, analyzer)
    del searcher
Beispiel #14
0
Datei: _1.py Projekt: sjgyb/HW
#!/usr/bin/env python
import web
from web import form
import urllib2
import os
import sys, os, lucene
from java.io import File
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause

STORE_DIR1 = "index1"
STORE_DIR2 = "index2"

vm_env =lucene.initVM(vmargs=['-Djava.awt.headless=true'])
#base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
directory1 = SimpleFSDirectory(File(STORE_DIR1))
searcher1 = IndexSearcher(DirectoryReader.open(directory1))
directory2= SimpleFSDirectory(File(STORE_DIR2))
searcher2 = IndexSearcher(DirectoryReader.open(directory2))
analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
total_list=[]
Beispiel #15
0
    def GET(self, query):
        data_input = web.input()
        page = 0
        if "page" in data_input:
            page = int(data_input["page"])
        render = web.template.render('templates/')
        anses = []
        num_pages = 0
        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            # s = Search(es).index('book-index').doc_type('book').query(Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip()))
            s = Search(using=es, index='book-index').doc_type('book').query(
                Q('match', title=query.strip())
                | Q('match', description=query.strip())
                | Q("match", userreviews_userReview=query.strip()))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            s = s[page * 10:page * 10 + 10]
            response = s.execute()
            # print 'total number of hits: ', response.hits.total
            num_pages = (response.hits.total / 10) + 1
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                anses.append({
                    'title': res.title,
                    'description': res.description.encode('utf-8'),
                    'url': res.url,
                    'cover': res.cover,
                    'authors': authors
                })
        else:
            # importing libraries for Lucene
            import lucene
            from java.io import File
            from org.apache.lucene.index import DirectoryReader, Term
            from org.apache.lucene.queryparser.classic import QueryParser
            from org.apache.lucene.store import SimpleFSDirectory
            from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
            from org.apache.lucene.util import Version
            from org.apache.lucene.analysis.standard import StandardAnalyzer
            import os

            # fields
            title_field = 'title'
            description_field = 'description'
            cover_field = 'cover'
            authors_name_field = 'authors_name'
            authors_url_field = 'authors_url'
            url_field = 'url'

            index_folder = '.'
            index_name = 'lucene.index'
            index_path = os.path.join(index_folder, index_name)

            lucene.initVM()
            version = Version.LUCENE_CURRENT
            directory = SimpleFSDirectory(File(index_path))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = StandardAnalyzer(version)

            title_tq = TermQuery(Term(title_field, query))
            desc_tq = TermQuery(Term(description_field, query))
            query = BooleanQuery()
            query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD))
            query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD))
            scoreDocs = searcher.search(query, 1000).scoreDocs
            num_pages = (len(scoreDocs) / 10) + 1

            for scoreDoc in scoreDocs[page * 10:page * 10 + 10]:
                doc = searcher.doc(scoreDoc.doc)
                authors = zip([doc.get(authors_name_field)],
                              [doc.get(authors_url_field)])
                anses.append({
                    'title':
                    doc.get(title_field),
                    'description':
                    doc.get(description_field).encode('utf-8'),
                    'url':
                    doc.get(url_field),
                    'cover':
                    doc.get(cover_field),
                    'authors':
                    authors
                })

        return render.index(anses, query, num_pages)
Beispiel #16
0
def func(user_access):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #

    tag = {}
    access = user_access.split()
    res = ''
    for i in access:
        b = i
        b = ''.join(b.split('/'))
        query = QueryParser(Version.LUCENE_CURRENT, "Tags", analyzer).parse(b)
        scoreDocs = searcher.search(query, 200).scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            tags = doc.get("Tags")
            tag_list = tags.split()
            for j in tag_list:
                if j not in tag:
                    tag[j] = 1
                else:
                    tag[j] += 1
        tags_list = sorted(tag.items(), key=lambda item: item[1], reverse=True)
        for i in tags_list[:3]:
            command = i[0]
        if command == '':
            return
        command = ''.join(command.split('/'))
        query = QueryParser(Version.LUCENE_CURRENT, "Tags",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 200).scoreDocs
        tmp = {}
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            collect = doc.get("Likes")
            views = doc.get("Views")
            rate = float(collect) / float(views)
            tmp[doc.get("Page_num")] = rate
        res_list = sorted(tmp.items(), key=lambda item: item[1], reverse=True)
        count = 0
        for i in res_list:
            if i[0] not in res:
                res += i[0]
                res += ' '
                count += 1
            if count > 9:
                break
    tmp_list = res.split()
    res = ''
    for i in tmp_list:
        query = QueryParser(Version.LUCENE_CURRENT, "Page_num",
                            analyzer).parse(i)
        scoreDocs = searcher.search(query, 1).scoreDocs
        doc = searcher.doc(scoreDocs[0].doc)
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        res += ch
        res += ' '
    del searcher
    del analyzer
    return res
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return

            print "Searching for:", command
            query = QueryParser(Version.LUCENE_43, "contents",
                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print "%s total matching documents." % len(scoreDocs)

            # Highlight the matching text in red
            highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)
Beispiel #18
0
def get_writer(index_dir):
    indexDir = SimpleFSDirectory(File(index_dir).toPath())
    writerConfig = IndexWriterConfig()
    print(f"Codec : {writerConfig.getCodec()}")
    writer = IndexWriter(indexDir, writerConfig)
    return writer
Beispiel #19
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    del searcher
Beispiel #20
0
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer

from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.search.spell import SpellChecker
from org.apache.lucene.search.spell import LuceneDictionary
from org.apache.lucene.index import IndexReader
from org.apache.lucene.index import IndexWriterConfig

vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = SimpleFSDirectory(File("store"))
searcher = IndexSearcher(DirectoryReader.open(directory))
# 创建拼写检查索引
spell_dic = SimpleFSDirectory(File("spellchecker"))
spellchecker = SpellChecker(spell_dic)
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
Beispiel #21
0
# -*- coding : utf-8 -*-
"""
建立索引
"""
import sys
import lucene

from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

if __name__ == "__main__":
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from sys.stdin..."
    for n, l in enumerate(sys.stdin):
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Beispiel #22
0
for o, a in options:
    if o == "--format":
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'


template = CustomTemplate(format)

fsDir = SimpleFSDirectory(Paths.get(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer()
parser = QueryParser("keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print("Found %d document(s) (in %s) that matched query '%s':" %
          (len(scoreDocs), duration, query),
          file=sys.stderr)

for scoreDoc in scoreDocs:
Beispiel #23
0
#!/usr/bin/python
import sys, os

sys.path.append("../lib/lucene-core-3.6.2.jar")
sys.path.append("../lib/lucene-core-3.6.2-javadoc.jar")

from java.io import File
from java.util import Scanner
from org.apache.lucene.index import IndexReader, Term
from org.apache.lucene.store import SimpleFSDirectory
import pdb

if __name__ == "__main__":

    r = IndexReader.open(SimpleFSDirectory(File('../index')))
    print "... total number of documents in the index is " + str(r.maxDoc())
    t = r.terms()
    i = 0
    count_add = 0
    while t.next():
        i = i + 1
        if i > 100010:
            break
        if i > 100000:
            print "[" + str(i) + "]" + t.term().text()

    te = Term("contents", "brute")
    print "... number of documents with the word brute is : " + str(
        r.docFreq(te))
    td = r.termDocs(te)
Beispiel #24
0
 def __init__(self, indexDir: str):
     index_dir = SimpleFSDirectory(Paths.get(indexDir))
     self._searcher = IndexSearcher(DirectoryReader.open(index_dir))
Beispiel #25
0
    def openStore(self):

        return SimpleFSDirectory(Paths.get(self.STORE_DIR))
Beispiel #26
0
 def __init__(self, id: str, indexDir: str):
     index_dir = SimpleFSDirectory(Paths.get(indexDir))
     self._searcher = IndexSearcher(DirectoryReader.open(index_dir))
     self._id = id
     self._resDict = {}
     self._strDict = ''
Beispiel #27
0
import sys
import lucene
 
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, IntField, StringField, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField}

 
if __name__ == "__main__":
  lucene.initVM()
  indexDir = SimpleFSDirectory(File("data/lucene_full_v1/"))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)
 
  print "%d docs in index" % writer.numDocs()
  print "Reading lines from sys.stdin..."
  header=[]
  for n, l in enumerate(sys.stdin):
    doc = Document()
    fields = l.rstrip().split("\t")
    #add one more field to header field set, which will index the concatenated set of all fields for general searches
    all_ = []
    if len(fields) < 1 or len(fields[0]) == 0:
        continue
    for (idx,field) in enumerate(fields):
        if n == 0:
Beispiel #28
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)
Beispiel #29
0
                        default="tfidf",
                        help="Similarity (in [tfidf, lm, bm25])")
    parser.add_argument('--reorder',
                        type=str,
                        nargs='?',
                        default="no",
                        help="Reordering (in [ups, normups])")
    parser.add_argument('--short',
                        action='store_false',
                        help="Don't show the body of comments")
    args = parser.parse_args()

    if args.sim in ['bm25']:
        similarity = BM25Similarity()
    elif args.sim in ['lm']:
        similarity = LMDirichletSimilarity()
    else:
        similarity = ClassicSimilarity()

    # Sample query
    storeDir = SimpleFSDirectory(Paths.get(args.index_dir))
    searcher = IndexSearcher(DirectoryReader.open(storeDir))
    if similarity is not None:
        searcher.setSimilarity(similarity)
    analyzer = StandardAnalyzer()
    run(searcher,
        analyzer,
        ndocs=args.ndocs,
        reordering=args.reorder,
        show_bodies=not args.short)
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
                a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
def createSearcher(index_dir):
    reader = DirectoryReader.open(SimpleFSDirectory(File(index_dir).toPath()))
    searcher = IndexSearcher(reader)
    return searcher
Beispiel #32
0
def update(collection_name,
           tofind,
           update,
           commit=False,
           add_field_if_not_exists=True):
    #As of now the update will be implemented as search,modify data in json file,delete and re-write
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    try:
        tofind_keyvalue_pairs = json.loads(tofind)
    except:
        return 100
    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
        #setting writer configurations
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
    except:
        return 105
    no_of_documents_modified = 0

    #finding the document to update
    #Scope for making this more efficient
    def rewrite(data_string):
        data = json.loads(data_string)
        toupdate = json.loads(update)
        #primary_key_modified=False

        #delete the appropriate document
        query = BooleanQuery()
        for key in primary_keys_map[collection_name]:
            temp = QueryParser(Version.LUCENE_CURRENT, key,
                               analyzer).parse(data[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

        #print query
        #modify the values
        for key, value in toupdate.items():
            #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)
            if add_field_if_not_exists == False:
                if key in data.keys():
                    data[key] = value
            else:
                data[key] = value

        #this deletion statement has been intenstionally added here
        #only if the modified data,has primary keys already not existing,will the updating process continue
        primary_key_update = False
        for key in toupdate.keys():
            if key in primary_keys_map[INDEX_DIR]:
                primary_key_update = True
                break
        if primary_key_update == True:
            query_search = BooleanQuery()
            for key in primary_keys_map[INDEX_DIR]:
                temp = QueryParser(Version.LUCENE_CURRENT, key,
                                   analyzer).parse(data[key])
                query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST))
            hits = searcher.search(query_search, MAX_RESULTS).scoreDocs
            if len(hits) > 0:
                return 106
        writer.deleteDocuments(query)

        #add the newly modified document
        doc = Document()
        #index files wrt primary key
        for primary_key in primary_keys_map[collection_name]:
            try:
                field = Field(primary_key, data[primary_key], Field.Store.NO,
                              Field.Index.NOT_ANALYZED)
                doc.add(field)
            except:
                primary_keys_map.pop(collection_name)
                return 101
        #compress data using snappy if compression is on
        if to_be_compressed_map[collection_name] == True:
            temp = json.dumps(data)
            data_string = base64.b64encode(snappy.compress(temp))
        else:
            temp = json.dumps(data)
            data_string = base64.b64encode(temp)

        field = Field("$DATA$", data_string, Field.Store.YES,
                      Field.Index.ANALYZED)
        doc.add(field)
        writer.addDocument(doc)

    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map[collection_name]:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents
    if len(tofind_primary_keyvalue_pairs) > 0:
        query = BooleanQuery()
        for key in tofind_primary_keyvalue_pairs.keys():
            temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(
                tofind_primary_keyvalue_pairs[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs

        for hit in hits:
            doc = searcher.doc(hit.doc)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)
            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    if rewrite(data) != 106:
                        no_of_documents_modified += 1
                    else:
                        return 106
            else:
                if rewrite(data) != 106:
                    no_of_documents_modified += 1
                else:
                    return 106

    else:
        for i in range(0, ireader.numDocs()):
            doc = searcher.doc(i)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)

            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    if rewrite(data) != 106:
                        no_of_documents_modified += 1
                    else:
                        return 106
            else:
                if rewrite(data) != 106:
                    no_of_documents_modified += 1
                else:
                    return 106

    ireader.close()
    if commit == True:
        writer.commit()
    writer.close()
    return str(no_of_documents_modified) + " have been modified"
Beispiel #33
0
def store(collection_name, data, commit=False):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    print "started indexing input data......"

    #extracting values
    try:
        contents = json.loads(data)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #checking for existance of record with same primary_key set
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
        query = BooleanQuery()
        for key in primary_keys_map[INDEX_DIR]:
            temp = QueryParser(Version.LUCENE_CURRENT, key,
                               analyzer).parse(contents[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs
        if len(hits) > 0:
            return 106
    except:
        pass

    #setting writer configurations
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(direc, config)
    #fix this later.....FieldType not defined
    #field_type=FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    try:
        doc = Document()
        #index files wrt primary key
        for primary_key in primary_keys_map[collection_name]:
            try:
                field = Field(primary_key, contents[primary_key],
                              Field.Store.NO, Field.Index.NOT_ANALYZED)
                doc.add(field)
            except:
                primary_keys_map.pop(collection_name)
                return 101
        #compress data using snappy if compression is on
        if to_be_compressed_map[collection_name] == True:
            # print "here"
            #data=data.encode('utf-8')
            data = base64.b64encode(snappy.compress(data))
            # print data
        else:
            data = base64.b64encode(data)

        field = Field("$DATA$", data, Field.Store.YES, Field.Index.ANALYZED)
        doc.add(field)
        writer.addDocument(doc)
        if commit == True:
            writer.commit()
        writer.close()
        return 000
    except:
        return 102
Beispiel #34
0
def search(collection_name, tofind):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    try:
        tofind_keyvalue_pairs = json.loads(tofind)
    except:
        return 100
    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
    except:
        return 105

    #initializing return list
    return_list = []
    #check_list=[]
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map[collection_name]:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents
    if len(tofind_primary_keyvalue_pairs) > 0:
        query = BooleanQuery()
        for key in tofind_primary_keyvalue_pairs.keys():
            temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(
                tofind_primary_keyvalue_pairs[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs
        for hit in hits:
            doc = searcher.doc(hit.doc)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)
            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    else:
        for i in range(0, ireader.numDocs()):
            doc = searcher.doc(i)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)

            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    ireader.close()

    if len(return_list) == 0:
        return None
    else:
        return return_list
        html_clean = html_clean + ' ' + token.encode('ascii', 'ignore')
    return html_clean

def clean_countries_dict(dictionary_object):
    cleaned = []
    for country in dictionary_object:
        country_cleaned = [country[0],clean_html(country[1]),country[2],clean_html(country[3])]
        cleaned.append(country_cleaned)
    return cleaned

cleaned_dictionary = clean_countries_dict(merge_country_city_text(countries_dict))

# creating the index
index_path = File(sys.argv[1])
analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
index = SimpleFSDirectory(index_path)

# populating the index
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(index, config)

def create_index():
    for country in cleaned_dictionary:
        doc = Document()
        doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

create_index()
Beispiel #36
0
    using the gutenberg corpus that comes with NLTK package
"""
# In[3]:

import lucene
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
import sys 
from java.io import File
import re

lucene.initVM()
index_dir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(index_dir, writerConfig)


# In[56]:

pattern = r"\[([A-Za-z0-9_].*)\]"


# In[26]:

f = open('gutenberg/austen-emma.txt')


# In[76]: