def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        #self.analyzer = StandardAnalyzer()
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searcher = IndexSearcher(self.reader)
        self.dict_term_freq = {}
        if similarity == 'BM25':
            (self.searcher).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
Beispiel #2
0
    def build(self, index):

        writer = self.getWriter(directory=index.index,
                                analyzer=SimpleAnalyzer(
                                    Version.LUCENE_CURRENT))

        seed(101)
        for d in xrange(self.minId, self.maxId + 1):
            doc = Document()
            doc.add(Field("id", self.pad(d), StringField.TYPE_STORED))
            if index.allowNegativeRandomInts:
                r = randint(~self.MAX_INT, self.MAX_INT)
            else:
                r = randint(0, self.MAX_INT)

            if index.maxR < r:
                index.maxR = r

            if r < index.minR:
                index.minR = r

            doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED))
            doc.add(Field("body", "body", StringField.TYPE_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
def main():
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)
    config = config.setRAMBufferSizeMB(1024.0)
    # write data to index

    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
Beispiel #5
0
    def testNot(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer())

        d1 = Document()
        d1.add(Field("field", "a b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b")

        topDocs = searcher.search(query, 50)
        self.assertEqual(0, topDocs.totalHits)
Beispiel #6
0
def running(command):
    command = unicode(command)
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    return run(searcher, analyzer, command)
 def __init__(self, store_dir):
     initVM()
     directory = SimpleFSDirectory(File(store_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     print 'loaded index: %s' % store_dir
     self.analyzer = {}
     self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
def vagueSearch(command, urlclick):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File('index2.3'))
    print "run vague search..."
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    results, swxc_res = run(searcher, analyzer, command, urlclick)
    del searcher
    return results, swxc_res
Beispiel #9
0
def get_search_func():
    jieba.initialize()
    vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(File(LUCENE_INDEX_DIR))))

    search = search_func_factory(analyzer=analyzer,
                                 searcher=searcher,
                                 vm_env=vm_env)

    return search
Beispiel #10
0
    def setUp(self):
        super(TestRegexQuery, self).setUp()

        writer = self.getWriter(analyzer=SimpleAnalyzer(self.TEST_VERSION))
        doc = Document()
        doc.add(
            Field(self.FN, "the quick brown fox jumps over the lazy dog",
                  TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)
        writer.commit()
        writer.close()
        self.searcher = self.getSearcher()
 def run(self):
     print 'lucene', lucene.VERSION
     start = datetime.now()
     try:
         IndexFiles(
             xmlpath=self.xmlpath,
             storeDir=self.indexpath,
             analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT),
             ItemClass=self.ItemClass)
         end = datetime.now()
         print end - start
     except Exception, e:
         print "Failed: ", e
Beispiel #12
0
    def testSimple(self):

        a = SimpleAnalyzer()
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "foo.bar.FOO.BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "U.S.A.", ["u", "s", "a"])
        self._assertAnalyzesTo(a, "C++", ["c"])
        self._assertAnalyzesTo(a, "B2B", ["b", "b"])
        self._assertAnalyzesTo(a, "2B", ["b"])
        self._assertAnalyzesTo(a, "\"QUOTED\" word", ["quoted", "word"])
Beispiel #13
0
def _index_files(storeDir, indexFile):
    jieba.initialize()

    store = SimpleFSDirectory(File(storeDir))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

    writer = IndexWriter(store, config)

    _index_docs(indexFile, writer)

    print('commit index')
    writer.commit()
    writer.close()
    print('done')
Beispiel #14
0
    def testDocBoost(self):

        writer = self.getWriter(
            analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT))

        f1 = Field("field", "word", TextField.TYPE_STORED)
        f2 = Field("field", "word", TextField.TYPE_STORED)
        f2.setBoost(2.0)

        d1 = Document()
        d2 = Document()

        d1.add(f1)  # boost = 1
        d2.add(f2)  # boost = 2

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.close()

        scores = [0.0] * 2

        class collector(PythonCollector):
            def __init__(_self, scores):
                super(collector, _self).__init__()
                _self.scores = scores
                _self.base = 0

            def collect(_self, doc, score):
                _self.scores[doc + _self.base] = score

            def setNextReader(_self, context):
                _self.base = context.docBase

            def acceptsDocsOutOfOrder(_self):
                return True

        self.getSearcher().search(TermQuery(Term("field", "word")),
                                  collector(scores))

        lastScore = 0.0
        for score in scores:
            self.assert_(score > lastScore)
            lastScore = score
    def __init__(self, root, storeDir):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Beispiel #16
0
    def testPerField(self):

        perField = HashMap()
        perField.put("special", SimpleAnalyzer())
        analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField)

        text = "Qwerty"
        tokenStream = analyzer.tokenStream("field", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)

        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("Qwerty", termAtt.toString(),
                         "WhitespaceAnalyzer does not lowercase")

        tokenStream = analyzer.tokenStream("special", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)
        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("qwerty", termAtt.toString(),
                         "SimpleAnalyzer lowercases")
Beispiel #17
0
    def __init__(self, root, storeDir, f):
        self.filedir = f

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        # analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
Beispiel #18
0
def render_result(request, template, result, Search, index):
    '''
    Render 'result' page for website and image search.

    Input: `request`: `request` variable received
           `template`: template HTML file
           `result`: relative path of 'result' page
           `Search`: search class used to search the index
           `index`: directory storing the Lucene index
    '''
    if request.method == "POST":
        keyword = request.form['keyword']
        return redirect(url_for(result, keyword=keyword))

    vm_env.attachCurrentThread()
    engine = Search(index, SimpleAnalyzer(), lambda x: ' '.join(jieba.cut(x)))
    keyword = request.args.get('keyword')

    command = {"type": result, "keyword": keyword}
    if command not in search_history:
        search_history.append(command)

    results = engine.search_command(keyword)
    return render_template(template, keyword=keyword, results=results)
    def search(self, command, num, use_clf):
        print("log1", command, num, use_clf)
        self.vm.attachCurrentThread()
        searcher = self.searcher

        print("command", command)

        if (not self.reT.search(command)):
            if (use_clf):
                print("sentence feed to classify", command)
                probs = self.classifier.classify(command)
                command = self.text.seg(command)
                command = self.text.remove_stop_word(command)
                # command = self.text.replace_white_space_with_dash(command)
                key = sorted(range(len(self.keys)),
                             key=lambda i: probs[i],
                             reverse=True)
                key_use = []
                key_use.append(key[0])
                for i in key[1:]:
                    if probs[i] > 0.3 or probs[i] - probs[key[0]] > -0.1:
                        key_use.append(i)

                command_final = self.keys[key_use[0]] + ":(" + command + ")"
                for i in key_use[1:]:
                    command_final = "%s OR %s:(%s)" % (command_final,
                                                       self.keys[i], command)
                command = command_final

                # command = "Title:\"2016 吉 07 民终 491号 包颜峰诉\""
                # command = "PubDate:\"2016 11 24\""
                # command = "WBSB:浙江省 WBSB:苍南县 WBSB:人民法院"
                print(command)
                # command = "Title:陕西省-高级-人民法院 Pubdate:陕西省-高级-人民法院"
                query = QueryParser("PubDate",
                                    WhitespaceAnalyzer()).parse(command)
                # parser =  MultiFieldQueryParser(['WBSB'], self.analyzer)
                # parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
                # query =parser.parse(QueryParserBase,command)

                # P = QueryParser('Pubdate', CJKAnalyzer())
                # query = MultiFieldQueryParser(['WBSB','Pubdate'],CJKAnalyzer()).parse(P,command)
                #
                #
                # # query = MultiFieldQueryParser(['WBSB',"title"], CJKAnalyzer()).getMultiFieldQuery(q)
                # # p = QueryParser('Title', CJKAnalyzer()).parse("你好 中国 你好 北京")
                # print(query)

                # fields = []
                # # fields = ["filename", "contents", "description"]
                #
                # for i in key_use:
                #     fields.append(self.keys[i])
                # flags = [BooleanClause.Occur.SHOULD]*len(fields)
                #
                # query=MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer())
                #
                print(query)

                scoreDocs = searcher.search(query, num).scoreDocs

                results = []

                for scoreDoc in scoreDocs:
                    doc = searcher.doc(scoreDoc.doc)
                    result = dict()
                    for i in self.keys:
                        result[i] = doc.get(i)
                    result['id'] = doc.get('id')
                    results.append(result)
                probs_tmp = ""
                for key, prob in zip(self.keys, probs):
                    probs_tmp += "%s:%2f " % (key, prob)
                probs = probs_tmp
                key_use_tmp = ""
                for i in key_use:
                    key_use_tmp += "%s " % (self.keys[i])
                key_use = key_use_tmp
                return results, probs, key_use

            else:
                command = self.text.seg(command)
                command = self.text.remove_stop_word(command)
                fields = self.keys
                flags = [BooleanClause.Occur.SHOULD] * len(fields)

                query = MultiFieldQueryParser.parse(command, fields, flags,
                                                    WhitespaceAnalyzer())

                # command_final = "Title:"+command
                # for i in self.keys[1:]:
                #     command_final = "%s OR %s:%s"% (command_final,i,command)
                # command=command_final
                # print("矣")
                # print(command)
                # query = QueryParser("Title", self.analyzer).parse(command)

                fields = self.keys
                flags = [BooleanClause.Occur.SHOULD] * len(fields)

                query = MultiFieldQueryParser.parse(command, fields, flags,
                                                    WhitespaceAnalyzer())
                print(query)
                scoreDocs = searcher.search(query, num).scoreDocs

                results = []

                for scoreDoc in scoreDocs:
                    doc = searcher.doc(scoreDoc.doc)
                    result = dict()
                    for i in self.keys:
                        result[i] = doc.get(i)
                    result['id'] = doc.get('id')
                    results.append(result)
                return results, [None] * len(self.keys), self.keys
        else:
            print('command', command)
            ps = self.reT.findall(command)
            print(ps)
            print(type(command))
            rem = self.reT.sub(command, ' ')
            print(ps)
            print(rem)
            q_t = []
            key_use = []
            for i in ps:

                f = i[1]
                data = i[4]
                rela = i[5]

                key_use.append(f)

                q_t.append(f)
                q_t.append(':')
                seg_t = self.text.seg(data)
                seg_t = self.text.remove_stop_word(seg_t)
                dash_t = self.text.replace_white_space_with_dash(seg_t)
                q_t.append(dash_t)
                if (rela):
                    q_t.append(" %s " % rela)
                print('tract pattern', q_t)
            q_f = "".join(q_t)
            print("final q", q_f)
            query = QueryParser("PubDate", SimpleAnalyzer()).parse(q_f)
            print("query", query)
            scoreDocs = searcher.search(query, num).scoreDocs

            results = []

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                result = dict()
                for i in self.keys:
                    result[i] = doc.get(i)
                result['id'] = doc.get('id')
                results.append(result)
            return results, [None] * len(key_use), key_use
Beispiel #20
0
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc


if __name__ == '__main__':

    #html_dir = sys.argv[1]
    store_dir = 'index'  #sys.argv[2]

    lucene.initVM()
    print('lucene {}'.format(lucene.VERSION))

    start = datetime.now()
    try:
        # ExtractImgs(html_dir)
        IndexImgs(store_dir, SimpleAnalyzer())
        end = datetime.now()
        print(end - start)
    except Exception as e:
        print("Failed: {}".format(e))
        raise e
Beispiel #21
0
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc


if __name__ == '__main__':

    html_dir = sys.argv[1]
    doc_dir = sys.argv[2]
    store_dir = sys.argv[3]

    lucene.initVM()
    print('lucene {}'.format(lucene.VERSION))

    start = datetime.now()
    try:
        # IndexFiles('test_folder', 'index', StandardAnalyzer())

        ConvertFiles(html_dir, doc_dir)
        # Use `SimpleAnalyzer` as `Analyzer`
        IndexFiles(html_dir, doc_dir, store_dir, SimpleAnalyzer())
        end = datetime.now()
        print(end - start)
    except Exception as e:
        print("Failed: {}".format(e))
        raise e
Beispiel #22
0
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc


if __name__ == '__main__':

    doc_dir = sys.argv[1]
    store_dir = sys.argv[2]

    lucene.initVM()
    print('lucene {}'.format(lucene.VERSION))

    start = datetime.now()
    try:
        # fn = 'pg17565.txt'
        # IndexUpdate('testfolder', 'index', StandardAnalyzer())

        IndexUpdate(doc_dir, store_dir, SimpleAnalyzer())
        end = datetime.now()
        print(end - start)
    except Exception as e:
        print("Failed: {}".format(e))
        raise e
Beispiel #23
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a c b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(TermQuery(b), collector1())

        builder = BooleanQuery.Builder()
        builder.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        builder.add(TermQuery(b), BooleanClause.Occur.SHOULD)
        bq = builder.build()

        class collector2(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def doSetNextReader(_self, context):
                _self.base = context.docBase

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(bq, collector2())

        pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()])

        class collector3(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector3())

        pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()])

        class collector4(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(0.5, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector4())
            querys.add(query, BooleanClause.Occur.MUST)
        return self.searcher.search(querys.build(), 50).scoreDocs

    def output(self, score_docs):
        '''
        Output the search results in terminal.

        Input: `score_docs`: search results
        Output: None
        '''
        print("{} total matching documents.".format(len(score_docs)))
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            print('path: {}, title: {}, url: {}, name: {}'.format(
                doc.get('path'), doc.get('title'), doc.get('url'),
                doc.get('name')))
        print()


if __name__ == '__main__':

    index_dir = sys.argv[1]

    lucene.initVM()
    print('lucene', lucene.VERSION)

    # SearchFiles('index', StandardAnalyzer())

    # Pass the Jieba function as a parameter for generalized preprocessing
    SearchFiles(index_dir, SimpleAnalyzer(), lambda x: ' '.join(jieba.cut(x)))
Beispiel #26
0
 def __init__(self, folder='gushiwen_index'):
     self.searcher = IndexSearcher(
         DirectoryReader.open(SimpleFSDirectory(File(folder))))
     self.analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
Beispiel #27
0
        split_result=line.split()
        if len(split_result)<=1:
            dic[split_result[0]]="no name"
        dic[split_result[1]]=split_result[0]
    myfile.close()
    return dic"""


if __name__ == '__main__':
    """
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    """
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        """
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        IndexFiles(sys.argv[1], os.path.join(base_dir, INDEX_DIR),
                   StandardAnalyzer(Version.LUCENE_CURRENT))
                   """
        analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
        IndexFiles("Music", INDEX_DIR, analyzer)
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        raise e
    def testSimple(self):
        writer = self.getWriter(analyzer=SimpleAnalyzer())

        doc = Document()
        field = Field("foo", "", TextField.TYPE_NOT_STORED)
        doc.add(field)

        dvField = FloatDocValuesField("foo_boost", 0.0)
        doc.add(dvField)

        field2 = Field("bar", "", TextField.TYPE_NOT_STORED)
        doc.add(field2)

        field.setStringValue("quick brown fox")
        field2.setStringValue("quick brown fox")
        dvField.setFloatValue(2.0)  # boost x2
        writer.addDocument(doc)

        field.setStringValue("jumps over lazy brown dog")
        field2.setStringValue("jumps over lazy brown dog")
        dvField.setFloatValue(4.0)  # boost x4
        writer.addDocument(doc)

        reader = writer.getReader()
        writer.close()

        # no boosting
        searcher1 = self.getSearcher(reader=reader)
        base = searcher1.getSimilarity(True)

        # boosting
        searcher2 = self.getSearcher(reader=reader)

        class _similarity(PythonPerFieldSimilarityWrapper):
            def __init__(_self, base):
                super(_similarity, _self).__init__()
                _self.base = base
                _self.fooSim = BoostingSimilarity(base, "foo_boost")

            def get(_self, field):
                return _self.fooSim if "foo" == field else _self.base

        searcher2.setSimilarity(_similarity(base))

        # in this case, we searched on field "foo". first document should have
        # 2x the score.
        tq = TermQuery(Term("foo", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)

        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score,
                         noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON)

        # this query matches only the second document, which should have 4x
        # the score.
        tq = TermQuery(Term("foo", "jumps"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score,
                         noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON)

        # search on on field bar just for kicks, nothing should happen, since
        # we setup our sim provider to only use foo_boost for field foo.
        tq = TermQuery(Term("bar", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score,
                         SCORE_EPSILON)

        reader.close()
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python create_category_corpus.py NUMBER_TOP_CATEGORY')
        quit()

    NUMBER_TOP_CATEGORY = int(sys.argv[1])
    print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY))

    print('loading category profiles')
    profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz')
    print('finish loading category profiles')

    system_flag = platform.system()
    cwd = os.getcwd()

    # initialize mongo client
    if system_flag == 'Windows':
        client = pymongo.MongoClient("localhost", 27017)
    else:
        client = pymongo.MongoClient("localhost", 58903)

    db = client.wiki2015
    wiki_article_categories = db['article_categories']

    category_corpus = {}

    pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % (
        NUMBER_TOP_CATEGORY)
    if system_flag == 'Windows':
        lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3',
                                            'BM25', True)
    else:
        lucene_dbpedia_fsdm = Lucene_Object(
            '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True)

    cnt = 0
    if os.path.exists(pkl_filename) == True:
        #if False==True:
        print('loading category corpus')
        category_corpus = load_zipped_pickle(pkl_filename)
    else:

        for item in wiki_article_categories.find():
            list_category = item['categories'].strip().split('|')
            uri_article = item['uri']
            title = findTitle(uri_article)

            entity_content_dict = {}
            doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex(
                title, 'title', False)
            if doc_entity is None:
                continue

            for f in [
                    'names', 'attributes', 'categories', 'similar_entities',
                    'related_entities', 'catchall'
            ]:
                entity_content_dict[f] = doc_entity[f]
                entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' +
                                                                 f]

            if len(entity_content_dict['catchall'].strip()) == 0:
                continue

            for cat in list_category[:NUMBER_TOP_CATEGORY]:
                if ('<http://dbpedia.org/resource/Category:' + cat +
                        '>') not in profile:
                    continue
                if cat not in category_corpus:
                    category_corpus[cat] = []
                if len(category_corpus[cat]) < 300:
                    category_corpus[cat].append(entity_content_dict)

            #cnt+=1
            #if cnt>20:
            #break

        print('saving corpus to pkl.gz')
        save_zipped_pickle(category_corpus, pkl_filename)
    client.close()

    # begin write the data into index
    print('begin write into index')
    if system_flag == 'Windows':
        LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str(
            NUMBER_TOP_CATEGORY) + '_fsdm3'
    else:
        LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % (
            cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3'

    # backup code files
    cmd = 'robocopy %s %s\code_files *.py' % (
        r'%cd%', LUCENE_INDEX_DIR
    ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % (
        LUCENE_INDEX_DIR)
    os.system(cmd)

    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)

    # write data to index
    w = IndexWriter(index_mm, config)

    cnt = 0
    data = {}
    max_article_num = 0
    stemmer = SnowballStemmer('english')
    for cat, list_entity_dict in category_corpus.items():
        cat_label = cleanSentence(cat, True)
        data.clear()
        data['category'] = (cat, 'StringField')
        data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT')
        data['stemmed_label'] = (stemSentence(cat_label, stemmer,
                                              True), 'CUSTOM_FIELD_TEXT')
        data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED')

        if data['num_articles'][0] > max_article_num:
            max_article_num = data['num_articles'][0]

        for f in [
                'names', 'attributes', 'categories', 'similar_entities',
                'related_entities', 'catchall'
        ]:
            contents = cleanSentence(
                ' '.join([dic[f] for dic in list_entity_dict]), True, ' ')
            data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED')
            data['stemmed_' + f] = (stemSentence(contents, stemmer, False),
                                    'CUSTOM_FIELD_TEXT_NOT_STORED')
        #print ('--------------------')
        # need to calculate corpus average length
        addDoc(w, data)

        #cnt+=1
        #if cnt>20:
        #break

    w.close()
    print('max article num=%d' % (max_article_num))
Beispiel #30
0
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a b c", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(TermQuery(b), collector1())

        bq = BooleanQuery()
        bq.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        bq.add(TermQuery(b), BooleanClause.Occur.SHOULD)

        class collector2(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def setNextReader(_self, context):
                _self.base = context.docBase

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(bq, collector2())

        pq = PhraseQuery()
        pq.add(a)
        pq.add(c)

        class collector3(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector3())

        pq.setSlop(2)

        class collector4(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(2.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector4())