Python WhitespaceAnalyzerの例、org.apache.lucene.analysis.core.WhitespaceAnalyzer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: syntax.py プロジェクト: zoudajia/rencos

def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)

コード例 #2

0

ファイルを表示

ファイル: code.py プロジェクト: yuemonangong/Movie_Search

def func1(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    if command == '':
        return []
    command_list = jieba.cut(command)
    command = " ".join(command_list)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doct = {
            'title': doc.get("title"),
            'url': doc.get("url"),
            "sentence": doc.get("sentence")
        }
        result.append(doct)
    del searcher
    return result

コード例 #3

0

ファイルを表示

ファイル: code2.py プロジェクト: spiderdet/homework

def func2(command):
    STORE_DIR = "index1"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    if command == '':
        return
    query = QueryParser(Version.LUCENE_CURRENT, "zhuliao",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 9).scoreDocs
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        try:
            res.append([
                doc.get("name"),
                doc.get("collect_num"),
                doc.get("zhuliao").split(' '),
                doc.get("zuofa").split('\n'),
                doc.get("img_url"),
                doc.get("url")
            ])
        except:
            pass
    res1 = []
    for i in res:
        i[1] = int(i[1])
        res1.append(tuple(i))
    res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True)
    return res2

コード例 #4

0

ファイルを表示

def search_img(output):
    STORE_DIR = "WebPageIndex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    results = []
    results.append([])
    for num in output:
        imgnum = str(num)
        query = QueryParser(Version.LUCENE_CURRENT, "imgnum",
                            analyzer).parse(imgnum)
        scoreDocs = searcher.search(query, 1).scoreDocs
        for scoreDoc in scoreDocs:
            result = []
            doc = searcher.doc(scoreDoc.doc)
            result.append(doc.get("album"))
            result.append(doc.get("subalbum"))
            result.append(doc.get("singer"))
            result.append(doc.get("url"))
            result.append(doc.get("reviews"))
            result.append(doc.get("imgurl"))
            result.append(doc.get("imgnum"))
            results.append(result)
    del searcher
    return results

コード例 #5

0

ファイルを表示

def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result

コード例 #6

0

ファイルを表示

def run(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index1"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(analysis(command))
    HighlightFormatter = SimpleHTMLFormatter()
    highlighter = Highlighter(HighlightFormatter, QueryScorer(query))
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print 'path:', doc.get("path"), 'name:', doc.get(
            "name"), 'url:', doc.get("url"), 'title:', doc.get("title")
        text = doc.get('contents')
        highLightText = highlighter.getBestFragment(analyzer, "contents", text)
        if highLightText != None:
            highLightText = ''.join(highLightText.split(' '))
        data = {}
        data['url'] = doc.get("url")
        data['title'] = doc.get('title')
        data['highlight'] = highLightText
        result.append(data)
    return result

コード例 #7

0

ファイルを表示

ファイル: PyLuceneTestCase.py プロジェクト: zky001/pylucene

    def getWriter(self,
                  directory=None,
                  analyzer=None,
                  open_mode=None,
                  similarity=None,
                  maxBufferedDocs=None,
                  mergePolicy=None):
        if analyzer is None:
            analyzer = LimitTokenCountAnalyzer(
                WhitespaceAnalyzer(self.TEST_VERSION), 10000)
        config = self.getConfig(analyzer)

        if open_mode is None:
            open_mode = IndexWriterConfig.OpenMode.CREATE
        config.setOpenMode(open_mode)
        if similarity is not None:
            config.setSimilarity(similarity)
        if maxBufferedDocs is not None:
            config.setMaxBufferedDocs(maxBufferedDocs)
        if mergePolicy is not None:
            config.setMergePolicy(mergePolicy)

        if directory is None:
            directory = self.directory

        return IndexWriter(directory, config)

コード例 #8

0

ファイルを表示

ファイル: FacetExample.py プロジェクト: jessekafor/bisonlucene

    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)

コード例 #9

0

ファイルを表示

ファイル: UpdateIndex.py プロジェクト: paoxiaode/EE208-2019

 def testDelete(self, fieldName, searchString):
     analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     writer = IndexWriter(self.dir, config)
     writer.deleteDocuments(Term(fieldName, searchString))
     writer.close()

コード例 #10

0

ファイルを表示

def createIndexWriter(indexDir):
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    directory = FSDirectory.open(Paths.get(indexDir))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    #config = config.setRAMBufferSizeMB(ramBufferSize)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    return IndexWriter(directory, config)

コード例 #11

0

ファイルを表示

ファイル: lucenesettings.py プロジェクト: ambientman/meresco-lucene

 def createAnalyzer(self):
     config = self._analyzer
     if config['type'] == "MerescoStandardAnalyzer":
         return MerescoStandardAnalyzer()
     elif config['type'] == "MerescoDutchStemmingAnalyzer":
         return MerescoDutchStemmingAnalyzer(config['stemmingFields'])
     elif config['type'] == "WhitespaceAnalyzer":
         return WhitespaceAnalyzer()
     raise Exception("No support for type " + str(self._analyzer))

コード例 #12

0

ファイルを表示

def search_dianping(province, kind, query):
    STORE_DIR = "index"
    vm_env.attachCurrentThread()
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    allowed_opt = ['food', 'foodshop']

    if kind not in allowed_opt:
        return None
    if query == '':
        return None

    command = '%s:%s province:%s' % (kind, query, province)
    command = unicode(command, 'utf8', 'ignore')
    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    #比较评分
    max_rank = 0
    best_shop = ''
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))
        if cur_rank > max_rank:
            max_rank = cur_rank
            best_shop = cur_shop

    result = {}
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))

        if cur_rank == max_rank:
            result['name'] = cur_shop.encode('utf8', 'ignore')
            result['rank'] = doc.get('rank').encode('utf8', 'ignore')
            result['food'] = doc.get('food').encode('utf8', 'ignore')
            result['location'] = doc.get('location').encode('utf8', 'ignore')
            result['tel'] = doc.get('tel').encode('utf8', 'ignore')
            result['environment_score'] = doc.get('environment_score').encode(
                'utf8', 'ignore')
            result['flavour_score'] = doc.get('flavour_score').encode(
                'utf8', 'ignore')
            result['service_score'] = doc.get('service_score').encode(
                'utf8', 'ignore')
            result['price_level'] = doc.get('price_level').encode(
                'utf8', 'ignore')

    del searcher
    return result

コード例 #13

0

ファイルを表示

def main():
    STORE_DIR = "index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    del searcher

コード例 #14

0

ファイルを表示

ファイル: index_docs.py プロジェクト: jwymbs23/pamphlets

def getWriter(store, analyzer=None, create=False):
    if analyzer is None:
        analyzer = WhitespaceAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 10000000)
    config = IndexWriterConfig(analyzer)
    if create:
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    print(store, config)
    writer = IndexWriter(store, config)
    return writer

コード例 #15

0

ファイルを表示

def func_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    match_count, result = work(searcher, analyzer, command)
    del searcher
    return match_count, result

コード例 #16

0

ファイルを表示

def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher

コード例 #17

0

ファイルを表示

    def getWriter(self, store, analyzer=None, create=False):

        if analyzer is None:
            analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        return writer

コード例 #18

0

ファイルを表示

def main(title, judge):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    res = run(searcher, analyzer, title, judge)
    del searcher
    return res

コード例 #19

0

ファイルを表示

ファイル: final.py プロジェクト: Faldict/Electonic-Enginerring-C-

def search_img(image):
    img = cv2.imread(image)
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, img)
    return result
    del searcher

コード例 #20

0

ファイルを表示

ファイル: synonymGenerator.py プロジェクト: michael5555/InformationRetrieval

    def __init__(self, analyser=WhitespaceAnalyzer(), file="prolog/wn_s.pl"):

        self.parser = WordnetSynonymParser(True, True, analyser)

        # Read the prolog-file for wordnet in a stringreader
        PlFile = StringReader(open(file, 'r').read())

        # Parse the prologfile with the WordnetSynonymParser
        self.parser.parse(PlFile)

        # Build the synonymmap
        self.map = self.parser.build()

コード例 #21

0

ファイルを表示

def init_search(search_content, vm_env):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    lucene.initVM()
    STORE_DIR = "zhihuindex"
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result_list = run(searcher, analyzer, search_content)
    del searcher
    return result_list

コード例 #22

0

ファイルを表示

ファイル: indexer.py プロジェクト: kjs1715/IR_system

 def __init__(self, path):
     p = Path(path)
     if not p.is_dir():
         os.mkdir(path)
     storeDir = SimpleFSDirectory(Paths.get(path))
     # analyzer = StandardAnalyzer()
     # analyzer = SmartChineseAnalyzer(stopwords)
     analyzer = WhitespaceAnalyzer(Version.LATEST)
     config = IndexWriterConfig(analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     index_writer = IndexWriter(storeDir, config)
     self.IndexDocs(index_writer)

コード例 #23

0

ファイルを表示

def Search_text(command):
    STORE_DIR = "index"
    # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = run(searcher, analyzer, command)
    del searcher
    return res

コード例 #24

0

ファイルを表示

    def getWriter(self, store, analyzer=None, create=False):

        if analyzer is None:
            analyzer = WhitespaceAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(analyzer)
        #config.setInfoStream(PrintStreamInfoStream(System.out))
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        return writer

コード例 #25

0

ファイルを表示

def text_search(command, cpage, meth):
    global vm_env, searcher, analyzer

    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(File(STORE_TEXT_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    text, maxnum = runstext(command, cpage, meth)

    del searcher

    return text, maxnum

コード例 #26

0

ファイルを表示

    def testNull(self):

        a = WhitespaceAnalyzer()
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "FOO", "BAR"])
        self._assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                               ["foo", "bar", ".", "FOO", "<>", "BAR"])
        self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", ["foo.bar.FOO.BAR"])
        self._assertAnalyzesTo(a, "U.S.A.", ["U.S.A."])
        self._assertAnalyzesTo(a, "C++", ["C++"])
        self._assertAnalyzesTo(a, "B2B", ["B2B"])
        self._assertAnalyzesTo(a, "2B", ["2B"])
        self._assertAnalyzesTo(a, "\"QUOTED\" word", ["\"QUOTED\"", "word"])

コード例 #27

0

ファイルを表示

ファイル: SearchFiles_tianya.py プロジェクト: DasongLi/a-forum

def init_search(search_content, vm_env):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    lucene.initVM()
    STORE_DIR = "index"
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result_s = run(searcher, analyzer, search_content)
    del searcher
    print(result_s)
    return result_s

コード例 #28

0

ファイルを表示

ファイル: UpdateIndex.py プロジェクト: paoxiaode/EE208-2019

    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True，建立新索引，False，建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e

コード例 #29

0

ファイルを表示

 def search(self, query, field, maxReturnLimit):
     qp = QueryParser(Version.LUCENE_CURRENT, field, WhitespaceAnalyzer(Version.LUCENE_CURRENT)).parse(query)
     hits = self.searcher.search(qp, maxReturnLimit)
     result = []
     for hit in hits.scoreDocs:
         record = dict()
         doc = self.searcher.doc(hit.doc)
         record["id"] = doc.get("id")
         record["pos"]  = doc.get("pos")
         record["hallmarks"] = doc.get("hallmarks").split()
         #record["hallmarks-exp"] = doc.get("hallmarks-exp").split()
         record["text"] = doc.get("text")
         result.append(record)
     return result

コード例 #30

0

ファイルを表示

def user_profile(searcher, author):
    """Builds an user profile {comments, links, subreddits}"""

    analyzer = WhitespaceAnalyzer()
    query = QueryParser("author", analyzer).parse('"%s"' % author)
    scoreDocs = searcher.search(query, N_DOCS).scoreDocs

    profile = {'comments': set(), 'links': set(), 'subreddits': set()}
    for sd in scoreDocs:
        doc = searcher.doc(sd.doc)
        profile['comments'].add(doc.get('name'))
        profile['links'].add(doc.get('link_id'))
        profile['subreddits'].add(doc.get('subreddit'))

    return profile