コード例 #1
0
ファイル: syntax.py プロジェクト: zoudajia/rencos
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
コード例 #2
0
ファイル: code.py プロジェクト: yuemonangong/Movie_Search
def func1(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    if command == '':
        return []
    command_list = jieba.cut(command)
    command = " ".join(command_list)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doct = {
            'title': doc.get("title"),
            'url': doc.get("url"),
            "sentence": doc.get("sentence")
        }
        result.append(doct)
    del searcher
    return result
コード例 #3
0
ファイル: code2.py プロジェクト: spiderdet/homework
def func2(command):
    STORE_DIR = "index1"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    if command == '':
        return
    query = QueryParser(Version.LUCENE_CURRENT, "zhuliao",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 9).scoreDocs
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        try:
            res.append([
                doc.get("name"),
                doc.get("collect_num"),
                doc.get("zhuliao").split(' '),
                doc.get("zuofa").split('\n'),
                doc.get("img_url"),
                doc.get("url")
            ])
        except:
            pass
    res1 = []
    for i in res:
        i[1] = int(i[1])
        res1.append(tuple(i))
    res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True)
    return res2
コード例 #4
0
def search_img(output):
    STORE_DIR = "WebPageIndex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    results = []
    results.append([])
    for num in output:
        imgnum = str(num)
        query = QueryParser(Version.LUCENE_CURRENT, "imgnum",
                            analyzer).parse(imgnum)
        scoreDocs = searcher.search(query, 1).scoreDocs
        for scoreDoc in scoreDocs:
            result = []
            doc = searcher.doc(scoreDoc.doc)
            result.append(doc.get("album"))
            result.append(doc.get("subalbum"))
            result.append(doc.get("singer"))
            result.append(doc.get("url"))
            result.append(doc.get("reviews"))
            result.append(doc.get("imgurl"))
            result.append(doc.get("imgnum"))
            results.append(result)
    del searcher
    return results
コード例 #5
0
def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result
コード例 #6
0
def run(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index1"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(analysis(command))
    HighlightFormatter = SimpleHTMLFormatter()
    highlighter = Highlighter(HighlightFormatter, QueryScorer(query))
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print 'path:', doc.get("path"), 'name:', doc.get(
            "name"), 'url:', doc.get("url"), 'title:', doc.get("title")
        text = doc.get('contents')
        highLightText = highlighter.getBestFragment(analyzer, "contents", text)
        if highLightText != None:
            highLightText = ''.join(highLightText.split(' '))
        data = {}
        data['url'] = doc.get("url")
        data['title'] = doc.get('title')
        data['highlight'] = highLightText
        result.append(data)
    return result
コード例 #7
0
ファイル: PyLuceneTestCase.py プロジェクト: zky001/pylucene
    def getWriter(self,
                  directory=None,
                  analyzer=None,
                  open_mode=None,
                  similarity=None,
                  maxBufferedDocs=None,
                  mergePolicy=None):
        if analyzer is None:
            analyzer = LimitTokenCountAnalyzer(
                WhitespaceAnalyzer(self.TEST_VERSION), 10000)
        config = self.getConfig(analyzer)

        if open_mode is None:
            open_mode = IndexWriterConfig.OpenMode.CREATE
        config.setOpenMode(open_mode)
        if similarity is not None:
            config.setSimilarity(similarity)
        if maxBufferedDocs is not None:
            config.setMaxBufferedDocs(maxBufferedDocs)
        if mergePolicy is not None:
            config.setMergePolicy(mergePolicy)

        if directory is None:
            directory = self.directory

        return IndexWriter(directory, config)
コード例 #8
0
    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
コード例 #9
0
ファイル: UpdateIndex.py プロジェクト: paoxiaode/EE208-2019
 def testDelete(self, fieldName, searchString):
     analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
     analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     writer = IndexWriter(self.dir, config)
     writer.deleteDocuments(Term(fieldName, searchString))
     writer.close()
コード例 #10
0
def createIndexWriter(indexDir):
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    directory = FSDirectory.open(Paths.get(indexDir))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    #config = config.setRAMBufferSizeMB(ramBufferSize)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    return IndexWriter(directory, config)
コード例 #11
0
 def createAnalyzer(self):
     config = self._analyzer
     if config['type'] == "MerescoStandardAnalyzer":
         return MerescoStandardAnalyzer()
     elif config['type'] == "MerescoDutchStemmingAnalyzer":
         return MerescoDutchStemmingAnalyzer(config['stemmingFields'])
     elif config['type'] == "WhitespaceAnalyzer":
         return WhitespaceAnalyzer()
     raise Exception("No support for type " + str(self._analyzer))
コード例 #12
0
def search_dianping(province, kind, query):
    STORE_DIR = "index"
    vm_env.attachCurrentThread()
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    allowed_opt = ['food', 'foodshop']

    if kind not in allowed_opt:
        return None
    if query == '':
        return None

    command = '%s:%s province:%s' % (kind, query, province)
    command = unicode(command, 'utf8', 'ignore')
    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    #比较评分
    max_rank = 0
    best_shop = ''
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))
        if cur_rank > max_rank:
            max_rank = cur_rank
            best_shop = cur_shop

    result = {}
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))

        if cur_rank == max_rank:
            result['name'] = cur_shop.encode('utf8', 'ignore')
            result['rank'] = doc.get('rank').encode('utf8', 'ignore')
            result['food'] = doc.get('food').encode('utf8', 'ignore')
            result['location'] = doc.get('location').encode('utf8', 'ignore')
            result['tel'] = doc.get('tel').encode('utf8', 'ignore')
            result['environment_score'] = doc.get('environment_score').encode(
                'utf8', 'ignore')
            result['flavour_score'] = doc.get('flavour_score').encode(
                'utf8', 'ignore')
            result['service_score'] = doc.get('service_score').encode(
                'utf8', 'ignore')
            result['price_level'] = doc.get('price_level').encode(
                'utf8', 'ignore')

    del searcher
    return result
コード例 #13
0
def main():
    STORE_DIR = "index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    del searcher
コード例 #14
0
ファイル: index_docs.py プロジェクト: jwymbs23/pamphlets
def getWriter(store, analyzer=None, create=False):
    if analyzer is None:
        analyzer = WhitespaceAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 10000000)
    config = IndexWriterConfig(analyzer)
    if create:
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    print(store, config)
    writer = IndexWriter(store, config)
    return writer
コード例 #15
0
def func_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    match_count, result = work(searcher, analyzer, command)
    del searcher
    return match_count, result
コード例 #16
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
コード例 #17
0
    def getWriter(self, store, analyzer=None, create=False):

        if analyzer is None:
            analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        return writer
コード例 #18
0
def main(title, judge):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    res = run(searcher, analyzer, title, judge)
    del searcher
    return res
コード例 #19
0
def search_img(image):
    img = cv2.imread(image)
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, img)
    return result
    del searcher
コード例 #20
0
    def __init__(self, analyser=WhitespaceAnalyzer(), file="prolog/wn_s.pl"):

        self.parser = WordnetSynonymParser(True, True, analyser)

        # Read the prolog-file for wordnet in a stringreader
        PlFile = StringReader(open(file, 'r').read())

        # Parse the prologfile with the WordnetSynonymParser
        self.parser.parse(PlFile)

        # Build the synonymmap
        self.map = self.parser.build()
コード例 #21
0
def init_search(search_content, vm_env):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    lucene.initVM()
    STORE_DIR = "zhihuindex"
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result_list = run(searcher, analyzer, search_content)
    del searcher
    return result_list
コード例 #22
0
ファイル: indexer.py プロジェクト: kjs1715/IR_system
 def __init__(self, path):
     p = Path(path)
     if not p.is_dir():
         os.mkdir(path)
     storeDir = SimpleFSDirectory(Paths.get(path))
     # analyzer = StandardAnalyzer()
     # analyzer = SmartChineseAnalyzer(stopwords)
     analyzer = WhitespaceAnalyzer(Version.LATEST)
     config = IndexWriterConfig(analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     index_writer = IndexWriter(storeDir, config)
     self.IndexDocs(index_writer)
コード例 #23
0
def Search_text(command):
    STORE_DIR = "index"
    # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = run(searcher, analyzer, command)
    del searcher
    return res
コード例 #24
0
    def getWriter(self, store, analyzer=None, create=False):

        if analyzer is None:
            analyzer = WhitespaceAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(analyzer)
        #config.setInfoStream(PrintStreamInfoStream(System.out))
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        return writer
コード例 #25
0
def text_search(command, cpage, meth):
    global vm_env, searcher, analyzer

    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(File(STORE_TEXT_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    text, maxnum = runstext(command, cpage, meth)

    del searcher

    return text, maxnum
コード例 #26
0
    def testNull(self):

        a = WhitespaceAnalyzer()
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "FOO", "BAR"])
        self._assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                               ["foo", "bar", ".", "FOO", "<>", "BAR"])
        self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", ["foo.bar.FOO.BAR"])
        self._assertAnalyzesTo(a, "U.S.A.", ["U.S.A."])
        self._assertAnalyzesTo(a, "C++", ["C++"])
        self._assertAnalyzesTo(a, "B2B", ["B2B"])
        self._assertAnalyzesTo(a, "2B", ["2B"])
        self._assertAnalyzesTo(a, "\"QUOTED\" word", ["\"QUOTED\"", "word"])
コード例 #27
0
def init_search(search_content, vm_env):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    lucene.initVM()
    STORE_DIR = "index"
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    result_s = run(searcher, analyzer, search_content)
    del searcher
    print(result_s)
    return result_s
コード例 #28
0
ファイル: UpdateIndex.py プロジェクト: paoxiaode/EE208-2019
    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True,建立新索引,False,建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e
コード例 #29
0
 def search(self, query, field, maxReturnLimit):
     qp = QueryParser(Version.LUCENE_CURRENT, field, WhitespaceAnalyzer(Version.LUCENE_CURRENT)).parse(query)
     hits = self.searcher.search(qp, maxReturnLimit)
     result = []
     for hit in hits.scoreDocs:
         record = dict()
         doc = self.searcher.doc(hit.doc)
         record["id"] = doc.get("id")
         record["pos"]  = doc.get("pos")
         record["hallmarks"] = doc.get("hallmarks").split()
         #record["hallmarks-exp"] = doc.get("hallmarks-exp").split()
         record["text"] = doc.get("text")
         result.append(record)
     return result
コード例 #30
0
def user_profile(searcher, author):
    """Builds an user profile {comments, links, subreddits}"""

    analyzer = WhitespaceAnalyzer()
    query = QueryParser("author", analyzer).parse('"%s"' % author)
    scoreDocs = searcher.search(query, N_DOCS).scoreDocs

    profile = {'comments': set(), 'links': set(), 'subreddits': set()}
    for sd in scoreDocs:
        doc = searcher.doc(sd.doc)
        profile['comments'].add(doc.get('name'))
        profile['links'].add(doc.get('link_id'))
        profile['subreddits'].add(doc.get('subreddit'))

    return profile