Ejemplo n.º 1
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data)
     if user_data.brand == '':
         user_data.brand = '1'
     res = Run_Score(searcher_good, searcher_bad, analyzer, name,
                     user_data.brand)
     comments = []
     for i in range(len(res)):
         if len(res[i]) == 9:
             t = res[i][8]
         else:
             t = ''
         for j in range(len(t)):
             s = t[j]
             s.encode("utf8")
             if len(s) >= 50:
                 comments.append(s)
     return render.comments(comments)
Ejemplo n.º 2
0
 def GET(self):
     user_data = web.input()
     message = user_data.keyword
     if len(message) > 10:
         if (len(message) > 3
                 and message[-3] + message[-2] + message[-1] == 'png'
                 or message[-3] + message[-2] + message[-1] == 'jpg'):
             urlretrieve(message, 'target.jpg')
             lis1 = shit.LSH('target.jpg')
             lis = []
             vm_env.attachCurrentThread()
             STORE_DIR = 'index'
             directory = SimpleFSDirectory(File(STORE_DIR))
             searcher = IndexSearcher(DirectoryReader.open(directory))
             analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
             for i in range(len(lis1)):
                 lis.append(run(searcher, analyzer, lis1[i])[0])
     else:
         a = func(user_data.keyword)
         STORE_DIR = 'index'
         vm_env.attachCurrentThread()
         directory = SimpleFSDirectory(File(STORE_DIR))
         searcher = IndexSearcher(DirectoryReader.open(directory))
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         lis = run(searcher, analyzer, a)
     f = login
     return render.movies(f, lis)
Ejemplo n.º 3
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
Ejemplo n.º 4
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Ejemplo n.º 5
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data.shop)
     res = Run_GoodRate(searcher_good, searcher_bad, analyzer, command,
                        user_data.brand)
     res.append(command)
     return render.SearchResult(res)
Ejemplo n.º 6
0
def running(command):
    command = unicode(command)
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    return run(searcher, analyzer, command)
Ejemplo n.º 7
0
def run(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    #print "%s total matching documents." % len(scoreDocs)
    res = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        tmp = []
        tmp.append([doc.get('name1'), doc.get('name2')])
        tmp.append(doc.get("homepage"))
        tmp.append(doc.get("intro"))
        tmp.append(doc.get('logo'))
        a = doc.get('goods')
        a = a.split('\n')
        for i in a:
            tmp.append(i)
        res.append(tmp)

    return command, res
Ejemplo n.º 8
0
def main():
    resultados = []
    indice_vacio = False
    if len(os.listdir("./lucene/index")) == 0:
        indice_vacio = True
    else:
        consulta = request.args.get("consulta", None)
        if consulta is not None:
            directory = SimpleFSDirectory(Paths.get("./lucene/index"))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = SpanishAnalyzer()
            query = QueryParser("texto", analyzer).parse(consulta)
            scoreDocs = searcher.search(query, 10).scoreDocs

            for sd in scoreDocs:
                doc = searcher.doc(sd.doc)
                resultados.append({
                    "url": direccion_base + doc.get("pdf"),
                    "titulo": doc.get("titulo")
                })

    return render_template("main.html",
                           lucene=lucene.VERSION,
                           indice_vacio=indice_vacio,
                           resultados=resultados)
Ejemplo n.º 9
0
def retrieve(command):
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except ValueError:
        print "JVM running."

    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    # to convert to AND query
    command = re.sub(r' ', r' +', command)
    command = "+" + command

    print "Searching for:", command
    query = QueryParser("contents", analyzer).parse(command)
    print query
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    retrieved_docs = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name")))

    del searcher
    return retrieved_docs
Ejemplo n.º 10
0
Archivo: idx.py Proyecto: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field,
                                      self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
Ejemplo n.º 11
0
def func(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #
    p = get_d_dimensional_vector(command)
    vp = get_vp(p)
    query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp)
    scoreDocs = searcher.search(query, 200).scoreDocs

    dict1 = {}
    result = ""
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views"))
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        dict1[ch] = rank
    res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True)
    for i in res_list:
        result += i[0]
        result += ' '
    del searcher
    del analyzer
    return result
Ejemplo n.º 12
0
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
Ejemplo n.º 13
0
    def GET(self):
        command = web.input().command.encode('utf-8')
        initvm.vm_env.attachCurrentThread()

        STORE_DIR = "jdindex"
        directory = SimpleFSDirectory(File(STORE_DIR))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 20).scoreDocs

        finalDocs = []
        for i, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            title = doc.get("title").strip('\n')
            if title not in finalDocs:
                finalDocs.append(title)

        web.header('content-type', 'text/json')
        data = {}
        data['q'] = command
        data['p'] = 'false'
        data['s'] = finalDocs
        return 'fn(' + json.dumps(data) + ');'
Ejemplo n.º 14
0
def SearchImgCommand(command):
    initvm.vm_env.attachCurrentThread()

    STORE_DIR = "jdindex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    contentCommand = ' '.join(jieba.cut(command))
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(contentCommand)
    scoreDocs = searcher.search(query, 50).scoreDocs

    Already = []
    finalDocs = []
    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        itemurl = doc.get("itemurl")
        if itemurl not in Already:
            oneDoc = {}
            oneDoc['imgurl'] = doc.get("imgurl")
            oneDoc['title'] = doc.get("title").strip('\n')
            oneDoc['itemurl'] = itemurl
            oneDoc['score'] = scoreDoc.score
            finalDocs.append(oneDoc)
            Already.append(itemurl)

    return finalDocs
Ejemplo n.º 15
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
Ejemplo n.º 16
0
def run_music(ID):
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID)
    scoreDocs = searcher.search(query, 1).scoreDocs

    try:
        scoreDoc = scoreDocs[0]
    except:
        return None
    doc = searcher.doc(scoreDoc.doc)

    item = []
    item.append(doc.get("song_title").encode('utf-8'))
    item.append(doc.get('song_url'))
    item.append(doc.get("singer").encode('utf-8'))
    item.append(doc.get("album").encode('utf-8'))
    item.append(doc.get("album_pic"))
    item.append(doc.get("album_genre").encode('utf-8'))
    item.append(doc.get("lyrics").encode('utf-8'))

    sim_str = doc.get("similar").encode('utf-8')
    sim_list = sim_str.split('+')
    for i in range(3):
        sim_list[i] = sim_list[i].split('*')
    item.append(sim_list)

    del searcher

    return item
Ejemplo n.º 17
0
def shourcut_retriever(keyword):
    '''查询器:在简介中查询'''
    global flag
    if flag:
        lucene.initVM()
    flag = False
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_4_10_1, "shortcut",
                        analyzer).parse(keyword)
    MAX = 20
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))
    results = []
    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        result = [doc.get('shortcut'), doc.get('url'), doc.get('name')]
        print(doc.get('url'))
        results.append(result)
    return results
Ejemplo n.º 18
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
Ejemplo n.º 19
0
 def __init__(self, path=INDEX_DIR):
     # 初始化lucene,设置好analyzer、reader、searcher和分词器
     lucene.initVM()
     self.indir = SimpleFSDirectory(Paths.get(path))
     self.analyzer = SmartChineseAnalyzer()
     self.reader = DirectoryReader.open(self.indir)
     self.searcher = IndexSearcher(self.reader)
Ejemplo n.º 20
0
def search_img(output):
    STORE_DIR = "WebPageIndex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    results = []
    results.append([])
    for num in output:
        imgnum = str(num)
        query = QueryParser(Version.LUCENE_CURRENT, "imgnum",
                            analyzer).parse(imgnum)
        scoreDocs = searcher.search(query, 1).scoreDocs
        for scoreDoc in scoreDocs:
            result = []
            doc = searcher.doc(scoreDoc.doc)
            result.append(doc.get("album"))
            result.append(doc.get("subalbum"))
            result.append(doc.get("singer"))
            result.append(doc.get("url"))
            result.append(doc.get("reviews"))
            result.append(doc.get("imgurl"))
            result.append(doc.get("imgnum"))
            results.append(result)
    del searcher
    return results
Ejemplo n.º 21
0
def searcher(directory, analyzer, queries_file):
    lines = queries_file.readlines()
    length = len(lines)
    a_query = ''
    query_counter = 0
    log = open("log.txt", "a")
    for line_number in range(length):
        if lines[line_number].startswith("<num>"):
            query_id = lines[line_number][14:].strip()
        elif lines[line_number].startswith("<desc>"):
            a_query = lines[line_number + 1].strip()
            a_query = stop_words(a_query)
        if a_query != '':
            # searching the index
            reader = DirectoryReader.open(directory)
            searcher = IndexSearcher(reader)
            # parse the query
            parser = QueryParser("DocParagraph", analyzer)
            query = parser.parse(a_query)
            # return 50 queries are required by the assignment
            hits = searcher.search(query, 50).scoreDocs
            # rank counter 1 through 50
            rank_counter = 1
            for hit in hits:
                result = searcher.doc(hit.doc)
                # write search result to log text file
                to_log = str(query_id) + " " + "Q" + str(
                    query_counter) + " " + str(result.get(
                        "DocID")) + " " + str(rank_counter) + " " + str(
                            hit.score) + " " + "Alex's" + "\n"
                log.write(to_log)
                rank_counter += 1
            query_counter += 1
            a_query = ''
    log.close()
Ejemplo n.º 22
0
 def load_index(self):
     indexDir = File(self.index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
Ejemplo n.º 23
0
 def __init__(self, index_path):
     # 요 부분은 다 lucene을 통해 인덱스들을 불러오고, reader와 searcher 초기화
     indexDir = File(index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(
         index)  #IndexReader 열고 닫지 않았었음...........................
     self.searcher = IndexSearcher(self.reader)
Ejemplo n.º 24
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
Ejemplo n.º 25
0
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
Ejemplo n.º 26
0
 def __init__(self, db_path):
     directory = SimpleFSDirectory(File(db_path))
     reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(reader)
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     logger.info("Loaded DB from %s with %d documents: ", db_path,
                 reader.numDocs())
Ejemplo n.º 27
0
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
Ejemplo n.º 28
0
    def __init__(self, path, analyzer, topn=DEF_TOPN):

        self.path = path
        self._analyzer = analyzer
        self.topn = topn
        self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path)))
        self._searcher = IndexSearcher(DirectoryReader.open(self._store))
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
Ejemplo n.º 30
0
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger