Example #1
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
Example #2
0
    def searchWithRequestAndQuery(cls, query, indexReader, taxoReader,
                                  indexingParams, facetRequest):
        """
        Search an index with facets for given query and facet requests.
        returns a List<FacetResult>
        """
        # prepare searcher to search against
        searcher = IndexSearcher(indexReader)
        # collect matching documents into a collector
        topDocsCollector = TopScoreDocCollector.create(10, True)
        if not indexingParams:
            indexingParams = FacetIndexingParams.DEFAULT

        # Faceted search parameters indicate which facets are we interested in
        facetRequests = [facetRequest,]
        facetRequests = Arrays.asList(facetRequests)
        # Add the facet request of interest to the search params:
        facetSearchParams = FacetSearchParams(indexingParams, facetRequests)
        # and create a FacetsCollector to use in our facetted search:
        facetsCollector = FacetsCollector.create(facetSearchParams, indexReader, taxoReader)
        # perform documents search and facets accumulation
        searcher.search(query, MultiCollector.wrap([topDocsCollector, facetsCollector]))
        print "\nFound %d Documents for query=%s" % (topDocsCollector.totalHits,
                                                     query.toString().encode('utf-8'))
        # Obtain facets results and print them
        res = facetsCollector.getFacetResults()
        i = 0
        for facetResult in res:
            print "Result #%d has %d descendants" % (i, facetResult.getNumValidDescendants())
            print "Result #%d : %s" % (i, facetResult)
            i += 1

        return res
class Searcher:
    def __init__(self, indexDir):
        self.directory = SimpleFSDirectory(Paths.get(indexDir))
        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
        self.nameQueryParser = QueryParser('name', StandardAnalyzer())
        self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
        self.idQueryParser = QueryParser('id', StandardAnalyzer())
        self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)

    def find_by_name(self, name):
        query = self.nameQueryParser.parse(name)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def find_by_id(self, id):
        query = self.idQueryParser.parse(id)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables
Example #4
0
class Lindex(object):
    def __init__(self, index_dir_ngram='./ngram', index_dir_vocab='./vocab'):
        self._index_dir_ngram = index_dir_ngram
        self._index_dir_vocab = index_dir_vocab
        fs = MMapDirectory.open(File(index_dir_ngram))
        self._searcher_ngram = IndexSearcher(DirectoryReader.open(fs))

    def __getitem__(self, ngram):
        pass

    def search_interactive(self):
        doc = None
        while True:
            print()
            print('Hit enter with no input to quit.')
            command = raw_input('Query:')
            if command == '':
                return doc
            print()
            print('Searching for:'.format(command))
            query = TermQuery(Term('ngram', command))
            hits = self._searcher_ngram.search(query, 1000).scoreDocs
            print('{} total matching documents.'.format(len(hits)))
            for i, hit in enumerate(hits):
                doc = self._searcher_ngram.doc(hit.doc)
                print('doc {}:'.format(i))
                for f in doc:
                    print(type(f))
                    print('\t{}'.format(f.toString().encode('utf-8')))
#                print 'text:', doc.get("text")
                if i >= 30:
                    break

    def search_interactive_history(self):
        doc = None
        while True:
            print()
            print('Hit enter with no input to quit.')
            command = raw_input('Query:')
            if command == '':
                return doc
            print()
            print('Searching for:'.format(command))
            query = TermQuery(Term('history', command))
            hits = self._searcher_ngram.search(query, 1000).scoreDocs
            print('{} total matching documents.'.format(len(hits)))
            for i, hit in enumerate(hits):
                doc = self._searcher_ngram.doc(hit.doc)
                print('doc {}:'.format(i))
                for f in doc:
                    print(type(f))
                    print('\t{}'.format(f.toString().encode('utf-8')))


#                print 'text:', doc.get("text")
                if i >= 30:
                    break
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Example #6
0
class QAsearcher(object):
    def __init__(self, dir):
        self.dir = dir
        self.lReader = DirectoryReader.open(
            FSDirectory.open(Paths.get(self.dir)))
        self.lSearcher = IndexSearcher(self.lReader)

    # def get_collection_size(self):
    #     return self.lReader.numDocs()

    def doc_search(self, keywords):

        analyzer = StandardAnalyzer()
        parser = QueryParser('Title', analyzer)
        query = parser.parse(keywords)

        try:
            collector = TopScoreDocCollector.create(3000)
            self.lSearcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

        except RuntimeError:
            print "Score docoment run fail"
        self.hits = hits
        return hits

    def print_result(self):
        temp = []
        for i in self.hits:
            # print "\nResult " + str(j) + "\tDocID: " + str(i.doc) + "\t Score: " + str(i.score)
            try:
                temp.append([
                    self.lReader.document(i.doc).get("Title"),
                    int(self.lReader.document(i.doc).get("AnswerCount"))
                ])
            except RuntimeError:
                print "Search fail"
        sort = np.array(temp, dtype=object)

        sort1000 = sort[np.argsort(-sort[:, 1])]

        show = 'The top 10 question is \n'
        for i in range(0, 10):
            show = show + '\n' + 'The ' + str(i + 1) + ' question is: ' + str(
                sort1000[i, 0]) + '\nThe number of answer is ' + str(
                    sort1000[i, 1]) + '\n'

        text = '\n'.join(show)
        easygui.textbox(title='6223 Top Question', text=show)

    def close(self):
        try:
            if (self.lReader != None):
                self.lReader.close()
        except RuntimeError:
            print "Close reader fail"
Example #7
0
class SearchIndex(object):
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  self.analyzer)

    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
Example #8
0
class TextSearcher:
    def __init__(self, fs_directory):
        directory = SimpleFSDirectory(Paths.get(fs_directory))
        self.index_reader = DirectoryReader.open(directory)
        self.searcher = IndexSearcher(DirectoryReader.open(directory))
        self.analyzer = StandardAnalyzer()
        self.query = None
        self.lucene_dictionary = LuceneDictionary(self.index_reader,
                                                  'contents')
        self.analyzer = StandardAnalyzer()
        self.formatter = SimpleHTMLFormatter()
        self.hits = None

    def search(self, searchtext):
        if searchtext is None:
            return 0

        self.query = QueryParser("contents", self.analyzer).parse(searchtext)
        score_docs = self.searcher.search(self.query, 50).scoreDocs
        print("%s total matching documents." % len(score_docs))
        return len(score_docs)

    def find_documents(self, search_text):
        self.query = QueryParser("contents", self.analyzer).parse(search_text)
        self.hits = self.searcher.search(self.query, 50)

        return self.hits

    def get_document(self, document_id):
        return self.searcher.doc(document_id)

    def get_current_query(self):
        return self.query

    def get_highlighted_hits(self):
        extracted_fragments = []

        scorer = QueryScorer(self.query)
        fragmenter = SimpleSpanFragmenter(scorer, 10)
        highlighter = Highlighter(self.formatter, scorer)
        highlighter.setTextFragmenter(fragmenter)

        for hit in self.hits.scoreDocs:
            document = self.searcher.doc(hit.doc)
            stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc,
                                                    'contents', self.analyzer)
            best_fragments = highlighter.getBestFragments(
                stream, document.get('contents'), 10)

            for fragment in best_fragments:
                print('fragment: ', fragment)

            extracted_fragments.append((hit.doc, best_fragments))

        return extracted_fragments
Example #9
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
Example #10
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
	def search(self, input_query=None, max_answers=10):
		''' Searches the given query in the index '''
		if input_query is None:
			return None

		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		
		
		# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
		parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
		query = MultiFieldQueryParser.parse(parser, input_query)

		scoreDocs = searcher.search(query, max_answers).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		docs = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
			docs.append(doc_dict)
			# print doc
		return docs
Example #13
0
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
Example #14
0
File: idx.py Project: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
Example #15
0
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader
	def search(self, index_dir):
		# Get handle to index directory
		directory = SimpleFSDirectory(File(index_dir))

		# Creates a searcher searching the provided index.
		ireader  = DirectoryReader.open(directory)

		# Implements search over a single IndexReader.
		# Use a single instance and use it across queries
		# to improve performance.
		searcher = IndexSearcher(ireader)

		# Get the analyzer
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

		# Constructs a query parser. We specify what field to search into.
		queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

		# Create the query
		query = queryParser.parse(self.query)

		# Run the query and get top 50 results
		topDocs = searcher.search(query, self.retrieve_count)

		# Get top hits
		scoreDocs = topDocs.scoreDocs

		doc_ids = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_ids.append(doc.get(FIELD_PATH))
		return [int(item) for item in doc_ids]
Example #17
0
def func2(command):
    STORE_DIR = "index1"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    if command == '':
        return
    query = QueryParser(Version.LUCENE_CURRENT, "zhuliao",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 9).scoreDocs
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        try:
            res.append([
                doc.get("name"),
                doc.get("collect_num"),
                doc.get("zhuliao").split(' '),
                doc.get("zuofa").split('\n'),
                doc.get("img_url"),
                doc.get("url")
            ])
        except:
            pass
    res1 = []
    for i in res:
        i[1] = int(i[1])
        res1.append(tuple(i))
    res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True)
    return res2
Example #18
0
    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Example #19
0
def func1(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    if command == '':
        return []
    command_list = jieba.cut(command)
    command = " ".join(command_list)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doct = {
            'title': doc.get("title"),
            'url': doc.get("url"),
            "sentence": doc.get("sentence")
        }
        result.append(doct)
    del searcher
    return result
Example #20
0
class Searcher:
    #comment out to run searcher by itself
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    def __init__(self, indexDir):
        f = Paths.get(indexDir)
        self._dir = SimpleFSDirectory(f)
        self._indexSearcher = IndexSearcher(DirectoryReader.open(self._dir))
        self._weights = HashMap()
        self._weights.put(FIELDS[0], 1)
        self._weights.put(FIELDS[1], 0.2)

    def search(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        q = MultiFieldQueryParser.parse(query, FIELDS, [SHOULD, SHOULD],
                                        StandardAnalyzer())
        #		print(q.toString())
        topHits = 100
        scores = self._indexSearcher.search(q, topHits).scoreDocs
        results = []
        for i in range(10):
            doc = self._indexSearcher.doc(scores[i].doc)
            results.append(i + 1, scores[i].doc, doc.get("filename"),
                           doc.get("contents"))


#			print(i+1)
#			print("Score: ", scores[i].doc)
#			print("Title: ", doc.get("filename"))
#			print("Contents: ", doc.get("contents"))
        return results
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
Example #22
0
    def GET(self):
        command = web.input().command.encode('utf-8')
        initvm.vm_env.attachCurrentThread()

        STORE_DIR = "jdindex"
        directory = SimpleFSDirectory(File(STORE_DIR))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 20).scoreDocs

        finalDocs = []
        for i, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            title = doc.get("title").strip('\n')
            if title not in finalDocs:
                finalDocs.append(title)

        web.header('content-type', 'text/json')
        data = {}
        data['q'] = command
        data['p'] = 'false'
        data['s'] = finalDocs
        return 'fn(' + json.dumps(data) + ');'
Example #23
0
def run_music(ID):
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID)
    scoreDocs = searcher.search(query, 1).scoreDocs

    try:
        scoreDoc = scoreDocs[0]
    except:
        return None
    doc = searcher.doc(scoreDoc.doc)

    item = []
    item.append(doc.get("song_title").encode('utf-8'))
    item.append(doc.get('song_url'))
    item.append(doc.get("singer").encode('utf-8'))
    item.append(doc.get("album").encode('utf-8'))
    item.append(doc.get("album_pic"))
    item.append(doc.get("album_genre").encode('utf-8'))
    item.append(doc.get("lyrics").encode('utf-8'))

    sim_str = doc.get("similar").encode('utf-8')
    sim_list = sim_str.split('+')
    for i in range(3):
        sim_list[i] = sim_list[i].split('*')
    item.append(sim_list)

    del searcher

    return item
Example #24
0
class LuceneSearch():
    def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        directory = SimpleFSDirectory(File(lucene_index_dir))
        self.searcher = IndexSearcher(DirectoryReader.open(directory))
        self.num_docs_to_return = 5
        self.ireader = IndexReader.open(directory)

    def search(self, value):
        query = TermQuery(Term("wiki_name", value.lower()))
        # query = BooleanQuery()
        # query.add(new TermQuery(Term("wikidata_name",v)),BooleanClause.Occur.SHOULD)
        # query.add(new TermQuery(Term("wikidata_name",v)),BooleanClause.Occur.SHOULD)
        scoreDocs = self.searcher.search(query,
                                         self.num_docs_to_return).scoreDocs
        pids = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            for f in doc.getFields():
                print f.name(), ':', f.stringValue(), ',  '
                if (f.name() == "wiki_id"):
                    pids.append(f.stringValue())
            print ''
        print '-------------------------------------\n'
        return pids
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
Example #26
0
def search_img(output):
    STORE_DIR = "WebPageIndex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    results = []
    results.append([])
    for num in output:
        imgnum = str(num)
        query = QueryParser(Version.LUCENE_CURRENT, "imgnum",
                            analyzer).parse(imgnum)
        scoreDocs = searcher.search(query, 1).scoreDocs
        for scoreDoc in scoreDocs:
            result = []
            doc = searcher.doc(scoreDoc.doc)
            result.append(doc.get("album"))
            result.append(doc.get("subalbum"))
            result.append(doc.get("singer"))
            result.append(doc.get("url"))
            result.append(doc.get("reviews"))
            result.append(doc.get("imgurl"))
            result.append(doc.get("imgnum"))
            results.append(result)
    del searcher
    return results
Example #27
0
def func(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #
    p = get_d_dimensional_vector(command)
    vp = get_vp(p)
    query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp)
    scoreDocs = searcher.search(query, 200).scoreDocs

    dict1 = {}
    result = ""
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views"))
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        dict1[ch] = rank
    res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True)
    for i in res_list:
        result += i[0]
        result += ' '
    del searcher
    del analyzer
    return result
Example #28
0
class LuceneSearcher(object):
    fields = ['id', 'text', 'types']

    def __init__(self, db_path):
        directory = SimpleFSDirectory(File(db_path))
        reader = DirectoryReader.open(directory)
        self.searcher = IndexSearcher(reader)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        logger.info("Loaded DB from %s with %d documents: ",
                    db_path, reader.numDocs())
        
    def search(self, query, max_matches=1000):
        query = VALID_CHARS_PATTERN.sub(' ', query)
        logger.debug("Searching for %s", query)
        query = QueryParser(Version.LUCENE_CURRENT, "text",
                            self.analyzer).parse(query)
        score_docs = self.searcher.search(query, max_matches).scoreDocs
        logger.debug("%s total matching documents.",
                     len(score_docs))
        
        docs = [self.searcher.doc(d.doc) for d in score_docs]
        return [self.convert_to_dict(doc) for doc in docs]
        
    def convert_to_dict(self, doc):
        return {field: doc.get(field) for field in self.fields}
Example #29
0
class SSQA_S_Searcher:
    def __init__(self, indexDir, analyzer):
        lucene.initVM()
        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug("Search similarity func: {}".format(
            self.searcher.getSimilarity()))

    def search(self, query_text, top_n):
        query_text = query_text.strip()
        query = QueryParser("content", self.analyzer).parse(
            QueryParser.escape(query_text.strip()))
        scoreDocs = self.searcher.search(query, top_n).scoreDocs
        count = 0
        out_list = []
        for scoreDoc in tqdm(scoreDocs):
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append(doc['content'])
            count += 1
        logger.info("Added {} sentences".format(count))
        return out_list

    def close(self):
        self.reader.close()
Example #30
0
def searcher(directory, analyzer, queries_file):
    lines = queries_file.readlines()
    length = len(lines)
    a_query = ''
    query_counter = 0
    log = open("log.txt", "a")
    for line_number in range(length):
        if lines[line_number].startswith("<num>"):
            query_id = lines[line_number][14:].strip()
        elif lines[line_number].startswith("<desc>"):
            a_query = lines[line_number + 1].strip()
            a_query = stop_words(a_query)
        if a_query != '':
            # searching the index
            reader = DirectoryReader.open(directory)
            searcher = IndexSearcher(reader)
            # parse the query
            parser = QueryParser("DocParagraph", analyzer)
            query = parser.parse(a_query)
            # return 50 queries are required by the assignment
            hits = searcher.search(query, 50).scoreDocs
            # rank counter 1 through 50
            rank_counter = 1
            for hit in hits:
                result = searcher.doc(hit.doc)
                # write search result to log text file
                to_log = str(query_id) + " " + "Q" + str(
                    query_counter) + " " + str(result.get(
                        "DocID")) + " " + str(rank_counter) + " " + str(
                            hit.score) + " " + "Alex's" + "\n"
                log.write(to_log)
                rank_counter += 1
            query_counter += 1
            a_query = ''
    log.close()
Example #31
0
def SearchImgCommand(command):
    initvm.vm_env.attachCurrentThread()

    STORE_DIR = "jdindex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    contentCommand = ' '.join(jieba.cut(command))
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(contentCommand)
    scoreDocs = searcher.search(query, 50).scoreDocs

    Already = []
    finalDocs = []
    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        itemurl = doc.get("itemurl")
        if itemurl not in Already:
            oneDoc = {}
            oneDoc['imgurl'] = doc.get("imgurl")
            oneDoc['title'] = doc.get("title").strip('\n')
            oneDoc['itemurl'] = itemurl
            oneDoc['score'] = scoreDoc.score
            finalDocs.append(oneDoc)
            Already.append(itemurl)

    return finalDocs
Example #32
0
def retrieve(command):
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except ValueError:
        print "JVM running."

    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    # to convert to AND query
    command = re.sub(r' ', r' +', command)
    command = "+" + command

    print "Searching for:", command
    query = QueryParser("contents", analyzer).parse(command)
    print query
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    retrieved_docs = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name")))

    del searcher
    return retrieved_docs
Example #33
0
class Searcher(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Searcher

        :param count: The number of counts to return from a query
        :param output: The output directory of the underlying index
        """
        self.count = kwargs.get("count", 100)
        self.output = kwargs.get("root", "index")
        self.store = SimpleFSDirectory(File(self.output))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.searcher = IndexSearcher(DirectoryReader.open(self.store))

    def search(self, query):
        """ Given a query, apply it against the existing index.

        :param query: The query to apply to the index
        :returns: A generator of the matching documents
        """
        query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query)
        results = self.searcher.search(query, self.count)
        for result in results.scoreDocs or []:
            # logger.debug("%s %s %s", hit.score, hit.doc, hit.toString())
            document = self.searcher.doc(result.doc)
            yield document.get("path"), result.score
Example #34
0
File: idx.py Project: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field,
                                      self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
Example #36
0
def shourcut_retriever(keyword):
    '''查询器:在简介中查询'''
    global flag
    if flag:
        lucene.initVM()
    flag = False
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_4_10_1, "shortcut",
                        analyzer).parse(keyword)
    MAX = 20
    hits = searcher.search(query, MAX)

    print("Found %d document(s) that matched query '%s':" %
          (hits.totalHits, query))
    results = []
    for hit in hits.scoreDocs:
        print(hit.score, hit.doc, hit.toString())
        doc = searcher.doc(hit.doc)
        result = [doc.get('shortcut'), doc.get('url'), doc.get('name')]
        print(doc.get('url'))
        results.append(result)
    return results
Example #37
0
def run(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    #print "%s total matching documents." % len(scoreDocs)
    res = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        tmp = []
        tmp.append([doc.get('name1'), doc.get('name2')])
        tmp.append(doc.get("homepage"))
        tmp.append(doc.get("intro"))
        tmp.append(doc.get('logo'))
        a = doc.get('goods')
        a = a.split('\n')
        for i in a:
            tmp.append(i)
        res.append(tmp)

    return command, res
class Search():
    def __init__(self, file_name):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        directory = SimpleFSDirectory(
            Paths.get(os.path.join(base_dir, file_name)))
        self.reader = DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.error_count = 0
        #import pdb; pdb.set_trace()

    def search_one(self, line, search_type, return_count=1):
        line = line.replace("?",
                            "").replace("OR",
                                        "").replace("AND",
                                                    "").replace("NOT", "")
        line = QueryParser.escape(line)
        search_types = ["all", "context", "last", "response"]
        query = QueryParser(search_types[search_type],
                            self.analyzer).parse(line)
        scoreDocs = self.searcher.search(query, return_count).scoreDocs
        if len(scoreDocs) == 0:
            self.error_count += 1
            print("Error in search: ", line)
            return None
        res = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            score = scoreDoc.score
            res.append((doc.get("all"), doc.get("context"), doc.get("last"),
                        doc.get("response"), score))
        return res
def get_query_results(reader,query,n,field):
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n).scoreDocs
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        print("%d. %s" % (i + 1, doc.get(field)))
Example #40
0
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
Example #41
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
Example #42
0
def main():
    resultados = []
    indice_vacio = False
    if len(os.listdir("./lucene/index")) == 0:
        indice_vacio = True
    else:
        consulta = request.args.get("consulta", None)
        if consulta is not None:
            directory = SimpleFSDirectory(Paths.get("./lucene/index"))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = SpanishAnalyzer()
            query = QueryParser("texto", analyzer).parse(consulta)
            scoreDocs = searcher.search(query, 10).scoreDocs

            for sd in scoreDocs:
                doc = searcher.doc(sd.doc)
                resultados.append({
                    "url": direccion_base + doc.get("pdf"),
                    "titulo": doc.get("titulo")
                })

    return render_template("main.html",
                           lucene=lucene.VERSION,
                           indice_vacio=indice_vacio,
                           resultados=resultados)
Example #43
0
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
 def search_docs(self, value, field="general_info"):
     MAX_RESULTS = 1000
     searcher = IndexSearcher(DirectoryReader.open(self.store))
     query = QueryParser(Version.LUCENE_CURRENT, field,
                         self.analyzer).parse(value)
     topDocs = searcher.search(query, MAX_RESULTS)
     
     return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
Example #46
0
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
Example #48
0
def is_article_indexed(art_id, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id))

    docs = searcher.search(query, 1).scoreDocs

    return len(docs) > 0
Example #49
0
class Searcher:
    def __init__(self, analyzer, indexDir):

        self.mAnalyzer = analyzer

        dir = SimpleFSDirectory(File(indexDir))
        self.mIndexSearcher = IndexSearcher(DirectoryReader.open(dir))

    def search(self, strQuery, fieldName, fieldName_2):

        query = QueryParser(Version.LUCENE_CURRENT, fieldName, self.mAnalyzer).parse(strQuery)
        docs = self.mIndexSearcher.search(query, 50).scoreDocs

        if len(docs) < 1:
            query = QueryParser(Version.LUCENE_CURRENT, fieldName_2, StandardAnalyzer(Version.LUCENE_CURRENT)).parse(
                strQuery
            )
            docs = self.mIndexSearcher.search(query, 50).scoreDocs

        return docs, query
Example #50
0
class WikiPageIndex():
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))

    def createIndex(self):
        self.writer = IndexWriter(self.directory, self.config)

        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

    def closeIndex(self):
        self.writer.commit()
        self.writer.close()


    def searchIndex(self, queryString, field="Text", max_results=100):
        query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
        scoreDocs = self.searcher.search(query, max_results).scoreDocs
        log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))

            #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
            docs.append(doc)

        return docs

    @staticmethod
    def cleanWikiText(text):
        text = text.encode('ascii', 'ignore')
        text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
        text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
        text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
        return text.strip()
Example #51
0
def search(term, n_docs=10, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term)

    # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery"

    score_docs = searcher.search(query, n_docs).scoreDocs

    return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
Example #52
0
def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')    
Example #53
0
def main(indexDir, inputDir):
	"""Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
	lucene.initVM()

	# Open index
	logger.info("Opening Lucene index [%s]..." % indexDir)
	dir = SimpleFSDirectory(File(indexDir))
	analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
	reader = DirectoryReader.open(dir)
	searcher = IndexSearcher(reader)

	# Search documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	rels = list()
	for f in onlyfiles:
		journal_code = f.split('.')[0]
		f = join(inputDir, f)
		json_data = open(f)
		data = json.load(json_data)
		# The results collected after comparison

		for entry in data:
			url = entry['url']
			date = entry['date']
			title = entry['title']

			logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title))

			tt = nltk.word_tokenize(title)
			tokens = []
			for t in tt:
				tokens.append(t.lower())

			for token in tokens:
				q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (token, date, journal_code, url)
				query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q)
				hits = searcher.search(query, MAX_HITS)

				logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q))

				for hit in hits.scoreDocs:
					doc = searcher.doc(hit.doc)
					logger.debug(doc)

					rels.append({'left': url, 'token': token, 'right': doc.get('url')})
		json_data.close()

	with open('relationships.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for rel in rels:
			csvwriter.writerow([rel['left'].encode('utf8'), rel['token'].encode('utf8'), rel['right'].encode('utf8')])
Example #54
0
class PyLucene(object):
	def __init__(self):
		if luceneImport:
			self.lucene = True
		else:
			self.lucene = False

		#Lucene connection
		lucene.initVM()
		indexDir = "texts/index"
		directory = MMapDirectory(File(indexDir))
		directory = DirectoryReader.open(directory)
		self.analyzer = StandardAnalyzer(Version.LUCENE_30)
		self.searcher = IndexSearcher(directory)

	def query(self, terms = []):
		query = QueryParser(Version.LUCENE_30, "text", self.analyzer).parse(" OR ".join(terms))
		MAX = 1000
		hits = self.searcher.search(query, MAX)

		results = []
		for hit in hits.scoreDocs:
			doc = self.searcher.doc(hit.doc)
			results.append([doc.get("doc_id").encode("utf-8"), doc.get("head").encode("utf-8")])

		return results

	def occurencies(self, term, morphs):
		query = []
		already = []

		for morph in morphs:
			query.append(morph)
			#Sometime, when there is doubt about a term, because of xml hashing in Lucene, you would find twice a lemma like wordword
			query.append(morph+morph)

		results = self.query(query)

		resultsReturned = []
		for result in results:
			if result[0] not in already:
				resultsReturned.append(result)
				already.append(result[0])

		return resultsReturned, len(resultsReturned)

	def chunk(self, occurency):
		#Could be updated using the section information but could be only milesone

		return occurency#, len(occurency)
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
    def perform_search(self, searchterm):
        # processing a query
        parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)

        query = parser.parse(searchterm)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start

        print scoreDocs
        print duration
Example #57
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
Example #58
0
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100):
    """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs
    :param query:
    :param analyzer:
    :param index:
    :param qparser_regexp:
    :param max_results:
    :return:
    """
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(
        query if not qparser_regexp else re.sub(qparser_regexp, " ", query)
    )
    hits = searcher.search(query, max_results)
    return hits.scoreDocs
Example #59
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1