Ejemplo n.º 1
0
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
def get_query_results(reader,query,n,field):
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n).scoreDocs
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        print("%d. %s" % (i + 1, doc.get(field)))
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
def search():

	lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
	args = []
	if request.method == 'POST':
		if request.form['ies']:
			args.append('+ies:'+request.form['ies'])
		if request.form['area']:
			args.append('+area:'+request.form['area'])
		if request.form['professor']:
			args.append('+professor:'+request.form['professor'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
			args.append('f:'+request.form['conceito'])

	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
Ejemplo n.º 5
0
class Searcher(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Searcher

        :param count: The number of counts to return from a query
        :param output: The output directory of the underlying index
        """
        self.count = kwargs.get("count", 100)
        self.output = kwargs.get("root", "index")
        self.store = SimpleFSDirectory(File(self.output))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.searcher = IndexSearcher(DirectoryReader.open(self.store))

    def search(self, query):
        """ Given a query, apply it against the existing index.

        :param query: The query to apply to the index
        :returns: A generator of the matching documents
        """
        query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query)
        results = self.searcher.search(query, self.count)
        for result in results.scoreDocs or []:
            # logger.debug("%s %s %s", hit.score, hit.doc, hit.toString())
            document = self.searcher.doc(result.doc)
            yield document.get("path"), result.score
Ejemplo n.º 6
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
Ejemplo n.º 7
0
	def getIndexSearcher(self):

		indexSearcher = IndexSearcher(self.mIndexReader)
		if self.mSimilarity != None:
			indexSearcher.setSimilarity(self.mSimilarity)
		
		return indexSearcher	
Ejemplo n.º 8
0
    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Ejemplo n.º 9
0
class IndexAndTaxonomy(object):

    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown();
            self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
            self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"], similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
Ejemplo n.º 12
0
class LuceneSearcher(object):
    fields = ['id', 'text', 'types']

    def __init__(self, db_path):
        directory = SimpleFSDirectory(File(db_path))
        reader = DirectoryReader.open(directory)
        self.searcher = IndexSearcher(reader)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        logger.info("Loaded DB from %s with %d documents: ",
                    db_path, reader.numDocs())
        
    def search(self, query, max_matches=1000):
        query = VALID_CHARS_PATTERN.sub(' ', query)
        logger.debug("Searching for %s", query)
        query = QueryParser(Version.LUCENE_CURRENT, "text",
                            self.analyzer).parse(query)
        score_docs = self.searcher.search(query, max_matches).scoreDocs
        logger.debug("%s total matching documents.",
                     len(score_docs))
        
        docs = [self.searcher.doc(d.doc) for d in score_docs]
        return [self.convert_to_dict(doc) for doc in docs]
        
    def convert_to_dict(self, doc):
        return {field: doc.get(field) for field in self.fields}
Ejemplo n.º 13
0
Archivo: idx.py Proyecto: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
	def search(self, input_query=None, max_answers=10):
		''' Searches the given query in the index '''
		if input_query is None:
			return None

		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		
		
		# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
		parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
		query = MultiFieldQueryParser.parse(parser, input_query)

		scoreDocs = searcher.search(query, max_answers).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		docs = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
			docs.append(doc_dict)
			# print doc
		return docs
 def search_docs(self, value, field="general_info"):
     MAX_RESULTS = 1000
     searcher = IndexSearcher(DirectoryReader.open(self.store))
     query = QueryParser(Version.LUCENE_CURRENT, field,
                         self.analyzer).parse(value)
     topDocs = searcher.search(query, MAX_RESULTS)
     
     return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def config():
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
    searcher.setSimilarity(bm25Sim)
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    return searcher,analyzer
Ejemplo n.º 17
0
def SearchQuery(queryString, fields, classification): 
    #if __name__ == "__main__":
    #if __name__ == "retriever":
    location = web.__path__[0] + "/static/web/files/index/index.articles"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    #query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(query, queryString)
    #query.parse(queryString)#"Shigella sonnei"
    #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    hits = searcher.search(query, MAX)
 
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
    paths = []
    pmcids = []
    documentDict = {}
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        pmcids.append(doc.get("pmcid"))
        docDict = {"title" : doc.get("title")}#we can add any other field we want...
        documentDict[doc.get("pmcid")] = docDict 
    
    #Where we get the images for all the pmcids    
    images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
    #create dictionary of images with pmcid being their key
    imagesDict = {}
    for img in images:
        img_pmcid = img.get("pmcid") 
        if img_pmcid in imagesDict.keys():
            imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
            
        else:
            imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
            
    #for each pmcid, we will assign an image to it for the search results
    for pmcid in pmcids:
        if imagesDict:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = imagesDict[pmcid][0] 
            documentDict[pmcid] = docDict 
        else:
            docDict = documentDict[pmcid]
            docDict["imgURL"] = "images/NoImageAvailable.jpg"
            documentDict[pmcid] = docDict
    
    #END - Where we get the images for all the pmcids
    
    
    return documentDict
Ejemplo n.º 18
0
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 20
0
def is_article_indexed(art_id, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id))

    docs = searcher.search(query, 1).scoreDocs

    return len(docs) > 0
Ejemplo n.º 21
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
Ejemplo n.º 22
0
class WikiPageIndex():
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))

    def createIndex(self):
        self.writer = IndexWriter(self.directory, self.config)

        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)

    def closeIndex(self):
        self.writer.commit()
        self.writer.close()


    def searchIndex(self, queryString, field="Text", max_results=100):
        query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
        scoreDocs = self.searcher.search(query, max_results).scoreDocs
        log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))

            #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
            docs.append(doc)

        return docs

    @staticmethod
    def cleanWikiText(text):
        text = text.encode('ascii', 'ignore')
        text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
        text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
        text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
        return text.strip()
Ejemplo n.º 23
0
def search(term, n_docs=10, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term)

    # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery"

    score_docs = searcher.search(query, n_docs).scoreDocs

    return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
Ejemplo n.º 24
0
def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')    
Ejemplo n.º 25
0
def main(indexDir, inputDir):
	"""Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index"""
	lucene.initVM()

	# Open index
	logger.info("Opening Lucene index [%s]..." % indexDir)
	dir = SimpleFSDirectory(File(indexDir))
	analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
	reader = DirectoryReader.open(dir)
	searcher = IndexSearcher(reader)

	# Search documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	rels = list()
	for f in onlyfiles:
		journal_code = f.split('.')[0]
		f = join(inputDir, f)
		json_data = open(f)
		data = json.load(json_data)
		# The results collected after comparison

		for entry in data:
			url = entry['url']
			date = entry['date']
			title = entry['title']

			logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title))

			tt = nltk.word_tokenize(title)
			tokens = []
			for t in tt:
				tokens.append(t.lower())

			for token in tokens:
				q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (token, date, journal_code, url)
				query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q)
				hits = searcher.search(query, MAX_HITS)

				logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q))

				for hit in hits.scoreDocs:
					doc = searcher.doc(hit.doc)
					logger.debug(doc)

					rels.append({'left': url, 'token': token, 'right': doc.get('url')})
		json_data.close()

	with open('relationships.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for rel in rels:
			csvwriter.writerow([rel['left'].encode('utf8'), rel['token'].encode('utf8'), rel['right'].encode('utf8')])
Ejemplo n.º 26
0
class PyLucene(object):
	def __init__(self):
		if luceneImport:
			self.lucene = True
		else:
			self.lucene = False

		#Lucene connection
		lucene.initVM()
		indexDir = "texts/index"
		directory = MMapDirectory(File(indexDir))
		directory = DirectoryReader.open(directory)
		self.analyzer = StandardAnalyzer(Version.LUCENE_30)
		self.searcher = IndexSearcher(directory)

	def query(self, terms = []):
		query = QueryParser(Version.LUCENE_30, "text", self.analyzer).parse(" OR ".join(terms))
		MAX = 1000
		hits = self.searcher.search(query, MAX)

		results = []
		for hit in hits.scoreDocs:
			doc = self.searcher.doc(hit.doc)
			results.append([doc.get("doc_id").encode("utf-8"), doc.get("head").encode("utf-8")])

		return results

	def occurencies(self, term, morphs):
		query = []
		already = []

		for morph in morphs:
			query.append(morph)
			#Sometime, when there is doubt about a term, because of xml hashing in Lucene, you would find twice a lemma like wordword
			query.append(morph+morph)

		results = self.query(query)

		resultsReturned = []
		for result in results:
			if result[0] not in already:
				resultsReturned.append(result)
				already.append(result[0])

		return resultsReturned, len(resultsReturned)

	def chunk(self, occurency):
		#Could be updated using the section information but could be only milesone

		return occurency#, len(occurency)
Ejemplo n.º 27
0
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown();
            self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
            self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 29
0
    def __init__(self, index_dir, index_file, rawQuery):
        self.indexFile = os.path.join(index_dir, index_file)

#         lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
        directory = SimpleFSDirectory(File(self.indexFile))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        searcher.setSimilarity(BM25Similarity(1.2, 0.75))  # set BM25 as the similarity metric, k=1.2, b=0.75
        if 'Standard' in self.indexFile:
            print "Use the StandardAnalyzer"
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # build a standard analyzer with default stop words
        if 'Porter' in self.indexFile:
            print "Use the PorterStemmer analyzer"
            analyzer = PorterStemmerAnalyzer()
        self.run(searcher, analyzer, rawQuery)
        del searcher
Ejemplo n.º 30
0
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
Ejemplo n.º 31
0
from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
# 以下是为了显示上下文并且高亮而import的 - Highlighter
from org.apache.lucene.analysis import TokenStream
from org.apache.lucene.index import Term
from org.apache.lucene import search
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search.highlight import Highlighter
from org.apache.lucene.search.highlight import QueryScorer
from org.apache.lucene.search.highlight import SimpleHTMLFormatter
from org.apache.lucene.search.highlight import SimpleSpanFragmenter 

# 以下是为了实现网页接口 import 的库
import web
from web import form

print 'lucene', lucene.VERSION
vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) 

STORE_DIR_PLAYLIST = "Playlist"
dir_playlist = SimpleFSDirectory(File(STORE_DIR_PLAYLIST))              
searcher_playlist = IndexSearcher(DirectoryReader.open(dir_playlist))

STORE_DIR_SONGS = "Songs"
dir_songs = SimpleFSDirectory(File(STORE_DIR_SONGS))              
searcher_songs = IndexSearcher(DirectoryReader.open(dir_songs))

analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

print "初始化完成."
Ejemplo n.º 32
0
def createIndexSearcher(indexDir):
    directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)))
    searcher = IndexSearcher(directory)
    similarity = BM25Similarity(K1, B)
    searcher.setSimilarity(similarity)
    return searcher
Ejemplo n.º 33
0
            doc = searcher.doc(scoreDoc.doc)
            #titleHighLight = Highlighter.getBestFragment(analyzer, "title", doc.get("标题"))
            #print titleHighLighthttps://img1.doubanio.com/view/photo/s_ratio_poster/public/p2502530749.jpg
            lis.append([
                doc.get('标题'),
                doc.get('图片'),
                doc.get('评分'),
                doc.get("上映日期:"),
                doc.get("类型:")
            ])
            lis.sort(key=lambda x: x[2], reverse=True)
            #print 'path:', doc.get("path"),  \
            #"title:",doc.get("title"),"url:",doc.get("url"),'score:', scoreDoc.score
            # print 'explain:', searcher.explain(query, scoreDoc.doc)
        return lis


if __name__ == '__main__':
    STORE_DIR = "index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    lis = run(searcher, analyzer)
    for i in lis:
        for j in i:
            print j
    del searcher
Ejemplo n.º 34
0
class QAsearcher(object):
    def __init__(self, dir):
        self.dir = dir
        self.lReader = DirectoryReader.open(
            FSDirectory.open(Paths.get(self.dir)))
        self.lSearcher = IndexSearcher(self.lReader)

    # def get_collection_size(self):
    #     return self.lReader.numDocs()

    def doc_search(self, field, keywords, numHits):
        if field != 'All':
            analyzer = StandardAnalyzer()
            parser = QueryParser(field, analyzer)
            query = parser.parse(keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits
        else:
            analyzer = WhitespaceAnalyzer()
            parser = MultiFieldQueryParser(['Title', 'Body'], analyzer)
            query = MultiFieldQueryParser.parse(parser, keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits

            self.hits = hits
            self.field = field
            return hits

    def print_result(self):
        j = 1

        for i in self.hits:
            print "\nResult " + str(j) + "\tDocID: " + str(
                i.doc) + "\t Score: " + str(i.score)
            try:
                if self.field == 'All':
                    print "Tile: " + self.lReader.document(i.doc).get("Title")
                    print "Body: " + self.lReader.document(i.doc).get("Body")
                if self.field == 'Title':
                    print "Tile: " + self.lReader.document(i.doc).get("Title")
                if self.field == 'Body':
                    print "Body: " + self.lReader.document(i.doc).get("Body")

            except RuntimeError:
                print "Search fail"
            j = j + 1
            print j

    def close(self):
        try:
            if (self.lReader != None):
                self.lReader.close()
        except RuntimeError:
            print "Close reader fail"
Ejemplo n.º 35
0
    import argparse
    parser = argparse.ArgumentParser(
        description='Execute queries on comment body')
    parser.add_argument('user_name', type=str,
                        help="User name (profile to use)")
    parser.add_argument('index_dir', metavar='dir', type=str,
                        help="Index directory")
    parser.add_argument('--sim', type=str, nargs='?',
                        default="tfidf", help="Similarity (in [tfidf, lm, bm25])")
    parser.add_argument('--reorder', type=str, nargs='?',
                        default="no", help="Reordering (in [ups, normups])")
    parser.add_argument('--short', action='store_false',
                        help="Don't show the body of comments")
    args = parser.parse_args()


    if args.sim in ['bm25']:
        similarity = BM25Similarity()
    elif args.sim in ['lm']:
        similarity = LMDirichletSimilarity()
    else:
        similarity = ClassicSimilarity()

    # Sample query
    storeDir = SimpleFSDirectory(Paths.get(args.index_dir))
    searcher = IndexSearcher(DirectoryReader.open(storeDir))
    if similarity is not None:
        searcher.setSimilarity(similarity)
    analyzer = StandardAnalyzer()
    run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
Ejemplo n.º 36
0
                array = text.split()
                print array[0], " ", array[1], " ", array[2] + "</a><br>"


if __name__ == '__main__':
    #initialize VM
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    #get user's input
    form = cgi.FieldStorage()
    searchTerm = form.getvalue('search')
    K = form.getvalue('kValue')
    #open index
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    reader = DirectoryReader.open(directory)
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer()
    print "results for query: ", searchTerm, "<br>"
    #if checkbox was checked, try 2,3,4,5,6 for k, see which one's clusters have higher silhouette coefficient
    #then use that number as k to search one more time to get final results
    if form.getvalue('autoK'):
        bestK = 0
        highestSilhouette = 0
        for counter in range(3, 7):
            if counter == 2:
                highestSilhouette = getSilhouette(reader, searcher, analyzer,
                                                  searchTerm, counter)
                bestK = counter
            else:
                silhouette = getSilhouette(reader, searcher, analyzer,
                                           searchTerm, counter)
Ejemplo n.º 37
0
 def getSearcher(self, store):
     return IndexSearcher(DirectoryReader.open(store))
Ejemplo n.º 38
0
class GitHubSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = index_path
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        indexDir = File(self.index_path)
        a = {"code": self.porter_analyzer}
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("Index contains %d documents." % n_docs)

    def get_DF(self, field, term):
        return self.reader.docFreq(Term(field, term))

    def get_IDF(self, field, term):
        from math import log10, sqrt
        docF = self.reader.docFreq(Term(field, term))

        return log10(self.reader.numDocs() / (docF + 1)) + 1

    def get_minimum_IDF(self, docF=2):
        from math import log10, sqrt
        return log10(self.reader.numDocs() / (docF + 1)) + 1

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  #"used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())

                    # Filter out noisy terms
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        # idf = self.get_IDF(field, term)

                        # print self.get_DF(field, term), term, field
                        #query += "%s:%s^%s " % (field, term, idf)
                        query += "%s:%s " % (field, term)

                    #print "term: %s idf: %s" % (term, self.get_minimum_IDF())

                #query += "%s:%s " % (field, term)
                #print "%s:%s^%s" % (field, term, self.getIDF(field, term))
        # for hint in doc.getFields("code_hints"):
        # 	tokens = utils.tokenize(hint.stringValue())
        # 	for token in tokens:
        # 		#print token
        # 		token = QueryParser.escape(token)
        # 		if token.strip():
        # 			print "HINTS", token
        # 			query += "code:%s^5.0 " % (token)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    print "TERM", term
                    # if term[0].isupper():
                    # 	query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term)
                    # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers
                    # 	query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term)

                    #query += "code:%s^5.0 " % (term)

        return query

    def get_matched_keywords(self, query, docid):
        matched_terms = []
        # def _get_matched_keywords(q, matched_terms):
        # 	print type(q), matched_terms
        # 	if isinstance(q, TermQuery):
        # 		if self.searcher.explain(q, docid).isMatch():
        # 			matched_terms.append( q.getTerm().text() )
        # 	elif isinstance(q, BooleanQuery):
        # 		for query_term in query.getClauses():
        # 			_get_matched_keywords(query_term, matched_terms)
        # 			# if self.searcher.explain(query_term.getQuery(), docid).isMatch():
        # 			# 	matched_terms.append( query_term.getQuery().getTerm().text() )

        # _get_matched_keywords(query, matched_terms)

        if isinstance(query, TermQuery):
            if self.searcher.explain(query, docid).isMatch():
                matched_terms.append(query.getTerm().text())
        elif isinstance(query, BooleanQuery):
            for query_term in query.getClauses():
                if self.searcher.explain(query_term.getQuery(),
                                         docid).isMatch():
                    matched_terms.append(
                        query_term.getQuery().getTerm().text())

        #print "Matched Terms: %s" % matched_terms
        return matched_terms

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                #field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def code_as_text(self):
        """ Extends a query by matching query keywords in source code as text"""

        query = " "
        for term in tokenize_string(self.porter_analyzer, self.query):
            if term:
                term = QueryParser.escape(term)
                query += "code:%s " % (term)

        return query

    def lexical_search(self):
        """ In case no term is matching with stackoverflow we perform a simple lexical search on GitHub """
        github_result = []
        query = self.code_as_text().strip()
        query = QueryParser(Version.LUCENE_CURRENT, "code",
                            self.analyzer).parse(query)
        hits = self.searcher.search(query, 10).scoreDocs
        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            matched_terms = self.get_matched_keywords(query, hit.doc)

            # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]

            item = GithubResultItem(doc.get("file"),
                                    decompress(doc.get("file_content")),
                                    matched_terms, hit.score, so_item,
                                    doc.get("line_numbers"), hit.doc)  # code

            github_result.append(item)

        return github_result

    def more_like_this(self, so_items):

        github_result = []
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))

        for so_item in so_items:
            queryparser = QueryParser(Version.LUCENE_CURRENT,
                                      "typed_method_call", self.analyzer)
            query = ""
            if so_item.doc:
                query = self.document_to_query(so_item.doc)

            query += self.code_as_text()
            if query:
                print "-" * 30
                print "Query: %s" % query
                print "-" * 30
                try:
                    like_query = queryparser.parse(query)

                    hits = self.searcher.search(like_query, 10).scoreDocs

                    for i, hit in enumerate(hits):
                        doc = self.searcher.doc(hit.doc)
                        matched_terms = self.get_matched_keywords2(
                            like_query, hit.doc)
                        # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                        item = GithubResultItem(doc.get("file"),
                                                decompress(
                                                    doc.get("file_content")),
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code
                        github_result.append(item)
                        #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score))
                except Exception as e:
                    print "Error: %s" % e
        # print Counter(files).most_common(5)
        return github_result

    def more_like_this2(self, so_items):
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))
        query = ""

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)  ####
        ###아래의 반복문이 Agumented Query 생성부
        for so_item in so_items:
            if so_item.doc:
                query += self.document_to_query(so_item.doc)
            query += self.code_as_text()

        github_result = []
        if query:
            print "-" * 50
            print "UNified Query: %s" % query
            print "-" * 50
            try:
                ###루씬에 맞는 Query로 최종 변환
                like_query = queryparser.parse(query)
                ###아래 줄이 실제로 GitHub Indices들 찾아들어가서 like_query와 비교 견적 상위 5개..
                hits = self.searcher.search(like_query, 5).scoreDocs  #상위 5개 결과
                #hits에 5개의 결과가 들어감..

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)
                    print "Matched Terms : ", matched_terms

                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    print("file", doc.get("file"), "file_content",
                          doc.get("file_content"), "line_numbers",
                          doc.get("line_numbers"))
                    file_path = doc.get("file")
                    #file_path = "" + doc.get("file")[24:]
                    #file_path = "/root/GitSearch" + doc.get("file")[24:]
                    #print(doc.get("file")[32:])
                    #print(doc.get("file")[0:])
                    #print(file_path)

                    content = None
                    try:
                        with open(
                                file_path
                        ) as f:  #실제 프로젝트 경로 쭉 찾아들어가서 파일 열고 읽어서 content에 넣음
                            content = f.read()
                    except:
                        pass

                    #File 찾고 내용 존재 시, 형식에 맞게 item에 넣음.
                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code
                        github_result.append(item)

            except Exception as e:
                print "GitSearcher: Error: %s" % e
                print(traceback.format_exc())

        return github_result
Ejemplo n.º 39
0
 def __init__(self, index_path):
     indexDir = File(index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
def main():
    """Function to index negative situations and retrive based on input sentence"""

    all_sent_df = pd.read_csv("../data/sentiment_data.csv")
    neg = all_sent_df[all_sent_df["label"] == 1]
    all_neg_phrases = list(neg["phrase"])
    with open("../data/negSituations.txt", "r") as fpointer:
        all_neg_situations = fpointer.readlines()

    all_neg_situations = map(lambda s: s.strip(), all_neg_situations)
    all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases)

    lucene.initVM()
    analyzer = StandardAnalyzer()
    path = Paths.get('negSituationIndex')
    directory = SimpleFSDirectory(path)
    writer_config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writer_config)

    print(writer.numDocs())
    # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS
    for each in all_neg_situations:
        document = Document()
        document.add(Field("negativeSituations", each, TextField.TYPE_STORED))
        writer.addDocument(document)

    print(writer.numDocs())
    writer.close()

    analyzer = StandardAnalyzer()
    reader = DirectoryReader.open(directory)
    searcher = IndexSearcher(reader)

    # QUERYING FOR A QUESTION
    with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer:
        all_test_sent = fpointer.readlines()
    all_test_sent = map(lambda s: s.strip(), all_test_sent)

    query_parser = QueryParser("negativeSituations", analyzer)

    total_num = 0
    tic = time.time()
    all_ans = []
    for each in all_test_sent:
        total_num = total_num + 1
        if total_num % 1000 == 0:
            print(total_num, time.time() - tic)

        query = query_parser.parse(query_parser.escape(each))
        hits = searcher.search(query, 3)
        docs_scores = [hit.score for hit in hits.scoreDocs]
        current_ans = []
        if docs_scores != []:
            for hit in hits.scoreDocs:
                doc_t = searcher.doc(hit.doc)
                doc_text = doc_t.get("negativeSituations")
                current_ans.append(doc_text)
        else:
            continue

        current_ans = list(set(current_ans))
        all_ans.append(current_ans)

    print(all_ans)
Ejemplo n.º 41
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()
Ejemplo n.º 42
0
        index_reader = DirectoryReader.open(index.store)
        # get vocab size

        terms = MultiFields.getTerms(index_reader, 'contents')
        termEnum = terms.iterator()
        vocabCounter = 0
        for term in BytesRefIterator.cast_(termEnum):
            vocabCounter += 1
        print("Number of docs:", index_reader.numDocs())
        print("Vocab size:", vocabCounter)

        # print min, max, mean
        querystr = 'بازار بزرگ تهران'
        print("Query: ", querystr)
        q = QueryParser("contents", index.analyzer).parse(querystr)
        hitsPerPage = 20
        searcher = IndexSearcher(index_reader)
        docs = searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs
        for i, hit in enumerate(hits):
            docId = hit.doc
            score = hit.score
            d = searcher.doc(docId)
            print("Query result(%d): %s , Similarity: %g" %
                  ((i + 1), d.get("id"), score))
        print("Overall time elapsed: ", (timeit.default_timer() - start_time))
    except Exception as e:
        print(e)
        raise e
Ejemplo n.º 43
0
class Searcher():
    """A simple interface to search articles.

    In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to
    accomplish our application: query should apply on multiple fields,
    duplication should be avoid.
    """
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(Paths.get(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.boost_map = HashMap()
        for k, v in boost.items():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format

    def query_between_dates(self, dt1, dt2, original_query=None):
        '''Update the given query to only allow records between dt1 and dt2.'''
        return TermRangeQuery(
            'date_published',  # Field
            BytesRef(dt1.strftime(self.date_format)),  # Lower bound
            BytesRef(dt2.strftime(self.date_format)),  # Upper bound
            True,  # Include lower bound
            True  # Include upper bound
        )

    def refresh(self):
        """Refresh the searsher, if index is changed."""
        nireader = DirectoryReader.openIfChanged(self.reader)
        if nireader:
            self.reader.close()
            self.reader = nireader
            self.isearcher = IndexSearcher(self.reader)
            logger.debug('Index file changed, freshed')
        else:
            logger.debug('Index file did not change.')

    def fetch_one_doc(self, score_doc):
        """Fetch one document from the scored doc results."""
        doc = self.isearcher.doc(score_doc.doc)
        return (
            doc.getField("group_id").numericValue().intValue(),
            doc.get("canonical_url"),
            doc.get("title"),
            doc.get("date_published"),
            doc.get("domain"),
            doc.get("site_type"),
            score_doc.score,
        )

    def search(self,
               query,
               n1=100,
               n2=100000,
               sort_by='relevant',
               use_lucene_syntax=False,
               min_score_of_recent_sorting=0.4,
               min_date_published=None):
        """Return the matched articles from lucene.

        Parameters
        ----------
        query : string
            The query string.
        n1 : int
            How many result finally returned.
        n2 : int
            How many search results returned when sort by recent.
        sort_by : string
            {'relevant', 'recent'}, the sorting order when doing lucene searching.
        min_score_of_recent_sorting : float
            The min score when sorting by 'recent'.
        min_date_published : datetime
            The min date_published when filtering lucene searching results.

        Returns
        -------
        tuple
            (total_hits, df), where total_hits represents the total number
            of hits and df is a pandas.DataFrame object. df.columns = ['id',
            'canonical_url', 'title', 'date_published', 'domain', 'site_type',
            'score']
        """
        if min_date_published is not None:
            dt2 = datetime.utcnow()
            if isinstance(min_date_published, datetime):
                dt1 = min_date_published
            elif isinstance(min_date_published, str):
                dt1 = utc_from_str(min_date_published)
            q_dates = self.query_between_dates(dt1, dt2)
        try:
            if use_lucene_syntax is False:
                query = clean_query(query)
            q = self.mul_parser.parse(self.mul_parser, query)
            logger.warning(q)
            if 'date_published:' in query:
                end = query.find('AND date_published')
                q_without_date_publushed = query[:end]
                logger.warning(q_without_date_publushed)
                q = self.mul_parser.parse(self.mul_parser,
                                          q_without_date_publushed)
                date_published_splits = query.split('date_published:[')
                date_range = date_published_splits[len(date_published_splits) -
                                                   1]
                date_range = date_range[:-1]
                logger.warning(date_range)
                if 'TO' in date_range:
                    date_range_splits = date_range.split('TO')
                    dt1_string = date_range_splits[0]
                    # handling when regex presents
                    if '*' in dt1_string:
                        date1_end = dt1_string.find('*') - 1
                        dt1_string = dt1_string[:date1_end]
                        logger.warning(dt1_string)
                    dt1 = utc_from_str(dt1_string)
                    dt2_string = date_range_splits[1]
                    if '*' in dt2_string:
                        date2_end = dt2_string.find('*') - 1
                        dt2_string = dt2_string[:date2_end]
                        logger.warning(dt2_string)
                    dt2 = utc_from_str(dt2_string)
                    query_dates = self.query_between_dates(dt1, dt2)
                    q = combine_queries(q, query_dates)
            if min_date_published is not None:
                q = combine_queries(q, q_dates)
            logger.warning('Parsed query: %s', q)
        except Exception as e:
            logger.error(e)
            if use_lucene_syntax is True:
                raise APIParseError("""Error when parse the query string! \
You are quering with lucene syntax, be careful of your query string!""")
            else:
                raise APIParseError('Error when parse the query string!')

        cnames = [
            'id', 'canonical_url', 'title', 'date_published', 'domain',
            'site_type', 'score'
        ]
        if sort_by == 'relevant':
            top_docs = self.isearcher.search(q, n1)
            score_docs = top_docs.scoreDocs
            total_hits = top_docs.totalHits
            if total_hits == 0:
                df = pd.DataFrame()
            else:
                records = [self.fetch_one_doc(sd) for sd in score_docs]

                # Index in each record of canonical URL and title
                canonical_url, title = 1, 2
                # Store 2-tuples of (site, article title) as keys in dict then
                # turn back to list
                unique_docs = dict()
                for record in records:
                    key = (record[canonical_url], record[title])
                    if key not in unique_docs:
                        unique_docs[key] = record
                # Include only unique records
                records = list(unique_docs.values())
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return total_hits, df
        elif sort_by == 'recent':
            counter = 0
            records = []
            top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent,
                                                   True, True)
            if top_field_docs.maxScore >= min_score_of_recent_sorting:
                for sd in top_field_docs.scoreDocs:
                    if sd.score >= min_score_of_recent_sorting:
                        records.append(self.fetch_one_doc(sd))
                        counter += 1
                        if counter == n1:
                            break
            if counter == 0:
                df = pd.DataFrame()
            else:
                df = pd.DataFrame(records, columns=cnames)
                df['date_published'] = pd.to_datetime(df['date_published'])
            return counter, df
Ejemplo n.º 44
0
 def getSearcher(self, directory=None, reader=None):
     if reader is not None:
         return IndexSearcher(reader)
     return IndexSearcher(self.getReader(directory=directory))
Ejemplo n.º 45
0
 def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     directory = SimpleFSDirectory(File(lucene_index_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.num_docs_to_return = 5
     self.ireader = IndexReader.open(directory)
Ejemplo n.º 46
0
class IndexAndTaxonomy(object):
    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {
            "similarity": self.searcher.getSimilarity().toString(),
            "numberOfConcurrentTasks": self._numberOfConcurrentTasks
        }
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown()
            self._executor = Executors.newFixedThreadPool(
                self._numberOfConcurrentTasks)
            self._searcher = SuperIndexSearcher(self._reader, self._executor,
                                                self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"],
                                              similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
Ejemplo n.º 47
0
elif args.mode == 'no_e':
    args.input_test_file = pre + 'no_e.json'

import lucene, time
import nltk
from java.io import File
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.util import Version
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
directory = FSDirectory.open(File(index_dir))
searcher = IndexSearcher(DirectoryReader.open(directory))
parser = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer)

english_punctuations = [
    ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$',
    '%', "''", '``', "'s", "-", "--", '–'
]
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(english_punctuations)
stopwords.remove('by')

with open(raw2Q_file, 'r') as f:
    raw2Q = json.load(f)
with open(Qlabel_dict_file, 'r') as f:
    Qlabel_dict = json.load(f)
Ejemplo n.º 48
0
def init_lucene(dir_path):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    store = SimpleFSDirectory(Paths.get(dir_path))
    searcher = IndexSearcher(DirectoryReader.open(store))
    # store.close()
    return searcher
Ejemplo n.º 49
0
 def __init__(self, dir):
     self.dir = dir
     self.lReader = DirectoryReader.open(
         FSDirectory.open(Paths.get(self.dir)))
     self.lSearcher = IndexSearcher(self.lReader)
Ejemplo n.º 50
0
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.util import Version

indexDir = './index'
query = 'null'
lucene.initVM()
print 'lucene', lucene.VERSION
keyWords = ["学术论坛", "媒体", "校园新闻", "创新", "机构设置", "教学在线", "科研处", "招生"]
#索引的存放位置
indir = SimpleFSDirectory(Paths.get(indexDir))
#分词器
analyzer = StandardAnalyzer()
#检索器
searcher = IndexSearcher(DirectoryReader.open(indir))

for i in range(0, 8):
    keyword = keyWords[i]
    query = QueryParser('contents', analyzer).parse(keyword)
    #开始搜索
    hits = searcher.search(query, 100)
    print '搜索到的结果数为:', hits.totalHits
    f = open("result.txt", "a")
    f.write("\nTD" + str(i) + " " + keyword)
    f.write('\n')
    f.close()
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        temp = doc.get('url') + " " + str(hit.score)
        f = open("result.txt", "a")
Ejemplo n.º 51
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
Ejemplo n.º 52
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 53
0
directory = RAMDirectory()
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, 1000000)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(directory, config)

i = 0
for file in files:
    print i, len(files)
    i = i + 1
    doc = create_document(file)
    writer.addDocument(doc)

writer.close()

searcher = IndexSearcher(DirectoryReader.open(directory))

file_expanded_queries = open('expanded_query', 'wb')
file_expanded_queries_vector = open('query_vector', 'wb')
file_expanded_queries_performance = open('Performance_after_query_expansion',
                                         'wb')
file_queries = open('query.txt', 'r')
lines = file_queries.readlines()
avg_prec = 0
avg_recall = 0
avg_fScore = 0
avg_prec_new = 0
avg_recall_new = 0
avg_fScore_new = 0
for query in lines:
    current_query = query[5:].strip()
Ejemplo n.º 54
0
 def __init__(self, folder='modern_index'):
     self.chSearcher = IndexSearcher(
         DirectoryReader.open(SimpleFSDirectory(File(folder + '/chinese'))))
     self.enSearcher = IndexSearcher(
         DirectoryReader.open(SimpleFSDirectory(File(folder + '/english'))))
     self.Analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
Ejemplo n.º 55
0
 def getSecondarySearcher(self):
     if len(self.searchers) < 2:
         self.searchers.append(IndexSearcher(self.reader))
     return self.searchers[1]
Ejemplo n.º 56
0
    subfolder_index_pairs = [
       ('cluster_w2v', 'index_w2v'),
       ('cluster_w2v_n', 'index_w2v_n'),
        ('cluster_tfidf', 'index_tfidf'),
        ('cluster_tfidf_n', 'index_tfidf_n')
    ]

    search_term = sys.argv[1]

    for cluster_type, index in subfolder_index_pairs:
        print(f'searching in {index}')
        path = Paths.get(f'../data/indices/{index}')
        
        reader = DirectoryReader.open(SimpleFSDirectory(path))
        searcher = IndexSearcher(reader)
 
        query = QueryParser("content", analyzer).parse(search_term)
        MAX = 1000000
        hits = searcher.search(query, MAX)
 
        month_counter = [
            [0] * 32,
            [0] * 32,
            [0] * 32
        ]

        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)

            month = int(doc.get('day').split('_')[0])
Ejemplo n.º 57
0
class LuceneInterface:

    def __init__(self, indexPath, initialHeap, maxHeap):
        lucene.initVM(initialheap=initialHeap, maxheap=maxHeap)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.indexPath = indexPath
        self.indexDir = SimpleFSDirectory(File(self.indexPath))

    def open(self):
        debug('Opening index "%s"' % self.indexPath)
        self.reader = IndexReader.open(self.indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.totalDocs = self.getTotalSentenceCount()

    def close(self):
        self.reader.close()

    def getTFForField(self, field):
        tfs = {}
        fields = MultiFields.getFields(self.reader)
        terms = fields.terms(field)
        enum = BytesRefIterator.cast_(terms.iterator(None))
        try:
          while enum.next():
                termval = TermsEnum.cast_(enum)
                termString  = termval.term().utf8ToString()
                freq    = self.reader.totalTermFreq(Term(field, termString))
                tfs[termString] = freq
        except:
            pass
        return tfs

    def getTotalSentenceCount(self):
        num =  self.reader.numDocs()
        return num

    def sentenceCountForQuery(self, query, field='text'):
        qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query)
        collector = TotalHitCountCollector()
        self.searcher.search(qp, collector)
        return collector.getTotalHits()

    #def getQueryCount(self, query, field='text'):
        #qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query)
        #collector = TotalHitCountCollector()
        #self.searcher.search(qp, collector)
        #return collector.getTotalHits()

    def getIntersectionCount(self, query, countTermString, sfield, cfield):
        qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,[query,countTermString],[sfield,cfield],[BooleanClause.Occur.MUST,BooleanClause.Occur.MUST],self.analyzer)
        collector = TotalHitCountCollector()
        self.searcher.search(qp, collector)
        return collector.getTotalHits()

    # Return a list of records, where each record is a dictionary; the keys are the the field names in lucene.
    def search(self, query, field, maxReturnLimit):
        qp = QueryParser(Version.LUCENE_CURRENT, field, WhitespaceAnalyzer(Version.LUCENE_CURRENT)).parse(query)
        hits = self.searcher.search(qp, maxReturnLimit)
        result = []
        for hit in hits.scoreDocs:
            record = dict()
            doc = self.searcher.doc(hit.doc)
            record["id"] = doc.get("id")
            record["pos"]  = doc.get("pos")
            record["hallmarks"] = doc.get("hallmarks").split()
            #record["hallmarks-exp"] = doc.get("hallmarks-exp").split()
            record["text"] = doc.get("text")
            result.append(record)
        return result

    def searchGivenHallmarks(self, query, hallmarksList, hallmarksField, maxReturnLimit):
        qList = [query]
        qList.extend(hallmarksList)
	#print(qList)
        fList = ["text"]
        fList.extend([hallmarksField]*len(hallmarksList))
	#print(fList)
        flagList = [BooleanClause.Occur.MUST]
        flagList.extend([BooleanClause.Occur.MUST]*len(hallmarksList))
        #print(flagList)
        qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, qList, fList, flagList, self.analyzer)
        #print (qp)
        hits = self.searcher.search(qp, maxReturnLimit)
        result = []
        for hit in hits.scoreDocs:
            record = dict()
            doc = self.searcher.doc(hit.doc)
            record["id"] = doc.get("id")
            record["pos"]  = doc.get("pos")
            record["hallmarks"] = doc.get("hallmarks").split()
            #record["hallmarks-exp"] = doc.get("hallmarks-exp").split()
            record["text"] = doc.get("text")
            result.append(record)
        return result
Ejemplo n.º 58
0
 def open(self):
     debug('Opening index "%s"' % self.indexPath)
     self.reader = IndexReader.open(self.indexDir)
     self.searcher = IndexSearcher(self.reader)
     self.totalDocs = self.getTotalSentenceCount()
Ejemplo n.º 59
0
def main():
    #constants
    FIELD_CONTENTS = "vectext"
    DOC_NAME = "identifier"
    STORE_DIR = "../full_index1"

    lucene.initVM()
    store = SimpleFSDirectory(Paths.get(STORE_DIR))

    ireader = DirectoryReader.open(store)  #, True)
    #print(ireader.readerIndex(0))

    searcher = IndexSearcher(ireader)  #self.getSearcher()

    pickle_file = glob.glob('full_word_list.pkl')
    print(pickle_file)
    date_range = (1785, 1805)

    bigrams = False
    remake_word_list = True
    if remake_word_list:  #not pickle_file:

        full_df = get_full_df()
        full_term_data = []
        for year in range(date_range[0], date_range[1]):
            docs_in_year = get_docs_in_year(full_df, year)
            #print(docs_in_year)
            year_dict = Counter({})
            terms = []
            freqs = []
            print(year)
            for cd, doc_id in enumerate(docs_in_year):
                #if not cd%100:
                #    print(cd , '--', len(docs_in_year))
                # get document (query by id)
                q = TermQuery(Term("identifier", doc_id + '_djvu.txt'))
                topDocs = searcher.search(q, 50000)

                #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")
                one_doc = topDocs.scoreDocs[0].doc
                doc_name = searcher.doc(one_doc)
                #print(doc_name, doc_id)

                if bigrams == False:
                    termvec = ireader.getTermVector(topDocs.scoreDocs[0].doc,
                                                    FIELD_CONTENTS)

                    if termvec != None:
                        #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")

                        termsEnum = termvec.iterator()
                        for term in BytesRefIterator.cast_(termsEnum):
                            terms.append(term.utf8ToString())
                            freqs.append(termsEnum.totalTermFreq())
                else:
                    #print(doc_name, doc_id)
                    text = doc_name.get("text")
                    text = text.split()
                    text = strip_stopwords_punc(text)
                    for word1, word2 in zip(text[:-1], text[1:]):
                        if len(word1) + len(word2) > 6:
                            try:
                                year_dict[word1 + ' ' + word2] += 1
                            except:
                                year_dict[word1 + ' ' + word2] = 1
            if bigrams == False:
                for term, freq in zip(terms, freqs):
                    try:
                        year_dict[term] += freq
                    except:
                        year_dict[term] = freq
            print(len(year_dict))
            #print(year_dict)
            for term in list(year_dict):
                if year_dict[term] < 2:  #5 and term not in stopwords:
                    year_dict.pop(term)
            full_term_data.append(year_dict)
            print(len(year_dict))
            #year_dict = year_dict + doc_dict
            #print(year_dict.most_common(1000))
            print('\n\n')
        if bigrams:
            pickle.dump(full_term_data, open('full_bigram_list.pkl', 'wb'))
        else:
            pickle.dump(full_term_data, open('full_word_list.pkl', 'wb'))
    else:
        if bigrams:
            full_term_data = pickle.load(open('full_bigram_list.pkl', 'rb'))
        else:
            full_term_data = pickle.load(open('full_word_list.pkl', 'rb'))
        # get complete list of unique words
        # top_words_year = zscore_method(full_term_data, date_range)

        top_words_year = tfidf_method(full_term_data, date_range)
        print(top_words_year)
    pickle.dump(top_words_year, open('trending_ratio.pkl', 'wb'))
Ejemplo n.º 60
0
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False):
    """
    :param indexed_data_dir:
    :param index_destination:
    :return:
    """
    def choose_best():
        scores = []
        for k, v in sorted(res.items(), key=lambda x: x[0]):
            scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v)))
        return sorted(scores, key=lambda x: -x[-1])[0][0]
 
    def calculate_score(res):
        """
        :param res:
        :return:
        """
        correct = 0
        total = 0
        for index, row in data_test.iterrows():
            if res[index] == row['correctAnswer']:
                correct += 1
            total += 1
        return float(correct)/total
 
    if not already_indexed:
        make_index(indexed_data, index_destination, source)
 
    res = {}
    MAX = 100
    docs_per_q = range(1,20)

    records = []
 
    #analyzer = StandardAnalyzer(Version.LUCENE_30)
    analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
    reader = IndexReader.open(SimpleFSDirectory(File(index_destination)))
    searcher = IndexSearcher(reader)
 
    for index, row in data_test.iterrows():
 
        queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']]
        queries = [row['question'] + ' ' + q for q in queries]
 
        scores = {}
        for q in queries:
            query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
            #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q))
            hits = searcher.search(query, MAX)
            doc_importance = [hit.score for hit in hits.scoreDocs]
            for n in docs_per_q:
                scores.setdefault(n, [])
                scores[n].append(sum(doc_importance[:n]))

        to_records = [index+102501]
        to_records.append(['A','B','C','D'][np.argmax(scores[4])])
        records.append(to_records)

        for n in docs_per_q:
            res.setdefault(n, [])
            res[n].append(['A','B','C','D'][np.argmax(scores[n])])

    df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"])
    df = df.set_index("id")
    df.to_csv("ololo.csv")

    # print res[4]
    best = choose_best()
    print best
    score = calculate_score(res[best])
    # score = calculate_score(res)
    print score