def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
Ejemplo n.º 3
0
class Searcher:
    def __init__(self, indexDir):
        self.directory = SimpleFSDirectory(Paths.get(indexDir))
        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
        self.nameQueryParser = QueryParser('name', StandardAnalyzer())
        self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
        self.idQueryParser = QueryParser('id', StandardAnalyzer())
        self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)

    def find_by_name(self, name):
        query = self.nameQueryParser.parse(name)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def find_by_id(self, id):
        query = self.idQueryParser.parse(id)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables
Ejemplo n.º 4
0
class LuceneRanker(object):
    def __init__(self, tfidf_path, strict=True):
        lucene.initVM()
        analyzer = StandardAnalyzer()
        reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path)))
        self.searcher = IndexSearcher(reader)

        self.parser = QueryParser("text", analyzer)
        self.parser.setDefaultOperator(QueryParser.Operator.OR)

    def closest_docs(self, query, k=1):
        """Closest docs by dot product between query and documents
        in tfidf weighted word vector space.
        """
        query = self.parser.parse(
            query.replace('/', '//').replace('?', '').replace('"', ''))
        hits = self.searcher.search(query, k)
        docids = []
        docs = []
        for i, hit in enumerate(hits.scoreDocs):
            doc = self.searcher.doc(hit.doc)
            docs.append(unicode(doc['text']))
            docids.append(unicode(doc['title']))
        return docids, docs

    def batch_closest_docs(self, queries, k=1, num_workers=None):
        """Process a batch of closest_docs requests multithreaded."""
        # get highest scoring document for multiple queries
        batch = []
        for i, q in enumerate(queries):
            if i % 100 == 0:
                print(i)

            t0 = time.time()
            docids, docs = self.closest_docs(q, k)
            batch.append((docids, docs))
        return batch

    def parse(self, query):
        return None

    def text2spvec(self, query):
        return None

    def get_doc_index(self, doc_id):
        return 0

    def get_doc_id(self, doc_index):
        return 0

    def __exit__(self, *args):
        pass
Ejemplo n.º 5
0
    def perform_search(self, searchterm):
        # processing a query
        parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)

        query = parser.parse(searchterm)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start

        print scoreDocs
        print duration
Ejemplo n.º 6
0
	def getQueryParser(self):
		
		analyzers = self.getSearchingAnalyzers()

		map = HashMap()
		map.put('name', analyzers['name'])
		map.put('parent', analyzers['parent'])
		map.put('content', analyzers['default'])
		map.put('id', analyzers['id'])
		analyzerWrapper = PerFieldAnalyzerWrapper(analyzers['default'], map)

		queryParser = QueryParser(Version.LUCENE_CURRENT, 'content', analyzerWrapper)

		queryParser.setAutoGeneratePhraseQueries(PHRASE_QUERY_BY_DEFAULT)
		queryParser.setPhraseSlop(PHRASE_SLOP)
		queryParser.setFuzzyMinSim(FUZZY_MIN_SIM)
		queryParser.setDefaultOperator(DEFAULT_OPERATOR)

		return queryParser
Ejemplo n.º 7
0
def search(termo, **args):
	
	indexDir = os.environ.get('MANDEX') or '3iteracao'
	fsDir = SimpleFSDirectory(File(indexDir))
	searcher = IndexSearcher(DirectoryReader.open(fsDir))
	
	analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
	parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer)
	parser.setDefaultOperator(QueryParser.Operator.OR)
	query = parser.parse(termo + ' '.join(args.values()))
	start = datetime.now()
	scoreDocs = searcher.search(query, 50).scoreDocs
	duration = datetime.now() - start

	politicos = []
	for scoreDoc in scoreDocs:	    
	    doc = searcher.doc(scoreDoc.doc)
	    table = dict((field.name(), field.stringValue()) for field in doc.getFields())	   
	    politicos.append(table)

	return politicos
Ejemplo n.º 8
0
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(Paths.get(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer()
parser = QueryParser("keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)
Ejemplo n.º 9
0
Archivo: views.py Proyecto: asxzy/weiso
def search(request):
    query = request.GET.get('q', None)
    page = int(request.GET.get('page', 1))
    perPage = 5
    nodes = []
    usage = {}
    usage["time"] = time.time()

    if not query:
        count = 0
        nodes = []
        keywords = []
    else:
        #conn = ReplicaSetConnection('localhost', replicaSet='jlu')
        conn = MongoClient('localhost')
        db = conn.sina
        #db.read_preference = ReadPreference.SECONDARY
        CACHE = db.cache
        keywords = query.split(' ')
        cache = CACHE.find_one({"query":keywords,"page":page})
        if cache == None:
            print "query cache not found"
            VM_ENV.attachCurrentThread()
            fsDir = SimpleFSDirectory(File(settings.ROOT_DIR+'/index'))
            searcher = IndexSearcher(DirectoryReader.open(fsDir))

            analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, 'text', analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)
            lucene_query = parser.parse(query)

            scoreDocs = searcher.search(lucene_query, 3000000).scoreDocs


            ids = []

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                for field in doc.getFields():
                    ids.append(field.stringValue())
            print "got ids from lucene",len(ids)

            ids = [int(x) for x in ids]
            NODES = conn.sina.nodes
            count = 0
            for n in NODES.find({"node_id":{"$in":ids}}).sort("in_degree",-1).skip((page-1)*perPage):
                count += 1
                print "doing",n["node_id"],count,"/",perPage
                n["js"] = similarity(n["node_id"],topk=10)
                nodes.append(n)
                if len(nodes) == perPage:
                    break
            count = len(ids)
            CACHE.insert({"query":keywords,"page":page,"cache":nodes,"count":len(ids)})
            usage["isCache"] = False
        else:
            print "found query cache"
            usage["isCache"] = True
            nodes = cache["cache"]
            count = cache["count"]
        pagenav = {}
        if page == 1:
            pagenav["has_pre"] = None
        else:
            pagenav["has_pre"] = page - 1
        if page > count/perPage:
            pagenav["has_next"] = None
        else:
            pagenav["has_next"] = page + 1
        pagenav["page"] = page
        usage["time"] = time.time() - usage["time"]

    return {
        'q' : request.GET.get('q', ''),
        'keywords' : keywords,
        'nodes' : nodes,
        'count' : count,
        'page' : pagenav,
        'usage' : usage,
    }
class TASearcher():
    def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000):
        vm.attachCurrentThread()

        self.queries = [query for query in queries if len(query.strip()) > 0]
        self.criteria = criteria
        self.conjunctions = conjunctions
        self.orderby = orderby
        self.ascending = ascending
        self.queryString = ""
        self.limit = limit

        self.fields = fields
        self.analyzer = PorterStemmerAnalyzer()
        self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer)
        self.queryParser.setAllowLeadingWildcard(True)
        self.queryParser.setDefaultOperator(QueryParser.Operator.AND)
        indexDir = settings.LUCENE_INDEX_DIRECTORY
        self.index = MMapDirectory(File(indexDir))

    def createQueryString(self):
        # Simple
        if len(self.criteria) == 0:
            self.queryString = "(%s) OR freetext-normalized:(%s)" % (self.queries[0], self.queries[0])
        # Advanced
        else:
            queryPairs = []
            criteriaQueries = zip(self.criteria, self.queries)
            self.criteria = dict(criteriaQueries).keys()
            for criterion, query in criteriaQueries:
                if criterion in ("volume", "number", "category-label", "pubtype", "author-sort"):
                    queryPairs.append("%s:%s" % (criterion, query))
                elif criterion == "year":
                    queryPairs.append("year-start:%s OR year-end:%s" % (query, query))
                else:
                    queryPairs.append('%s:%s OR %s-normalized:%s' % (criterion, query, criterion, query))
            # queryPairs = ["%s:%s"%(criterion,query.replace(" ", "+")) for criterion, query in zip(criteria, queries)]
            try:
                queryString = "%s %s" % (queryPairs[0], " ".join(
                    ["%s (%s)" % (conj, pair) for conj, pair in zip(self.conjunctions, queryPairs[1:])]))
                self.queryString = queryString
                return queryString
            except:
                self.queryString = "freetext"
                return self.queryString

    def getQueryString(self):
        return self.queryString

    def _getHits(self):
        reader = IndexReader.open(self.index)
        searcher = IndexSearcher(reader)

        # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln)
        sortDict = {
            "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)),
            "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)),
            "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)),
            "title": (("title-sort", Locale.GERMAN),),
            "author": (("author-sort", Locale.GERMAN),),
        }

        sortFields = []

        reverse = not self.ascending

        for name in self.orderby:
            for fieldName, typeNum in sortDict.get(name, []):
                sortFields.append(SortField(fieldName, typeNum, reverse))

        if len(sortFields) == 0:
            sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)]

        sort = Sort(sortFields)

        topDocs = searcher.search(self.query, None, 80000, sort)
        hits = topDocs.scoreDocs
        self.hits = hits
        self.searcher = searcher

        lang = translation.get_language()
        if lang != "de":
            lang = "en"

        facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}}

        # Highlighting
        highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query))

        hitObjects = []
        fields = {}
        for hit in hits:
            doc = searcher.doc(hit.doc)
            # print unicode(doc)
            fields["score"] = hit.score
            fields["volume"] = doc["volume"]
            fields["number"] = doc["number"]
            fields["id"] = doc["id"]
            fields["title"] = doc["title"]
            fields["author"] = doc["author"]
            fields["authors"] = [field.stringValue() for field in doc.getFields("author")]
            for author in fields["authors"]:  # XXX
                facets["author"][author] = facets["author"].get(author, 0) + 1  # XXX

            fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)]
            for cat in fields["categories"]:
                facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1
            maxNumFragmentsRequired = 2
            fragmentSeparator = "...";
            pubtype = doc["pubtype"]
            fields["pubtype"] = pubtype
            facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1
            fields["city"] = doc["city"]
            fields["year"] = doc["year-start"]
            if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]:
                fields["year"] += " - " + doc["year-end"]
            highlightFields = ("title", "author", "city", "year", "category")

            if "freetext" in self.criteria:
                for fieldName in highlightFields:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue

            for fieldName in highlightFields:
                if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue
            """if "author" in self.criteria:
                try:
                    tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"]))
                    fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator)
                except:
                        pass"""

            hitObjects.append(
                Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"],
                    fields["year"], fields["categories"], fields["pubtype"], fields["score"]))

        facetsToDelete = []
        for facet in facets:
            if len(facets[facet]) < 2:
                facetsToDelete.append(facet)
                continue
            values = sorted(facets[facet].items(), key=itemgetter(0))
            values = sorted(values, key=itemgetter(1), reverse=True)
            facets[facet] = values[:25]
        for facet in facetsToDelete:
            del facets[facet]
        self.facets = facets
        reader.close()
        self.hitObjects = hitObjects
        return hitObjects

    def search(self):
        self.createQueryString()
        querystr = self.getQueryString()
        self.query = self.queryParser.parse(querystr)
        return self._getHits()

    def getAll(self):
        self.query = MatchAllDocsQuery()
        return self._getHits()
Ejemplo n.º 11
0
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'


template = CustomTemplate(format)

fsDir = SimpleFSDirectory(Paths.get(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer()
parser = QueryParser("keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >> sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" % (
        len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict(
        (field.name(), field.stringValue()) for field in doc.getFields())
    print template.substitute(table)
Ejemplo n.º 12
0
    raise Exception(
        'At least one between TAGS_AND_GENRES and DESCR should be True')

lucene.initVM(vmargs=['-Djava.awt.headless=true'])
fsDir = SimpleFSDirectory(Paths.get('index'))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

if CLASSIC_SIMILARITY:
    searcher.setSimilarity(ClassicSimilarity())

analyzer = EnglishAnalyzer()
tags_parser = QueryParser(TAGS_LABEL, analyzer)
genres_parser = QueryParser(GENRES_LABEL, analyzer)
descr_parser = QueryParser(DESCR_LABEL, analyzer)

tags_parser.setDefaultOperator(QueryParser.Operator.OR)
genres_parser.setDefaultOperator(QueryParser.Operator.OR)
descr_parser.setDefaultOperator(QueryParser.Operator.OR)

BooleanQuery.setMaxClauseCount(
    2000000)  # prevents 1024 limit error for very long queries

############################## Build user queries ##########################
ratings = ML1M('../datasets/ml-1m').ratings

movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv')
movies_tags = pd.read_csv('../datasets/movies-tags.csv')
movies_genres = pd.read_csv('../datasets/movies-genres.csv')

users = set(ratings[['user']].values.flatten())