コード例 #1
0
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
コード例 #2
0
ファイル: searcher.py プロジェクト: taimaruuu/Tweet-Bot
def search(searcher, analyzer, directory, query2):
    print
    print "Empty to quit."
    # command = raw_input("Query: ") #raw_input for query
    command = query2
    if command == '':
        loopVar = False
        return

    print
    print "Searching for ", command
    parserVar = MultiFieldQueryParser(fields, analyzer)
    parserVar.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(parserVar, command)

    scoreDocs = searcher.search(
        query, 10).scoreDocs  #number is max number of matching documents
    print "total matching documents in: " + str((len(scoreDocs)))
    counter = 0
    for scoreDoc in scoreDocs:  #dont really know what this is either
        doc = searcher.doc(scoreDoc.doc)
        print "@" + doc.get("u_name") + ": " + doc.get(
            "tweet") + " Score:" + str(scoreDocs[counter].score)
        docData = {}
        docData['u_name'] = doc.get("u_name")
        docData['tweet'] = doc.get("tweet")
        docData['score'] = str(scoreDocs[counter].score)
        results.append(docData)
        counter = counter + 1
    print
    print "\n------------------------------------------------------"
    return results
コード例 #3
0
 def scents_search(former, mid, last):
     query = ''.join(former) + ' ' + ''.join(mid) + ' ' + ''.join(last)
     fields = ["former_scents", "mid_scents", "last_scents"]
     clauses = [
         BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD,
         BooleanClause.Occur.SHOULD
     ]
     parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                    analyzer)
     parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
     query = MultiFieldQueryParser.parse(parser, query)
     return query
コード例 #4
0
 def brand_scent_search(brand, scent):
     query = brand + ' ' + ''.join(scents)
     fields = ["name", "scents"]
     clauses = [
         BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD,
         BooleanClause.Occur.SHOULD
     ]
     parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                    analyzer)
     parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
     query = MultiFieldQueryParser.parse(parser, query)
     return query
コード例 #5
0
ファイル: search.py プロジェクト: rbouadjenek/YouTaQA
    def multiFieldsSearch(self, query, sim):
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(
            ["content_section", "title_section", 'title_article'],
            self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
コード例 #6
0
    def explain(self, query, fields, doc):

        if not self.searcher:
            self.open_searcher()

        query = QueryParser.escape(query)

        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)

        return self.searcher.explain(query, doc)
コード例 #7
0
 def parse_query(self, query_string, order_matters=True):
     query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT,
                                          ["title", "qbody"], self.analyzer)
     if order_matters:
         # Take into account order of query terms
         base_query = getSpanNearQuery(self.analyzer, query_string)
     else:
         # Considers query keywords as bag of words
         base_query = query_parser.parse(query_string)
     #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
     boost_query = FunctionQuery(LongFieldSource("view_count"))
     self.query = CustomScoreQuery(base_query, boost_query)
コード例 #8
0
ファイル: rank.py プロジェクト: davepie101/Twitter-Crawler
    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results
コード例 #9
0
ファイル: search.py プロジェクト: rbouadjenek/YouTaQA
    def multiFieldsSearch(self, query, sim):
        """
        Method that searches through documents using content_section and title_article Fields
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
コード例 #10
0
    def preprocess_query(self, query, fields, mode="ANY"):
        '''
		Fix query according to provided mode. If the value is not supported, 
		the query remains unchanged
		'''

        terms = query.lower().strip().split()
        if mode == "ANY":
            query = " OR ".join(terms)
        elif mode == "ALL":
            query = " AND ".join(terms)
        else:
            print "Invalid mode parameter '%s'." % mode

        query = QueryParser.escape(query)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)
        return query
コード例 #11
0
    def search(self, terms, n_hits=5):
        """
        Run search query.
        """
        # TODO: support date range queries

        # build query
        parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer)
        #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier
        query = MultiFieldQueryParser.parse(
            parser, terms)  # https://stackoverflow.com/a/26853987/130164
        # create a highlighter
        highlighter = Highlighter(SimpleHTMLFormatter('*', '*'),
                                  QueryScorer(query))
        # execute search for top N hits
        return [
            self._process_search_result(result, highlighter)
            for result in self.searcher.search(query, n_hits).scoreDocs
        ]
コード例 #12
0
def func_cross(former, mid, last):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = former + ' ' + ' ' + mid + ' ' + last
    fields = ["former", "mid", "last"]
    clauses = [
        BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD,
        BooleanClause.Occur.SHOULD
    ]
    parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
    parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    query = MultiFieldQueryParser.parse(parser, query)

    scoreDocs = searcher.search(query, 200).scoreDocs
    results = process(scoreDocs, searcher)
    return results
コード例 #13
0
ファイル: search.py プロジェクト: wdqatualr/hoaxy-backend
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format
コード例 #14
0
ファイル: search.py プロジェクト: rbouadjenek/YouTaQA
    def multiFieldsPairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title))
        query2 = MultiFieldQueryParser.parse(parser,
                                             QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
コード例 #15
0
    def doc_search(self, field, keywords, numHits):
        if field != 'All':
            analyzer = StandardAnalyzer()
            parser = QueryParser(field, analyzer)
            query = parser.parse(keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits
        else:
            analyzer = WhitespaceAnalyzer()
            parser = MultiFieldQueryParser(['Title', 'Body'], analyzer)
            query = MultiFieldQueryParser.parse(parser, keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits

            self.hits = hits
            self.field = field
            return hits
コード例 #16
0
ファイル: JaccardSearcher.py プロジェクト: pombredanne/CoCaBu
indexDir = File("/tmp/github")

# 1. open the index
analyzer = KeywordAnalyzer()
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
# a = {"typed_method_call": WhitespaceAnalyzer()}
# wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "HttpURLConnection.disconnect Exception.printStackTrace BufferedReader.close HttpURLConnection.setRequestProperty HttpURLConnection.setRequestMethod DataOutputStream.writeBytes HttpURLConnection.getInputStream DataOutputStream.close HttpURLConnection.setUseCaches StringBuffer.append URL.openConnection HttpURLConnection.getOutputStream Integer.toString String.getBytes StringBuffer.toString HttpURLConnection.setDoOutput BufferedReader.readLine DataOutputStream.flush HttpURLConnection.setDoInput"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT,
                                     ["typed_method_call"], analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
# boost_query = FunctionQuery( LongFieldSource("view_count"))
#query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)

# 3. search the index for the query
# We retrieve and sort all documents that match the query.
# In a real application, use a TopScoreDocCollector to sort the hits.
コード例 #17
0
ファイル: SOSearcher.py プロジェクト: pombredanne/CoCaBu
indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
boost_query = FunctionQuery(LongFieldSource("view_count"))
query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)

# 3. search the index for the query
# We retrieve and sort all documents that match the query.
# In a real application, use a TopScoreDocCollector to sort the hits.