コード例 #1
0
class SSQA_S_Searcher:
    def __init__(self, indexDir, analyzer):
        lucene.initVM()
        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug("Search similarity func: {}".format(
            self.searcher.getSimilarity()))

    def search(self, query_text, top_n):
        query_text = query_text.strip()
        query = QueryParser("content", self.analyzer).parse(
            QueryParser.escape(query_text.strip()))
        scoreDocs = self.searcher.search(query, top_n).scoreDocs
        count = 0
        out_list = []
        for scoreDoc in tqdm(scoreDocs):
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append(doc['content'])
            count += 1
        logger.info("Added {} sentences".format(count))
        return out_list

    def close(self):
        self.reader.close()
コード例 #2
0
ファイル: ir.py プロジェクト: ericmelz/XCS224U-Project
def search_loop(index_dir, field="contents", explain=False):
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir))))
    analyzer = StandardAnalyzer()
    print("Hit enter with no input to quit.")
    while True:
        command = input("Query:")
        if command == '':
            return
        print("Searching for: %s" % command)
        query = QueryParser(field, analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            if field == 'web':
                print(
                    f'{doc.get("web")} | {doc.get("raw")} | {scoreDoc.score}')
            else:
                print('path:', doc.get("path"), 'name:', doc.get("name"))
            if explain:
                explanation = searcher.explain(query, scoreDoc.doc)
                print(explanation)
                print('------------')
コード例 #3
0
class ParagSearcher:
    def __init__(self, Lid, db_path=config.DB_SSQA):
        lucene.initVM()
        self.db = SSQA_DB(db_path)

        lesson_str = self.db.get_lesson_str(Lid)
        parags = str_lesson2parags(lesson_str)

        # Index a Lesson
        myIndexer = _ChineseRamIndexer()
        myIndexer.index_lesson(parags)
        myIndexer.close()

        self.reader = DirectoryReader.open(myIndexer.indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = SmartChineseAnalyzer()
        logger.debug('search similarity:{}'.format(
            self.searcher.getSimilarity()))

    def __exit__(self, *args):
        self.close()

    def search(self, query_text, top_n=1):
        query_text = query_text.strip()
        # query = QueryParser("content", self.analyzer).parse(QueryParser.escape(query_text.strip()))
        query = QueryParser("content", self.analyzer).parse(query_text)
        scoreDocs = self.searcher.search(query, top_n).scoreDocs

        out_list = []
        for scoreDoc in scoreDocs:
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append((doc['pid'], doc['content'], scoreDoc.score))
        return out_list

    def close(self):
        self.db.close()
        self.reader.close()
コード例 #4
0
class CosQASearcher:
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH)))
            analyzer = SmartChineseAnalyzer()
        elif lang == 'en':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN)))
            analyzer = EnglishAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))

        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug('search similarity func: {}'.format(
            self.searcher.getSimilarity()))

    def search(self, query_text, top_n=1):
        query_text = query_text.strip()
        query = QueryParser("content", self.analyzer).parse(
            QueryParser.escape(query_text.strip()))
        #         query = QueryParser("content", self.analyzer).parse(query_text)
        scoreDocs = self.searcher.search(query, top_n).scoreDocs

        out_list = []
        for scoreDoc in scoreDocs:
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append(
                (doc['did'], doc['title_en'], doc['content'], scoreDoc.score))
        return out_list

    def close(self):
        self.reader.close()
コード例 #5
0
ファイル: ir.py プロジェクト: ericmelz/XCS224U-Project
class SimpleSearcher(Searcher):
    def __init__(self, index_dir):
        self.searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir))))
        self.analyzer = StandardAnalyzer()

    def search(self, qstring):
        query = QueryParser("web", self.analyzer).parse(qstring)
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        return [self.searcher.doc(score_doc.doc) for score_doc in scoreDocs]

    def explain(self, qstring):
        query = QueryParser("web", self.analyzer).parse(qstring)
        score_docs = self.searcher.search(query, 50).scoreDocs
        print(qstring)
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            print(f'{doc.get("web")} | {doc.get("raw")} | {score_doc.score}')
            explanation = self.searcher.explain(query, score_doc.doc)
            print(explanation)
            print('------------')
コード例 #6
0
ファイル: lucene_retrieval.py プロジェクト: danieldmm/minerva
class LuceneRetrieval(BaseRetrieval):
    """
        Encapsulates the Lucene retrieval engine
    """
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger

    def runQueryViaExplain(self,query, max_results):
        """
            Really crappy solution to make sure that explanations and searches are the same
            while I fix Lucene
        """
        results=[]

        index=0
        for index in range(self.reader.numDocs()):
            explanation=self.searcher.explain(query,index)
            score=explanation.getValue()
##            match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
##            if match:
##                score=float(match.group(1))
            hit=namedtuple("Hit",["doc","score"])
            hit.doc=index
            hit.score=score
##            heapq.heappush(results,hit)
            results.append(hit)

        results.sort(key=lambda x:x.score,reverse=True)

        if max_results < self.reader.numDocs():
            results=results[:max_results]

        return results

    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res


    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: Elastic DSL Query
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation=self.searcher.explain(query,doc_id)

        formula=StoredFormula()
        formula.fromLuceneExplanation(explanation)
        return formula
コード例 #7
0
class GitHubSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = index_path
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        indexDir = File(self.index_path)
        a = {"code": self.porter_analyzer}
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("Index contains %d documents." % n_docs)

    def get_DF(self, field, term):
        return self.reader.docFreq(Term(field, term))

    def get_IDF(self, field, term):
        from math import log10, sqrt
        docF = self.reader.docFreq(Term(field, term))

        return log10(self.reader.numDocs() / (docF + 1)) + 1

    def get_minimum_IDF(self, docF=2):
        from math import log10, sqrt
        return log10(self.reader.numDocs() / (docF + 1)) + 1

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  #"used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())

                    # Filter out noisy terms
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        # idf = self.get_IDF(field, term)

                        # print self.get_DF(field, term), term, field
                        #query += "%s:%s^%s " % (field, term, idf)
                        query += "%s:%s " % (field, term)

                    #print "term: %s idf: %s" % (term, self.get_minimum_IDF())

                #query += "%s:%s " % (field, term)
                #print "%s:%s^%s" % (field, term, self.getIDF(field, term))
        # for hint in doc.getFields("code_hints"):
        # 	tokens = utils.tokenize(hint.stringValue())
        # 	for token in tokens:
        # 		#print token
        # 		token = QueryParser.escape(token)
        # 		if token.strip():
        # 			print "HINTS", token
        # 			query += "code:%s^5.0 " % (token)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    print "TERM", term
                    # if term[0].isupper():
                    # 	query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term)
                    # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers
                    # 	query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term)

                    #query += "code:%s^5.0 " % (term)

        return query

    def get_matched_keywords(self, query, docid):
        matched_terms = []
        # def _get_matched_keywords(q, matched_terms):
        # 	print type(q), matched_terms
        # 	if isinstance(q, TermQuery):
        # 		if self.searcher.explain(q, docid).isMatch():
        # 			matched_terms.append( q.getTerm().text() )
        # 	elif isinstance(q, BooleanQuery):
        # 		for query_term in query.getClauses():
        # 			_get_matched_keywords(query_term, matched_terms)
        # 			# if self.searcher.explain(query_term.getQuery(), docid).isMatch():
        # 			# 	matched_terms.append( query_term.getQuery().getTerm().text() )

        # _get_matched_keywords(query, matched_terms)

        if isinstance(query, TermQuery):
            if self.searcher.explain(query, docid).isMatch():
                matched_terms.append(query.getTerm().text())
        elif isinstance(query, BooleanQuery):
            for query_term in query.getClauses():
                if self.searcher.explain(query_term.getQuery(),
                                         docid).isMatch():
                    matched_terms.append(
                        query_term.getQuery().getTerm().text())

        #print "Matched Terms: %s" % matched_terms
        return matched_terms

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                #field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def code_as_text(self):
        """ Extends a query by matching query keywords in source code as text"""

        query = " "
        for term in tokenize_string(self.porter_analyzer, self.query):
            if term:
                term = QueryParser.escape(term)
                query += "code:%s " % (term)

        return query

    def lexical_search(self):
        """ In case no term is matching with stackoverflow we perform a simple lexical search on GitHub """
        github_result = []
        query = self.code_as_text().strip()
        query = QueryParser(Version.LUCENE_CURRENT, "code",
                            self.analyzer).parse(query)
        hits = self.searcher.search(query, 10).scoreDocs
        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            matched_terms = self.get_matched_keywords(query, hit.doc)

            # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]

            item = GithubResultItem(doc.get("file"),
                                    decompress(doc.get("file_content")),
                                    matched_terms, hit.score, so_item,
                                    doc.get("line_numbers"), hit.doc)  # code

            github_result.append(item)

        return github_result

    def more_like_this(self, so_items):

        github_result = []
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))

        for so_item in so_items:
            queryparser = QueryParser(Version.LUCENE_CURRENT,
                                      "typed_method_call", self.analyzer)
            query = ""
            if so_item.doc:
                query = self.document_to_query(so_item.doc)

            query += self.code_as_text()
            if query:
                print "-" * 30
                print "Query: %s" % query
                print "-" * 30
                try:
                    like_query = queryparser.parse(query)

                    hits = self.searcher.search(like_query, 10).scoreDocs

                    for i, hit in enumerate(hits):
                        doc = self.searcher.doc(hit.doc)
                        matched_terms = self.get_matched_keywords2(
                            like_query, hit.doc)
                        # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                        item = GithubResultItem(doc.get("file"),
                                                decompress(
                                                    doc.get("file_content")),
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code
                        github_result.append(item)
                        #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score))
                except Exception as e:
                    print "Error: %s" % e
        # print Counter(files).most_common(5)
        return github_result

    def more_like_this2(self, so_items):
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))
        query = ""

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)  ####
        ###아래의 반복문이 Agumented Query 생성부
        for so_item in so_items:
            if so_item.doc:
                query += self.document_to_query(so_item.doc)
            query += self.code_as_text()

        github_result = []
        if query:
            print "-" * 50
            print "UNified Query: %s" % query
            print "-" * 50
            try:
                ###루씬에 맞는 Query로 최종 변환
                like_query = queryparser.parse(query)
                ###아래 줄이 실제로 GitHub Indices들 찾아들어가서 like_query와 비교 견적 상위 5개..
                hits = self.searcher.search(like_query, 5).scoreDocs  #상위 5개 결과
                #hits에 5개의 결과가 들어감..

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)
                    print "Matched Terms : ", matched_terms

                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    print("file", doc.get("file"), "file_content",
                          doc.get("file_content"), "line_numbers",
                          doc.get("line_numbers"))
                    file_path = doc.get("file")
                    #file_path = "" + doc.get("file")[24:]
                    #file_path = "/root/GitSearch" + doc.get("file")[24:]
                    #print(doc.get("file")[32:])
                    #print(doc.get("file")[0:])
                    #print(file_path)

                    content = None
                    try:
                        with open(
                                file_path
                        ) as f:  #실제 프로젝트 경로 쭉 찾아들어가서 파일 열고 읽어서 content에 넣음
                            content = f.read()
                    except:
                        pass

                    #File 찾고 내용 존재 시, 형식에 맞게 item에 넣음.
                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code
                        github_result.append(item)

            except Exception as e:
                print "GitSearcher: Error: %s" % e
                print(traceback.format_exc())

        return github_result
コード例 #8
0
class LuceneRetrieval(BaseRetrieval):
    """
        Encapsulates the Lucene retrieval engine
    """
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger

    def runQueryViaExplain(self,query, max_results):
        """
            Really crappy solution to make sure that explanations and searches are the same
            while I fix Lucene
        """
        results=[]

        index=0
        for index in range(self.reader.numDocs()):
            explanation=self.searcher.explain(query,index)
            score=explanation.getValue()
##            match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
##            if match:
##                score=float(match.group(1))
            hit=namedtuple("Hit",["doc","score"])
            hit.doc=index
            hit.score=score
##            heapq.heappush(results,hit)
            results.append(hit)

        results.sort(key=lambda x:x.score,reverse=True)

        if max_results < self.reader.numDocs():
            results=results[:max_results]

        return results

    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res


    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: Elastic DSL Query
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation=self.searcher.explain(query,doc_id)

        formula=StoredFormula()
        formula.fromLuceneExplanation(explanation)
        return formula
コード例 #9
0
class Searcher:
    def __init__(self, source, index_path):
        self.index_path = index_path
        self.source = source
        ast, source = parse(self.source, resolve=True, source=True)
        self.source = source
        self.ast = ast
        self.queryparser = QueryParser(Version.LUCENE_CURRENT,
                                       "typed_method_call", KeywordAnalyzer())
        self.load_index()

    def load_index(self):
        indexDir = File(self.index_path)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("Index contains %d documents." % n_docs)

    def document_to_query(self):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "typed_method_call", "methods", "extends", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  #"used_classes", , "literals"
            for val in self.ast[field]:
                term = QueryParser.escape(val)

                query += "%s:%s " % (field, term)

        return query

    def get_matched_keywords(self, query, docid):
        matched_terms = []

        if isinstance(query, TermQuery):
            #print self.searcher.explain(query, docid)
            if self.searcher.explain(query, docid).isMatch():
                matched_terms.append(query.getTerm().text())
        else:
            for query_term in query.getClauses():
                if self.searcher.explain(query_term.getQuery(),
                                         docid).isMatch():
                    #print self.searcher.explain(query_term.getQuery(), docid)
                    matched_terms.append(
                        query_term.getQuery().getTerm().text())

        #print "Matched Terms: %s" % matched_terms
        return matched_terms

    def get_AST_from_Doc(self, doc):
        tree = {}
        tree["typed_method_call"] = [
            f.stringValue() for f in doc.getFields("typed_method_call")
        ]
        tree["methods_called"] = [
            f.stringValue() for f in doc.getFields("methods_called")
        ]
        tree["imports"] = [f.stringValue() for f in doc.getFields("imports")]
        tree["used_classes"] = [
            f.stringValue() for f in doc.getFields("used_classes")
        ]
        # tree["var_type_map"] = eval(doc.getField("var_type_map").stringValue())
        # tree["unresolved_method_calls"] = [f.stringValue() for f in doc.getFields("unresolved_method_calls")]

        return tree

    def more_like_this(self):

        trees = []

        file_hash_process = set()
        query = self.document_to_query()

        if query:
            print "-" * 30
            print "Query: %s" % query
            print "-" * 30
            try:
                like_query = self.queryparser.parse(query)

                hits = self.searcher.search(like_query, 10).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords(
                        like_query, hit.doc)
                    file_path = doc.getField("file").stringValue()
                    #print "Matched Terms", matched_terms
                    print "Path: ", file_path
                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    with open(file_path, "r") as f:
                        file_content = f.read()

                    file_hash = doc.getField("hash").stringValue()

                    #print "FILE", file_content
                    #print "PARSE", parse(file_content, resolve=False)
                    if file_hash not in file_hash_process:
                        trees.append(parse(file_content, resolve=False))
                        file_hash_process.add(file_hash)
                    else:
                        print "Duplicate: ", file_path

                    #trees.append( self.get_AST_from_Doc(doc) )

            except Exception as e:
                print "Error: %s" % e

        return trees
コード例 #10
0
class QuestionLuceneSearch():

    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()

    def get_text_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        text_id = {}
        id_text = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            text = doc['text']
            text_id[text] = idd
            id_text[idd] = text

        return text_id, id_text


    # def add_doc(self, doc_id, title, txt, add_terms):
    def add_doc(self, doc_id, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        # doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)


    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()


    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)
 
        return out


    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c

    
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
                except:
                    print('Unexpected error when processing query:', str(q))
                    print('Using query "dummy".')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = list(map(int, doc['word_idx'].split(' ')))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    # c[int(doc['id'])] = [word_idx, word]
                    c[int(doc['id'])] = [word_idx, word, hit.score]
                out.append(c)

        return out


    def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in zip(out, terms):                
                for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())):
                    outt[cand_id] = term
  
        if save_cache:
            for q, c in zip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out



    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out

    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out

    def search_pair_score_multithread(self, qs_trailing_doc, searcher):

        self.curr_searcher = searcher
        # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int))
        out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc)

        return out

    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
コード例 #11
0
class BenchSearcher:
	def __init__(self, index_path, query=None):
		self.index_path = File(index_path)
		self.directory = None
		self.reader = None
		self.query = query
		self.porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		self.load_index()

	def load_index(self):
		a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer}
		self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
		self.directory = SimpleFSDirectory(self.index_path)
		self.reader = DirectoryReader.open(self.directory)
		self.searcher = IndexSearcher(self.reader)

	def tokenize_string(self, analyzer, string):
		result = []
		stream = analyzer.tokenStream(None, StringReader(string))
		cattr = stream.addAttribute(CharTermAttribute)
		stream.reset()
		while stream.incrementToken():
			result.append(cattr.toString())
		stream.close()
		return result

	def camel_case_split(self, s):
		import re
		s = s.replace("_", " ")
		s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
		s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace("  ", " ").split()
		return s

	def document_to_query(self, doc):
		""" Given a document it transforms the source code related fields to a lucene query string """
		query = ""
		for field in ["description"]:
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					# tokenize
					term = self.tokenize_string(StandardAnalyzer(), term)
					# CamelCase
					temp = []
					for t in term:
						temp += self.camel_case_split(t)
					# stopwords
					temp_2 = []

					for t in temp:
						if t not in english_stop_words:
							temp_2.append(t)
					# stemming
					temp_3 = []
					for t in temp_2:
						temp_3.append(stem(t))
					# stopwords
					temp_4 = []

					for t in temp_3:
						if t not in english_stop_words:
							temp_4.append(t)
					# query generation
					for term in temp_4:
						query += "%s:%s " % (field, term)

		for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called",
					  "annotations", "literals"]:  # "used_classes", , "literals" , "extends"
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float',
									 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write',
									 'toString',
									 'close', 'mkdir', 'exists']

					if term not in java_stoplist:
						query += "%s:%s " % (field, term)

		if len(doc.getFields("code_hints")) > 0:
			hints = [hint.stringValue() for hint in doc.getFields("code_hints")]
			hints_str = " ".join(hints)
			for term in hints:
				if term:
					term = QueryParser.escape(term)
					if term not in english_stop_words:
						# print "Including 'code_hints' from Doc_To_Query TERMs... //", term
						query += "code_hints:%s " % term
		return query

	def get_matched_keywords2(self, query, doc):
		matched_terms = []
		weight_expl = self.searcher.explain(query, doc).toString().split("weight(")
		for expl in weight_expl:
			if " in " in expl:
				field_val = expl.split(" in ")[0]
				# field, val = field_val.split(":")
				val = field_val.split(":")[-1]
				matched_terms.append(val)
		return matched_terms

	def more_like_this2(self, limit, score_logs_for_each, user_query, flag):
		bench_result = []
		query = ""
		if flag == 1:
			query += user_query

		query = remove_unified_stop_lists(query)
		queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer)
		if query:
			try:
				parsed_query = queryparser.parse(query)
				hits = self.searcher.search(parsed_query, limit).scoreDocs
				temp = 1
				for i, hit in enumerate(hits):
					doc = self.searcher.doc(hit.doc)
					matched = doc.get('file').split('/')[9].split('.')[0]
					score_logs_for_each += str(matched) + '\t' + str(round(hit.score, 2)) + '\n'
					matched_terms = self.get_matched_keywords2(parsed_query, hit.doc)
					temp += 1

					file_path = doc.get("file")
					content = None
					try:
						with open(file_path) as f:
							content = f.read()
					except:
						pass

					if content:
						item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc)
						bench_result.append(item)

			except Exception as e:
				print "BenchSearcher Error: %s" % e
				print(traceback.format_exc())

		return bench_result, score_logs_for_each
コード例 #12
0
class LuceneAnnotator(object):
    #@Value("${org.freya.lucene.index.dir.search}") Resource luceneIndexDir;
    def __init__(self):
        logging.basicConfig(filename='../../freya/index/annotator.log', filemode='w', level=logging.DEBUG)

    def close(self):
        if self._reader != None:
            try:
                print "closing index Reader"
            except Exception as e:#IOException(e):
                print e.message
                logging.error("Error")
            finally:
                pass
    #private static final Log logger = LogFactory.getLog(LuceneAnnotator.class);
    def getIndex(self):
        return self._index

    def setIndex(self, index):
        self._index = index
    def testSearcher(self):
        query=QueryParser(Version.LUCENE_CURRENT, "class", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(QueryParser.escape('http\://www.mooney.net/geo#River'))
        print query
        hits = self._searcher.search(query, 50)
        for hit in hits.scoreDocs:
            print hit.score, hit.doc, hit.toString()
            doc = self._searcher.doc(hit.doc)
            print doc.get("class").encode("utf-8")
    #public SynonymMap synonymMap;
    def init(self):
        try:
            print 'lucene', lucene.VERSION
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
            if not hasattr(self,'_index'):
                indexDir = "../../freya/index/actual"
                self._index = File(indexDir)
            if not hasattr(self,'_reader'):
                self._reader = "Not needed"
            if not hasattr(self,'_searcher'):
                try:
                    # lazily instantiate searcher
                    print "Setting searcher to " + str(self._index)
                    self._searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(self._index)))
                except Exception as e:#Exception(e):
                    print e.message
                    print "Searcher Initialisation Error"
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Lucene Error")

    def getSpecificityScores(self):
        # map = Hashtable[str, Nullable]()
        logging.info("Need to implement....")
        return map

    # *
    # * find lucene annotations for this poc specialTreatment is for common nouns so that they are searched with stem not
    # * exact match
    # *
    # * @param annotation
    # * @return
    #
    def searchIndex(self, annotation, specialTreatment):
        if specialTreatment:
            return self.searchStemFirst(annotation)
        annotations = list() #ArrayList[Annotation]()
        try:
            maxSynonyms = 0
            stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
            # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
            # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
            # synonymMap, maxSynonyms);
            analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
            pocString = QueryParser.escape(annotation.getText())
            preparePocString = "\"" + pocString + "\""
            preparePocStringLowercase = "\"" + pocString.lower() + "\""
            query = parser.parse(preparePocString)
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            if freq <= 0:
                # search lowercased exact
                lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
                query = lowerCasedParser.parse(preparePocStringLowercase)
                # logging.info("Searching for: " + query.toString());
                result = self._searcher.search(query, 1)
                freq = result.totalHits
                if freq > 0:
                    result = self._searcher.search(query, freq)
                hits = pyJava.JArray2List(result.scoreDocs)
                logging.debug("For " + str(query) + " : " + str(result.totalHits))
            if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0:
                # search stemmed
                stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser)
                query = stemParser.parse(preparePocStringLowercase)
                # logging.info("Searching for: " + query.toString());
                result = self._searcher.search(query, 1)
                freq = result.totalHits
                if freq > 0:
                    result = self._searcher.search(query, freq)
                hits = pyJava.JArray2List(result.scoreDocs)
                logging.info("For " + str(query) + " : " + str(result.totalHits))
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                self._searcher.explain(query, hit.doc)
                ann = Annotation()
                features = dict()
                features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB)
                features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB)
                features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
                features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
                features[FreyaConstants.SCORE]=hit.score
                ann.setFeatures(features)
                ann.setEndOffset(annotation.getEndOffset())
                ann.setStartOffset(annotation.getStartOffset())
                ann.setSyntaxTree(annotation.getSyntaxTree())
                ann.setText(annotation.getText())
                annotations.append(ann)
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return annotations

    # *
    # * this method now search both stem and lowercase
    # *
    # * @param annotation
    # * @return
    #
    def searchStemFirst(self, annotation):
        annotations = list()
        pocString = QueryParser.escape(annotation.getText())
        preparePocStringOriginal = "\"" + pocString + "\""
        preparePocStringLowercase = "\"" + pocString.lower() + "\""
        try:
            maxSynonyms = 0
            # Analyzer stemmedAnalyser =
            # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
            # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
            # synonymMap, maxSynonyms);
            stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
            analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
            stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser)
            query = stemParser.parse(preparePocStringLowercase)
            result = self._searcher.search(query, 1)
            logging.info("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            stemHits = result.scoreDocs
            allHits = stemHits
            # if(stemHits.length == 0) {
            # search lowercased exact
            parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
            query = parser.parse(preparePocStringLowercase)
            result = self._searcher.search(query, 1)
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            lowHits = result.scoreDocs
            allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits)
            logging.info("For " + str(query) + " : " + str(result.totalHits))
            # }
            # if(allHits.length == 0) {
            # search exact
            exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
            query = exactParser.parse(preparePocStringLowercase)
            result = self._searcher.search(query, 1)
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs)
            logging.info("For " + str(query) + " : " + str(result.totalHits))
            # }
            # for (ScoreDoc hit : allHits) {
            indexus = 0
            while indexus < len(allHits):
                hit = allHits[indexus]
                doc = self._searcher.doc(hit.doc)
                self._searcher.explain(query, hit.doc)
                ann = Annotation()
                features = dict()
                features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB)
                features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB)
                features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
                features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
                features["score"] = hit.score
                ann.setFeatures(features)
                ann.setEndOffset(annotation.getEndOffset())
                ann.setStartOffset(annotation.getStartOffset())
                ann.setSyntaxTree(annotation.getSyntaxTree())
                ann.setText(annotation.getText())
                annotations.append(ann)
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return annotations

    # *
    # * @return
    #
    def findPropertyURIs(self):
        uris = list()
        uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, None)
        uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, None)
        uris = uris + self.findRDFPropertyURIs(None)
        return uris

    # *
    # * @param max
    # * @return
    #
    def findPropertyURIs(self, max):
        uris = list()
        uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, max)
        uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, max)
        uris = uris + self.findRDFPropertyURIs(max)
        return uris

    # *
    # * @return
    #
    def findDatatypePropertyURIs(self):
        uris = list()
        uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, None)
        return uris

    # *
    # * @return
    #
    def findObjectPropertyURIs(self):
        uris = list()
        uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, None)
        return uris

    # *
    # * @param max
    # * @return
    #
    def findRDFPropertyURIs(self, max):
        uris = list()
        owl = "http://www.w3.org/2002/07/owl"
        rdfProps = self.findPropertyURIs(RDF.PROPERTY, max)
        # for (String prop : rdfProps) {
        indexus = 0
        while indexus < len(rdfProps):
            prop = rdfProps[indexus]
            if prop != None and not prop.startswith(owl):
                uris.append(prop)
            indexus += 1
        return uris

    # *
    # * @return
    #
    def findClassURIs(self):
        uris = list()
        uris = uris + self.findPropertyURIs(OWL.CLASS, None)
        uris = uris + self.findPropertyURIs(RDFS.CLASS, None)
        return uris

    # *
    # * find lucene annotations for this poc
    # *
    # * @param annotation
    # * @return
    #
    def findPropertyURIs(self, propertyType, max):
        uris = list() # list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer)
            query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"")
            result = self._searcher.search(query, 1)
            freq = result.totalHits
            if max != None:
                freq = max.intValue()
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max))
            print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                self._searcher.explain(query, hit.doc)
                uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return uris

    # *
    # * @param propertyUri
    # * @return
    #
    def findPropertyRange(self, propertyUri):
        rangeUri = "http://www.w3.org/2000/01/rdf-schema#range"
        return self.searchForClass(propertyUri, rangeUri)

    # *
    # * @param propertyUri
    # * @return
    #
    def findPropertyDomain(self, propertyUri):
        rangeUri = "http://www.w3.org/2000/01/rdf-schema#domain"
        return self.searchForClass(propertyUri, rangeUri)

    # *
    # * given classUri search for field class so that pred=subClassOf
    # *
    # * @param classUri
    # * @return
    #
    def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!!
        propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        subClasses = list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
            flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
            subClassUri = "\"" + QueryParser.escape(propertyURI) + "\""
            queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri]
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer)
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return subClasses

    # *
    # * check whether this is datatype property or not
    # *
    # * @param propertyUri
    # * @return
    #
    def isItDatatypeProperty(self, propertyUri):
        result = self.checkIfItIsDatatypeProperty(propertyUri)
        exists = False
        if result != None and len(result) > 0:
            exists = True
        # logging.info("isItDatatypeProperty for " + propertyUri + " is " + exists);
        return exists

    # *
    # * @param classUri
    # * @return
    #

    def getDefinedPropertiesWhereClassIsADomain(self, classUri):
        properties = self.searchForInstance(classUri, RDFS.DOMAIN)
        return properties

    # Apparently there is no overloading in Python! MUST CHANGE FLOW
    def getDefinedPropertiesWhereClassIsADomain(self, classUri, forceSuperClasses):
        properties = list()
        if forceSuperClasses:
            superClasses = self.findSuperClasses(classUri)
            superClasses.append(classUri)
            # for (String uri : superClasses) {
            indexus = 0
            while indexus < len(superClasses):
                uri = superClasses[indexus]
                for each in self.getDefinedPropertiesWhereClassIsADomain(uri,False):
                    properties.append(each)
                indexus += 1
        else:
            properties = self.searchForInstance(classUri, RDFS.DOMAIN)
        return properties

    def getDefinedPropertiesWhereClassIsARange(self, classUri, forceSuperClasses):
        properties = list()
        if forceSuperClasses:
            superClasses = self.findSuperClasses(classUri)
            superClasses.append(classUri)
            # for (String uri : superClasses) {
            indexus = 0
            while indexus < len(superClasses):
                uri = superClasses[indexus]
                for each in self.getDefinedPropertiesWhereClassIsARange(uri,False):
                    properties.append(each)
                indexus += 1
        else:
            properties = self.searchForInstance(classUri, RDFS.RANGE)
        return properties


    # *
    # * @param classUri
    # * @return
    #
    def getNeighbouringClassesWhereGivenClassIsADomain(self, classUri, forceSuperClasses):
        classes = list()
        if forceSuperClasses:
            # here recursively go and first find all super classes
            feedClasses = self.findSuperClasses(classUri)
            feedClasses.append(classUri)
            # then for each superclass do the same as above
            # for (String uri : feedClasses) {
            indexus = 0
            while indexus < len(feedClasses):
                uri = feedClasses[indexus]
                for each in self.getNeighbouringClassesWhereGivenClassIsADomain(uri, False):
                    classes.append(each)
                indexus += 1
        else:
            properties = self.searchForInstance(classUri, RDFS.DOMAIN)
            # for (String property : properties) {
            indexus = 0
            while indexus < len(properties):
                property = properties[indexus]
                for each in self.searchForClass(property, RDFS.RANGE):
                    classes.append(each)
                indexus += 1
        return classes




    def getNeighbouringClassesWhereGivenClassIsARange(self, classUri, forceSuperClasses):
        classes = list()
        if forceSuperClasses:
            # here recursively go and first find all super classes
            feedClasses = self.findSuperClasses(classUri)
            feedClasses.append(classUri)
            logging.info("found " + str(len(feedClasses)) + " super classes for " + classUri)
            # then for each superclass do the same as above
            # for (String uri : feedClasses) {
            indexus = 0
            while indexus < len(feedClasses):
                uri = feedClasses[indexus]
                for each in self.getNeighbouringClassesWhereGivenClassIsARange(uri,False):
                    classes.append(each)
                logging.info("found " + str(len(classes)) + " elements for " + uri)
                indexus += 1
        else:
            properties = self.searchForInstance(classUri, RDFS.RANGE)
            # for (String property : properties) {
            indexus = 0
            while indexus < len(properties):
                property = properties[indexus]
                for each in self.searchForClass(property, RDFS.DOMAIN):
                    classes.append(each)
                indexus += 1
        return classes


    # *
    # * @param classUri
    # * @return
    #
    def findSuperClasses(self, classUri):
        searchFinished = False
        directSuperClasses = list()
        superClassesToSave = list()
        while not searchFinished:
            directSuperClasses = self.searchForClass(classUri, RDFS.SUBCLASSOF)
            # print str(directSuperClasses) + " list"
            if len(directSuperClasses) == 0 or (len(directSuperClasses) != 0 and pyJava.contains(directSuperClasses,superClassesToSave)):
                searchFinished = True
            else:
                # logging.info("searchFinished for SuperClasses");
                # System.out.println("size:"+directSuperClasses.size());
                for each in directSuperClasses:
                    superClassesToSave.append(each)
                # for (String cUri : directSuperClasses) {
                indexus = 0
                while indexus < len(directSuperClasses):
                    cUri = directSuperClasses[indexus]
                    for each in self.findSuperClasses(cUri):
                        superClassesToSave.append(each)
                    indexus += 1
                searchFinished = True
        logging.info("For " + str(classUri) + " found " + str(len(superClassesToSave)) + " super-classes.")
        return superClassesToSave





    def searchForInstance(self, classUri, pred):
        uris = list()
        fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
        flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
        queries = ["\"" + QueryParser.escape(classUri) + "\"", "\"" + QueryParser.escape(pred) + "\""]
        try:
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, StandardAnalyzer(Version.LUCENE_CURRENT))
            result = self._searcher.search(query, 1)
            logging.debug("For " + query.toString() + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
                indexus += 1
        except Exception as e:#ParseException(e):
            print e.message
            logging.error("Error")
        return uris

    # *
    # *
    # * @param inst
    # * @param className
    # * @return
    #
    def checkIfItIsDatatypeProperty(self, inst):
        classUris = list()
        fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.CLASS_FEATURE_LKB]
        flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
        queries = ["\"" + inst + "\"", "\"" + OWL.DATATYPEPROPERTY + "\""]
        try:
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, StandardAnalyzer(Version.LUCENE_CURRENT))
            result = self._searcher.search(query, 1)
            logging.info("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                classUris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
                indexus += 1
        except Exception as e:#ParseException(e):
            print e.message
            logging.error("Error")
        return classUris

    # *
    # * @param inst
    # * @param pred
    # * @return
    #
    def searchForClass(self, inst, pred):
        classUris = list()
        fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
        flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
        queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""]
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
            result = self._searcher.search(query, 1)
            logging.info("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
                indexus += 1
        except Exception as e:#ParseException(e):
            print e.message
            logging.error("Error")
        return classUris

    # *
    # * @return
    #
    def findTopClasses(self):
        propertyURI = RDFS.SUBCLASSOF
        allClasses = list()
        topClasses = list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer)
            query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"")
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
                indexus += 1
            # for (String classUri : allClasses) {
            indexus = 0
            while indexus < len(allClasses):
                classUri = allClasses[indexus]
                logging.info("Checking whether " + classUri + " is a top class.")
                # search inst and pred retrieve class
                # if class exists that means it is not top class otherwise add to
                # topClasses
                classes = self.searchForClass(classUri, propertyURI)
                logging.info("top classes:" + str(len(classes)))
                if classes != None or len(classes) > 0:
                    logging.info("This is not a top class...")
                else:
                    topClasses.append(classUri)
                    logging.info("Adding " + classUri + " to top classes.")
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return topClasses

    # *
    # * randomly gets the direct type
    # *
    # * @param instanceUri
    # * @return
    #
    def findOneDirectType(self, instanceUri):
        return self.findDirectTypes(instanceUri, 1)[0]

    def findDirectTypes(self, instanceUri):
        return self.findDirectTypes(instanceUri, None)

    # *
    # * find direct types
    # *
    # * @param annotation
    # * @return
    #
    def findDirectTypes(self, instanceUri, max):
        dTypes = list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer)
            query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"")
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = 0
            if max != None:
                freq = max
            else:
                freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                self._searcher.explain(query, hit.doc)
                dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        logging.debug("there are " + str(len(dTypes)) + " unique direct types")
        return dTypes

    # *
    # * find lucene annotations for this poc
    # *
    # * @param annotation
    # * @return
    #
    def findLabels(self, instanceUri):
        labels = list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
            flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
            labelOrTitleUris = "\"http://www.w3.org/2000/01/rdf-schema#label\"" # +
            # " OR http://purl.org/dc/elements/1.1/title";
            queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)]
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT))
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return labels

    def findLiteral(self, instanceUri, propertyURI):
        labels = list()
        try:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
            fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
            flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
            labelOrTitleUris = "\"" + propertyURI + "\""
            queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)]
            query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
            result = self._searcher.search(query, 1)
            logging.debug("For " + str(query) + " : " + str(result.totalHits))
            freq = result.totalHits
            if freq > 0:
                result = self._searcher.search(query, freq)
            hits = pyJava.JArray2List(result.scoreDocs)
            # for (ScoreDoc hit : hits) {
            indexus = 0
            while indexus < len(hits):
                hit = hits[indexus]
                doc = self._searcher.doc(hit.doc)
                labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT))
                indexus += 1
        except Exception as e:#CorruptIndexException(e):
            print e.message
            logging.error("Error")
        return labels
コード例 #13
0
ファイル: Project_Searcher.py プロジェクト: pombredanne/facoy
class GitSearcher:
    def __init__(self, index_path):
        self.index_path = index_path
        self.reader = None
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)

    def tokenize_string(self, analyzer, string):
        result = []
        stream = analyzer.tokenStream(None, StringReader(string))
        cattr = stream.addAttribute(CharTermAttribute)
        stream.reset()
        while stream.incrementToken():
            result.append(cattr.toString())
        stream.close()
        return result

    def camel_case_split(self, s):
        import re
        s = s.replace("_", " ")
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
        s = re.sub('([a-z0-9])([A-Z])', r'\1 \2',
                   s1).lower().replace("  ", " ").split()
        return s

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string """
        query = ""
        for field in ["description"]:
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    #tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    #CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    #stopwords
                    temp_2 = []

                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    #stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    #stopwords
                    temp_4 = []

                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    #query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  # "used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    java_stoplist = [
                        "java.lang.Object", 'void', 'Global', 'boolean',
                        'String', 'int', 'char', 'float', 'double', 'write',
                        'close', 'from', 'println', 'StringBuilder', 'write',
                        'toString', 'close', 'mkdir', 'exists'
                    ]

                    if term not in java_stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                #field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def more_like_this2(self, limit, item_doc, user_query):
        github_result = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0))

        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)
        query += user_query
        query = remove_unified_stop_lists(query)
        print '................................................................................................'
        print "Project Searcher Unified Query :", query
        print '................................................................................................'
        write_search_log(
            "................................................................................................\n"
            + "Project Searcher Unified Query : " +
            str(query.encode('utf-8')) + "\n" +
            "................................................................................................\n"
        )
        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(like_query,
                                            limit).scoreDocs  #answer 1개당 10개씩
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)
                    #print "Matched Terms : ", matched_terms

                    print("File %s" % temp, doc.get("file"), "//",
                          doc.get("file_content")
                          )  #, "line_numbers", doc.get("line_numbers"))
                    write_search_log("File " + str(temp) +
                                     str(doc.get("file")) + "//" +
                                     str(doc.get("file_content")) + "\n")
                    temp += 1

                    file_path = doc.get("file")
                    print 'file_path = ', file_path
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        print "CAN'T OPEN THE FILE"
                        pass

                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms, hit.score,
                                                item_doc,
                                                doc.get("line_numbers"),
                                                hit.doc)
                        # print item.score
                        github_result.append(item)

            except Exception as e:
                print "GitSearcher Error: %s" % e
                print(traceback.format_exc())

        #sorted(github_result, key=attrgetter())

            print 'github_result : ', github_result
        return github_result
コード例 #14
0
class Index:
    def __init__(self, folder=None, fields=[], similarity="tfidf"):

        self.jcc = lucene.initVM()

        if folder:
            self.directory = SimpleFSDirectory(File(folder))
        else:
            self.directory = RAMDirectory()

        self.fields = {}

        for field in fields:
            ft = FieldType()
            for pname, pvalue in field.props.items():
                setter = getattr(ft, "set" + pname.capitalize())
                setter(pvalue)

            ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
            # 			ft.setOmitNorms(True)

            self.fields[field.name] = ft

        self.similarity = similarity.lower()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.writer = None
        self.searcher = None

    def attach_thread(self):
        self.jcc.attachCurrentThread()

    def open_writer(self):

        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.writer = IndexWriter(self.directory, config)

    def add(self, **doc):

        if not self.writer:
            self.open_writer()

        d = Document()
        for field, value in doc.items():
            #			try :
            d.add(Field(field, value, self.fields[field]))


#			except Exception, e :
#				print
#				print "Fudeu"
#				pass

        self.writer.addDocument(d)

    def commit(self):
        self.writer.commit()

    def close(self):
        if self.writer:
            self.writer.close()

    def open_searcher(self):
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        if (self.similarity == "bm25"):
            self.searcher.setSimilarity(BM25Similarity())

    def preprocess_query(self, query, fields, mode="ANY"):
        '''
		Fix query according to provided mode. If the value is not supported, 
		the query remains unchanged
		'''

        terms = query.lower().strip().split()
        if mode == "ANY":
            query = " OR ".join(terms)
        elif mode == "ALL":
            query = " AND ".join(terms)
        else:
            print "Invalid mode parameter '%s'." % mode

        query = QueryParser.escape(query)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)
        return query

    def search(self,
               query,
               search_fields,
               return_fields,
               filter=None,
               ignore=set(),
               mode="ANY",
               return_scores=False,
               limit=1000000):
        '''
		Search documents in the index using a standard analyzer (tokenizes and 
		removes top words). Supports two search modes: ANY and ALL
		  ANY: include documents that contain at least one term of the query.
		  ALL: include only documents that contain all terms of the query. 
		'''

        if not self.searcher:
            self.open_searcher()

        # Return empty results if query is empty (Lucene can't handle it nicely)
        if query.strip() == '':
            if return_scores:
                return [], []
            else:
                return []

        query = self.preprocess_query(query, search_fields, mode)

        # If limit is not provided, return all matched documents. A little hack is required
        # to do that. We query for one document and get the count total matched documents.
        #		if not limit :
        #			hits = self.searcher.search(query, 1)
        #			limit = hits.totalHits

        # Fetch more than asked in case we have to remove entries from the ignore set
        if limit != None:
            limit += len(ignore)

        hits = self.searcher.search(query, filter, limit)
        hits = hits.scoreDocs

        docs = []
        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            if doc['id'] not in ignore:
                docs.append([doc[f] for f in return_fields])

        if return_scores:
            scores = [hit.score for hit in hits]
            return docs[:limit], scores[:limit]

        return docs[:limit]

    def explain(self, query, fields, doc):

        if not self.searcher:
            self.open_searcher()

        query = QueryParser.escape(query)

        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)

        return self.searcher.explain(query, doc)

    def get_documents(self, doc_ids, fields):

        docs = []
        for doc_id in doc_ids:
            doc = self.reader.document(doc_id)
            if isinstance(fields, basestring):
                docs.append(doc.get(fields))
            else:
                docs.append({f: doc.get(f) for f in fields})

        return docs

    def get_query_scores(self, query, fields, doc_ids, mode="ANY"):

        # Creates pre-filter to ignore all other documents
        filter = TermsFilter([Term("id", id) for id in doc_ids])

        query = self.preprocess_query(query, fields, mode)
        hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs

        # Creates scores' mapping using entity id instead of internal index id
        scores = {
            str(self.reader.document(hit.doc).get("id")): hit.score
            for hit in hits
        }

        # Normalize to 0..1 interval
        #		n = 1.0/sum(scores.values())
        #		scores

        # Adds to the mapping entries for the non-returned docs (no term found)
        for doc_id in doc_ids:
            if doc_id not in scores:
                scores[doc_id] = 0.0

        return scores