Example #1
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
Example #2
0
class GettingQuestionDocs:  #Index path를 입력으로 받은 객체를 생성하고 search를 통해 현재 Answer Serchear_1에서 받은 질문 리스트들에 대한 doc들을 얻을 것.
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def search(self, q_ids, limit):
        docs = []
        c = 0
        for i, q_id in enumerate(q_ids):  #Index 가 안되어있는 Question은 찾지 못함.
            query = TermQuery(Term("question_id", str(q_id)))
            topdocs = self.searcher.search(
                query, 1).scoreDocs  #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정
            # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..)
            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    ResultItem(doc,
                               len(q_ids) - i, doc.get("title"),
                               doc.get("question_id")))

            if len(topdocs) > 0:
                c += 1
                if c >= limit:
                    break

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return docs
Example #3
0
    def index(self):
        if not (os.path.exists(self._dataDir)
                and os.path.isdir(self._dataDir)):
            raise IOError, "%s isn't existed or is not a directory" % (
                self._dataDir)

        dir = SimpleFSDirectory(Paths.get(self._indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)
        self.indexDirectory(writer, self._dataDir)
        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Example #4
0
def query_index(query, hit_logs_for_each, score_logs_for_each):
    ### 1_Query Alternation
    user_code_query = Generator(query)

    directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_4_text'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    benchsearcher = BenchSearcher(searcher)  # BigCloneBench
    ### 8_Querying for the Final Results
    # Log : Bench_result for each query
    bench_result, score_logs_for_each = benchsearcher.more_like_this3(
        5000, score_logs_for_each, user_code_query)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    if bench_result:
        hit_logs_for_each += str(len(bench_result)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    sorted_bench_results = sorted(bench_result,
                                  key=attrgetter('score'),
                                  reverse=True)

    print 'Search Count : ', len(sorted_bench_results)
    recommended = recommend(sorted_bench_results)
    print 'Final Count : ', len(recommended)
    if bench_result:
        hit_logs_for_each += str(len(recommended)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    return recommended, hit_logs_for_each, score_logs_for_each
Example #5
0
def createind(product,url):
	"This function creates index for lucene"
	global counter
	counter += 1
	adId = counter
	adLine = product
	field_string = chunker(product.lower())
	field_related_words = getDbpediaMatches(product, field_string)
	url = url    

	lucene.initVM()
	# 1. create an index
	index_path = File("Home/WishMatcherIndex")
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
	index = SimpleFSDirectory(index_path)

	# 2. fill the index
	config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(index, config)
	#for title in TITLES:
	import time
	millis = int(round(time.time() * 1000))
	
	userid = str(millis)
	
	doc = Document()
	doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(doc)
	print(adId)
	# 3. close resources
	writer.close()
	index.close()	
	return ""
Example #6
0
class Searcher:
    def __init__(self, indexDir):
        self.directory = SimpleFSDirectory(Paths.get(indexDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        self.nameQueryParser = QueryParser('name', StandardAnalyzer())
        self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
        self.idQueryParser = QueryParser('id', StandardAnalyzer())
        self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)

    def find_by_name(self, name):
        query = self.nameQueryParser.parse(name)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def find_by_id(self, id):
        query = self.idQueryParser.parse(id)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def close(self):
        self.directory.close()
        self.reader.close()
Example #7
0
def query_index(query, hit_logs_for_each, score_logs_for_each):
    ### 1_Query Alternation
    user_code_query = Generator(query)

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 2_Finding 3 Answer Snippets using the User Query (refined)
    answers = SnippetSearcher(searcher, user_code_query)
    answer_ids = answers.more_like_this(20, query=user_code_query)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 3_Finding the Associated Questions
    question_ids = answers.find_question_ids(answer_ids)
    # Log : Answer - Question count
    if question_ids:
        hit_logs_for_each += str(len(question_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    getDoc = GettingQuestionDocs(searcher)
    item_docs = getDoc.search(
        question_ids, 20)[0:7]  # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름.

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Question ItemDoc count
    if item_docs:
        hit_logs_for_each += str(len(item_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 4_Finding 3 Similar Questions per a Question (3 X 3)
    similar_questions = []
    question = SimilarQsSearcher(searcher)

    # Log : Similar Question count for each of Question ItemDoc
    i = 1
    if item_docs:
        for item_doc in item_docs:
            similar_question = question.more_like_this2(
                item_doc, 7)  # 각 question 들에 대해 7개씩 비슷한 것들 찾음.
            if similar_question:
                hit_logs_for_each += str(len(similar_question)) + '\t'
            else:
                hit_logs_for_each += ('0' + '\t')
            similar_questions += similar_question
            i += 1
    else:
        hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' +
                              '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t'
                              )  # 7개

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Similar Question result count
    if similar_questions:
        hit_logs_for_each += str(len(similar_questions)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 5_Finding Associated Answers for each Question (9 - 9)
    answer_ids = find_answer_ids(similar_questions)

    # Log : Question - Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 6_Getting Answer Docs for the Final Query
    getDoc = GettingAnswerDocs(searcher)
    answer_docs = getDoc.search(answer_ids)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer Docs count
    if answer_docs:
        hit_logs_for_each += str(len(answer_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_2'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    bench_results = []
    benchsearcher = BenchSearcher(searcher)  # BigCloneBench

    # Exceptional
    ### 7_Appending for the user query results

    ### 8_Querying for the Final Results
    # Log : Bench_result for each query
    for answer_doc in answer_docs:
        bench_result, score_logs_for_each = benchsearcher.more_like_this2(
            100, answer_doc, score_logs_for_each, user_code_query,
            0)  # , user_query=user_code_query)
        if bench_result:
            hit_logs_for_each += str(len(bench_result)) + '\t'
        else:
            hit_logs_for_each += ('0' + '\t')
        bench_results += bench_result

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    if answer_docs < 49:
        for a in range(49 - len(answer_docs)):
            hit_logs_for_each += ('0' + '\t')

    if bench_results:
        hit_logs_for_each += str(len(bench_results)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    sorted_bench_results = sorted(bench_results,
                                  key=attrgetter('score'),
                                  reverse=True)

    print 'Search Count : ', len(sorted_bench_results)
    recommended = recommend(sorted_bench_results)
    print 'Final Count : ', len(recommended)
    if bench_results:
        hit_logs_for_each += str(len(recommended)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    return recommended, hit_logs_for_each, score_logs_for_each
writer = IndexWriter(index, config)

def create_index():
    for country in cleaned_dictionary:
        doc = Document()
        doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

create_index()

#writer.deleteAll()
writer.close()
index.close()

### retrieval

index = SimpleFSDirectory(File(sys.argv[1]))
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

def get_query_results(reader,query,n,field):
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n).scoreDocs
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        print("%d. %s" % (i + 1, doc.get(field)))
Example #9
0
def query_index(query):
    ### 1_Query Alternation
    user_code_query = Generator(query)
    print 'query: ', query
    print 'user_code_query: ', user_code_query

    #open
    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 2_Finding 3 Answer Snippets using the User Query (refined)
    answers = SnippetSearcher(searcher, user_code_query)
    answer_ids = answers.more_like_this(10, query=user_code_query)
    print 'answer_ids: ', answer_ids

    #close
    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    ### 3_Finding the Associated Questions
    question_ids = answers.find_question_ids(answer_ids)
    print 'question ids: ', question_ids

    #open
    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 4_Cutting items
    getDoc = GettingQuestionDocs(searcher)
    item_docs = getDoc.search(
        question_ids, 10)[0:3]  # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름.
    # print 'item docs: ', item_docs

    ### 5_Finding 3 Similar Questions per a Question (3 X 3)
    similar_questions = []
    question = SimilarQsSearcher(searcher)

    if item_docs:
        for item_doc in item_docs:
            similar_question = question.more_like_this2(
                item_doc, 3)  # 각 question 들에 대해 7개씩 비슷한 것들 찾음.
            similar_questions += similar_question

    print 'similar_questions: ', similar_questions

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    ### 6_Finding Associated Answers for each Question (9 - 9)
    answer_ids = find_answer_ids(similar_questions)
    print 'answer ids: ', answer_ids

    if not answer_ids:
        recommended = ''
        return recommended
        # dest_path = u'/Users/Falcon/Desktop/***Ongoing***/***[4]_FaCoY_Defect4J_Data_Share_Kui/Defect4J_Results/'
        # project_name = u'Chart/'###################################################
        # write_file()

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 7_Getting Answer Docs for the Final Query
    getDoc = GettingAnswerDocs(searcher)
    answer_docs = getDoc.search(answer_ids)

    # print 'answer docs: ', answer_docs

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    directory = SimpleFSDirectory(File(INDICES_PATH + 'github'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    git_results = []
    gitsearcher = GitSearcher(searcher)

    ### 7_Appending for the user query results
    git_result = gitsearcher.more_like_this2(10, answer_docs[0],
                                             user_code_query, 1)
    git_results += git_result

    # print 'answer docs: ', answer_docs

    ### 8_Querying for the Final Results
    for answer_doc in answer_docs:
        git_result = gitsearcher.more_like_this2(10, answer_doc,
                                                 user_code_query, 0)
        git_results += git_result

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    git_results = sorted(git_results,
                         key=attrgetter('so_item.answer_id'),
                         reverse=True)
    id = 0
    i = 0
    temp_result = []
    for item in git_results:
        if id != item.so_item.answer_id:
            id = item.so_item.answer_id
            i = 1
            temp_result.append(item)

        elif id == item.so_item.answer_id and i < 3:
            i += 1
            temp_result.append(item)
        elif id == item.so_item.answer_id and i > 3:
            continue

    sorted_git_results = sorted(temp_result,
                                key=attrgetter('score'),
                                reverse=True)

    print 'Search Count : ', len(sorted_git_results)
    recommended = recommend(sorted_git_results)
    print 'Final Count : ', len(recommended)

    # Defect4J 쿼리 결과저장
    # cot = 0
    # for c, item in enumerate(recommended):
    #     cot += 1
    #     if cot > 10:
    #         break
    #     result_file = "/Users/Falcon/Desktop/Pycharm_Project/FaCoY_Project/GitSearch/Defect4J_FaCoY/" + str(c+1) + "_" + str('_'.join(str(item[0]).split("/")[6:]))
    #     write_file_over(result_file, str(item.file_content))

    # result_file = '/Users/Falcon/Desktop/test.txt'
    # if os.path.exists(result_file):
    #     os.remove(result_file)
    #
    # write_file(result_file, 'User Code Query \n' + str(query) + '\n' + '---------------------------' + '\n')
    # for c, i in enumerate(recommended):
    #     contents = ''
    #     contents = 'Rank: %d' % (int(c)+int(1))
    #     contents += '\nFile path: %s' % str(i.file[6:]) + '\n' + '---------------------------' + '\n'
    #     contents += str(i.file_content) +'\n' + '=================================================================' + '\n\n\n'
    #     write_file(result_file, contents)

    return recommended
Example #10
0
    v_orig2 = v
    v = unicodedata.normalize('NFKD', v).encode('ascii', 'ignore')
    v_orig = v.strip()
    v = v.lower().strip()
    doc.add(Field("wiki_id", str(k), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
    doc.add(
        Field("wiki_name_orig", str(v_orig), Field.Store.YES,
              Field.Index.NOT_ANALYZED))
    doc.add(
        Field("wiki_name", str(v), Field.Store.YES, Field.Index.NOT_ANALYZED))
    doc.add(
        Field("wiki_name_analyzed", str(v), Field.Store.YES,
              Field.Index.ANALYZED))
    v_punct_removed = re.sub(' +', ' ', regex.sub(' ', v)).strip()
    doc.add(
        Field("wiki_name_analyzed_nopunct", str(v_punct_removed),
              Field.Store.YES, Field.Index.ANALYZED))
    v_stop_removed = " ".join(
        [x for x in nltk.word_tokenize(v_punct_removed) if x not in stop])
    doc.add(
        Field("wiki_name_analyzed_nopunct_nostop", str(v_stop_removed),
              Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    i = i + 1
    if i % 10000 == 0:
        print 'finished ', i
print 'num errors while indexing ', num_errors
writer.close()
index.close()
Example #11
0
class GettingAnswerDocs:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def search(self, a_ids):
        docs = []

        # #For exceptional UQ
        # if a_ids[0] == '0':
        # 	query = TermQuery(Term("answer_id", str(a_ids)))
        # 	print query
        # 	topdoc = self.searcher.search(query, 1).scoreDocs
        #
        # 	doc = self.searcher.doc(topdoc[0][0].doc)
        # 	docs.append(doc, 0, 'No Title', 'No Question id', 'No Answer id', 'No Description')
        # 	return docs
        # else:

        i = 0
        for i, a_id in enumerate(a_ids):
            query = TermQuery(Term("answer_id", str(a_id)))
            topdocs = self.searcher.search(query, 1).scoreDocs

            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    ResultItem(doc,
                               len(a_ids) - i, doc.get("title"),
                               doc.get("question_id"), doc.get("answer_id"),
                               doc.get("description")))

            if len(topdocs) > 0:
                i += 1
                if i > a_ids.__len__():
                    break

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return docs
Example #12
0
class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content
Example #13
0
class SnippetSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

        # index = SimpleFSDirectory(indexDir)
        # self.reader = IndexReader.open(index)
        # self.searcher = SearcherFactory.newSearcher(self.reader)

    def get_matched_keywords(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def more_like_this(self, result_num, query):
        result = []
        queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called",
                                  self.porter_analyzer)
        if query:
            try:
                query = arranging_query_regex(query=query)
                # print '4. Right after the regex handling : ', query
                like_query = queryparser.parse(query)
                # print '5. Right after the Lucene parser : ', like_query

                hits = self.searcher.search(like_query, result_num).scoreDocs
                # filterScoreDosArray = hits.topDocs().scoreDocs;

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    # matched_terms = self.get_matched_keywords(like_query, hit.doc)
                    result.append(doc.get("answer_id"))

            except Exception as e:
                print "AnswerSearcher: Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return result

    def find_question_ids(self, answer_ids):
        result_list = []
        for id in answer_ids:
            # print "Answer id : ", id, " /// ", ;
            query = "SELECT parentID from posts where id = %s" % id
            question_id = DBManager.requestOneColumnQuery(query)
            result_list.append(question_id[0])
            # print "Question id : ", question_id[0]
        return result_list


# if __name__ == '__main__':
# 	query = """
# 	typed_method_call:FTPClient.setControlEncoding typed_method_call:FTPClient.login typed_method_call:FTPClient.disconnect typed_method_call:FTPClient.enterLocalPassiveMode typed_method_call:FTPClient.isConnected typed_method_call:FTPClient.setFileType typed_method_call:FTPClient.connect typed_method_call:FTPClient.storeFile typed_method_call:FTPClient.logout typed_method_call:FTPClient.changeWorkingDirectory typed_method_call:Log.e typed_method_call:File.getName typed_method_call:FTPClient.makeDirectory typed_method_call:FileInputStream.close used_classes:FTP used_classes:Log used_classes:FTPClient used_classes:FileInputStream used_classes:boolean class_instance_creation:FTPClient class_instance_creation:FileInputStream methods:uploadFile methods:login methods:FTPConnector methods_called:disconnect methods_called:makeDirectory methods_called:setFileType methods_called:getName methods_called:e methods_called:isConnected methods_called:login methods_called:storeFile methods_called:enterLocalPassiveMode methods_called:logout methods_called:changeWorkingDirectory methods_called:close methods_called:setControlEncoding methods_called:connect literals:LOGIN ERROR literals:UTF-8 literals:Artbit3 literals:FTP_UPLOAD literals:artbit123 literals:FTP_CONNECT literals:music_upload
# 	"""
#
# 	answer = SnippetSearcher("%sstackoverflow" % (INDICES_PATH), query)
#
# 	#유저 Code Query와 유사한 Snippet들을 가진 Answer Posts 도출
# 	answer_ids = answer.more_like_this(10, query=query)
# 	print answer_ids
#
# 	#도출된 Answer Posts에 각각 해당되는 Question Posts Id들 찾기
# 	question_ids = answer.find_question_ids(answer_ids)
# 	print question_ids
class PyLucene:
    """
        PyLucene module api
    """
    
    def __init__(self, startJVM=False):
        if startJVM:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        
        self.STORE_DIR = "index_dir"
        self.store = SimpleFSDirectory(File(self.STORE_DIR)) 
        
        tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) 
        self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) 
        
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.store, config)
    
    def close_store(self):
        self.store.close()

    def index_doc(self, doc_dict):
        """
          Index a doc to pylucene
          
          obs.: docid is a string not an integer
        """
        
        doc = Document()
        
        doc.add(Field("doc_id", doc_dict["doc_id"], TextField.TYPE_STORED))
        doc.add(Field("general_info", doc_dict["general_info"], TextField.TYPE_NOT_STORED))
        doc.add(Field("subject", doc_dict["subject"], TextField.TYPE_NOT_STORED))
        doc.add(Field("source", doc_dict["source"], TextField.TYPE_NOT_STORED))
        doc.add(Field("initial_date", doc_dict["initial_date"], TextField.TYPE_NOT_STORED))
        doc.add(Field("final_date", doc_dict["final_date"], TextField.TYPE_NOT_STORED))
        
        body_text = doc_dict["content"]
        body_reader = StringReader(body_text)
        doc.add(Field("content", body_reader))
        
        self.writer.addDocument(doc)
        
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        
        self.writer.commit()
        
        ticker.tick = False
        print 'done'
        
    def search_docs(self, value, field="general_info"):
        MAX_RESULTS = 1000
        searcher = IndexSearcher(DirectoryReader.open(self.store))
        query = QueryParser(Version.LUCENE_CURRENT, field,
                            self.analyzer).parse(value)
        topDocs = searcher.search(query, MAX_RESULTS)
        
        return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
Example #15
0

# In[81]:

doc.add(Field('text', texts, Field.Store.NO, Field.Index.ANALYZED))


# In[82]:

writer.addDocument(doc)


# In[83]:

f.close()


# In[75]:

f = open('gutenberg/austen-sense.txt')


# In[84]:

writer.close()


# In[86]:

index_dir.close()
Example #16
0
        document = isearcher.doc(hits[i].doc)
        fieldoutput = " | ".join([str(document.get(field)) for field in display_fields])
        print("#{})\t".format(i+1) + fieldoutput + "\n")


if __name__ == '__main__':
    lucene.initVM()
    indexdir = "/home/keerthana/Downloads/project-information-retrieval-master/src/lucene.index"

    lindex = SimpleFSDirectory(Paths.get(indexdir))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()


    parser = QueryParser(input("Enter your field :"), analyser)
    query = parser.parse(input("Enter Your SearchQuery : "))

    hits = isearcher.search(query, 10).scoreDocs
    print(hits)
    for i in range(len(hits)):
        print(i, hits[i])
        hitDoc = isearcher.doc(hits[i].doc)
        print("{} || {} || {}".format(hitDoc.get("subreddit"), hitDoc.get("id"), hitDoc.get("text")))
    if len(hits) == 0:
        print("No hits!")

    ireader.close()
    lindex.close()
Example #17
0
class BenchSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def tokenize_string(self, analyzer, string):
        result = []
        stream = analyzer.tokenStream(None, StringReader(string))
        cattr = stream.addAttribute(CharTermAttribute)
        stream.reset()
        while stream.incrementToken():
            result.append(cattr.toString())
        stream.close()
        return result

    def camel_case_split(self, s):
        import re
        s = s.replace("_", " ")
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
        s = re.sub('([a-z0-9])([A-Z])', r'\1 \2',
                   s1).lower().replace("  ", " ").split()
        return s

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string """
        query = ""
        for field in ["description"]:
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    # tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    # CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    # stopwords
                    temp_2 = []

                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    # stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    # stopwords
                    temp_4 = []

                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    # query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  # "used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    java_stoplist = [
                        "java.lang.Object", 'void', 'Global', 'boolean',
                        'String', 'int', 'char', 'float', 'double', 'write',
                        'close', 'from', 'println', 'StringBuilder', 'write',
                        'toString', 'close', 'mkdir', 'exists'
                    ]

                    if term not in java_stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                # field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query,
                        flag):  #flag = UQ(1) or not(0)
        bench_result = []
        query = ""
        if flag == 1:
            query += user_query
            # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None)

        if flag == 0 and item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched = doc.get('file').split('/')[9].split('.')[0]
                    score_logs_for_each += str(matched) + '\t' + str(
                        round(hit.score, 2)) + '\n'
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem(doc.get("file"), content,
                                               matched_terms,
                                               hit.score, item_doc,
                                               doc.get("line_numbers"),
                                               hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.release()
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return bench_result, score_logs_for_each

    def more_like_this3(self, limit, score_logs_for_each, user_query):
        query = ""
        bench_result = []
        # if not item_doc:
        # 	item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0))
        # if item_doc.doc:
        # 	query += self.document_to_query(item_doc.doc)

        query += user_query
        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    score_logs_for_each += str(round(hit.score, 2)) + '\n'
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem_UQ(doc.get("file"), content,
                                                  matched_terms, hit.score,
                                                  doc.get("line_numbers"),
                                                  hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return bench_result, score_logs_for_each