Example #1
0
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print("Found %d sentences (in %s) that matched query '%s':" % (len(scoreDocs), duration, query),
                  file=sys.stderr)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
Example #2
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Example #3
0
def createWriter(index_dir):
    indexDir = SimpleFSDirectory(File(index_dir).toPath())
    writerConfig = IndexWriterConfig()
    print(Codec.availableCodecs())
    print(f"Codec : {writerConfig.getCodec()}")
    writer = IndexWriter(indexDir, writerConfig)
    return writer
Example #4
0
 def load_index(self):
     indexDir = File(self.index_path)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
Example #5
0
    def __init__(self, store_dir, analyzer, db_path):
        self.write_type = True
        self.spacy_number_types = ['DATE', 'CARDINAL', 'QUANTITY', 'MONEY',
                                   'TIME', 'PERCENT', 'ORDINAL']

        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
        store = SimpleFSDirectory(Paths.get(store_dir))
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(store, config)

        # TODO checksum
        self.wiki_db = DocDB(db_path=db_path)

        print('Getting docs..', db_path)
        self.doc_ids = self.wiki_db.get_ner_doc_ids(limit=None)
        print('# wiki docs', len(self.doc_ids))
        assert len(self.doc_ids) == 5075182

        self.entity2idx = dict()
        self.idx2entity = dict()
        self.UNK = 'UNK'
        self.entity2idx[self.UNK] = 0
        self.idx2entity[self.entity2idx[self.UNK]] = self.UNK
        self.entitytype2idx = dict()
        self.entitytype2idx[self.UNK] = 0
        self.entity_dict = dict()
        self.num_entities_max = -1
        print('Init. Done')
Example #6
0
 def __init__(self, indexDir):
     f = Paths.get(indexDir)
     self._dir = SimpleFSDirectory(f)
     self._indexSearcher = IndexSearcher(DirectoryReader.open(self._dir))
     self._weights = HashMap()
     self._weights.put(FIELDS[0], 1)
     self._weights.put(FIELDS[1], 0.2)
Example #7
0
    def __init__(self, store_dir, hits_dir, frags_dir=None):

        # store_dir is the location of our generated lucene index
        # hits_dir is the location of the highlighted document hits
        # frags_dif is the location of the document hit fragments - optional
        self.store_dir = store_dir
        self.hits_dir = hits_dir
        self.frags_dir = frags_dir

        if not os.path.exists(self.store_dir):
            os.mkdir(self.store_dir)

        if not os.path.exists(self.hits_dir):
            os.mkdir(self.hits_dir)

        if self.frags_dir is not None and not os.path.exists(self.frags_dir):
            os.mkdir(self.frags_dir)

        self.directory = SimpleFSDirectory(File(self.store_dir))

        # For now I just use the StandardAnalyzer
        self.analyzer = StandardAnalyzer(Version.LUCENE_43)
        config = IndexWriterConfig(Version.LUCENE_43, self.analyzer)

        self.writer = IndexWriter(self.directory, config)
Example #8
0
def run(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index1"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(analysis(command))
    HighlightFormatter = SimpleHTMLFormatter()
    highlighter = Highlighter(HighlightFormatter, QueryScorer(query))
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print 'path:', doc.get("path"), 'name:', doc.get(
            "name"), 'url:', doc.get("url"), 'title:', doc.get("title")
        text = doc.get('contents')
        highLightText = highlighter.getBestFragment(analyzer, "contents", text)
        if highLightText != None:
            highLightText = ''.join(highLightText.split(' '))
        data = {}
        data['url'] = doc.get("url")
        data['title'] = doc.get('title')
        data['highlight'] = highLightText
        result.append(data)
    return result
Example #9
0
def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result
Example #10
0
def search_loop(index_dir, field="contents", explain=False):
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir))))
    analyzer = StandardAnalyzer()
    print("Hit enter with no input to quit.")
    while True:
        command = input("Query:")
        if command == '':
            return
        print("Searching for: %s" % command)
        query = QueryParser(field, analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            if field == 'web':
                print(
                    f'{doc.get("web")} | {doc.get("raw")} | {scoreDoc.score}')
            else:
                print('path:', doc.get("path"), 'name:', doc.get("name"))
            if explain:
                explanation = searcher.explain(query, scoreDoc.doc)
                print(explanation)
                print('------------')
Example #11
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data.shop)
     #command=command+u' '+u'brand:'+xx.decode('utf8')
     res = Run_Score(searcher_good, searcher_bad, analyzer, command,
                     user_data.brand)
     res.append(command)
     return render.SearchResult(res)
Example #12
0
    def __init__(self, root, storeDir, analyzer, type="html"):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.load_stop_words([
            "CNstopwords.txt",
            "ENstopwords.txt",
        ])
        self.html2text = HTML2Text()
        self.html2text.ignore_links = True
        self.html2text.ignore_images = True
        type_to_index = {
            "html": self.index_html,
            "image": self.index_image,
        }
        type_to_index[type](root, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Example #13
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q, a, t, p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n += 1

        q = q.replace('AND', '\\AND').replace('OR',
                                              '\\OR').replace('NOT', '\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text",
                            analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)

    return candidates
Example #14
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
def create_index_for_wiki_sentence(filename, path, firstTime=False):
    logging.info('Start create wiki_sentence!')
    wiki_dict = get_wiki_data(path)

    logging.info('Start creating index!')
    filename = '_wiki_sentence'
    analyzer = analysis.standard.StandardAnalyzer()

    # # Store the index in memory:
    base_dir = HOMEPATH
    INDEX_DIR = "IndexFiles" + filename + ".index"
    storeDir = os.path.join(base_dir, INDEX_DIR)
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)
    directory = SimpleFSDirectory(Paths.get(storeDir))
    if firstTime:
        config = index.IndexWriterConfig(analyzer)
        iwriter = index.IndexWriter(directory, config)
        for cnt, key in enumerate(wiki_dict.keys()):
            if cnt % 1000 == 0:
                logging.info(
                    'I have preprocessed {} index in creating index by document!'
                    .format(str(cnt)))
            org_title = key[0]
            preprocessed_title = key[1]
            doc_id = key[2]
            sentence = wiki_dict[key]
            doc = create_document_by_document_sentence(org_title,
                                                       preprocessed_title,
                                                       doc_id, sentence)
            iwriter.addDocument(doc)
        iwriter.close()
    logging.info('Finish creating index wiki_sentence!')
    return directory
Example #16
0
def irsolver(data_file, index):
    from questions import get_input_data
    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)
    pred = []
    mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}

    idx, ques, ans = get_input_data(data_file)
    for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)):
        max_score = -1000000
        best_ans = 'A'
        for i, ai in enumerate(a):
            sc = query(q, ai, analyzer, searcher)
            print(acm, i, sc)
            if sc > max_score:
                max_score = sc
                best_ans = mapp[i + 1]
        pred.append(best_ans)

    return idx, pred
Example #17
0
def func2(command):
    STORE_DIR = "index1"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    res = []
    if command == '':
        return
    query = QueryParser(Version.LUCENE_CURRENT, "zhuliao",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 9).scoreDocs
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        try:
            res.append([
                doc.get("name"),
                doc.get("collect_num"),
                doc.get("zhuliao").split(' '),
                doc.get("zuofa").split('\n'),
                doc.get("img_url"),
                doc.get("url")
            ])
        except:
            pass
    res1 = []
    for i in res:
        i[1] = int(i[1])
        res1.append(tuple(i))
    res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True)
    return res2
Example #18
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
Example #19
0
def func1(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    if command == '':
        return []
    command_list = jieba.cut(command)
    command = " ".join(command_list)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doct = {
            'title': doc.get("title"),
            'url': doc.get("url"),
            "sentence": doc.get("sentence")
        }
        result.append(doct)
    del searcher
    return result
Example #20
0
def process_q_test(q, out_q):
    lucene.initVM()
    lucene.getVMEnv().attachCurrentThread()

    index = DirectoryReader.open(SimpleFSDirectory(
        Paths.get(robust_index_dir)))
    searcher = IndexSearcher(index)
    searcher.setSimilarity(BM25Similarity())
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)
    preprocessor = Preprocess()

    while not exitFlag:
        qid, query = q.get()
        tname = multiprocessing.current_process().name
        # print(tname, qid, query)
        if query == "DONE":
            break

        try:
            # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000)
            # if len(dids) >= 10:
            #     out_q.put((qid, dids, scores))
            dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer,
                                            preprocessor)
            out_q.put((qid, dids_text))
        except:
            print('%s exception %s, %s' % (tname, qid, query))
Example #21
0
def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/github2")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
	def search(self, index_dir):
		# Get handle to index directory
		directory = SimpleFSDirectory(File(index_dir))

		# Creates a searcher searching the provided index.
		ireader  = DirectoryReader.open(directory)

		# Implements search over a single IndexReader.
		# Use a single instance and use it across queries
		# to improve performance.
		searcher = IndexSearcher(ireader)

		# Get the analyzer
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

		# Constructs a query parser. We specify what field to search into.
		queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

		# Create the query
		query = queryParser.parse(self.query)

		# Run the query and get top 50 results
		topDocs = searcher.search(query, self.retrieve_count)

		# Get top hits
		scoreDocs = topDocs.scoreDocs

		doc_ids = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_ids.append(doc.get(FIELD_PATH))
		return [int(item) for item in doc_ids]
Example #24
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Example #25
0
def func(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #
    query = QueryParser(Version.LUCENE_CURRENT, "Tags",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 200).scoreDocs

    dict1 = {}
    result = ""
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views"))
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        dict1[ch] = rank
    res_list = sorted(dict1.items(), key=lambda item: item[1], reverse=True)
    for i in res_list:
        result += i[0]
        result += ' '
    del searcher
    del analyzer
    return result
    def __init__(self, folder=None, fields=[], similarity="tfidf"):

        self.jcc = lucene.initVM()

        if folder:
            self.directory = SimpleFSDirectory(File(folder))
        else:
            self.directory = RAMDirectory()

        self.fields = {}

        for field in fields:
            ft = FieldType()
            for pname, pvalue in field.props.items():
                setter = getattr(ft, "set" + pname.capitalize())
                setter(pvalue)

            ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
            # 			ft.setOmitNorms(True)

            self.fields[field.name] = ft

        self.similarity = similarity.lower()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.writer = None
        self.searcher = None
Example #27
0
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
def main():
    try:
        indicesDestination = File(dest_path)
        analyzer = KeywordAnalyzer()
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": porter_analyzer,
            "description": porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": porter_analyzer,
            "word": KeywordAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        counter = Counter()
        generate_indices_from_benchmark(writer, counter)
        writer.close()

        print "All jobs are done.."
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Example #29
0
 def buscar():
     global folder_path, folder_index
     logging.info("Ingresando en la peticion para busqueda")
     # print folder_path
     # print folder_index
     logging.info("palabra buscada: "+request.form['id_entrada'])
     palabra = str(request.form['id_entrada']).replace('"', "").replace(":","").replace(".","").replace(",","").replace(";","").replace("'","")
     logging.info("Obteniendo ambiente de lucene en busqueda")
     vm_env = lucene.getVMEnv()
     logging.info("Creando hilo en el ambiente en busqueda")
     vm_env.attachCurrentThread()
     #base_dir = os.path.dirname(os.path.abspath(folder_path))
     
     logging.info("Llamando a SimpleFSDirectory")
     directory = SimpleFSDirectory(Paths.get(INDEX_PATH))
     # directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, folder_index)))
     logging.info("Llamando a IndexSearcher")
     searcher = IndexSearcher(DirectoryReader.open(directory))
     logging.info("Llamando a StandardAnalyzer")
     analyzer = StandardAnalyzer()
     logging.info("Buscando palabra: "+palabra)
     SearchFiles().buscar(searcher, analyzer, palabra)
     listanombres=SearchFiles().getlistanombres()
     logging.info("Obteniendo la lista de nombres: "+str(listanombres))
     #print "Lista controller: ",listanombres
     # print "Entro"
     logging.info("Renderizando template de busqueda con resultado")
     return render_template('search.html', texto=palabra, nombres=listanombres, resultado=str("Se encontraron "+str(len(listanombres))+" documentos!."))
Example #30
0
 def addLang(self, lang, dataset, analyzer, index_path=None):
     self.languages.append(lang)
     idxdir = self.get_index(lang, dataset, index_path)
     directory = SimpleFSDirectory(Paths.get(idxdir))
     self.searcher[lang] = IndexSearcher(DirectoryReader.open(directory))
     self.parser[lang] = QueryParser("context", analyzers[analyzer]())
     self.searcher[lang].setSimilarity(self.similarity)
     self.lang = lang