Beispiel #1
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(
         Paths.get(os.path.join(self.directory, INDEX_DIR)))
     self.taxoDir = FSDirectory.open(
         Paths.get(os.path.join(self.directory, TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)
Beispiel #2
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                             INDEX_DIR)))
     self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                            TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)
Beispiel #3
0
def search(querystr):
    print('lucene', lucene.VERSION)
    # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    directory = FSDirectory.open(Paths.get("index"))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    q = QueryParser("name", analyzer).parse(querystr)

    hitsPerPage = 20
    docs = searcher.search(q, hitsPerPage)
    hits = docs.scoreDocs

    people = []
    number = 1
    for hit in hits:
        # print(hit.doc, hit.score)
        d = searcher.doc(hit.doc)
        person = {}
        print(number, d.get("name"))
        person['Name'] = (d.get("name"))
        person['Birth date'] = (d.get("birth_date"))
        person['Death date'] = (d.get("death_date"))
        person['Birth note'] = (d.get("birth_note"))
        person['Death note'] = (d.get("death_note"))
        people.append(person)
        number += 1

    return people
Beispiel #4
0
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)
Beispiel #5
0
def l_searcher(query_string, directory, number_documents):
	lucene.initVM()

	# analyzer = StandardAnalyzer()
	reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory)))
	searcher = IndexSearcher(reader)

	# Top 'n' documents as result
	topN = number_documents

	try:
		# query = QueryParser("question", analyzer).parse(query_string)
		query = FuzzyQuery(Term("question", query_string), 2)
		print("The query was: {}".format(query))

		hits = searcher.search(query, topN)

		print("The hits were: ")

		options = []
		options_answers = []

		# print(hits.totalHits)

		for hit in hits.scoreDocs:
			print(hit.doc)
			# print(hit.score, hit.doc, hit.toString())
			doc = searcher.doc(hit.doc)
			options_answers.append(doc.get("answer"))
			options.append(doc.get("question"))
			# print(doc.get("answer"))

		return options, options_answers
	except IndexError:
		return None
Beispiel #6
0
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader
Beispiel #7
0
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
Beispiel #8
0
 def __init__(self, dir, data_file):
     self.dir = dir
     self.data_file = data_file
     index_dir = FSDirectory.open(Paths.get(self.dir))
     analyzer = StandardAnalyzer()
     writer_config = IndexWriterConfig(analyzer)
     writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(index_dir, writer_config)
Beispiel #9
0
    def __init__(self, baseDir, indexDirectory="IR.Index"):
        """
        :param baseDir: The directory where this querrier is run
        :param indexDirectory: Directory of indices, default value = 'IR.Index'
        """
        indexDir = FSDirectory.open(Paths.get(os.path.join(baseDir, indexDirectory)))

        self.reader = DirectoryReader.open(indexDir)
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
Beispiel #11
0
 def create_index_dir(self):
     """
     Create the directory where index is stored
     :return: index directory
     """
     path = Paths.get('index')
     indexDir = FSDirectory.open(path)
     return indexDir
Beispiel #12
0
def createIndexWriter(indexDir):
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    directory = FSDirectory.open(Paths.get(indexDir))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    #config = config.setRAMBufferSizeMB(ramBufferSize)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    return IndexWriter(directory, config)
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
Beispiel #14
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0) # faster
     config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
Beispiel #16
0
 def __init__(self, index_path, update=False):
     dir = FSDirectory.open(Paths.get(index_path))
     analyzer = StandardAnalyzer()
     iwc = IndexWriterConfig(analyzer)
     if update:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     else:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(dir, iwc)
Beispiel #17
0
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0)  # faster
     config.setUseCompoundFile(False)  # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
Beispiel #18
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Beispiel #19
0
def index_scan():
    print("Scanning the index")
    #pdb.set_trace()
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    fields = MultiFields.getFields(reader)
    for field in fields:
        term = MultiFields.getTerms(reader,field)
        print(field, "->" , term)
Beispiel #20
0
def getLucene(path):
    directory = FSDirectory.open(File(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(Version.LATEST, analyzer)
    mergePolicy = config.getMergePolicy()
    sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    config.setMergePolicy(sortingMergePolicy)
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Beispiel #21
0
    def __init__(self):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])

        paths_dict = util.getPaths()
        # textSearcher = TextSearcher(paths_dict['fs_directory'])
        fs_directory = FSDirectory.open(Paths.get(paths_dict['fs_directory']))
        index_reader = DirectoryReader.open(fs_directory)
        self.lucene_dictionary = LuceneDictionary(index_reader, 'contents')
        self.analyzer = StandardAnalyzer()
        self.searcher = IndexSearcher(index_reader)
        self.formatter = SimpleHTMLFormatter()
Beispiel #22
0
 def init_index(self):
     """
         Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher.
         Returns:
             vm: The initialized Java VM
             analyzer: StandardAnalyzer word analizer.
             searcher: Searcher over the lucene index.
     """
     self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX))
     self.analyzer = StandardAnalyzer()
     self.searcher = IndexSearcher(DirectoryReader.open(ldir))
Beispiel #23
0
def open_writer(path):
    from java.io import File
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.index import IndexWriter, IndexWriterConfig
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    directory = FSDirectory.open(File(path))
    analyzer = StandardAnalyzer(Version.LUCENE_43)
    config = IndexWriterConfig(Version.LUCENE_43, analyzer)
    writer = IndexWriter(directory, config)
    return writer
Beispiel #24
0
def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')    
Beispiel #25
0
def createIndex():
    print(lucene.VERSION)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    index = FSDirectory.open(Paths.get('index'))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index, config)

    openCSV('../output/outputSpark_full_index.csv', writer)
    writer.close()
    def __init__(self, indexDir, root="testdocs"):
        # create and open an index writer
        indexDir = FSDirectory.open(Paths.get(indexDir))

        # TODO make appropriate analyzer add to config
        analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        self.authorcount = 0
        self.titlecount = 0
        self.errorcount = 0

        self.indexDocs(root, iw)
Beispiel #27
0
    def __init__(self, index_dir):
        """

        :param index_dir: the dir where to store the index.
        """
        self.indexDir = index_dir
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)
        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=self.ENGLISH_STOP_WORDS_SET)
        conf = IndexWriterConfig(self.analyzer)
        conf.setUseCompoundFile(False)
        directory = FSDirectory.open(Paths.get(index_dir))
        self.writer = IndexWriter(directory, conf)
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
Beispiel #29
0
    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results
def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
Beispiel #31
0
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
Beispiel #32
0
def find_frequencies_wikipedia(terms: List[str], index_location: str):
    """Find frequencies using a Lucene index of wikipedia."""
    # TODO doesn't find any n>1 grams due to missing location index on contents!

    logger.warning('Not working! Does not find any n>1 grams')
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get(index_location)
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)

    freqs = {}
    for term_str in tqdm.tqdm(terms):
        term = Term("contents", term_str)
        freq = reader.totalTermFreq(term)
        freqs[term_str] = freq

    reader.close()
    return freqs
def custom_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    print rootdir
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	year = entry['publ_year']
    	
      fname = short_title + CONTENT_EXT
      results[fname] = year;
Beispiel #34
0
    def index(self):
        # if exists sent_index, delete and create a new one
        doc_tool.cleardir(index_root)
        doc_tool.mkdir(index_root)

        index_dir = FSDirectory.open(Paths.get(index_root))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_dir, writer_config)

        ft1 = FieldType()
        ft1.setStored(True)
        ft1.setIndexOptions(IndexOptions.NONE)

        ft2 = FieldType()
        ft2.setStored(False)
        ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        doc_list = self.doc()
        file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc")
        file_list = os.listdir(file_path)

        num = 0
        for file in file_list:
            docs = doc_tool.load_json_file(file_path, file)
            for page_identifier in docs:
                if page_identifier in doc_list:
                    num += 1
                    for sent_number in docs[page_identifier]:
                        sentence_text = self.process_sent(
                            docs[page_identifier][sent_number])
                        doc = Document()
                        doc.add(Field("page_identifier", page_identifier, ft1))
                        doc.add(Field("sentence_number", sent_number, ft1))
                        doc.add(Field("sentence_text", sentence_text, ft2))
                        writer.addDocument(doc)
                    print(num)

        writer.commit()
        writer.close()
        index_dir.close()
def do_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    print os.path.abspath(os.path.pardir)
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    results = []
    for hit in hits:
        doc = searcher.doc(hit.doc);
        entry_id = doc.get('entry_id')

        entry = entry_map.get(entry_id)
        #print 'entry:', entry
        score = hit.score
        #print 'Hit:', entry['short_title'], score
        results.append((score, doc, entry))
        
    return results
Beispiel #36
0
def l_indexer(directory, load_path):
	lucene.initVM()

	# index_dir = SimpleFSDirectory(File(directory))
	index_dir = FSDirectory.open(Paths.get(directory))
	writer_config = IndexWriterConfig(PortugueseAnalyzer())
	# writer_config = IndexWriterConfig(customPortugueseAnalyser())
	writer = IndexWriter(index_dir, writer_config)

	with open(load_path) as subtles_file:
		subtles_corpus = subtles_file.read().splitlines()

	for i in range(0, len(subtles_corpus), 2):
		doc = Document()
		doc.add(Field("question", subtles_corpus[i], StringField.TYPE_STORED))
		doc.add(Field("answer", subtles_corpus[i+1], StringField.TYPE_STORED))

		writer.addDocument(doc)

	writer.close()
	print("Index successfully created!")
 def __index(self, emailInfo):
     from org.apache.lucene.index import IndexWriterConfig
     from org.apache.lucene.util import Version
     from org.apache.lucene.analysis.standard import StandardAnalyzer
     analyser = StandardAnalyzer(Version.LUCENE_33)
     conf = IndexWriterConfig(Version.LUCENE_33, analyser)
     from org.apache.lucene.store import FSDirectory
     from java.io import File
     storage = File.createTempFile(u'Tubelight-', '.index')
     storage.delete()
     storage.mkdir()
     storage.deleteOnExit()
     self.storage = storage.getAbsolutePath()
     from java.io import File
     self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx')
     directory = FSDirectory.open(storage)
     from org.apache.lucene.index import IndexWriter
     iw = IndexWriter(directory, conf)
     from us.d8u.tubelight import Configuration
     addr = emailInfo[Configuration.EmailAddressKey]
     (username, server) = addr.split('@')
     from java.lang import System
     System.setProperty("mail.imap.partialfetch", "false")
     urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey])))
     from javax.mail import Session
     session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey))
     session.connect(server, username, emailInfo[Configuration.EmailPasswordKey])
     folder = session.getDefaultFolder()
     for m in folder.getMessages():
         from org.apache.lucene.document import Document
         d = Document()
         subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED)
         toSrc = u''
         toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()]
         to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED)
         d.add(to)
         d.add(subject)
         iw.addDocument(d)
     iw.commit()
     self.searcher = IndexSearcher(directory)
Beispiel #38
0
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format
Beispiel #39
0
def retrieving(searchword):
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    idxDocs = reader.maxDoc()
    print("We have ", idxDocs, " indexed documents")
    searcher = IndexSearcher(reader)
    idx_analyzer = EnglishAnalyzer()
    #Search for the input term in field stored as text
    # To look into multiple fields, try  MultiFieldQueryParser, but it is not recommended.
    # Its best to club everything we want to search into a single search field and try WildCard matching on it
    query = QueryParser("text", idx_analyzer).parse(searchword)
    MAX = 1000
    hits = searcher.search(query, MAX)
    print ("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    try:
        for hit in hits.scoreDocs:
            print (hit.score, hit.doc, hit.toString())
            doc = searcher.doc(hit.doc)
            print (doc.get("text").encode("utf-8"))
    except:
        print("Could not find the word")
Beispiel #40
0
    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)
Beispiel #41
0
    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)
Beispiel #42
0
def index():
    indexFile = File(luceneDirectory).toPath()
    directory = FSDirectory.open(indexFile)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 128479)
    writeConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writeConfig)

    file_number = 2
    while file_number <= 200:
        data = []
        file_name = './parsed/parsed_data' + str(file_number) + '.txt'
        with open(file_name) as f:
            for line in f:
                data.append(json.loads(line))
        f.close()

        for j in data:
            doc = create_doc(j)
            writer.addDocument(doc)

        file_number += 1
    writer.close()
Beispiel #43
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))
Beispiel #44
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(File(path)))
Beispiel #45
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))
Beispiel #46
0
        # Use MoreLikeThis query by document technology
        mlt = MoreLikeThis(reader)
        mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"])
        mlt.setMinTermFreq(0)
        mlt.setMinDocFreq(0)
        mlt.setAnalyzer(self.analyzer)
        mlt_query = mlt.like(results.scoreDocs[0].doc)

        # Filter the original film
        filtered_query = BooleanQuery()
        filtered_query.add(mlt_query, BooleanClause.Occur.MUST)
        filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT)
        score_docs = self.searcher.search(filtered_query, count).scoreDocs

        return self._retrieve_in_order(score_docs)


# Initialize Lucene
lucene.initVM()
logger = logging.getLogger(__name__)
logger.info('Initialising Lucene VM')
base_dir = os.path.abspath(os.path.curdir)
index_file = os.path.join(base_dir, settings.LUCENE['PATH'])
index = FSDirectory.open(File(index_file))
try:
    reader = DirectoryReader.open(index)
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
except lucene.JavaError as e:
    logger.error('Lucene not loaded')
Beispiel #47
0
"""

# lucene modules needed for this script
import lucene
from java.io import File
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, TextField

# start Java VM 
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# indexing directory
indexDir = FSDirectory.open(File("lucene_index.Index"))

# input which will be indexed with Lucene
title1 = "text of title1"
title2 = "title2"
abstract1 = "abstract1 has many words, e.g. hellow world can be the text"
abstract2 = "text of abstract2"

# configure indexing
config = IndexWriterConfig(Version.LUCENE_CURRENT, WhitespaceAnalyzer(Version.LUCENE_CURRENT))
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iw = IndexWriter(indexDir, config)

# count number of documents processed
nDocsAdded = 0