Python FSDirectory Beispiele, org.apache.lucene.store.FSDirectory Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: main.py Projekt: wararaki718/scrapbox2

 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(
         Paths.get(os.path.join(self.directory, INDEX_DIR)))
     self.taxoDir = FSDirectory.open(
         Paths.get(os.path.join(self.directory, TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)

Beispiel #2

0

Datei anzeigen

Datei: FacetExample.py Projekt: svn2github/pylucene

 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                             INDEX_DIR)))
     self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                            TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)

Beispiel #3

0

Datei anzeigen

Datei: search_pylucene.py Projekt: skluckova/VINF

def search(querystr):
    print('lucene', lucene.VERSION)
    # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    directory = FSDirectory.open(Paths.get("index"))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    q = QueryParser("name", analyzer).parse(querystr)

    hitsPerPage = 20
    docs = searcher.search(q, hitsPerPage)
    hits = docs.scoreDocs

    people = []
    number = 1
    for hit in hits:
        # print(hit.doc, hit.score)
        d = searcher.doc(hit.doc)
        person = {}
        print(number, d.get("name"))
        person['Name'] = (d.get("name"))
        person['Birth date'] = (d.get("birth_date"))
        person['Death date'] = (d.get("death_date"))
        person['Birth note'] = (d.get("birth_note"))
        person['Death note'] = (d.get("death_note"))
        people.append(person)
        number += 1

    return people

Beispiel #4

0

Datei anzeigen

Datei: index.py Projekt: zjcom/hoaxy-backend

    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)

Beispiel #5

0

Datei anzeigen

def l_searcher(query_string, directory, number_documents):
	lucene.initVM()

	# analyzer = StandardAnalyzer()
	reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory)))
	searcher = IndexSearcher(reader)

	# Top 'n' documents as result
	topN = number_documents

	try:
		# query = QueryParser("question", analyzer).parse(query_string)
		query = FuzzyQuery(Term("question", query_string), 2)
		print("The query was: {}".format(query))

		hits = searcher.search(query, topN)

		print("The hits were: ")

		options = []
		options_answers = []

		# print(hits.totalHits)

		for hit in hits.scoreDocs:
			print(hit.doc)
			# print(hit.score, hit.doc, hit.toString())
			doc = searcher.doc(hit.doc)
			options_answers.append(doc.get("answer"))
			options.append(doc.get("question"))
			# print(doc.get("answer"))

		return options, options_answers
	except IndexError:
		return None

Beispiel #6

0

Datei anzeigen

Datei: lucene_looper.py Projekt: DAlkemade/BREDS

def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader

Beispiel #7

0

Datei anzeigen

Datei: search.py Projekt: rbouadjenek/YouTaQA

    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)

Beispiel #8

0

Datei anzeigen

 def __init__(self, dir, data_file):
     self.dir = dir
     self.data_file = data_file
     index_dir = FSDirectory.open(Paths.get(self.dir))
     analyzer = StandardAnalyzer()
     writer_config = IndexWriterConfig(analyzer)
     writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(index_dir, writer_config)

Beispiel #9

0

Datei anzeigen

    def __init__(self, baseDir, indexDirectory="IR.Index"):
        """
        :param baseDir: The directory where this querrier is run
        :param indexDirectory: Directory of indices, default value = 'IR.Index'
        """
        indexDir = FSDirectory.open(Paths.get(os.path.join(baseDir, indexDirectory)))

        self.reader = DirectoryReader.open(indexDir)

Beispiel #10

0

Datei anzeigen

Datei: rolling_old.py Projekt: BinbinBian/USAAR-SemEval-2015

def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)

Beispiel #11

0

Datei anzeigen

 def create_index_dir(self):
     """
     Create the directory where index is stored
     :return: index directory
     """
     path = Paths.get('index')
     indexDir = FSDirectory.open(path)
     return indexDir

Beispiel #12

0

Datei anzeigen

def createIndexWriter(indexDir):
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    directory = FSDirectory.open(Paths.get(indexDir))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    #config = config.setRAMBufferSizeMB(ramBufferSize)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    return IndexWriter(directory, config)

Beispiel #13

0

Datei anzeigen

Datei: rolling2.py Projekt: StevenLOL/USAAR-SemEval-2015

def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)

Beispiel #14

0

Datei anzeigen

Datei: oaijazz.py Projekt: seecr/meresco-oai

def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher

Beispiel #15

0

Datei anzeigen

Datei: lucenekeyvaluestore.py Projekt: seecr/meresco-lucene

 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0) # faster
     config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher

Beispiel #16

0

Datei anzeigen

 def __init__(self, index_path, update=False):
     dir = FSDirectory.open(Paths.get(index_path))
     analyzer = StandardAnalyzer()
     iwc = IndexWriterConfig(analyzer)
     if update:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     else:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(dir, iwc)

Beispiel #17

0

Datei anzeigen

 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0)  # faster
     config.setUseCompoundFile(False)  # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher

Beispiel #18

0

Datei anzeigen

def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher

Beispiel #19

0

Datei anzeigen

Datei: TwIndexer.py Projekt: skopp002/cs242

def index_scan():
    print("Scanning the index")
    #pdb.set_trace()
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    fields = MultiFields.getFields(reader)
    for field in fields:
        term = MultiFields.getTerms(reader,field)
        print(field, "->" , term)

Beispiel #20

0

Datei anzeigen

def getLucene(path):
    directory = FSDirectory.open(File(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(Version.LATEST, analyzer)
    mergePolicy = config.getMergePolicy()
    sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    config.setMergePolicy(sortingMergePolicy)
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher

Beispiel #21

0

Datei anzeigen

    def __init__(self):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])

        paths_dict = util.getPaths()
        # textSearcher = TextSearcher(paths_dict['fs_directory'])
        fs_directory = FSDirectory.open(Paths.get(paths_dict['fs_directory']))
        index_reader = DirectoryReader.open(fs_directory)
        self.lucene_dictionary = LuceneDictionary(index_reader, 'contents')
        self.analyzer = StandardAnalyzer()
        self.searcher = IndexSearcher(index_reader)
        self.formatter = SimpleHTMLFormatter()

Beispiel #22

0

Datei anzeigen

 def init_index(self):
     """
         Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher.
         Returns:
             vm: The initialized Java VM
             analyzer: StandardAnalyzer word analizer.
             searcher: Searcher over the lucene index.
     """
     self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX))
     self.analyzer = StandardAnalyzer()
     self.searcher = IndexSearcher(DirectoryReader.open(ldir))

Beispiel #23

0

Datei anzeigen

Datei: tirza.py Projekt: komax/tirza

def open_writer(path):
    from java.io import File
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.index import IndexWriter, IndexWriterConfig
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    directory = FSDirectory.open(File(path))
    analyzer = StandardAnalyzer(Version.LUCENE_43)
    config = IndexWriterConfig(Version.LUCENE_43, analyzer)
    writer = IndexWriter(directory, config)
    return writer

Beispiel #24

0

Datei anzeigen

Datei: WikiIndex.py Projekt: alvations/Wikicorpus

def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')

Beispiel #25

0

Datei anzeigen

Datei: search_pylucene.py Projekt: skluckova/VINF

def createIndex():
    print(lucene.VERSION)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    index = FSDirectory.open(Paths.get('index'))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index, config)

    openCSV('../output/outputSpark_full_index.csv', writer)
    writer.close()

Beispiel #26

0

Datei anzeigen

Datei: index.py Projekt: michael5555/InformationRetrieval

    def __init__(self, indexDir, root="testdocs"):
        # create and open an index writer
        indexDir = FSDirectory.open(Paths.get(indexDir))

        # TODO make appropriate analyzer add to config
        analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        self.authorcount = 0
        self.titlecount = 0
        self.errorcount = 0

        self.indexDocs(root, iw)

Beispiel #27

0

Datei anzeigen

    def __init__(self, index_dir):
        """

        :param index_dir: the dir where to store the index.
        """
        self.indexDir = index_dir
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)
        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=self.ENGLISH_STOP_WORDS_SET)
        conf = IndexWriterConfig(self.analyzer)
        conf.setUseCompoundFile(False)
        directory = FSDirectory.open(Paths.get(index_dir))
        self.writer = IndexWriter(directory, conf)

Beispiel #28

0

Datei anzeigen

Datei: renaissance_pages.py Projekt: mdole/global-renaissance

def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)

Beispiel #29

0

Datei anzeigen

Datei: rank.py Projekt: davepie101/Twitter-Crawler

    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results

Beispiel #30

0

Datei anzeigen

Datei: luluwiki.py Projekt: StevenLOL/USAAR-SemEval-2015

def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()

Beispiel #31

0

Datei anzeigen

Datei: WikiIndex.py Projekt: alvations/Wikicorpus

def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()

Beispiel #32

0

Datei anzeigen

def find_frequencies_wikipedia(terms: List[str], index_location: str):
    """Find frequencies using a Lucene index of wikipedia."""
    # TODO doesn't find any n>1 grams due to missing location index on contents!

    logger.warning('Not working! Does not find any n>1 grams')
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get(index_location)
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)

    freqs = {}
    for term_str in tqdm.tqdm(terms):
        term = Term("contents", term_str)
        freq = reader.totalTermFreq(term)
        freqs[term_str] = freq

    reader.close()
    return freqs

Beispiel #33

0

Datei anzeigen

Datei: renaissance_pages.py Projekt: mdole/global-renaissance

def custom_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    print rootdir
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	year = entry['publ_year']
    	
      fname = short_title + CONTENT_EXT
      results[fname] = year;

Beispiel #34

0

Datei anzeigen

    def index(self):
        # if exists sent_index, delete and create a new one
        doc_tool.cleardir(index_root)
        doc_tool.mkdir(index_root)

        index_dir = FSDirectory.open(Paths.get(index_root))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_dir, writer_config)

        ft1 = FieldType()
        ft1.setStored(True)
        ft1.setIndexOptions(IndexOptions.NONE)

        ft2 = FieldType()
        ft2.setStored(False)
        ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        doc_list = self.doc()
        file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc")
        file_list = os.listdir(file_path)

        num = 0
        for file in file_list:
            docs = doc_tool.load_json_file(file_path, file)
            for page_identifier in docs:
                if page_identifier in doc_list:
                    num += 1
                    for sent_number in docs[page_identifier]:
                        sentence_text = self.process_sent(
                            docs[page_identifier][sent_number])
                        doc = Document()
                        doc.add(Field("page_identifier", page_identifier, ft1))
                        doc.add(Field("sentence_number", sent_number, ft1))
                        doc.add(Field("sentence_text", sentence_text, ft2))
                        writer.addDocument(doc)
                    print(num)

        writer.commit()
        writer.close()
        index_dir.close()

Beispiel #35

0

Datei anzeigen

Datei: renaissance_pages.py Projekt: mdole/global-renaissance

def do_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    print os.path.abspath(os.path.pardir)
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    results = []
    for hit in hits:
        doc = searcher.doc(hit.doc);
        entry_id = doc.get('entry_id')

        entry = entry_map.get(entry_id)
        #print 'entry:', entry
        score = hit.score
        #print 'Hit:', entry['short_title'], score
        results.append((score, doc, entry))
        
    return results

Beispiel #36

0

Datei anzeigen

def l_indexer(directory, load_path):
	lucene.initVM()

	# index_dir = SimpleFSDirectory(File(directory))
	index_dir = FSDirectory.open(Paths.get(directory))
	writer_config = IndexWriterConfig(PortugueseAnalyzer())
	# writer_config = IndexWriterConfig(customPortugueseAnalyser())
	writer = IndexWriter(index_dir, writer_config)

	with open(load_path) as subtles_file:
		subtles_corpus = subtles_file.read().splitlines()

	for i in range(0, len(subtles_corpus), 2):
		doc = Document()
		doc.add(Field("question", subtles_corpus[i], StringField.TYPE_STORED))
		doc.add(Field("answer", subtles_corpus[i+1], StringField.TYPE_STORED))

		writer.addDocument(doc)

	writer.close()
	print("Index successfully created!")

Beispiel #37

0

Datei anzeigen

Datei: ConfigurationServlet.py Projekt: hdiwan/tubelight

 def __index(self, emailInfo):
     from org.apache.lucene.index import IndexWriterConfig
     from org.apache.lucene.util import Version
     from org.apache.lucene.analysis.standard import StandardAnalyzer
     analyser = StandardAnalyzer(Version.LUCENE_33)
     conf = IndexWriterConfig(Version.LUCENE_33, analyser)
     from org.apache.lucene.store import FSDirectory
     from java.io import File
     storage = File.createTempFile(u'Tubelight-', '.index')
     storage.delete()
     storage.mkdir()
     storage.deleteOnExit()
     self.storage = storage.getAbsolutePath()
     from java.io import File
     self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx')
     directory = FSDirectory.open(storage)
     from org.apache.lucene.index import IndexWriter
     iw = IndexWriter(directory, conf)
     from us.d8u.tubelight import Configuration
     addr = emailInfo[Configuration.EmailAddressKey]
     (username, server) = addr.split('@')
     from java.lang import System
     System.setProperty("mail.imap.partialfetch", "false")
     urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey])))
     from javax.mail import Session
     session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey))
     session.connect(server, username, emailInfo[Configuration.EmailPasswordKey])
     folder = session.getDefaultFolder()
     for m in folder.getMessages():
         from org.apache.lucene.document import Document
         d = Document()
         subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED)
         toSrc = u''
         toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()]
         to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED)
         d.add(to)
         d.add(subject)
         iw.addDocument(d)
     iw.commit()
     self.searcher = IndexSearcher(directory)

Beispiel #38

0

Datei anzeigen

Datei: search.py Projekt: wdqatualr/hoaxy-backend

    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format

Beispiel #39

0

Datei anzeigen

Datei: TwIndexer.py Projekt: skopp002/cs242

def retrieving(searchword):
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    idxDocs = reader.maxDoc()
    print("We have ", idxDocs, " indexed documents")
    searcher = IndexSearcher(reader)
    idx_analyzer = EnglishAnalyzer()
    #Search for the input term in field stored as text
    # To look into multiple fields, try  MultiFieldQueryParser, but it is not recommended.
    # Its best to club everything we want to search into a single search field and try WildCard matching on it
    query = QueryParser("text", idx_analyzer).parse(searchword)
    MAX = 1000
    hits = searcher.search(query, MAX)
    print ("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    try:
        for hit in hits.scoreDocs:
            print (hit.score, hit.doc, hit.toString())
            doc = searcher.doc(hit.doc)
            print (doc.get("text").encode("utf-8"))
    except:
        print("Could not find the word")

Beispiel #40

0

Datei anzeigen

Datei: idx.py Projekt: mkind/crawler

    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)

Beispiel #41

0

Datei anzeigen

Datei: idx.py Projekt: mkind/crawler

    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)

Beispiel #42

0

Datei anzeigen

Datei: index.py Projekt: davepie101/Twitter-Crawler

def index():
    indexFile = File(luceneDirectory).toPath()
    directory = FSDirectory.open(indexFile)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, 128479)
    writeConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writeConfig)

    file_number = 2
    while file_number <= 200:
        data = []
        file_name = './parsed/parsed_data' + str(file_number) + '.txt'
        with open(file_name) as f:
            for line in f:
                data.append(json.loads(line))
        f.close()

        for j in data:
            doc = create_doc(j)
            writer.addDocument(doc)

        file_number += 1
    writer.close()

Beispiel #43

0

Datei anzeigen

Datei: oaijazz.py Projekt: seecr/meresco-oai

def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))

Beispiel #44

0

Datei anzeigen

def getReader(path):
    return DirectoryReader.open(FSDirectory.open(File(path)))

Beispiel #45

0

Datei anzeigen

def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))

Beispiel #46

0

Datei anzeigen

Datei: lucene.py Projekt: dvalcarce/filmyou-web

        # Use MoreLikeThis query by document technology
        mlt = MoreLikeThis(reader)
        mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"])
        mlt.setMinTermFreq(0)
        mlt.setMinDocFreq(0)
        mlt.setAnalyzer(self.analyzer)
        mlt_query = mlt.like(results.scoreDocs[0].doc)

        # Filter the original film
        filtered_query = BooleanQuery()
        filtered_query.add(mlt_query, BooleanClause.Occur.MUST)
        filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT)
        score_docs = self.searcher.search(filtered_query, count).scoreDocs

        return self._retrieve_in_order(score_docs)


# Initialize Lucene
lucene.initVM()
logger = logging.getLogger(__name__)
logger.info('Initialising Lucene VM')
base_dir = os.path.abspath(os.path.curdir)
index_file = os.path.join(base_dir, settings.LUCENE['PATH'])
index = FSDirectory.open(File(index_file))
try:
    reader = DirectoryReader.open(index)
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
except lucene.JavaError as e:
    logger.error('Lucene not loaded')

Beispiel #47

0

Datei anzeigen

Datei: index.py Projekt: KerstenDoering/PubMedPortable

"""

# lucene modules needed for this script
import lucene
from java.io import File
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, TextField

# start Java VM 
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# indexing directory
indexDir = FSDirectory.open(File("lucene_index.Index"))

# input which will be indexed with Lucene
title1 = "text of title1"
title2 = "title2"
abstract1 = "abstract1 has many words, e.g. hellow world can be the text"
abstract2 = "text of abstract2"

# configure indexing
config = IndexWriterConfig(Version.LUCENE_CURRENT, WhitespaceAnalyzer(Version.LUCENE_CURRENT))
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iw = IndexWriter(indexDir, config)

# count number of documents processed
nDocsAdded = 0