Ejemplo n.º 1
0
    def testSlurp(self):

        fsDirReader = IndexReader.open(self.dir, True)
        self.assertEqual(len(self.keywords), fsDirReader.maxDoc())
        self.assertEqual(len(self.keywords), fsDirReader.numDocs())

        ramDir = RAMDirectory(self.dir)
        ramDirReader = IndexReader.open(ramDir, True)
        self.assertEqual(fsDirReader.maxDoc(), ramDirReader.maxDoc())
        self.assertEqual(fsDirReader.numDocs(), ramDirReader.numDocs())

        fsDirReader.close()
        ramDir.close()
Ejemplo n.º 2
0
    def testSlurp(self):

        fsDirReader = IndexReader.open(self.dir, True)
        self.assertEqual(len(self.keywords), fsDirReader.maxDoc())
        self.assertEqual(len(self.keywords), fsDirReader.numDocs())

        ramDir = RAMDirectory(self.dir)
        ramDirReader = IndexReader.open(ramDir, True)
        self.assertEqual(fsDirReader.maxDoc(), ramDirReader.maxDoc())
        self.assertEqual(fsDirReader.numDocs(), ramDirReader.numDocs())

        fsDirReader.close()
        ramDir.close()
Ejemplo n.º 3
0
def ExportIndex(b_print = False,b_write_file = False,b_filter = True):
    _dict = ReadConfig()
    initVM()
    try:
        if(b_write_file == True):
            output_file = _dict['resultDir'] + '/' + sys.argv[1] + '.xls'
            _fw = open(output_file,'w')
        directory = SimpleFSDirectory(File(_dict['indexDir']))
        ireader = IndexReader.open(directory)
        # Enum all the terms
        all_terms = ireader.terms()
        word_dict = {}
        _stopword_set = ImportStopword()
#        SetPrint(_stopword_set)
        while all_terms.next():
            term_elem = all_terms.term()
            if term_elem.field() == sys.argv[1]:
                _temp = term_elem.text().rstrip()
                word_dict[_temp] = all_terms.docFreq()
        if(b_filter == True):
            StopwordFilter(word_dict,_stopword_set)
        if(b_print != False):
            DictPrint(word_dict)
        if(b_write_file != False):
            DictPrint(word_dict,out_file=_fw)
            _fw.close()
        all_terms.close()
        return word_dict
    except Exception,e:
        print "Failed: ",e
        traceback.print_exc(file=sys.stdout)
 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                     WhitespaceAnalyzer())
Ejemplo n.º 5
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
Ejemplo n.º 6
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Ejemplo n.º 7
0
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
Ejemplo n.º 8
0
Archivo: plush.py Proyecto: jjguy/plush
 def openStore(self, store_dir):
     """Open a lucene store."""
     if self._connected:
         self.closeStore()
     if store_dir == 'dummy':
         directory = RAMDirectory()
         self.initDummyStore(directory)
         store_path = store_dir
     else:
         store_path = os.path.abspath(store_dir)
         try:
             directory = SimpleFSDirectory(File(store_path)) #TODO , False)
         except JavaError:
             print "Error: %s Not found." % store_path
             return
     try:
         self.searcher = IndexSearcher(directory)
     except JavaError:
         print "Error: '%s' is not a valid lucene store." % store_path
         return
     print 'Opening store: %s' % store_path
     self.directory = directory
     self.store_path = store_path
     # TODO - TaxonomyReader??
     self.index_reader = IndexReader.open(directory)
     self.fields = self.getFieldNames()
     self.fields.sort()
     self._connected = True
Ejemplo n.º 9
0
Archivo: cnanalyzer.py Proyecto: fay/wt
def c():
    from apps.wantown import dao
    from apps.wantown.models import Entry,Category
    entries = Entry.objects.all()
    from dot.matrixmapper import MatrixMapper
    STOP_WORDS = [u'a', u'an', u'and', u'are', u'as', u'at', u'be', u'but', u'by', u'for', u'if', u'in', u'into', 
              u'is', u'it', u'no', u'not', u'of', u'on', u'or', u'such', u'that', u'the', u'their', u'then',
              u'there', u'these', u'they', u'this', u'to', u'was', u'will', u'with',
              u'you',u'your',u'we',u'he',u'him',u'how',u'where',
              # add by myself
              u'i',u'been',u'about',u'们',u'这',u'那',u'的',u'己',u'个',u'我',u'你',u'很',u'了',u'是',u'以',u'过',u'一',u'么',u'没',u'在']
    mapper = MatrixMapper(STOP_WORDS)
    ireader = IndexReader.open(STORE_DIR)
    for i in range(len(entries)):
        try:
            doc = ireader.document(i)
            link = doc.get('link')
            entry = dao.get_by_link(link, Entry)
            category = mapper.build([doc])
            weight = 0
            if category:
                cat = category[0].text
                weight = category[0].label_weight
            else:
                cat = '其他'
            entry.category = dao.save_category(cat,weight,'s')
            entry.save()
        except Exception,e:
            print i,e
Ejemplo n.º 10
0
Archivo: lingo.py Proyecto: fay/wt
def loadterms():
    ireader = IndexReader.open(STORE_DIR)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title'))
    a = ireader.terms()
    rownames = []
    # 列名为term的中英文表示
    colnames = []
    # term-freq矩阵
    data = []
    ireader.document(- 1)
    i = 0
    while a.next():
        term = a.term()
        if term.field() == 'summary':
            colnames.append(term.text())
            if term.text() == '':
                print 'ok'
                break
            i = i+1
            if i == 1000:
                break
            docs = ireader.termDocs(term)
            vector = []
            lastdoc = 0
            while docs.next():
                # 填补那些不包含当前term的document的词频为0
                if lastdoc < docs.doc():
                    id = docs.doc()
                    for j in range(id - lastdoc):
                        vector.append(0)
                vector.append(docs.freq())
            data.append(vector)
    ireader.close()
    return colnames, data
Ejemplo n.º 11
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Ejemplo n.º 12
0
Archivo: clusters.py Proyecto: fay/wt
def loadterms():
    ireader = IndexReader.open(STORE_DIR)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title'))
    a = ireader.terms()
    rownames = []
    colnames = []
    data = []
    print dir(a)
    print dir(ireader)
    ireader.document(- 1)
    while 0 and a.next():
        term = a.term()
        if term.field() == 'summary':
            colnames.append(term.text())
            docs = ireader.termDocs(term)
            vector = []
            lastdoc = 0
            while docs.next():
                if lastdoc < docs.doc():
                    id = docs.doc()
                    for j in range(id - lastdoc):
                        vector.append(0)
                vector.append(docs.freq())
            data.append(vector)  
    return colnames, data
def build_lda_corpus(index_folder, paths_index_file,  
                     dictionary_file, ldac_file, min_frequency, 
                     min_word_len, max_word_len=20):
    '''
    The main function that does the job! 
    
    '''
    initVM()  
    store = SimpleFSDirectory(File(index_folder))
    index_reader = IndexReader.open(store)

    # Stores the file paths index (for LDA)
    _store_file_paths_index(index_reader, paths_index_file) 
    
    # Creates the dictionary 
    _create_dictionary(index_reader, dictionary_file, min_frequency, 
                       min_word_len, max_word_len)

    # Creates the corpus 
    dictionary = corpora.Dictionary().load(dictionary_file)      
    # doesn't load the corpus into the memory! 
    corpus_memory_friendly = _TextCorpus(dictionary, index_reader) 
    corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, 
                                 id2word=dictionary)
    
    logging.info('The Enron corpus building is completed.')
Ejemplo n.º 14
0
def deleteOldDocuments(*args):
	
	now = datetime.datetime.now() - datetime.timedelta(hours=6)
	IndexReader = writer.getReader()
	
	for i in IndexReader.maxDoc():
		
		if IndexReader.isDeleted(i):
			continue
			
		doc = IndexReader.document(i)
		date = doc.get("creation_date")	
		realDate = datetime.datetime.strptime(str(date), "%a %b %d %H:%M:%S")
		
		if now > realDate:
			IndexReader.deleteDocument(i)
			writer.optimize()
			writer.commit()	
Ejemplo n.º 15
0
 def runDrillDown(self):
     # open readers
     taxo = DirectoryTaxonomyReader(self.taxoDir)
     indexReader = IndexReader.open(self.indexDir, True)
     facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo)
     # close readers
     taxo.close()
     indexReader.close()
     # return result
     return facetRes
Ejemplo n.º 16
0
 def runSimple(self):
     # open readers
     taxo = DirectoryTaxonomyReader(self.taxoDir)
     indexReader = IndexReader.open(self.indexDir, True)
     # returns List<FacetResult>
     facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo)
     # close readers
     taxo.close()
     indexReader.close()
     # return result
     return facetRes
Ejemplo n.º 17
0
    def testDeleteBeforeIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())
        reader.deleteDocument(1)

        self.assert_(reader.isDeleted(1))
        self.assert_(reader.hasDeletions())
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()

        reader = IndexReader.open(self.dir, True)

        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()
Ejemplo n.º 18
0
 def loadtermFreq(self,context)    :
     word_filter = lambda w: (len(w) > 3) and (w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english'))
     try:
         reader = IndexReader.open(context.ramIndex,True)
         wordList =[]
         termenum = reader.terms()
         while termenum.next():
             wordList.append(termenum.term().text())
         self.termList = filter(word_filter,wordList)
     except Exception,e:
         print 'Unable to read Ram Index',e
Ejemplo n.º 19
0
Archivo: plush.py Proyecto: jjguy/plush
 def do_describe_store(self, message):
     """\d[escribe_store]
     General information about the store."""
     if not self.checkStore():
         return
     nb_docs = self.numDocs()
     max_doc = self.maxDoc()
     directory = self.directory
     print "Directory info"
     print "--------------"
     print "* Directory path             : %s" % self.store_path
     store_size = getTreeSize(self.store_path)
     print "* Directory size             : %s" % readableSize(store_size)
     print "* Directory current version  : %s" % (
         IndexReader.getCurrentVersion(directory))
     print "* Number of docs             : %s (max doc num: %s)" % (
         nb_docs, max_doc)
     print "* Number of fields           : %d" % len(self.fields)
     if nb_docs:
         print "* Average doc size           : %s" % (
             readableSize(store_size / float(nb_docs)))
     try:
         last_modified = datetime.fromtimestamp(
             IndexReader.lastModified(directory)/1000.0)
         last_modified = last_modified.isoformat()
     except ValueError:
         last_modified = "Unknown"
     print "* Index last modified        : %s" % last_modified
     print "* Index status               :",
     # TODO
     #if IndexReader.isLocked(directory):
     #    print "LOCKED"
     #else:
     #    print "unlocked"
     print "* Has deletions              :",
     if self.index_reader.hasDeletions():
         print "YES"
     else:
         print "no"
     print "* Directory implementation   : %s" % (
         directory.getClass().getName())
Ejemplo n.º 20
0
 def process(self,context):
     self.unigramList = context.termList
     self.ramreader = IndexReader.open(context.ramIndex,True)
     self.ramsearcher = IndexSearcher(context.ramIndex)
     self.N = self.ramreader.numDocs()
     self.m = len(self.unigramList)
     self.createTermDocumentMatrix()
     self.ramsearcher.close()
     self.ramreader.close()
     context.termdocumentmatrix = self.termdocumentMatrix
     print 'finished creating term document matrix'
     self.context = context
Ejemplo n.º 21
0
    def testDeleteAfterIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())
        reader.deleteDocument(1)
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.optimize()
        writer.close()

        reader = IndexReader.open(self.dir, True)

        self.assert_(not reader.isDeleted(1))
        self.assert_(not reader.hasDeletions())
        self.assertEqual(1, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()
Ejemplo n.º 22
0
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
Ejemplo n.º 23
0
    def calculateWeight(self,context):
        #try:
            self.termList = context.termList            
            ramreader = IndexReader.open(context.ramIndex,True)
            store = SimpleFSDirectory(File(context.luceneDir))
            storereader = IndexReader.open(store)
            searcher = IndexSearcher(store)
            ramsearcher = IndexSearcher(context.ramIndex)
            # Number of documents in the collection    
            N = storereader.numDocs()
            # Number of relevant documents            
            R = ramreader.numDocs()
            analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)

            
            for w in self.termList:       
                searchString= "'" + w + "'"
                query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString)
                # Number of relevant document having the term
                #r = ramsearcher.docFreq(Term("contents",w))
                hits = ramsearcher.search(query,self.MAX)
                r = hits.totalHits    
                # Number of documents having the term
                #n = searcher.docFreq(Term("contents",w))
                query = QueryParser(Version.LUCENE_CURRENT,context.searchField,analyzer).parse(searchString)
                hits = searcher.search(query,self.MAX)
                n = hits.totalHits
                if (R-r) > 0 and (n-r) > 0 and (N-n-R+r) > 0:
                    weight = (r/(R-r))/(((n-r)/(N-n-R+r)))
                else:
                    weight =0
                if weight > self.weightThreshold:            
                    self.gramList.append([w,weight])
            searcher.close()
            ramsearcher.close()
            storereader.close()
            ramreader.close()
        #except Exception,e:
        #    print 'error',e
            
Ejemplo n.º 24
0
    def buildCategoryVectors(self):

        reader = IndexReader.open(self.directory, True)

        for id in xrange(reader.maxDoc()):
            doc = reader.document(id)
            category = doc.get("category")
            vectorMap = self.categoryMap.get(category, None)
            if vectorMap is None:
                vectorMap = self.categoryMap[category] = {}

            termFreqVector = reader.getTermFreqVector(id, "subject")
            self.addTermFreqToMap(vectorMap, termFreqVector)
Ejemplo n.º 25
0
    def buildCategoryVectors(self):

        reader = IndexReader.open(self.directory, True)

        for id in xrange(reader.maxDoc()):
            doc = reader.document(id)
            category = doc.get("category")
            vectorMap = self.categoryMap.get(category, None)
            if vectorMap is None:
                vectorMap = self.categoryMap[category] = {}

            termFreqVector = reader.getTermFreqVector(id, "subject")
            self.addTermFreqToMap(vectorMap, termFreqVector)
Ejemplo n.º 26
0
    def testDeleteAfterIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())
        reader.deleteDocument(1)
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.commit()
        writer.close()

        reader = IndexReader.open(self.dir, True)
        deletedDocs = MultiFields.getDeletedDocs(reader)

        self.assert_(deletedDocs is None or not deletedDocs.get(1))
        self.assert_(not reader.hasDeletions())
        self.assertEqual(1, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()
Ejemplo n.º 27
0
    def testDeleteBeforeIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())

        reader.deleteDocument(1)
        deletedDocs = MultiFields.getDeletedDocs(reader)

        self.assert_(deletedDocs is not None and deletedDocs.get(1))
        self.assert_(reader.hasDeletions())
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()

        reader = IndexReader.open(self.dir, True)

        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()
Ejemplo n.º 28
0
    def query(indexName, queryFile, runName):
        indReader = IndexReader.open(SimpleFSDirectory(File(indexName)))
        indSearcher = IndexSearcher(indReader)
        ir = indSearcher.getIndexReader()

        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))

        f = open('results-'+runName, 'w')

        while(True):
            id = queryFile.readline()

            if id == "":
                break

            id = id.replace("C","")
            id = id.replace("\n","")

            queryString = queryFile.readline()
            queryString = queryString.replace("?","")
            queryString = queryString.replace("*","")
            queryString = queryString.replace("-","_")
            queryString = queryString.replace("\n","")

            query = qp.parse(queryString)

            queryFile.readline()

            returnedDocs = 1000
            collector = TopScoreDocCollector.create(returnedDocs, True)

            indSearcher.search(query, collector)

            hits = collector.topDocs().scoreDocs

            size = len(hits)
            print "Total hits for query " +id+ ": "+str(size)

            i = 0
            for hit in hits:        
                docId = hits[i].doc
                score = hits[i].score
                doc = ir.document(docId)
                j = i + 1
                f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n")
                i+=1

        f.close()
Ejemplo n.º 29
0
    def delete_old(self, index):
        existing_ids = set([book.id for book in Book.objects.all()])

        reader = IndexReader.open(index.index, False)
        searcher = IndexSearcher(reader)
        try:
            num = searcher.docFreq(Term('is_book', 'true'))
            docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
            for result in docs.scoreDocs:
                stored = searcher.doc(result.doc)
                book_id = int(stored.get('book_id'))
                if not book_id in existing_ids:
                    print "book id %d doesn't exist." % book_id
                    index.remove_book(book_id)
        finally:
            searcher.close()
            reader.close()
Ejemplo n.º 30
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()
Ejemplo n.º 31
0
    def main(cls, argv):
        
        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return
        
        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(Field("word", word,
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("t9", cls.t9(word),
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("length", str(len(word)),
                             Field.Store.NO, Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.commit()
        writer.close()

        reader.close()
Ejemplo n.º 32
0
    def main(cls, argv):

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        reader = IndexReader.open(directory, True)
        blt = BooksLikeThis(reader)

        for id in xrange(reader.maxDoc()):
            if reader.isDeleted(id):
                continue
            doc = reader.document(id)
            print ''
            print doc.get("title").encode('utf-8')

            docs = blt.docsLike(id, doc, 10)
            if not docs:
                print "  None like this"
            else:
                for doc in docs:
                    print " ->", doc.get("title").encode('utf-8')
Ejemplo n.º 33
0
    def main(cls, argv):

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        reader = IndexReader.open(directory, True)
        blt = BooksLikeThis(reader)

        for id in xrange(reader.maxDoc()):
            if reader.isDeleted(id):
                continue
            doc = reader.document(id)
            print ''
            print doc.get("title").encode('utf-8')

            docs = blt.docsLike(id, doc, 10)
            if not docs:
                print "  None like this"
            else:
                for doc in docs:
                    print " ->", doc.get("title").encode('utf-8')
Ejemplo n.º 34
0
def similar(command, docno):
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = searcher.search(query)
    document = hits.id(docno)

    ir = IndexReader.open(STORE_DIR)
    mlt = MoreLikeThis(ir)
    mlt.setFieldNames(['name', 'contents'])
    mlt.setMinWordLen(2)
    mlt.setBoost(True)
    query = mlt.like(document)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
Ejemplo n.º 35
0
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia",
                      Field.Store.YES, Field.Index.NO))
        doc.add(Field("contents", "St. Petersburg has lots of bridges",
                      Field.Store.NO, Field.Index.ANALYZED))
        doc.add(Field("city", "St. Petersburg",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.commit()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
directory = RAMDirectory()
iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT),
                      True, IndexWriter.MaxFieldLength.LIMITED)
ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be index"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t,
                  Field.Store.YES, Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
    iwriter.addDocument(doc)
iwriter.optimize()
iwriter.close()

ireader = IndexReader.open(directory, True)

tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))

for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
    print 'term %s' % t
    print '  freq: %i' % f
    try:
        print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
    except:
        print '  no pos'
    try:
        print '  off: ' + \
              str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                   for o in tpv.getOffsets(i)])
    except:
Ejemplo n.º 37
0
 def label_assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
     
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     for i in range(len(labels)):
         if not labels[i].is_candicate_label and len(labels[i].text) >= 3:
             label_term.append([])
             continue
         #print labels[i].text,labels[i].id
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []
         for token in stream:
             if term_row.has_key(token.term()):
                 # weighting
                 termdocs = ireader.termDocs(Term('summary', token.term()))
                 count = 0
                 span = 0
                 terms.append(token.term())
                 while termdocs.next():
                     count += termdocs.freq()
                     span += 1
                 weight = labels[i].label_weight
                 #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()):
                     #weight = 0
                 labelmatrix[term_row[token.term()]][i] = weight
         label_term.append(terms)
     termmatrix = array(all)
     termmatrix = transpose(termmatrix)
     #for i in range(len(labelmatrix[0])):
         #for j in range(len(termmatrix[0])):
     
     # row是doc,col是label  
     #p = self.product(termmatrix,labelmatrix)
     d = dot(termmatrix, labelmatrix)
     result = d / (norm(labelmatrix) * norm(termmatrix))
     doc_label = []
     for i in range(len(result)):
         m = - 1
         index = - 1
         group = []
         for j in range(len(result[i])):
             if result[i][j] > 0:
                 labels[j].id = result[i][j]
                 group.append(labels[j])
         # substring是按照id来排序的,这里正好用到
         group.sort()
         group.reverse()
         max_label = group[0]
         # i:doc number(just occur position in the docs)
         # label id
         # label score
         # 如果label自身并没有出现在当前doc中
         if not max_label.doc_freq.has_key(i):
             #print 'oringial:',labels[index].text
             count = 0
             overlap = ''
             for k in label_term[index]:
                 if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0:
                     overlap = k
                     print k
                     count += 1
             # 至少有一个交集,并且长度大于等于2
             if count == 1 and len(overlap) >= 2 :
                 new_label = pextractor.Substring()
                 new_label.text = overlap
                 new_label.id = m
                 doc_label.append(group[0])
                 continue
                     
         #labels[index].id = m
         doc_label.append(group[0])
     return doc_label
Ejemplo n.º 38
0
 def assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     term_doc_freq = {}
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
         """
             TODO:给属于标题的term加权
         """
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
     #for k,v in term_doc_freq.items():
      #   if v> 3:
       #      print k,v
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     # doc -label:每个doc对应的label
     all_weight_table = {}
     #label -doc:每个label对应的doc
     label_doc = []
     label_doc_map = {}
     for i in range(len(labels)):
         nonzero_table = []
         # 一个label对应和所有doc的权重之积
         weight_table = []
         
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []            
         c = 0
         weight_row = {}
         nonzero_index = []  
         is_incomplete = False
         for token in stream:
             term = token.term()#token.decode('utf-8')#
             #print term
             if term_row.has_key(term):
                 row = term_row[term]
                 terms.append(term)
                 docs_with_current_term = all[row]
                 for j in range(len(docs_with_current_term)):
                     if docs_with_current_term[j] != 0:                                            
                         if c == 0:
                             nonzero_index.append(j)
                         if c == 0 or j in nonzero_index:
                             weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight 
                         else:
                             # 加1防止权重之积为0
                             # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc  ,乘以-100使得权重乘积最小表示当前label不适用于此doc                              
                             weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100)
                     # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc 
                     elif docs_with_current_term[j] == 0 and j in nonzero_index:
                         # 加1防止权重之积为0
                         weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100)
                 c += 1
             else:
                 is_incomplete = True
         label_term.append(terms)
         # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。
         if is_incomplete:
             weight_row = {}
                 
                 
         for doc, weight in weight_row.items():
             last = all_weight_table.get(doc)                
             if weight > 0:
                 if not label_doc_map.has_key(labels[i].text):    
                     kc = dao.get_keyword_category_by_category(self.query, labels[i].text)
                     #label_doc.append([ 0,labels[i].text,[]])
                     label_doc.append([ 0,labels[i].text,0])
                     label_doc_map[labels[i].text] = len(label_doc) - 1
                 new_label = pextractor.Substring()
                 new_label.text = labels[i].text
                 new_label.id = weight
                 if last:
                     all_weight_table[doc].append(new_label)
                 else:
                     all_weight_table[doc] = [new_label]
                 #label_doc[label_doc_map[labels[i].text]][2].append(doc)
                 label_doc[label_doc_map[labels[i].text]][2] += 1
                 label_doc[label_doc_map[labels[i].text]][0] += weight
                 
                 #try:
                  #   category = dao.save_category(labels[i].text, weight, 'd')
                   #  entry = self.entries[doc]
                    # ec = dao.save_entry_cat(entry, category, weight)
                 #except Exception,e:
                  #   print e
                 
                 #if last:
                  #   all_weight_table[doc].append(ec)
                 #else:
                  #   all_weight_table[doc] = [ec]
             # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label
             #if last:
             #    if last.id < weight and weight > 0:
              #       labels[i].id = weight
               #      all_weight_table[doc] = labels[i]
             #else:
              #   labels[i].id = weight
               #  all_weight_table[doc] = labels[i]
     label_doc.sort(reverse=True)
     for k, v in all_weight_table.items():
         v.sort(reverse=True)
             
     # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了
     thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query)
     thread.start()
     return all_weight_table,label_doc
Ejemplo n.º 39
0
Archivo: cnanalyzer.py Proyecto: fay/wt
    def __init__(self, stopWords=None):

        if stopWords is None:
            self.stopWords = StopAnalyzer.ENGLISH_STOP_WORDS
        else:
            self.stopWords = stopWords

    def tokenStream(self, fieldName, reader):

        return StopFilter(LowerCaseFilter(LetterTokenizer(reader)),
                          self.stopWords)
if __name__ == '__main__':
    analyzer = CJKAnalyzer()
    directory = RAMDirectory()
    ireader = IndexReader.open(STORE_DIR)
    iwriter = IndexWriter(directory, StandardAnalyzer(), True)
    ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"]
    for t in ts:
        doc = Document()
        doc.add(Field("fieldname", t,
                      Field.Store.YES, Field.Index.TOKENIZED,
                      Field.TermVector.WITH_POSITIONS_OFFSETS))
        iwriter.addDocument(doc)
    iwriter.optimize()
    iwriter.close()
    ireader = IndexReader.open(directory)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
    
    for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)):
        print 'term %s' % t
Ejemplo n.º 40
0
FIELD_CONTENTS = "contents"
FIELD_PATH = "path"
#QUERY_STRING = "lucene and restored"
QUERY_STRING = sys.argv[1]
STORE_DIR = "/home/kanaujia/lucene_index"

if __name__ == '__main__':
    initVM()
    print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)
Ejemplo n.º 41
0
                        addedDocs = IndexHelper.index(f, writer)
                        noDocs += addedDocs

                        nrFiles += 1

                    except IOError:
                        print "File " + filename + " does not exist. Skipping..."

                writer.close()

                print str(nrFiles) + " files containing " + str(noDocs) + " documents added to index "

            
        elif sys.argv[1] == 'read':
            reader = IndexReader.open(SimpleFSDirectory(File("senses-gh95")))

            doc = reader.document(0)
            content = doc.getValues("content")
            id = doc.getValues("id")
            print content

            nrDocs = reader.numDocs()
            print "Number of docs: "+str(nrDocs)
            print "Doc 1: "+str(id[0])

            #Print all terms (takes some time :-) )
            #terms = reader.terms()
            #while(terms.next()):
            #    t = terms.term()
            #    freq = terms.docFreq()
Ejemplo n.º 42
0
    def testIndexReader(self):

        reader = IndexReader.open(self.dir, True)
        self.assertEqual(len(self.keywords), reader.maxDoc())
        self.assertEqual(len(self.keywords), reader.numDocs())
        reader.close()
 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
    def testIndexReader(self):

        reader = IndexReader.open(self.dir, True)
        self.assertEqual(len(self.keywords), reader.maxDoc())
        self.assertEqual(len(self.keywords), reader.numDocs())
        reader.close()
    def getIndexReader(self):

        return IndexReader.open(self.getDirectory())