def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def fn(): env.attachCurrentThread() start = datetime.now() IndexFiles(sys.argv[1], "index", StandardAnalyzer(Version.LUCENE_CURRENT)) end = datetime.now() print end - start
def __init__(self): self.__dict__ = self.__shared_state if not self.__shared_state: self.jccvm = lucene.initVM() self.index = SimpleFSDirectory( lucene.File(settings.lucene_index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def testPrefixQuery(self): parser = QueryParser(Version.LUCENE_CURRENT, "category", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setLowercaseExpandedTerms(False) print parser.parse("/Computers/technology*").toString("category")
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def names(): lst = [] search = "spax" #request.form['product'] lucene.initVM() dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 1: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(',') for item in items: if item == search: pass elif item not in lst: lst.append(item) #print lst data = {"products": lst} if request.method == 'POST': return jsonify(data) else: return jsonify(data)
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def open(self, name, txn, **kwds): super(IndexContainer, self).open(name, txn, **kwds) if kwds.get('create', False): directory = self.getDirectory() indexWriter = IndexWriter(directory, StandardAnalyzer(), True) indexWriter.close() directory.close()
def retrieve(string,tweetID): global eventNum global eventDict global eventList lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: searcher = IndexSearcher(dir) except lucene.JavaError: #print 'Inside First Except' eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() return try: query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string) #e = sys.exc_info()[0] #print e MAX = 2 hits = searcher.search(query, MAX) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) #print 'total hits' #print hits.totalHits if hits.totalHits > 0: eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc] analyzer.close() searcher.close() return else: #print '-----------' #print tweetID eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() searcher.close() return #for hit in hits.scoreDocs: #print hit.score, hit.doc, hit.toString() #doc = searcher.doc(hit.doc) #print doc.get("text").encode("utf-8") except lucene.JavaError: eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() searcher.close() return
def extractFeatureQueryWords(query): import string from lucene import Document, TermQuery, Term # create analyzer aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: file = open('../features.txt', 'r') featurelist = [] for line in file.readlines(): words_in_line = line.split() featurelist += words_in_line querywordlist = query.split() featureQueryList = [] productQueryList = [] for word in querywordlist: if word in featurelist: featureQueryList.append(word) else: # create parser for word aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer) aux_query = aux_parser.parse(word) scoreDocs = searcher.search(aux_query, 50).scoreDocs if scoreDocs: productQueryList.append(word) featureQuery = "" if featureQueryList: featureQuery = "(" for i in range(len(featureQueryList)): if i == len(featureQueryList) - 1: featureQuery += featureQueryList[i] + ")" else: featureQuery += featureQueryList[i] + " AND " print featureQuery productQuery = "" if productQueryList: productQuery = "(" for i in range(len(productQueryList)): if i == len(productQueryList) - 1: productQuery += productQueryList[i] + ")" else: productQuery += productQueryList[i] + " AND " return (featureQuery, productQuery, featureQueryList, productQueryList) except Exception, ex: print "Could not separate feature query words. Reason: ", ex return ("", "(" + query + ")", [], querywordlist)
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def getResultScoreDocs(query): # create analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # create parser for user submitted query parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) formatted_query = parser.parse(query) scoreDocs = searcher.search(formatted_query, 50).scoreDocs return scoreDocs
def main(cls, argv): print "SimpleAnalyzer" cls.displayTokensWithFullDetails(SimpleAnalyzer(), "The quick brown fox....") print "\n----" print "StandardAnalyzer" cls.displayTokensWithFullDetails( StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")
def commitIndexWriter(self, writer): directory = writer.getDirectory() writer.close() dbDirectory = self.getDirectory() dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False) dbWriter.setUseCompoundFile(False) dbWriter.addIndexes([directory]) directory.close() dbWriter.close() dbDirectory.close()
def testWithQueryParser(self): query = QueryParser(Version.LUCENE_CURRENT, "content", self.synonymAnalyzer).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) # in Lucene 1.9, position increments are no longer ignored self.assertEqual(1, topDocs.totalHits, "!!!! what?!") query = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer( Version.LUCENE_CURRENT)).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "*whew*")
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def build_perfield_analyzer(index_fields): """ This function creates a PerFieldAnalyzerWrapper which allows us to associate different Lucene Analyzers to specific fields in our Lucene index. """ analyzer = PerFieldAnalyzerWrapper(StandardAnalyzer( Version.LUCENE_CURRENT)) for (index_name, analyzer_type) in index_fields.iteritems(): if analyzer_type == "standard": continue analyzer.addAnalyzer(index_name, ANALYZER_LOOKUP.get(analyzer_type)) return analyzer
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def find(self, query, indir): lucene.initVM() INDEXDIR = indir indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"<default field>",\ lucene_analyzer).parse("text:" + query + " OR title:" + query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "\nHits: ", total_hits.totalHits, "\n" for hit in total_hits.scoreDocs: print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc( hit.doc).get("department").encode( "utf-8"), "Title:", lucene_searcher.doc( hit.doc).get("title").encode("utf-8") print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def __init__(self, STOP_WORDS=StopAnalyzer.ENGLISH_STOP_WORDS): self.pe = pextractor.PhraseExtractor() self.STOP_WORDS = STOP_WORDS self.analyzer = StandardAnalyzer(STOP_WORDS) self.entries = []
class MatrixMapper(object): def __init__(self, STOP_WORDS=StopAnalyzer.ENGLISH_STOP_WORDS): self.pe = pextractor.PhraseExtractor() self.STOP_WORDS = STOP_WORDS self.analyzer = StandardAnalyzer(STOP_WORDS) self.entries = [] def get_cs_by_lucene_doc(self, docs, context): doc_size = len(docs) lucene_ids = [] categories = [] for id in range(doc_size): link = docs[id].get("link") lucene_ids.append(int(docs[id].get("id"))) entry = dao.get_by_link(link, Entry) self.entries.append(entry) # TODO boost title field summary = entry.summary[:200] #if entry.category != '其他': #categories.append(entry.category) stream = self.analyzer.tokenStream("summary", StringReader(summary)) for s in stream: context.tokens.append(s.term()) context.token_types.append(s.type()) stream = self.analyzer.tokenStream("title", StringReader(entry.title)) for s in stream: context.title_field.append(len(context.tokens)) context.tokens.append(s.term()) context.token_types.append(s.type()) context.term_doc_range.append(len(context.tokens)) #print 'tokens:',len(context.tokens) return self.pe.extract(context), lucene_ids, categories def add2matrix(self, tpv, all, term_row, lucene_ids, i,term_doc_freq): for (t, f) in zip(tpv.getTerms(), tpv.getTermFrequencies()): term = [0 for j in range(len(lucene_ids))] new = False if not term_row.has_key(t): term_row[t] = len(term_row) new = True row = term_row[t] if new: term[i] = f#self.tfidf(len(tpv.getTerms()), f, total_terms, dtf) all.append(term) else: all[row][i] = f term_doc_freq[t] = term_doc_freq.get(t,0) + 1 """ 效率很低,弃用 """ def product(self, termmatrix, labelmatrix): row = [] for i in range(len(termmatrix)): col = [] for j in range(len(labelmatrix[0])): #k是a的列,b的行 e = [] for k in range(len(termmatrix[0])): c = termmatrix[i][k] d = labelmatrix[k][j] e.append(c * d) col.append(sum(e)) row.append(col) return row """ 取代label_assign方法 """ def assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 term_doc_freq = {} for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) """ TODO:给属于标题的term加权 """ tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) #for k,v in term_doc_freq.items(): # if v> 3: # print k,v # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] # doc -label:每个doc对应的label all_weight_table = {} #label -doc:每个label对应的doc label_doc = [] label_doc_map = {} for i in range(len(labels)): nonzero_table = [] # 一个label对应和所有doc的权重之积 weight_table = [] stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] c = 0 weight_row = {} nonzero_index = [] is_incomplete = False for token in stream: term = token.term()#token.decode('utf-8')# #print term if term_row.has_key(term): row = term_row[term] terms.append(term) docs_with_current_term = all[row] for j in range(len(docs_with_current_term)): if docs_with_current_term[j] != 0: if c == 0: nonzero_index.append(j) if c == 0 or j in nonzero_index: weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight else: # 加1防止权重之积为0 # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc ,乘以-100使得权重乘积最小表示当前label不适用于此doc weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100) # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc elif docs_with_current_term[j] == 0 and j in nonzero_index: # 加1防止权重之积为0 weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100) c += 1 else: is_incomplete = True label_term.append(terms) # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。 if is_incomplete: weight_row = {} for doc, weight in weight_row.items(): last = all_weight_table.get(doc) if weight > 0: if not label_doc_map.has_key(labels[i].text): kc = dao.get_keyword_category_by_category(self.query, labels[i].text) #label_doc.append([ 0,labels[i].text,[]]) label_doc.append([ 0,labels[i].text,0]) label_doc_map[labels[i].text] = len(label_doc) - 1 new_label = pextractor.Substring() new_label.text = labels[i].text new_label.id = weight if last: all_weight_table[doc].append(new_label) else: all_weight_table[doc] = [new_label] #label_doc[label_doc_map[labels[i].text]][2].append(doc) label_doc[label_doc_map[labels[i].text]][2] += 1 label_doc[label_doc_map[labels[i].text]][0] += weight #try: # category = dao.save_category(labels[i].text, weight, 'd') # entry = self.entries[doc] # ec = dao.save_entry_cat(entry, category, weight) #except Exception,e: # print e #if last: # all_weight_table[doc].append(ec) #else: # all_weight_table[doc] = [ec] # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label #if last: # if last.id < weight and weight > 0: # labels[i].id = weight # all_weight_table[doc] = labels[i] #else: # labels[i].id = weight # all_weight_table[doc] = labels[i] label_doc.sort(reverse=True) for k, v in all_weight_table.items(): v.sort(reverse=True) # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了 thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query) thread.start() return all_weight_table,label_doc """ 废弃 """ def label_assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i) # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] for i in range(len(labels)): if not labels[i].is_candicate_label and len(labels[i].text) >= 3: label_term.append([]) continue #print labels[i].text,labels[i].id stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] for token in stream: if term_row.has_key(token.term()): # weighting termdocs = ireader.termDocs(Term('summary', token.term())) count = 0 span = 0 terms.append(token.term()) while termdocs.next(): count += termdocs.freq() span += 1 weight = labels[i].label_weight #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()): #weight = 0 labelmatrix[term_row[token.term()]][i] = weight label_term.append(terms) termmatrix = array(all) termmatrix = transpose(termmatrix) #for i in range(len(labelmatrix[0])): #for j in range(len(termmatrix[0])): # row是doc,col是label #p = self.product(termmatrix,labelmatrix) d = dot(termmatrix, labelmatrix) result = d / (norm(labelmatrix) * norm(termmatrix)) doc_label = [] for i in range(len(result)): m = - 1 index = - 1 group = [] for j in range(len(result[i])): if result[i][j] > 0: labels[j].id = result[i][j] group.append(labels[j]) # substring是按照id来排序的,这里正好用到 group.sort() group.reverse() max_label = group[0] # i:doc number(just occur position in the docs) # label id # label score # 如果label自身并没有出现在当前doc中 if not max_label.doc_freq.has_key(i): #print 'oringial:',labels[index].text count = 0 overlap = '' for k in label_term[index]: if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0: overlap = k print k count += 1 # 至少有一个交集,并且长度大于等于2 if count == 1 and len(overlap) >= 2 : new_label = pextractor.Substring() new_label.text = overlap new_label.id = m doc_label.append(group[0]) continue #labels[index].id = m doc_label.append(group[0]) return doc_label """ 计算tf-idf值 """ def tfidf(self, terms_length, tf, docs_length, dtf): # results[i].freq - v可能为0,所以将v/2再相减 return (tf / float(terms_length)) * abs(math.log((dtf - tf / 2.0) / (docs_length*10))) def build(self, docs,query): self.entries = [] self.query = query context = Context() results, lucene_ids , categories = self.get_cs_by_lucene_doc(docs, context) results_size = len(results) doc_size = len(docs) stroplabels = {} for i in range(results_size): #删除一些类似‘在阿里巴巴’中的‘在’字,类似地还有‘...上’,‘...里’,‘和...’ stream2 = tokenizer.tokenize(results[i].text.encode('utf-8')) if len(stream2) == 2 and len(results[i].text)>=3: if (len(stream2[0].decode('utf-8')) == 1 or len(stream2[1].decode('utf-8')) == 1): results[i].text = len(stream2[0].decode('utf-8')) == 1 and stream2[1].decode('utf-8') or stream2[0].decode('utf-8') elif (len(stream2[1].decode('utf-8')) - len(stream2[0].decode('utf-8'))) == 1 \ and len(stream2[1].decode('utf-8')) == len(results[i].text): results[i].text = stream2[0].decode('utf-8') if stroplabels.has_key(results[i].text): same_label_name = stroplabels[results[i].text] results[i].freq += same_label_name.freq for k,v in same_label_name.doc_freq.items(): if results[i].doc_freq.has_key(k): results[i].doc_freq[k] += v else: results[i].doc_freq[k] = v stroplabels[results[i].text] = results[i] results = stroplabels.values() results_size = len(results) # phrase doc matrix pdmatrix = zeros((results_size, doc_size)) #for i in results: #print i.text,i.freq d = len(context.tokens) if results_size == 0: print 'no frequent phrase!' return {},[] for i in range(results_size): doc_freq = results[i].doc_freq index = context.suffix[results[i].id] # 如果有性能问题可以把title_field改为map is_title = False if index in context.title_field: is_title = True for k, v in doc_freq.items(): # k is doc_id,v is doc freq # doc_length 所在doc的term数量 doc_length = context.term_doc_range[k] - (k and context.term_doc_range[k - 1]) pdmatrix[i][k] = self.tfidf(doc_length, v, d, results[i].freq) * (is_title and TITLE_FIELD_BOOST or 1) results[i].label_weight = norm(pdmatrix[i]) #if stroplabels.has_key(results[i].text): # stroplabels[results[i].text].label_weight+=results[i].label_weight if 1: # SVD-奇异值分解 u, s, v = linalg.svd(pdmatrix) # 试探以取得矩阵的秩 rank = 1 for k in range(1, len(s)): if norm(s[:k]) / norm(s) >= CLT: rank = k break # 取u中前rank列,选择最大的一个component,保存对应的label索引 maxes = {} lo = {} for j in range(rank): m = - 100 index = - 1 for i in range(len(u)): if u[i][j] > 0: lo[i]=u[i][j] if u[i][j] > m: m = u[i][j] # index 为 对应的label索引 index = i # 如果label之前已有则取均值 if maxes.has_key(index): maxes[index] = (maxes[index] + m) / 2.0 else: # 保存最大值 maxes[index] = m labels = [] # k为索引,v为最大值 for k, v in maxes.items(): #print results[k].text,k,v #sub = pextractor.Substring() #sub.text = results[k].text temp = re.sub('[0-9 ]', '', results[k].text) if temp == '': results[k].text = temp results[k].is_candicate_label = True results[k].label_weight = v labels.append(results[k]) return self.assign(docs, results, lucene_ids)
def a(): import os #loader = BSDDictLoader() #dic = loader.load() words_dict = {} from dot.searcher import Searcher, STORE_DIR from apps.wantown import dao from apps.wantown.models import Entry searcher = Searcher() hits = searcher.search("java") docs = [] for hit in hits: doc = Hit.cast_(hit).getDocument() docs.append(doc) entries = [] all = '' from dot.context import Context, Token context = Context() import re #all = re.sub('[0-9:;;/\(\)\t\[\]]()\**#&','',all) #all = re.sub('[ +=-]',' ',all) analyzer = StandardAnalyzer() # doc id id = 0 allToken = [] allText = [] pureText = '' c = 0 docRange = {} for doc in docs[0:100]: link = doc.get("link") entry = dao.get_by_link(link, Entry) entries.append(entry.summary) all = entry.summary[:200] + entry.title pureText += all tokenType = [] last_type = '' #all = """提起电吉他演奏,就必须提到布鲁斯音乐;提起最伟大的吉他演奏大师,人们首先会想到的是 Jimi Hendrix,但是说起依然在世的最伟大的吉他演奏家,名字只有一个——Eric Clapton爵士。自从上个世纪60年代布鲁斯摇滚乐以及布鲁斯吉他演奏成为了主流摇滚风格之后,在这种来源于黑人音乐的吉他演奏中,在所有除黑色外其他肤色的布鲁斯吉他演奏家之中,传奇人物Eric Clapton毫无疑问是其中最杰出的一位。在与Eric Clapton同时代的所有艺术家纷纷的离开人世,或者失去了原有的歌迷号召力之后,Eric Clapton是所有当年这些艺术家中为数不多的既然保持着自己高超的演奏技术以及强大的市场号召力的艺术家。 # #Eric Clapton为人谦逊,在与其他出色的吉他演奏者比如Jimi Hendrix,B.B. King,Duane Allman,甚至后辈Stevie Ray Vaughan相比较的时候他总是非常谦恭,在与B.B. King以及Bob Dylan等人同台的时候他总是举止非常礼让,他是最有绅士风度的流行音乐家之一。同时,作为世界上最著名的吉他大师,Eric Clapton还经常热心的帮助包括英国著名流行音乐家Sting,Bon Jovi乐队主音吉他手Richie Sambora在内的其他一些音乐家去录制专辑或者拍摄音乐录影带,并且经常为一些音乐家担任吉他手作伴奏。Eric Clapton曾经协助过Bob Dylan,Aretha Franklin,Joe Cocker,Ringo Starr,Freddie King,Roger Waters等等近百位艺术家的专辑录制。 #""" stream = analyzer.tokenStream("fieldname", StringReader(all)) for s in stream: #if (last_type == '<ALPHANUM>' or last_type == '<HOST>') and (s.type() == '<ALPHANUM>' or s.type() == '<HOST>'): #all.append(' ') #pass #last_type = s.type() token = Token() token.text = s.termText() token.offset = s.termLength() token.doc = id allToken.append(token) allText.append(s.term()) print dir(s) c += 1 docRange[len(allText)] = id #all = sorted(all,cmp=lambda x,y:cmp(x.termText(),y.termText())) id += 1 context.tokens = allText #context.tokens.sort() #for i in context.tokens: #print i #print s context.text = '' context.token_types = tokenType context.docs = entries context.term_doc_range = docRange print len(all) from dot.lingo import pextractor import time start = time.time() #pe = pextractor.PhraseExtractor() #results = pe.extract(context) count = 0 r = docRange.keys() r.sort() if 0: for i in results: if len(i.text) > 1 and i.freq > 2 and len(i.text) < 20: id = i.id - 1 lcp = context.lcp[id + 1] for f in range(i.freq): begin = context.suffix[id] end = context.suffix[id] + lcp for j in range(len(r)): if begin < r[j]: break doc = docRange[r[j]] #print context.tokens[begin:end],i.freq,begin,doc if end > r[j]: print 'not in the same doc' id += 1 #print i.text.strip(), i.freq,i.doc_freq #print (time.time() - start) from dot.matrixmapper import MatrixMapper mapper = MatrixMapper() mapper.build(docs[:100]) #print pureText import sys from dot.lingo import suffixsorter as ss #for i in range(len(context.suffix)): # s = pe.list2str(context.tokens) # sys.stdout.write('%d\t%d\t%s\n' % (context.suffix[i], context.lcp[i], context.tokens[context.suffix[i]:context.suffix[i] + 10])) #dm = getDictManager() #words_dict= featurex.tf_idf(entries, dm.seg_dict) #doc1 = featurex.Document(entries.encode('utf-8'),dm) #doc2 = featurex.Document(entries[0].encode('utf-8'), dm) #for i in words_dict.values(): #print i.word,i.frequency,i.feature_value,i.tfidf #print similitude_doc_cos(doc1, doc2) """ ibm jdk 3 {3: 3} 不同 3 {4: 2, 7: 1} 使用 3 {8: 2, 7: 1} 可以 10 {8: 3, 3: 2, 4: 2, 7: 3} 处理 3 {8: 3} 好的 3 {8: 1, 7: 2} 字体 5 {8: 2, 4: 3} 已经 4 {9: 1, 3: 1, 4: 1, 7: 1} 平滑 4 {8: 1, 4: 3} 应用 3 {8: 1, 4: 2} 手机上 3 {7: 3} 文本 3 {8: 3} 游戏 4 {7: 4} 环境 3 {1: 1, 3: 2} 的java 6 {1: 1, 2: 1, 5: 1, 7: 3} 的文 3 {8: 3} 设置 5 {4: 5} 软件 3 {1: 1, 7: 2} 运行 3 {1: 1, 7: 2} """