def loadterms(): ireader = IndexReader.open(STORE_DIR) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title')) a = ireader.terms() rownames = [] # 列名为term的中英文表示 colnames = [] # term-freq矩阵 data = [] ireader.document(- 1) i = 0 while a.next(): term = a.term() if term.field() == 'summary': colnames.append(term.text()) if term.text() == '': print 'ok' break i = i+1 if i == 1000: break docs = ireader.termDocs(term) vector = [] lastdoc = 0 while docs.next(): # 填补那些不包含当前term的document的词频为0 if lastdoc < docs.doc(): id = docs.doc() for j in range(id - lastdoc): vector.append(0) vector.append(docs.freq()) data.append(vector) ireader.close() return colnames, data
def loadterms(): ireader = IndexReader.open(STORE_DIR) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title')) a = ireader.terms() rownames = [] colnames = [] data = [] print dir(a) print dir(ireader) ireader.document(- 1) while 0 and a.next(): term = a.term() if term.field() == 'summary': colnames.append(term.text()) docs = ireader.termDocs(term) vector = [] lastdoc = 0 while docs.next(): if lastdoc < docs.doc(): id = docs.doc() for j in range(id - lastdoc): vector.append(0) vector.append(docs.freq()) data.append(vector) return colnames, data
def label_assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i) # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] for i in range(len(labels)): if not labels[i].is_candicate_label and len(labels[i].text) >= 3: label_term.append([]) continue #print labels[i].text,labels[i].id stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] for token in stream: if term_row.has_key(token.term()): # weighting termdocs = ireader.termDocs(Term('summary', token.term())) count = 0 span = 0 terms.append(token.term()) while termdocs.next(): count += termdocs.freq() span += 1 weight = labels[i].label_weight #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()): #weight = 0 labelmatrix[term_row[token.term()]][i] = weight label_term.append(terms) termmatrix = array(all) termmatrix = transpose(termmatrix) #for i in range(len(labelmatrix[0])): #for j in range(len(termmatrix[0])): # row是doc,col是label #p = self.product(termmatrix,labelmatrix) d = dot(termmatrix, labelmatrix) result = d / (norm(labelmatrix) * norm(termmatrix)) doc_label = [] for i in range(len(result)): m = - 1 index = - 1 group = [] for j in range(len(result[i])): if result[i][j] > 0: labels[j].id = result[i][j] group.append(labels[j]) # substring是按照id来排序的,这里正好用到 group.sort() group.reverse() max_label = group[0] # i:doc number(just occur position in the docs) # label id # label score # 如果label自身并没有出现在当前doc中 if not max_label.doc_freq.has_key(i): #print 'oringial:',labels[index].text count = 0 overlap = '' for k in label_term[index]: if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0: overlap = k print k count += 1 # 至少有一个交集,并且长度大于等于2 if count == 1 and len(overlap) >= 2 : new_label = pextractor.Substring() new_label.text = overlap new_label.id = m doc_label.append(group[0]) continue #labels[index].id = m doc_label.append(group[0]) return doc_label
def assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 term_doc_freq = {} for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) """ TODO:给属于标题的term加权 """ tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) #for k,v in term_doc_freq.items(): # if v> 3: # print k,v # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] # doc -label:每个doc对应的label all_weight_table = {} #label -doc:每个label对应的doc label_doc = [] label_doc_map = {} for i in range(len(labels)): nonzero_table = [] # 一个label对应和所有doc的权重之积 weight_table = [] stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] c = 0 weight_row = {} nonzero_index = [] is_incomplete = False for token in stream: term = token.term()#token.decode('utf-8')# #print term if term_row.has_key(term): row = term_row[term] terms.append(term) docs_with_current_term = all[row] for j in range(len(docs_with_current_term)): if docs_with_current_term[j] != 0: if c == 0: nonzero_index.append(j) if c == 0 or j in nonzero_index: weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight else: # 加1防止权重之积为0 # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc ,乘以-100使得权重乘积最小表示当前label不适用于此doc weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100) # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc elif docs_with_current_term[j] == 0 and j in nonzero_index: # 加1防止权重之积为0 weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100) c += 1 else: is_incomplete = True label_term.append(terms) # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。 if is_incomplete: weight_row = {} for doc, weight in weight_row.items(): last = all_weight_table.get(doc) if weight > 0: if not label_doc_map.has_key(labels[i].text): kc = dao.get_keyword_category_by_category(self.query, labels[i].text) #label_doc.append([ 0,labels[i].text,[]]) label_doc.append([ 0,labels[i].text,0]) label_doc_map[labels[i].text] = len(label_doc) - 1 new_label = pextractor.Substring() new_label.text = labels[i].text new_label.id = weight if last: all_weight_table[doc].append(new_label) else: all_weight_table[doc] = [new_label] #label_doc[label_doc_map[labels[i].text]][2].append(doc) label_doc[label_doc_map[labels[i].text]][2] += 1 label_doc[label_doc_map[labels[i].text]][0] += weight #try: # category = dao.save_category(labels[i].text, weight, 'd') # entry = self.entries[doc] # ec = dao.save_entry_cat(entry, category, weight) #except Exception,e: # print e #if last: # all_weight_table[doc].append(ec) #else: # all_weight_table[doc] = [ec] # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label #if last: # if last.id < weight and weight > 0: # labels[i].id = weight # all_weight_table[doc] = labels[i] #else: # labels[i].id = weight # all_weight_table[doc] = labels[i] label_doc.sort(reverse=True) for k, v in all_weight_table.items(): v.sort(reverse=True) # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了 thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query) thread.start() return all_weight_table,label_doc
if __name__ == '__main__': analyzer = CJKAnalyzer() directory = RAMDirectory() ireader = IndexReader.open(STORE_DIR) iwriter = IndexWriter(directory, StandardAnalyzer(), True) ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.optimize() iwriter.close() ireader = IndexReader.open(directory) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)): print 'term %s' % t print ' freq: %i' % f try: print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) except: print ' no pos' try: print ' off: ' + \ str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) for o in tpv.getOffsets(i)]) except: print ' no offsets' text = "地方库 fd### fd 反对 发"