Example #1
0
class AnalyzerDemo(object):

    examples = ["http://www.baidu.com/ www.baidu.com",
                "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')]
    
    analyzers = [WhitespaceAnalyzer(),
                 SimpleAnalyzer(),
                 StopAnalyzer(Version.LUCENE_CURRENT),
                 StandardAnalyzer(Version.LUCENE_CURRENT),
                 CJKAnalyzer(Version.LUCENE_CURRENT)]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' %(text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" %(name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)
Example #2
0
 def label_assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i)
     
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     for i in range(len(labels)):
         if not labels[i].is_candicate_label and len(labels[i].text) >= 3:
             label_term.append([])
             continue
         #print labels[i].text,labels[i].id
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []
         for token in stream:
             if term_row.has_key(token.term()):
                 # weighting
                 termdocs = ireader.termDocs(Term('summary', token.term()))
                 count = 0
                 span = 0
                 terms.append(token.term())
                 while termdocs.next():
                     count += termdocs.freq()
                     span += 1
                 weight = labels[i].label_weight
                 #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()):
                     #weight = 0
                 labelmatrix[term_row[token.term()]][i] = weight
         label_term.append(terms)
     termmatrix = array(all)
     termmatrix = transpose(termmatrix)
     #for i in range(len(labelmatrix[0])):
         #for j in range(len(termmatrix[0])):
     
     # row是doc,col是label  
     #p = self.product(termmatrix,labelmatrix)
     d = dot(termmatrix, labelmatrix)
     result = d / (norm(labelmatrix) * norm(termmatrix))
     doc_label = []
     for i in range(len(result)):
         m = - 1
         index = - 1
         group = []
         for j in range(len(result[i])):
             if result[i][j] > 0:
                 labels[j].id = result[i][j]
                 group.append(labels[j])
         # substring是按照id来排序的,这里正好用到
         group.sort()
         group.reverse()
         max_label = group[0]
         # i:doc number(just occur position in the docs)
         # label id
         # label score
         # 如果label自身并没有出现在当前doc中
         if not max_label.doc_freq.has_key(i):
             #print 'oringial:',labels[index].text
             count = 0
             overlap = ''
             for k in label_term[index]:
                 if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0:
                     overlap = k
                     print k
                     count += 1
             # 至少有一个交集,并且长度大于等于2
             if count == 1 and len(overlap) >= 2 :
                 new_label = pextractor.Substring()
                 new_label.text = overlap
                 new_label.id = m
                 doc_label.append(group[0])
                 continue
                     
         #labels[index].id = m
         doc_label.append(group[0])
     return doc_label
Example #3
0
 def assign(self, docs, labels, lucene_ids):
     term_row = {}
     all = []
     ireader = IndexReader.open(STORE_DIR)
     total_terms = 0
     term_doc_freq = {}
     for i in range(len(lucene_ids)):
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
         """
             TODO:给属于标题的term加权
         """
         tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
         self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
     #for k,v in term_doc_freq.items():
      #   if v> 3:
       #      print k,v
     # 对label进行分词            
     analyzer = CJKAnalyzer()
     labelmatrix = zeros((len(all), len(labels)))
     label_term = []
     # doc -label:每个doc对应的label
     all_weight_table = {}
     #label -doc:每个label对应的doc
     label_doc = []
     label_doc_map = {}
     for i in range(len(labels)):
         nonzero_table = []
         # 一个label对应和所有doc的权重之积
         weight_table = []
         
         stream = analyzer.tokenStream('', StringReader(labels[i].text))
         terms = []            
         c = 0
         weight_row = {}
         nonzero_index = []  
         is_incomplete = False
         for token in stream:
             term = token.term()#token.decode('utf-8')#
             #print term
             if term_row.has_key(term):
                 row = term_row[term]
                 terms.append(term)
                 docs_with_current_term = all[row]
                 for j in range(len(docs_with_current_term)):
                     if docs_with_current_term[j] != 0:                                            
                         if c == 0:
                             nonzero_index.append(j)
                         if c == 0 or j in nonzero_index:
                             weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight 
                         else:
                             # 加1防止权重之积为0
                             # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc  ,乘以-100使得权重乘积最小表示当前label不适用于此doc                              
                             weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100)
                     # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc 
                     elif docs_with_current_term[j] == 0 and j in nonzero_index:
                         # 加1防止权重之积为0
                         weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100)
                 c += 1
             else:
                 is_incomplete = True
         label_term.append(terms)
         # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。
         if is_incomplete:
             weight_row = {}
                 
                 
         for doc, weight in weight_row.items():
             last = all_weight_table.get(doc)                
             if weight > 0:
                 if not label_doc_map.has_key(labels[i].text):    
                     kc = dao.get_keyword_category_by_category(self.query, labels[i].text)
                     #label_doc.append([ 0,labels[i].text,[]])
                     label_doc.append([ 0,labels[i].text,0])
                     label_doc_map[labels[i].text] = len(label_doc) - 1
                 new_label = pextractor.Substring()
                 new_label.text = labels[i].text
                 new_label.id = weight
                 if last:
                     all_weight_table[doc].append(new_label)
                 else:
                     all_weight_table[doc] = [new_label]
                 #label_doc[label_doc_map[labels[i].text]][2].append(doc)
                 label_doc[label_doc_map[labels[i].text]][2] += 1
                 label_doc[label_doc_map[labels[i].text]][0] += weight
                 
                 #try:
                  #   category = dao.save_category(labels[i].text, weight, 'd')
                   #  entry = self.entries[doc]
                    # ec = dao.save_entry_cat(entry, category, weight)
                 #except Exception,e:
                  #   print e
                 
                 #if last:
                  #   all_weight_table[doc].append(ec)
                 #else:
                  #   all_weight_table[doc] = [ec]
             # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label
             #if last:
             #    if last.id < weight and weight > 0:
              #       labels[i].id = weight
               #      all_weight_table[doc] = labels[i]
             #else:
              #   labels[i].id = weight
               #  all_weight_table[doc] = labels[i]
     label_doc.sort(reverse=True)
     for k, v in all_weight_table.items():
         v.sort(reverse=True)
             
     # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了
     thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query)
     thread.start()
     return all_weight_table,label_doc
Example #4
0
File: cnanalyzer.py Project: fay/wt
        return _tokenStream()
class StopAnalyzer2(PythonAnalyzer):

    def __init__(self, stopWords=None):

        if stopWords is None:
            self.stopWords = StopAnalyzer.ENGLISH_STOP_WORDS
        else:
            self.stopWords = stopWords

    def tokenStream(self, fieldName, reader):

        return StopFilter(LowerCaseFilter(LetterTokenizer(reader)),
                          self.stopWords)
if __name__ == '__main__':
    analyzer = CJKAnalyzer()
    directory = RAMDirectory()
    ireader = IndexReader.open(STORE_DIR)
    iwriter = IndexWriter(directory, StandardAnalyzer(), True)
    ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"]
    for t in ts:
        doc = Document()
        doc.add(Field("fieldname", t,
                      Field.Store.YES, Field.Index.TOKENIZED,
                      Field.TermVector.WITH_POSITIONS_OFFSETS))
        iwriter.addDocument(doc)
    iwriter.optimize()
    iwriter.close()
    ireader = IndexReader.open(directory)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))