def __init__(self): self.loader = CixingDictLoader() self.dict = self.loader.load()
class PhraseExtractor(object): def __init__(self): self.loader = CixingDictLoader() self.dict = self.loader.load() def extract(self, context): # 这个text可能就是一个字符串,但也可能是一个单词(中文、英文混杂)的list text = context.tokens text_inversed = text[:: - 1] suffix = suffixsorter.build_suffix_array(text) context.suffix = suffix # 得到text逆序后的suffix array suffix_inversed = suffixsorter.build_suffix_array(text_inversed) lcp = suffixsorter.calculateLcp(text, suffix) context.lcp = lcp lcp_inversed = suffixsorter.calculateLcp(text_inversed, suffix_inversed) # 得到right complete substring rcs = self.rcs(lcp,context) # sort rcs by id # (it works because suffix array is already sorted # and the rcs list just have changed a little due to the stack's effect in intersect_lcs_rcs) rcs.sort() #for i in rcs: #print i.id,context.tokens[context.suffix[i.id]:context.suffix[i.id]+context.lcp[i.id]],self.list2str(context.tokens[context.suffix[i.id]:context.suffix[i.id]+context.lcp[i.id]]),i.freq # 得到left complete substring lcs = self.rcs(lcp_inversed,context) rcs_ordered = [] lcs_ordered = [] for item in rcs: rcs_ordered.append(text[suffix[item.id]:suffix[item.id] + lcp[item.id]]) for item in lcs: lcs_ordered.append(text_inversed[suffix_inversed[item.id]:suffix_inversed[item.id] + lcp_inversed[item.id]][:: - 1]) #rcs needn't sort now #rcs_ordered.sort() lcs_ordered.sort() results = self.intersect_lcs_rcs(rcs, lcs, rcs_ordered, lcs_ordered,context) return results # get right complete substring def rcs(self, lcp,context): N = len(lcp) result = [] stack = range(N - 1) sp = - 1 i = 1 while i < N: if sp < 0: if ((lcp[i]==1 and context.token_types[context.suffix[i]]=='<ALPHANUM>' and len(context.tokens[context.suffix[i]]) >= 3) or (lcp[i] >= MIN_PHRASE_LEN)) and lcp[i] <= MAX_PHRASE_LEN: sp += 1 stack[sp] = Substring(id=i, freq=2) i += 1 else: r = stack[sp].id # 如果小于则表明有新的子串出现 if lcp[r] < lcp[i]: sp += 1 stack[sp] = Substring(id=i, freq=2) i += 1 elif lcp[r] == lcp[i]: # 如果相等,则必然是同substring,因为现在是按字母顺序的有序排列 stack[sp].freq += 1 i += 1 else: # 如果大于,当前堆栈中的substring已经是最后一个,可以输出到结果中 result.append(stack[sp]) f = stack[sp].freq sp -= 1 if sp >= 0: stack[sp].freq = stack[sp].freq + f - 1 if lcp[i] >= MIN_PHRASE_LEN and lcp[i] <= MAX_PHRASE_LEN and sp < 0: sp += 1 stack[sp] = Substring(id=i, freq=2 + f - 1) i += 1 return result #将list中的字符连接成一个字符串 def list2str(self, lst): #如果本来就是字符串直接返回 if type(lst) == str or type(lst) == unicode: return lst s = '' is_alpha = False lastchar = '' count = 0 for k in lst: if type(k) != str and type(k) != unicode: k = k.text tt = re.search('[0-9a-zA-Z]', k) # 如果当前term与前面的term都为英文,则之间加个空格 if is_alpha and tt: s += " " if not is_alpha and tt and count == 1: if len(lastchar) == 1: s=s[1:] is_alpha = tt lastchar = k s += k count += 1 return s def intersect_lcs_rcs(self, rcs, lcs, ordered_rcs, ordered_lcs,context): i = 0 j = 0 results = [] tdr = context.term_doc_range while i < len(ordered_lcs) and j < len(ordered_rcs): # 由于ordered_lcs等是一个list,在python中是不能作为key的(list对象是可变的) # 所以这里先把单词list转换为字符串 l = self.list2str(ordered_lcs[i]) r = self.list2str(ordered_rcs[j]) # 找到lcs,rcs的交集 if l == r: # 词性是副词连接词代词等的忽略 if self.dict.has_key(l): i += 1 j += 1 continue # 一方面保证短小的标签适合作为候选,一方面为了防止大量rss文章中带有广告性质的垃圾信息 if len(re.sub('[a-zA-Z0-9]','',l)) >= MAX_CHINESE_LABEL_LEN: i += 1 j += 1 continue rcs[j].text = l # 求出complete substring 在每个文档里的出现freq id = rcs[j].id - 1 lcp = context.lcp[id + 1] for m in range(rcs[j].freq): begin = context.suffix[id] end = context.suffix[id] + lcp n = 0 for n in range(len(tdr)): if begin < tdr[n]: break doc_id = n rcs[j].doc_freq[doc_id] = rcs[j].doc_freq.get(doc_id,0) + 1 id += 1 # Q: Why choose rcs's substring as results returned? # A: rcs is sorted already while lcs is not results.append(rcs[j]) i += 1 j += 1 elif l < r: i += 1 else: j += 1 return results