def save(para={},match=False): ''' 保存结果,match指示是否直接匹配字典 ''' src_fp = r'../COAE2011_Corpus_All_Text/' # r'./test' # kind = ['seg4_fin', 'seg4_ent', 'seg4_dig'] # ['temp'] # suf_save = '_pku3' # 新目录的后缀 union_format = lambda x:x # 默认不做格式转换 if len(para)!=0: kind = ['ict_fin','ict_ent','ict_dig'] # 使用ict文件源 suf_save = '_pku3_ict' union_format = format_rev pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt'] pos_A = ['a','b'] for f in kind: src_p = os.path.join(src_fp, f) odif = Dir_File(src_p) sentences = odif.key_value(model='rb') #; return sentences data = {} #; return sentences # print '==== len: %d ====' % len(sentences), sentences # TODO 优化类 PatternDrag 的代码,将其实例的说明提到for循环外,循环增加数据时,再处理增加的内容 for k in sentences.keys(): v = union_format(sentences[k]) #; return v # print '==== src_fPath: %s/%s, lines-sum: %d' % (src_p,k,len(v)) emotion = PatternDrag(v, pos_N, pos_A) emotion.monitor_params(para) # 修改参数 tmp = emotion.emotion_pattern() #; return tmp data[k] = emotion.format_emo( tmp ) #; return data[k] dst_p = os.path.join(src_fp,''.join([f,suf_save])) # 一个类别的结果存放一个文件 print "==== save():: write back data, path: %s,\n==== data-size: %d" % (dst_p,len(data)) odif.key_value(data, 'wb', dst_p) return
def __init__(self, Pdir, words, codes='gbk'): ''' 录入文件|文件夹 中的内容到list ''' assert isinstance(Pdir,str) and len(words)>0, "ERROR: Paramater's type " self.dir = Pdir self.list = [] # 三级list: file->line->col self.files = [] # 遍历文件夹时,存放文件列表 self.freq = {} # 按self.list存放的freq单元{wrod:f} self.modify=False # list file 数据被修改标识 #self.sentence = [] # 两级list,一级list存放分词后的句子 self.size = 0 # 总词数 self.data_size = 0 # 外部集合的sum() self.PMIwords = list(set(words)) # 计算相似度的词集合 self.PMIfactors = {} # PMI词的分母,未做开方运算 self.content = {} # 词的上下文(先得到list->再有dict),按PMI得特征向量 self.simi = {} # 目标词的 self.window = 2 # 上下文词窗口长度 self.decomp = True # 标志key是否无POS 仅为汉字 self.codes = codes #if len(data)!=0: return None tdir = Dir_File(Pdir) if os.path.isdir(Pdir): temp = tdir.key_value(model='rb') # 返回dict,将K-V分别转换为list self.files = temp.keys() self.list = [ temp[f] for f in self.files ] else: self.files = [Pdir] self.list = [tdir.oper_file(Pdir)] for i in range(len(self.list)): l = [ l for l in self.list[i] if len(l)>2 ] # 过滤list,长度>2 self.list[i] = l # 检查编码是否一致; 要求list中的第一个单元非空 assert words[0].decode(self.codes) and self.list[0][0][0].decode(self.codes), "init::ERROR: PMIwords or file's code-type different"
def __init__(self,dict_path,save_pt,line=None,params=None): para={'dict_type':'n','code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'npos':['n'],'vpos':['v']} # 默认的初始化参数 if params: para = update_value(para,params,True) # 更新初始化参数 self.codes = para['code'] # 文件的编码方式 self.num = para['num'] # 命中词的前后词数量 self.wpos = para['wpos'] # list中字的pos self.mpos = para['mpos'] # list中POS的pos self.len_dict=para['len'] # 字典原子的字节数下限 self.npos = para['npos'] # 字典命中词的名词POS列表 self.vpos = para['vpos'] # 动词POS列表 self.dict_type=para['dict_type'] # 字典的极性 self.spath = save_pt # 命中结果的保存文件地址 self.line = line # 待分析的句子 if self.line: self.__normal() self.dict = [] # 字典的内容 self.hit = [] # 命中结果 self.oper = Dir_File(os.path.dirname(save_pt)) # 提供文件操作,目录默认为字典的上级目录 if line: # 编码检测 self.line[0][wpos].decode(self.codes) if isinstance(dict_path,list): fps = [ file(i,'rb') for i in dict_path ] else: fps = [ file(dict_path,'rb') ] for l in fps: tlines = [ one.strip() for one in l.readlines() if len(one.strip())>self.len_dict] # 过滤单字 l.close() assert tlines[0].decode(self.codes),"!!!! init::ERROR try decode failed !!!! " self.dict.extend(tlines)
def save(ictF=False): ''' 直接匹配字典,保存结果 ''' from util_file import Dir_File from t3_pku import format_rev # 装载词典; 设置存储文件名 rneg = [r'./test/remark_neg.txt',r'./test/feel_neg.txt'] ; wneg = 'hit_neg.txt'; rpos = [r'./test/remark_pos.txt',r'./test/feel_pos.txt'] ; wpos = 'hit_pos.txt'; para={'code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'dict_type':'', 'npos':['n','nd','nh','ni','nl','ns','nt','nz'],'vpos':['v']} mpos=1; punc='w'; # multi_match()函数的参数 src_fp = r'../COAE2011_Corpus_All_Text/' # r'./test' # kind = ['seg4_fin', 'seg4_ent', 'seg4_dig'] # ['temp'] # union_format = lambda x:x # 默认不做格式转换 if ictF: kind = ['ict_dig','ict_fin','ict_ent'] # 使用ict文件源 wneg = 'hit_neg_hit.txt' wpos = 'hit_pos_hit.txt' para['npos']=['n','nr','nr1','nr2','nrj','nrf','ns','nsf','nt','nz','nl','ng'] para['vpos']=['v','vn','vf','vx','vi','vl','vg'] union_format = format_rev #pos_N = ['ns','nz','nh','ni','n','j','ws','nl','nt'] #pos_A = ['a','b'] for f in kind: src_p = os.path.join(src_fp, f) saveN = os.path.join(src_fp,'_'.join([f,wneg])) saveP = os.path.join(src_fp,'_'.join([f,wpos])) #; print src_p, saveN, saveP; return para['dict_type']='n' m_neg = SegmentFilter(rneg,saveN,None,para) para['dict_type']='p' m_pos = SegmentFilter(rpos,saveP,None,para) odif = Dir_File(src_p) sentences = odif.key_value(model='rb') #; return sentences data = {} #; return sentences for k in sentences.keys(): v = union_format(sentences[k]) #; print k #; return v m_neg.multi_match(v,punc,fname=k) m_pos.multi_match(v,punc,fname=k) ; #if k=="D08934.txt":return [m_pos,sentences[k]] m_neg.save() print "---- save: %s,data-size: %d ----"%(m_pos.spath,len(m_pos.hit)) m_pos.save()
class UtilWork: ''' 导入按COAE-gold-task1文件结构导入答案到list ''' def __init__(self, fpath): self.path = fpath self.list = [] self.content = Dir_File(self.path) # 文件读写 self.filter = LineOperator() def oper_file(self,model='rb'): ''' 读取文件,转换为两级list ''' self.list = self.content.oper_file(self.path,model=model) #self.list = self.filter(data=self.list) # 得到两级list def oper_dir(self,pdir='',model='rb'): ''' 操作目录 ''' pass def para_monitor(self,news=[]): pass
class SegmentFilter: ''' 基于单极性n|p的词典(支持长度过滤),2gram匹配分词结果,存放格式[pre1,pre2,term,suf1,suf2]末尾插入Docid ''' def __init__(self,dict_path,save_pt,line=None,params=None): para={'dict_type':'n','code':'gbk','num':3,'wpos':0,'mpos':1,'len':2,'npos':['n'],'vpos':['v']} # 默认的初始化参数 if params: para = update_value(para,params,True) # 更新初始化参数 self.codes = para['code'] # 文件的编码方式 self.num = para['num'] # 命中词的前后词数量 self.wpos = para['wpos'] # list中字的pos self.mpos = para['mpos'] # list中POS的pos self.len_dict=para['len'] # 字典原子的字节数下限 self.npos = para['npos'] # 字典命中词的名词POS列表 self.vpos = para['vpos'] # 动词POS列表 self.dict_type=para['dict_type'] # 字典的极性 self.spath = save_pt # 命中结果的保存文件地址 self.line = line # 待分析的句子 if self.line: self.__normal() self.dict = [] # 字典的内容 self.hit = [] # 命中结果 self.oper = Dir_File(os.path.dirname(save_pt)) # 提供文件操作,目录默认为字典的上级目录 if line: # 编码检测 self.line[0][wpos].decode(self.codes) if isinstance(dict_path,list): fps = [ file(i,'rb') for i in dict_path ] else: fps = [ file(dict_path,'rb') ] for l in fps: tlines = [ one.strip() for one in l.readlines() if len(one.strip())>self.len_dict] # 过滤单字 l.close() assert tlines[0].decode(self.codes),"!!!! init::ERROR try decode failed !!!! " self.dict.extend(tlines) def update(self,line,save=None): if not isinstance(line,list) or len(line)==0: print "!!!! update::WARN date's size too small !!!!" line[0][self.wpos].decode(self.codes) # 编码检验 self.line = line self.__normal() if save and isinstance(save,str): self.spath = save def save(self,path=None,mode='ab'): ''' 两级list存储 ''' if len(self.hit)==0: print "!!!! save::WARN hit-data is empty !!!!" return self.hit.append(['','']) # 插入空白原子,ab模式写文件时实现换行 if path:self.oper.oper_file(path,mode,self.hit) else: self.oper.oper_file(self.spath,mode,self.hit) self.hit = [] # 清空数据 def __normal(self): ''' 数据规划化:删除长度<2的单元 ''' if len(self.line)==0: return False self.line = [ l for l in self.line if len(l)>1 ] return True def __pos_filter(self,atom): ''' 对N,V词集等做过滤 TODO:计算各类别的置信度 ''' if len(self.npos)==0 or len(self.vpos)==0: print "!!!! __pos_filter::ERROR POS-values is un-initialed !!!!" return False # 不进行POS合法性检验 if len(atom)>self.mpos and atom[self.mpos] in self.npos or atom[self.mpos] in self.vpos: if self.dict_type=="p": # 对V|N 过滤掉正极性 return True return False def __segment_combine(self,wid,sz,size=6,punc='w',apos='a'): ''' 对长度(非字数)<size的词进行前合并|修改POS值, 再进行匹配 ''' if wid==0 or sz==1 or \ punc in self.line[wid-1][self.mpos] or\ len(self.line[wid][self.wpos])>size: return False tword = ''.join([self.line[wid-1][self.wpos],self.line[wid][self.wpos]]) if tword in self.dict: self.line[wid][self.wpos] = tword ; self.line[wid][self.mpos]=apos return True return False def match(self,punc='w',fname=''): ''' 匹配两级list的句子,查找是否有命中 mpos为list中POS的pos;命中时在此处插入fname''' if len(self.line)==0: print "!!!! match::ERROR data is empty !!!!",fname return False assert self.__normal(),"!!!! normal::ERROR !!!!" sz = len(self.line) hit_pos = -1 # 记录前一次匹配的词id for i in range(sz): atom = self.line[i][:] #; print atom[self.wpos],atom; return if self.__pos_filter(atom): continue # 过滤 if atom[self.wpos] in self.dict: hit_pos = i self.line[i].append(fname) # 插入源文件名 beg = 0 if i>self.num: beg = i-self.num # 搜索标点符号的位置 tmp = [l for l in range(beg,i) if punc in self.line[l][self.mpos] ] if len(tmp)!=0: beg = tmp[-1]+1 # 子句的句首 end = i+self.num if end>sz: end = sz tmp = [l for l in range(i+1,end) if punc in self.line[l][self.mpos] ] if len(tmp)!=0: end = tmp[0] # 子句的句尾 self.hit.extend(self.line[beg:end]) ; #print "hit get one, beg: %d, end: %d" % (beg,end),atom,self.line[beg:end] self.hit.append([]) # 插入空行 # 分词优化: 仅在前2个词都没命中词典时,引入合并词命中, 且仅合并前后最多各一词 elif i and (i-hit_pos)>2 and self.__segment_combine(i,sz,size=6,punc=punc): hit_pos = i ; print "combine hit ONE: %d, "%i,self.line[i], beg = i-2 if punc in self.line[beg][self.mpos]: tmp = [self.line[i]] else: tmp = [ self.line[beg],self.line[i] ] end = i+1 if end<sz and punc not in self.line[end][self.mpos]: tmp.append(self.line[end]) self.hit.extend(tmp) self.hit.append([]) print "++++ match::OVER file: %s, hit-result words: %d ++++"%(fname,len(self.hit)) return True def multi_match(self,data=[],punc='w',fname=''): ''' match() 的多句子版本 ''' if len(data)==0 : print "!!!! multi_match::WARN data is empty !!!!" return False pos = -1 for i in range(len(data)): # 找到第一个非空原子,用于判断是否非3级list if len(data[i])!=0: pos = i; break if pos == -1: return False # 数据为空 if not isinstance(data[pos][0],list): self.line = data[pos:] else: # 句子级的list降维 tmp = [ ] for l in data[pos:]: tmp.extend(l) self.line = tmp self.match(punc,fname)
def __init__(self, fpath): self.path = fpath self.list = [] self.content = Dir_File(self.path) # 文件读写 self.filter = LineOperator()