def breakSentencesByYear(sentence, FILE): ''' 按年份分词 :param sentence: :return: ''' print sentence FILE.write(sentence.strip()) FILE.write('\r\n') return # 处理 \d{2}年 的情况 for item in re.findall(u'\d{2}年', sentence): sentence = sentence.replace(item, item[:-1] + u'#') rowSen = sentence.strip().replace(u'——', u'#').replace(u'—', u'#') if not re.search('\d{2}', rowSen): return seg_list = jieba.cut(rowSen, HMM=True) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 # 断句 # divList = [li for li in seg_list] flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#': mdStack.push(li) elif li == '#' and flag == 0: # mdStack.push('年') flag += 1 elif li == "#" and flag > 0: temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() writeContent = ''.join(tempData) print writeContent FILE.write(writeContent) FILE.write('\r\n') # paramDIC['sentence'].append(tempData) tempData = [] mdStack.push(temp) tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() writeContent = ''.join(tempData) print writeContent FILE.write(writeContent) FILE.write('\r\n')
if __name__ == "__main__": string = u"1993.07—1997.07市长(其间:1994.09—1997.01 中共中央党校经济管理专业在职研究生学习 1996.09—1997.07 中共中央党校中青年干部培训班学习)" # string = u"1969年12月参加工作,南京工学院(现东南大学)本科毕业,中央党校研究生学历,高级经济师。" # seg_list = jieba.cut(string.strip()) # print("Default Mode: " + "//".join(seg_list)) # 精确模式 #处理 \d{2}年 的情况 for item in re.findall(u'\d{2}年', string): string = string.replace(item, item[:-1] + u'#') seg_list = jieba.cut(string.strip().replace(u'—', u'#')) # print("Default Mode: " + "//".join(seg_list)) # 精确模式 # 断句 # divList = [li for li in seg_list] flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#': mdStack.push(li) elif li == '#' and flag == 0: flag += 1 elif li == "#" and flag > 0: temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() tempStr = '#'.join(tempData)
def breakSentencesByYear(sentence): ''' 按年份分词 :param sentence: :return: ''' for item in re.findall(u'\d{2}年|\d{2}\.', sentence): sentence = sentence.replace(item, item[:-1] + u'#') # for c in sentence: # if re.search(u'[\u2014]', c): # print c # return seg_list = jieba.cut(sentence.strip(), HMM=True) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#' and flag == 0: mdStack.push(li) elif li == '#' and flag == 0: flag += 1 elif li != '#' and flag == 1: #判断是否带有除 (年,月,日,至) 以外的中文字符 [\u4e00-\u9fa5] if re.search(u'[-至到\u2014]', li): flag = 0 mdStack.push(li) elif li == "#" and flag > 0: flag == 0 temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC['sentence'].append(tempData) tempData = [] mdStack.push(temp) tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC['sentence'].append(tempData)