def breakSentencesByYear(sentence, FILE): ''' 按年份分词 :param sentence: :return: ''' print sentence FILE.write(sentence.strip()) FILE.write('\r\n') return # 处理 \d{2}年 的情况 for item in re.findall(u'\d{2}年', sentence): sentence = sentence.replace(item, item[:-1] + u'#') rowSen = sentence.strip().replace(u'——', u'#').replace(u'—', u'#') if not re.search('\d{2}', rowSen): return seg_list = jieba.cut(rowSen, HMM=True) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 # 断句 # divList = [li for li in seg_list] flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#': mdStack.push(li) elif li == '#' and flag == 0: # mdStack.push('年') flag += 1 elif li == "#" and flag > 0: temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() writeContent = ''.join(tempData) print writeContent FILE.write(writeContent) FILE.write('\r\n') # paramDIC['sentence'].append(tempData) tempData = [] mdStack.push(temp) tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() writeContent = ''.join(tempData) print writeContent FILE.write(writeContent) FILE.write('\r\n')
def breakSentencesByYear(sentence): """ 按年份分词 :param sentence: :return: """ for item in re.findall(u"\d{2}年|\d{2}\.", sentence): sentence = sentence.replace(item, item[:-1] + u"#") # for c in sentence: # if re.search(u'[\u2014]', c): # print c # return seg_list = jieba.cut(sentence.strip(), HMM=True) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 flag = 0 mdStack = Stack() tempData = [] tempStr = "" for li in seg_list: if li == " " and li == u"": pass if li != "#" and flag == 0: mdStack.push(li) elif li == "#" and flag == 0: mdStack.push(u"年") flag += 1 elif li != "#" and flag == 1: # 判断是否带有除 (年,月,日,至) 以外的中文字符 [\u4e00-\u9fa5] if re.search(u"[-至到\u2014]", li): flag = 0 mdStack.push(li) elif li == "#" and flag > 0: flag == 0 temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC["sentence"].append(tempData) tempData = [] mdStack.push(temp + u"年") tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC["sentence"].append(tempData)
def breakSentencesByYear(sentence): ''' 按年份分词 :param sentence: :return: ''' for item in re.findall(u'\d{2}年|\d{2}\.', sentence): sentence = sentence.replace(item, item[:-1] + u'#') # for c in sentence: # if re.search(u'[\u2014]', c): # print c # return seg_list = jieba.cut(sentence.strip(), HMM=True) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#' and flag == 0: mdStack.push(li) elif li == '#' and flag == 0: flag += 1 elif li != '#' and flag == 1: #判断是否带有除 (年,月,日,至) 以外的中文字符 [\u4e00-\u9fa5] if re.search(u'[-至到\u2014]', li): flag = 0 mdStack.push(li) elif li == "#" and flag > 0: flag == 0 temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC['sentence'].append(tempData) tempData = [] mdStack.push(temp) tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() paramDIC['sentence'].append(tempData)
# print("Default Mode: " + "//".join(seg_list)) # 精确模式 # 断句 # divList = [li for li in seg_list] flag = 0 mdStack = Stack() tempData = [] tempStr = '' for li in seg_list: if li == ' ' and li == u'': pass if li != '#': mdStack.push(li) elif li == '#' and flag == 0: flag += 1 elif li == "#" and flag > 0: temp = mdStack.pop() # 抛出#号前一个 while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() tempStr = '#'.join(tempData) print tempStr print '==========' tempData = [] mdStack.push(temp) tempData = [] while not mdStack.empty(): tempData.append(mdStack.pop()) tempData.reverse() tempStr = '#'.join(tempData) print tempStr