Example #1
0
def breakSentencesByYear(sentence, FILE):
    '''
    按年份分词
    :param sentence:
    :return:
    '''
    print sentence
    FILE.write(sentence.strip())
    FILE.write('\r\n')
    return

    # 处理 \d{2}年 的情况
    for item in re.findall(u'\d{2}年', sentence):
        sentence = sentence.replace(item, item[:-1] + u'#')
    rowSen = sentence.strip().replace(u'——', u'#').replace(u'—', u'#')

    if not re.search('\d{2}', rowSen):
        return
    seg_list = jieba.cut(rowSen, HMM=True)
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
    # 断句
    # divList = [li for li in seg_list]
    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#':
            mdStack.push(li)
        elif li == '#' and flag == 0:
            # mdStack.push('年')
            flag += 1
        elif li == "#" and flag > 0:
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            writeContent = ''.join(tempData)
            print writeContent
            FILE.write(writeContent)
            FILE.write('\r\n')
            # paramDIC['sentence'].append(tempData)
            tempData = []
            mdStack.push(temp)
    tempData = []
    while not mdStack.empty():
        tempData.append(mdStack.pop())
    tempData.reverse()
    writeContent = ''.join(tempData)
    print writeContent
    FILE.write(writeContent)
    FILE.write('\r\n')
Example #2
0
if __name__ == "__main__":
    string = u"1993.07—1997.07市长(其间:1994.09—1997.01 中共中央党校经济管理专业在职研究生学习 1996.09—1997.07 中共中央党校中青年干部培训班学习)"
    # string = u"1969年12月参加工作,南京工学院(现东南大学)本科毕业,中央党校研究生学历,高级经济师。"
    # seg_list = jieba.cut(string.strip())
    # print("Default Mode: " + "//".join(seg_list))  # 精确模式

    #处理 \d{2}年 的情况
    for item in re.findall(u'\d{2}年', string):
        string = string.replace(item, item[:-1] + u'#')
    seg_list = jieba.cut(string.strip().replace(u'—', u'#'))
    # print("Default Mode: " + "//".join(seg_list))  # 精确模式
    # 断句
    # divList = [li for li in seg_list]
    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#':
            mdStack.push(li)
        elif li == '#' and flag == 0:
            flag += 1
        elif li == "#" and flag > 0:
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            tempStr = '#'.join(tempData)
Example #3
0
def breakSentencesByYear(sentence):
    '''
    按年份分词
    :param sentence:
    :return:
    '''
    for item in re.findall(u'\d{2}年|\d{2}\.', sentence):
        sentence = sentence.replace(item, item[:-1] + u'#')
    # for c in sentence:
    #     if re.search(u'[\u2014]', c):
    #         print c
    # return

    seg_list = jieba.cut(sentence.strip(), HMM=True)
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#' and flag == 0:
            mdStack.push(li)
        elif li == '#' and flag == 0:
            flag += 1
        elif li != '#' and flag == 1:
            #判断是否带有除 (年,月,日,至) 以外的中文字符 [\u4e00-\u9fa5]
            if re.search(u'[-至到\u2014]', li):
                flag = 0
            mdStack.push(li)
        elif li == "#" and flag > 0:
            flag == 0
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            paramDIC['sentence'].append(tempData)
            tempData = []
            mdStack.push(temp)

    tempData = []
    while not mdStack.empty():
        tempData.append(mdStack.pop())
    tempData.reverse()
    paramDIC['sentence'].append(tempData)
Example #4
0
def breakSentencesByYear(sentence, FILE):
    '''
    按年份分词
    :param sentence:
    :return:
    '''
    print sentence
    FILE.write(sentence.strip())
    FILE.write('\r\n')
    return

    # 处理 \d{2}年 的情况
    for item in re.findall(u'\d{2}年', sentence):
        sentence = sentence.replace(item, item[:-1] + u'#')
    rowSen = sentence.strip().replace(u'——', u'#').replace(u'—', u'#')

    if not re.search('\d{2}', rowSen):
        return
    seg_list = jieba.cut(rowSen, HMM=True)
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
    # 断句
    # divList = [li for li in seg_list]
    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#':
            mdStack.push(li)
        elif li == '#' and flag == 0:
            # mdStack.push('年')
            flag += 1
        elif li == "#" and flag > 0:
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            writeContent = ''.join(tempData)
            print writeContent
            FILE.write(writeContent)
            FILE.write('\r\n')
            # paramDIC['sentence'].append(tempData)
            tempData = []
            mdStack.push(temp)
    tempData = []
    while not mdStack.empty():
        tempData.append(mdStack.pop())
    tempData.reverse()
    writeContent = ''.join(tempData)
    print writeContent
    FILE.write(writeContent)
    FILE.write('\r\n')
Example #5
0
def breakSentencesByYear(sentence):
    '''
    按年份分词
    :param sentence:
    :return:
    '''
    for item in re.findall(u'\d{2}年|\d{2}\.', sentence):
        sentence = sentence.replace(item, item[:-1] + u'#')
    # for c in sentence:
    #     if re.search(u'[\u2014]', c):
    #         print c
    # return

    seg_list = jieba.cut(sentence.strip(), HMM=True)
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#' and flag == 0:
            mdStack.push(li)
        elif li == '#' and flag == 0:
            flag += 1
        elif li != '#' and flag == 1:
            #判断是否带有除 (年,月,日,至) 以外的中文字符 [\u4e00-\u9fa5]
            if re.search(u'[-至到\u2014]', li):
                flag = 0
            mdStack.push(li)
        elif li == "#" and flag > 0:
            flag == 0
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            paramDIC['sentence'].append(tempData)
            tempData = []
            mdStack.push(temp)

    tempData = []
    while not mdStack.empty():
        tempData.append(mdStack.pop())
    tempData.reverse()
    paramDIC['sentence'].append(tempData)
Example #6
0
if __name__ == "__main__":
    string = u"1993.07—1997.07市长(其间:1994.09—1997.01 中共中央党校经济管理专业在职研究生学习 1996.09—1997.07 中共中央党校中青年干部培训班学习)"
    # string = u"1969年12月参加工作,南京工学院(现东南大学)本科毕业,中央党校研究生学历,高级经济师。"
    # seg_list = jieba.cut(string.strip())
    # print("Default Mode: " + "//".join(seg_list))  # 精确模式

    #处理 \d{2}年 的情况
    for item in re.findall(u'\d{2}年', string):
        string = string.replace(item, item[:-1] + u'#')
    seg_list = jieba.cut(string.strip().replace(u'—', u'#'))
    # print("Default Mode: " + "//".join(seg_list))  # 精确模式
    # 断句
    # divList = [li for li in seg_list]
    flag = 0
    mdStack = Stack()
    tempData = []
    tempStr = ''
    for li in seg_list:
        if li == ' ' and li == u'':
            pass
        if li != '#':
            mdStack.push(li)
        elif li == '#' and flag == 0:
            flag += 1
        elif li == "#" and flag > 0:
            temp = mdStack.pop()  # 抛出#号前一个
            while not mdStack.empty():
                tempData.append(mdStack.pop())
            tempData.reverse()
            tempStr = '#'.join(tempData)