コード例 #1
0
def extract_sentece():
    lines = read_lines('./Data/corpus/training.seg.csv')
    lines += read_lines('./Data/corpus/testing.seg.csv')
    with codecs.open('./Data/corpus/sentence.txt', 'w',
                     encoding='utf-8') as file_w:
        for line in lines:
            index = line.index(',')
            word_tag = line[index + 1:]
            file_w.write('%s\n' % get_sentence(word_tag))
コード例 #2
0
ファイル: model_dc.py プロジェクト: Babyzpj/NLP
def init_result():
    labels = []
    for i in range(config.KFOLD):
        lines = read_lines('./Data/result/best_%d' % i)
        temp = []
        for line in lines:
            label = line.split(',')[1]
            temp.append(label)
        labels.append(temp)
    return labels
コード例 #3
0
ファイル: prepare_data.py プロジェクト: Babyzpj/NLP
def init_voc():
    """
    初始化voc
    """
    lines = read_lines(config.TRAIN_PATH)
    lines += read_lines(config.TEST_PATH)
    words = []  # 句子
    pos_tags = []  # 词性标记类型
    for line in lines:
        index = line.index(',')
        sentence = line[index + 1:]
        # words and tags
        words_tags = sentence.split(' ')
        words_temp, tag_temp = [], []
        for item in words_tags:
            r_index = item.rindex('/')
            word, tag = item[:r_index], item[r_index + 1:]
            words_temp.append(word)
            tag_temp.append(tag)
        pos_tags.extend(tag_temp)
        words.extend(words_temp)
    # word voc
    create_dictionary(words,
                      config.WORD_VOC_PATH,
                      start=config.WORD_VOC_START,
                      min_count=5,
                      sort=True,
                      lower=True,
                      overwrite=True)
    # tag voc
    create_dictionary(pos_tags,
                      config.TAG_VOC_PATH,
                      start=config.TAG_VOC_START,
                      sort=True,
                      lower=False,
                      overwrite=True)
    # label voc
    label_types = [str(i) for i in range(1, 12)]
    create_dictionary(label_types,
                      config.LABEL_VOC_PATH,
                      start=0,
                      overwrite=True)
コード例 #4
0
ファイル: load_data.py プロジェクト: Babyzpj/NLP
def load_train_data(word_voc, tag_voc, label_voc):
    """
    加载训练测试数据
    Args:
        word_voc: dict
        tag_voc: dict
        label_voc: dict
    Returns:
        xx
    """
    return init_data(read_lines(config.TRAIN_PATH), word_voc, tag_voc,
                     label_voc)
コード例 #5
0
ファイル: load_data.py プロジェクト: Babyzpj/NLP
def load_test_data(word_voc, tag_voc, label_voc):
    """
    加载测试数据
    Args:
        word_voc: dict
        tag_voc: dict
        label_voc: dict
    Returns:
        xx
    """
    sentences, tags, _ = init_data(read_lines(config.TEST_PATH), word_voc,
                                   tag_voc, label_voc)
    return sentences, tags
コード例 #6
0
def init_voc():
    """
    初始化voc
    """
    #TRAIN_PATH = './Data/corpus/training.seg.csv'
    #TRAIN_PATH = 'F:\\PubMedSpyder\\new_together.txt'
    lines = read_lines(config.TRAIN_PATH)
    #lines += read_lines(config.TEST_PATH)
    words = []  # 句子
    pos_tags = []  # 词性标记类型
    for line in lines:
        #index = line.index(',')
        #sentence = line[index+1:]
        sentence = line
        # words and tags
        words_tags = sentence.split(' ')
        words_temp, tag_temp = [], []
        for item in words_tags:
            r_index = item.rindex('/')  #/是词与词性的界限
            word, tag = item[:r_index], item[r_index + 1:]
            #分别构造词典和词性词典
            words_temp.append(word)
            tag_temp.append(tag)
        pos_tags.extend(tag_temp)
        words.extend(words_temp)
    # word voc
    #WORD_VOC_PATH是含词(注意不是词向量)的pkl文件
    #得到的字典是下标与单词的词典
    create_dictionary(words,
                      config.WORD_VOC_PATH,
                      start=config.WORD_VOC_START,
                      min_count=1,
                      sort=True,
                      lower=True,
                      overwrite=True)
    # tag voc
    #TAG_VOC_PATH是含词性(注意不是词向量)的pkl文件
    #TAG_VOC_START=1代表起始下标
    create_dictionary(pos_tags,
                      config.TAG_VOC_PATH,
                      start=config.TAG_VOC_START,
                      sort=True,
                      lower=False,
                      overwrite=True)
    # label voc
    #在BIONLP中事件类型有九种(这里可以先理解为触发词类型,因为是由触发词直接得到的类型
    label_types = [str(i) for i in range(1, 10)]
    create_dictionary(label_types,
                      config.LABEL_VOC_PATH,
                      start=0,
                      overwrite=True)
def extract_sentece():
    #将测试集和训练集的句子合在一起
    #lines = read_lines('./Data/corpus/training.seg.csv')
    #lines += read_lines('./Data/corpus/testing.seg.csv')
    lines = read_lines('F:\\PubMedSpider\\sample\\new_together.txt')
    #创建一个新的文本
    with codecs.open('F:\\PubMedSpider\\sample\\only_sentence.txt',
                     'w',
                     encoding='utf-8') as file_w:
        for line in lines:
            #注意:这里line.index(' ')里面的符号根据得到词性标记的文本实际情况而定
            #index = line.index(' ')
            #word_tag = line[index+1:]
            word_tag = line
            file_w.write('%s\n' % get_sentence(word_tag))