def extract_sentence(train_seg_path, test_seg_path, col_sep=','):
    ret = []
    lines = read_lines(train_seg_path)
    lines += read_lines(test_seg_path)
    for line in lines:
        if col_sep in line:
            index = line.index(col_sep)
            word_tag = line[index + 1:]
            sentence = ''.join(get_sentence(word_tag))
            ret.append(sentence)
    return ret
Exemple #2
0
def _load_data(path, col_sep='\t', word_sep=' ', pos_sep='/'):
    lines = read_lines(path, col_sep)
    word_lst = []
    pos_lst = []
    label_lst = []
    for line in lines:
        index = line.index(col_sep)
        label = line[:index]
        if pos_sep in label:
            label = label.split(pos_sep)[0]
        label_lst.extend(label)
        sentence = line[index + 1:]
        # word and pos
        word_pos_list = sentence.split(word_sep)
        word, pos = [], []
        for item in word_pos_list:
            if pos_sep in item:
                r_index = item.rindex(pos_sep)
                w, p = item[:r_index], item[r_index + 1:]
                if w == '' or p == '':
                    continue
                word.append(w)
                pos.append(p)
        word_lst.extend(word)
        pos_lst.extend(pos)
    return word_lst, pos_lst, label_lst
Exemple #3
0
def generate_vocab(sentence_path, min_count=0, lower=False, sort=True):
    sentences = read_lines(sentence_path)

    word_dict = defaultdict(int)
    for sentence in sentences:
        sentence = sentence if lower else sentence.lower()
        for word in sentence.split():
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
    if sort:
        word_tulpe = sorted(word_dict.items(),
                            key=lambda item: item[1],
                            reverse=True)
        result = [
            word for i, (word, count) in enumerate(word_tulpe)
            if count >= min_count
        ]
    else:
        result = [
            word for word, count in word_dict.items() if count >= min_count
        ]
    word_ids = dict([(word, index) for index, word in enumerate(result)])
    ids_word = dict([(index, word) for index, word in enumerate(result)])
    return word_ids, ids_word
Exemple #4
0
def get_text_vocab(train_seg_x_path, train_seg_target_path):
    lines_train = read_lines(train_seg_x_path)
    lines_target = read_lines(train_seg_target_path)

    print('train_x lines: %d' % len(lines_train))
    print('train_target lines: %d' % len(lines_target))
    train_word_set = set()
    for sentence in lines_train:
        for word in sentence.split():
            if word not in train_word_set:
                train_word_set.add(word)

    target_word_set = set()
    for sentence in lines_target:
        for word in sentence.split():
            if word not in target_word_set:
                target_word_set.add(word)
    return train_word_set, target_word_set
Exemple #5
0
def extract_sentence(train_seg_x_path,
                     train_seg_target_path,
                     test_seg_x_path,
                     col_sep='\t'):
    ret = []
    print('read %s...' % train_seg_x_path)
    lines = read_lines(train_seg_x_path)
    print('read %s...' % train_seg_target_path)
    lines += read_lines(train_seg_target_path)
    print('read %s...' % test_seg_x_path)
    lines += read_lines(test_seg_x_path)
    for line in lines:
        ret.append(line)
        # if col_sep in line:
        #     index = line.index(col_sep)
        #     word_tag = line[index + 1:]
        #     sentence = ''.join(get_sentence(word_tag))
        #     ret.append(sentence)
    return ret
Exemple #6
0
def train_reader(path, word_vocab, pos_vocab, label_vocab, col_sep='\t'):
    """
    load train data
    :param word_vocab:
    :param pos_vocab:
    :param label_vocab:
    :return:
    """
    return _init_data(read_lines(path, col_sep),
                      word_vocab,
                      pos_vocab,
                      label_vocab,
                      col_sep=col_sep)
Exemple #7
0
def test_reader(path, word_vocab, pos_vocab, label_vocab, col_sep='\t'):
    """
    load test data
    :param word_vocab:
    :param pos_vocab:
    :param label_vocab:
    :return:
    """
    sentences, pos, _ = _init_data(read_lines(path, col_sep),
                                   word_vocab,
                                   pos_vocab,
                                   label_vocab,
                                   col_sep=col_sep)
    return sentences, pos