def extract_sentence(train_seg_path, test_seg_path, col_sep=','): ret = [] lines = read_lines(train_seg_path) lines += read_lines(test_seg_path) for line in lines: if col_sep in line: index = line.index(col_sep) word_tag = line[index + 1:] sentence = ''.join(get_sentence(word_tag)) ret.append(sentence) return ret
def _load_data(path, col_sep='\t', word_sep=' ', pos_sep='/'): lines = read_lines(path, col_sep) word_lst = [] pos_lst = [] label_lst = [] for line in lines: index = line.index(col_sep) label = line[:index] if pos_sep in label: label = label.split(pos_sep)[0] label_lst.extend(label) sentence = line[index + 1:] # word and pos word_pos_list = sentence.split(word_sep) word, pos = [], [] for item in word_pos_list: if pos_sep in item: r_index = item.rindex(pos_sep) w, p = item[:r_index], item[r_index + 1:] if w == '' or p == '': continue word.append(w) pos.append(p) word_lst.extend(word) pos_lst.extend(pos) return word_lst, pos_lst, label_lst
def generate_vocab(sentence_path, min_count=0, lower=False, sort=True): sentences = read_lines(sentence_path) word_dict = defaultdict(int) for sentence in sentences: sentence = sentence if lower else sentence.lower() for word in sentence.split(): if word not in word_dict: word_dict[word] = 1 else: word_dict[word] += 1 if sort: word_tulpe = sorted(word_dict.items(), key=lambda item: item[1], reverse=True) result = [ word for i, (word, count) in enumerate(word_tulpe) if count >= min_count ] else: result = [ word for word, count in word_dict.items() if count >= min_count ] word_ids = dict([(word, index) for index, word in enumerate(result)]) ids_word = dict([(index, word) for index, word in enumerate(result)]) return word_ids, ids_word
def get_text_vocab(train_seg_x_path, train_seg_target_path): lines_train = read_lines(train_seg_x_path) lines_target = read_lines(train_seg_target_path) print('train_x lines: %d' % len(lines_train)) print('train_target lines: %d' % len(lines_target)) train_word_set = set() for sentence in lines_train: for word in sentence.split(): if word not in train_word_set: train_word_set.add(word) target_word_set = set() for sentence in lines_target: for word in sentence.split(): if word not in target_word_set: target_word_set.add(word) return train_word_set, target_word_set
def extract_sentence(train_seg_x_path, train_seg_target_path, test_seg_x_path, col_sep='\t'): ret = [] print('read %s...' % train_seg_x_path) lines = read_lines(train_seg_x_path) print('read %s...' % train_seg_target_path) lines += read_lines(train_seg_target_path) print('read %s...' % test_seg_x_path) lines += read_lines(test_seg_x_path) for line in lines: ret.append(line) # if col_sep in line: # index = line.index(col_sep) # word_tag = line[index + 1:] # sentence = ''.join(get_sentence(word_tag)) # ret.append(sentence) return ret
def train_reader(path, word_vocab, pos_vocab, label_vocab, col_sep='\t'): """ load train data :param word_vocab: :param pos_vocab: :param label_vocab: :return: """ return _init_data(read_lines(path, col_sep), word_vocab, pos_vocab, label_vocab, col_sep=col_sep)
def test_reader(path, word_vocab, pos_vocab, label_vocab, col_sep='\t'): """ load test data :param word_vocab: :param pos_vocab: :param label_vocab: :return: """ sentences, pos, _ = _init_data(read_lines(path, col_sep), word_vocab, pos_vocab, label_vocab, col_sep=col_sep) return sentences, pos