def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file): tvs_line = datautils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sents = datautils.load_json_objs(sents_file) assert len(tvs_arr) == len(dep_tags_list) dep_tags_list_train, dep_tags_list_valid = list(), list() pos_tags_list_train, pos_tags_list_valid = list(), list() sents_train, sents_valid = list(), list() for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents): if tvs_label == 0: dep_tags_list_train.append(dep_tags) pos_tags_list_train.append(pos_tags) sents_train.append(sent) else: dep_tags_list_valid.append(dep_tags) pos_tags_list_valid.append(pos_tags) sents_valid.append(sent) data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train) data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid) return data_train, data_valid
def __gen_word_cnts_file(tok_texts_file, output_file): import pandas as pd texts = datautils.read_lines(tok_texts_file) word_cnts_dict = dict() total_word_cnt = 0 for i, sent_text in enumerate(texts): if i % 2 == 1: continue words = sent_text.split() total_word_cnt += len(words) for w in words: cnt = word_cnts_dict.get(w, 0) word_cnts_dict[w] = cnt + 1 word_cnt_tups = list(word_cnts_dict.items()) word_cnt_tups.sort(key=lambda x: -x[1]) word_cnt_rate_tups = list() for w, cnt in word_cnt_tups: word_cnt_rate_tups.append((w, cnt, cnt / total_word_cnt)) df = pd.DataFrame(word_cnt_rate_tups, columns=['word', 'cnt', 'p']) with open(output_file, 'w', encoding='utf-8', newline='\n') as fout: df.to_csv(fout, index=False, float_format='%.5f') print(total_word_cnt)
def __run_with_mined_rules(mine_tool, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file, sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6, output_result_file=None, sents_file=None): l1_rules, l2_rules = ruleutils.load_rule_patterns_file(rule_patterns_file) term_vocab = ruleutils.get_term_vocab(term_hit_rate_file, term_hit_rate_thres) dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sent_texts = datautils.read_lines(sent_texts_file) filter_terms_vocab = set(datautils.read_lines(filter_terms_vocab_file)) # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) terms_sys_list = list() for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(zip(dep_tags_list, pos_tags_list, sent_texts)): terms = set() l1_terms_new = set() for p in l1_rules: terms_new = ruleutils.find_terms_by_l1_pattern( p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab) terms.update(terms_new) l1_terms_new.update(terms_new) for p in l2_rules: terms_new = ruleutils.find_terms_by_l2_pattern( p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab, l1_terms_new) terms.update(terms_new) terms_new = mine_tool.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab) terms.update(terms_new) terms_sys_list.append(terms) if sent_idx % 10000 == 0: print(sent_idx) if output_result_file is not None: __write_rule_results(terms_sys_list, sent_texts, output_result_file) if sents_file is not None: sents = datautils.load_json_objs(sents_file) # aspect_terms_true = utils.aspect_terms_list_from_sents(sents) terms_list_true = mine_tool.terms_list_from_sents(sents) sent_texts = [sent['text'] for sent in sents] correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
def get_weak_label_data(vocab, true_terms_file, tok_texts_file, task): terms_true_list = datautils.load_json_objs(true_terms_file) tok_texts = datautils.read_lines(tok_texts_file) # print(len(terms_true_list), tok_texts_file, len(tok_texts)) if len(terms_true_list) != len(tok_texts): print(len(terms_true_list), len(tok_texts)) assert len(terms_true_list) == len(tok_texts) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} label_seq_list = list() word_idx_seq_list = list() for terms_true, tok_text in zip(terms_true_list, tok_texts): words = tok_text.split(' ') label_seq = label_sentence(words, terms_true) label_seq_list.append(label_seq) word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) np.random.seed(3719) perm = np.random.permutation(len(label_seq_list)) n_train = len(label_seq_list) - 2000 idxs_train, idxs_valid = perm[:n_train], perm[n_train:] label_seq_list_train = [label_seq_list[idx] for idx in idxs_train] word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train] train_data = TrainData(label_seq_list_train, word_idx_seq_list_train) label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid] word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid] tok_texts_valid = [tok_texts[idx] for idx in idxs_valid] terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid] aspect_true_list, opinion_true_list = None, None if task != 'opinion': aspect_true_list = terms_true_list_valid if task != 'aspect': opinion_true_list = terms_true_list_valid valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid, None, tok_texts_valid, aspect_true_list, opinion_true_list) return train_data, valid_data
def get_data_semeval(train_sents_file, train_tok_text_file, train_valid_split_file, test_sents_file, test_tok_text_file, vocab, n_train, task): tvs_line = datautils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] sents = datautils.load_json_objs(train_sents_file) # texts = utils.read_lines(train_tok_text_file) tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file) sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list( ), list(), list() word_span_seqs_train, word_span_seqs_valid = list(), list() for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts, word_span_seqs): if label == 0: sents_train.append(s) tok_texts_train.append(t) word_span_seqs_train.append(span_seq) else: sents_valid.append(s) tok_texts_valid.append(t) word_span_seqs_valid.append(span_seq) labels_list_train, word_idxs_list_train = data_from_sents_file( sents_train, tok_texts_train, word_span_seqs_train, vocab, task) if n_train > -1: labels_list_train = labels_list_train[:n_train] word_idxs_list_train = word_idxs_list_train[:n_train] train_data = TrainData(labels_list_train, word_idxs_list_train) valid_data = __get_valid_data(sents_valid, tok_texts_valid, word_span_seqs_valid, vocab, task) sents_test = datautils.load_json_objs(test_sents_file) texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file) print('get test') test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test, vocab, task) return train_data, valid_data, test_data
def __init__(self, opinion_terms_vocab_file): self.opinion_terms_vocab = set( datautils.read_lines(opinion_terms_vocab_file))