Beispiel #1
0
def __gen_aspect_noun_filter_dict_file(sents_file, tok_texts_file,
                                       pos_tags_file, common_words_file,
                                       dst_file):
    sents = utils.load_json_objs(sents_file)
    tok_texts = utils.read_lines(tok_texts_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    term_sys_cnts, term_hit_cnts = dict(), dict()
    for sent_idx, (sent, tok_text,
                   pos_tags) in enumerate(zip(sents, tok_texts,
                                              pos_tags_list)):
        sent_words = tok_text.split(' ')
        noun_phrases = rules.rec_rule1(sent_words, pos_tags, None)
        term_objs = sent.get('terms', list())
        terms_true = {term_obj['term'].lower() for term_obj in term_objs}
        for n in noun_phrases:
            sys_cnt = term_sys_cnts.get(n, 0)
            term_sys_cnts[n] = sys_cnt + 1
            if n in terms_true:
                hit_cnt = term_hit_cnts.get(n, 0)
                term_hit_cnts[n] = hit_cnt + 1

    common_words = utils.read_lines(common_words_file)
    filter_terms = set(common_words)
    for term, sys_cnt in term_sys_cnts.items():
        hit_cnt = term_hit_cnts.get(term, 0)
        # print(term, hit_cnt, sys_cnt)
        if hit_cnt / sys_cnt < 0.4:
            filter_terms.add(term)

    fout = open(dst_file, 'w', encoding='utf-8', newline='\n')
    for t in filter_terms:
        fout.write('{}\n'.format(t))
    fout.close()
Beispiel #2
0
def __run_with_mined_rules(mine_helper,
                           rule_patterns_file,
                           term_hit_rate_file,
                           dep_tags_file,
                           pos_tags_file,
                           sent_texts_file,
                           filter_terms_vocab_file,
                           term_hit_rate_thres=0.6,
                           dst_result_file=None,
                           sents_file=None):
    l1_rules, l2_rules = rulescommon.load_rule_patterns_file(
        rule_patterns_file)
    term_vocab = rulescommon.get_term_vocab(term_hit_rate_file,
                                            term_hit_rate_thres)

    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sent_texts = utils.read_lines(sent_texts_file)
    filter_terms_vocab = set(utils.read_lines(filter_terms_vocab_file))
    # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))

    terms_sys_list = list()
    for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(
            zip(dep_tags_list, pos_tags_list, sent_texts)):
        terms = set()
        l1_terms_new = set()
        for p in l1_rules:
            terms_new = rulescommon.find_terms_by_l1_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab)
            terms.update(terms_new)
            l1_terms_new.update(terms_new)
        for p in l2_rules:
            terms_new = rulescommon.find_terms_by_l2_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab,
                l1_terms_new)
            terms.update(terms_new)

        terms_new = mine_helper.get_terms_by_matching(dep_tag_seq, pos_tag_seq,
                                                      sent_text, term_vocab)
        terms.update(terms_new)

        terms_sys_list.append(terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if dst_result_file is not None:
        __write_rule_results(terms_sys_list, sent_texts, dst_result_file)

    if sents_file is not None:
        sents = utils.load_json_objs(sents_file)
        # aspect_terms_true = utils.aspect_terms_list_from_sents(sents)
        terms_list_true = mine_helper.terms_list_from_sents(sents)
        sent_texts = [sent['text'] for sent in sents]
        correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true,
                                       dep_tags_list, pos_tags_list,
                                       sent_texts)
Beispiel #3
0
def __load_data(dep_tags_file, pos_tags_file, sents_file,
                train_valid_split_file):
    tvs_line = utils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sents = utils.load_json_objs(sents_file)

    assert len(tvs_arr) == len(dep_tags_list)

    dep_tags_list_train, dep_tags_list_valid = list(), list()
    pos_tags_list_train, pos_tags_list_valid = list(), list()
    sents_train, sents_valid = list(), list()
    for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list,
                                                   pos_tags_list, sents):
        if tvs_label == 0:
            dep_tags_list_train.append(dep_tags)
            pos_tags_list_train.append(pos_tags)
            sents_train.append(sent)
        else:
            dep_tags_list_valid.append(dep_tags)
            pos_tags_list_valid.append(pos_tags)
            sents_valid.append(sent)

    data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train,
                              sents_train)
    data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid,
                              sents_valid)
    return data_train, data_valid
Beispiel #4
0
def solution(file):
    print("Input: ", file)
    lines = ut.read_lines(file)

    i = 0
    fields = []
    while len(lines[i]) > 0:
        m = re.match('^(.*):\s(\d+)-(\d+)\sor\s(\d+)-(\d+)$', lines[i])
        if m == None:
            break
        field = [m.group(1)] + [int(m.group(x)) for x in range(2, 6)]
        fields.append(field)
        i += 1

    i += 1
    ticket = [int(x) for x in lines[i].split(',')]
    i += 2

    tickets = []
    while i < len(lines) > 0:
        tickets.append([int(x) for x in lines[i].split(',')])
        i += 1

    p1 = part1(fields, ticket, tickets)
    print("Part1: ", p1)
    p2 = part2(fields, ticket, tickets)
    print("Part2: ", p2)
Beispiel #5
0
def solution(file):
    print("Input: ", file)
    data = ut.read_lines(file, True)

    rules = {}
    for i, line in enumerate(data):
        if len(line.strip()) == 0:
            break

    rules = {}
    for i, line in enumerate(data):
        if len(line.strip()) == 0:
            break
        r = line.split(':')
        if len(r[1]) == 4:
            rules[int(r[0])] = [0, r[1][2]]
        else:
            rules[int(r[0])] = [1, [x.strip() for x in r[1].split('|')]]

    messages = data[i + 1:]

    memo = {}
    p1 = part1(rules, memo, messages)
    print("Part1: ", p1)
    p2 = part2(rules, memo, messages)
    print("Part2: ", p2)
Beispiel #6
0
def read_sents_to_word_idx_seqs(tok_texts_file, word_idx_dict):
    texts = utils.read_lines(tok_texts_file)
    word_idx_seq_list = list()
    for sent_text in texts:
        words = sent_text.strip().split(' ')
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])
    return word_idx_seq_list
Beispiel #7
0
def load_train_data_bert_ol(sents_file, train_valid_split_file,
                            valid_bert_tokens_file):
    from utils.utils import read_lines

    aspect_terms_list, opinion_terms_list = datautils.load_terms_list(
        sents_file, True)

    tvs_line = read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    # token_seqs_train = datautils.read_tokens_file(train_bert_tokens_file)
    token_seqs_valid = datautils.read_tokens_file(valid_bert_tokens_file)

    aspect_terms_list_train, aspect_terms_list_valid = list(), list()
    opinion_terms_list_train, opinion_terms_list_valid = list(), list()

    assert len(tvs_arr) == len(aspect_terms_list)
    for i, tvs_label in enumerate(tvs_arr):
        if tvs_label == 0:
            aspect_terms_list_train.append(aspect_terms_list[i])
            opinion_terms_list_train.append(opinion_terms_list[i])
        else:
            aspect_terms_list_valid.append(aspect_terms_list[i])
            opinion_terms_list_valid.append(opinion_terms_list[i])

    data_valid = ValidDataBertOL(token_seqs_valid, aspect_terms_list_valid,
                                 opinion_terms_list_valid)
    return len(aspect_terms_list_train), data_valid
Beispiel #8
0
def solution(file):
    print("Input: ", file)
    data = ut.read_lines(file, True)

    tiles = []
    i = 0
    tile = None
    while i < len(data):
        line = data[i]
        if line.startswith('Tile'):
            tile = Tile()
            tile.bits = []
            tile.id = int(line.split()[1][0:-1])
            for j in range(10):
                i += 1
                line = data[i]
                tile.bits = tile.bits + [
                    x for x in line.replace('.', '0').replace('#', '1')
                ]
            tiles.append(tile)
        i += 1

    p1 = part1(tiles)
    print("Part1: ", p1)
    p2 = part2(tiles)
Beispiel #9
0
def solution(file):
    print("Input:", file)
    data = ut.read_lines(file, True)

    rules = []
    for d in data:
        i    = 0
        dirs = []
   
        while i < len(d):
            if d[i:i+2] == 'nw':
                dirs.append('nw')
                i += 2
            if d[i:i+2] == 'ne':
                dirs.append('ne')
                i += 2
            if d[i:i+2] == 'sw':
                dirs.append('sw')
                i += 2
            if d[i:i+2] == 'se':
                dirs.append('se')
                i += 2
            if d[i:i+1] == 'w':
                dirs.append('w')
                i += 1
            if d[i:i+1] == 'e':
                dirs.append('e')
                i += 1
        rules.append(dirs)

    p1 = part1(rules)
    part2(p1[0], p1[1])
Beispiel #10
0
def solution(file):
    print("Input: ", file)
    lines = ut.read_lines(file)

    p1 = part1b(lines, 6)
    print("Part1: ", p1)

    p2 = part2b(lines, 6)
    print("Part2: ", p2)
Beispiel #11
0
def __rule_result_differ():
    idxs_rule = utils.read_lines('d:/data/aspect/semeval14/rules-correct.txt')
    idxs_neu = utils.read_lines('d:/data/aspect/semeval14/lstmcrf-correct.txt')
    idxs_rule = [int(idx) for idx in idxs_rule]
    idxs_neu = [int(idx) for idx in idxs_neu]
    print(idxs_rule)
    print(idxs_neu)
    idxs_rule_only = list()
    for i in idxs_rule:
        if i not in idxs_neu:
            idxs_rule_only.append(i)
    idxs_neu_only = list()
    for i in idxs_neu:
        if i not in idxs_rule:
            idxs_neu_only.append(i)
    print(idxs_rule_only)
    print(len(idxs_rule_only))
    print(idxs_neu_only)
    print(len(idxs_neu_only))
Beispiel #12
0
def solution(file, do1, do2):
    print("Input: ", file)
    data = ut.read_lines(file)

    if do1:
        p1 = part1(data)
        print("Part1: ", p1)

    if do2:
        p2 = part2(data)
        print("Part2: ", p2)
Beispiel #13
0
def gen_train_valid_sample_idxs_file(tok_texts_file, n_valid_samples,
                                     output_file):
    tok_texts = utils.read_lines(tok_texts_file)
    n_samples = len(tok_texts)
    np.random.seed(3719)
    perm = np.random.permutation(n_samples)
    n_train = n_samples - n_valid_samples
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]
    with open(output_file, 'w', encoding='utf-8') as fout:
        fout.write('{}\n'.format(' '.join([str(idx) for idx in idxs_train])))
        fout.write('{}\n'.format(' '.join([str(idx) for idx in idxs_valid])))
Beispiel #14
0
def solution(file):
    print("Input: ", file)

    lines = ut.read_lines(file)
    nums = [int(x) for x in lines]

    p1 = part1(nums)
    print("Part1: ", p1)

    p2 = part2(nums)
    print("Part2: ", p2)
Beispiel #15
0
def __merge_train_test(train_sents_file, test_sents_file, train_valid_split_file, dst_sents_file, dst_datasplit_file):
    train_sents = utils.load_json_objs(train_sents_file)
    test_sents = utils.load_json_objs(test_sents_file)
    all_sents = train_sents + test_sents
    utils.save_json_objs(all_sents, dst_sents_file)

    train_valid_split_labels = utils.read_lines(train_valid_split_file)[0]
    train_valid_split_labels = [int(v) for v in train_valid_split_labels.split(' ')]
    all_data_split_labels = train_valid_split_labels + [2 for _ in range(len(test_sents))]
    with open(dst_datasplit_file, 'w', encoding='utf-8') as fout:
        fout.write('{}\n'.format(' '.join([str(v) for v in all_data_split_labels])))
Beispiel #16
0
def __get_manual_feat(tok_texts_file, terms_file):
    tok_texts = utils.read_lines(tok_texts_file)
    terms_list = utils.load_json_objs(terms_file)
    feat_list = list()
    for terms_true, tok_text in zip(terms_list, tok_texts):
        words = tok_text.split(' ')
        label_seq = modelutils.label_sentence(words, terms_true)
        feat_seq = np.zeros([len(label_seq), 3], np.int32)
        for i, v in enumerate(label_seq):
            feat_seq[i][v] = 1
        feat_list.append(feat_seq)
    return feat_list
Beispiel #17
0
def solution(file):
    print("Input: ", file)

    lines = ut.read_lines(file)
    dirs = []
    for line in lines:
        dirs.append([line[0], int(line[1:])])

    p1 = part1(dirs)
    print("Part1: ", p1)

    p2 = part2(dirs)
    print("Part2: ", p2)
Beispiel #18
0
def main():
    if len(sys.argv) < 2:
        print('Provide input file name')
        exit(-1)
    if len(sys.argv) < 3:
        print('Provide preamble length')
        exit(-1)

    lines = ut.read_lines(sys.argv[1])
    nums = [int(x) for x in lines]

    value = part1(nums, int(sys.argv[2]))
    part2(nums, value)
Beispiel #19
0
def solution(file):
    print("Input: ", file)

    lines = ut.read_lines(file)
    start = int(lines[0])
    buses = lines[1].split(',')

    p1 = part1(start, buses)
    print("Part1: ", p1)

    #p2 = part2(buses)
    #p2 = part2_crt(buses)
    p2 = part2_again(buses)
    print("Part2: ", p2)
Beispiel #20
0
def solution(file):
    print("Input: ", file)

    lines = ut.read_lines(file)
    rows = []
    rows.append(list('.' * (len(lines[0]) + 2)))
    for line in lines:
        rows.append(list('.' + line + '.'))
    rows.append(list('.' * (len(lines[0]) + 2)))

    p1 = part1(rows)
    print("Part1: ", p1)

    p2 = part2(rows)
    print("Part2: ", p2)
Beispiel #21
0
def main():
    if len(sys.argv) < 2:
        print('Provide input file name')
        exit(-1)

    lines = ut.read_lines(sys.argv[1])
    prog  = []
    for line in lines:
        m = re.match('^(\w+)\s+(.)(\d+)$', line)
        sign = 1 if m.group(2) == '+' else -1
        inst = [m.group(1), sign * int(m.group(3)), False]
        prog.append(inst)

    # Deep copy prog because Python
    part1([p[:] for p in prog])
Beispiel #22
0
def solution(file):
    print("Input:", file)
    data = ut.read_lines(file, True)
    foods = []
    for line in data:
        temp = line.replace('(contains',
                            ':').replace(',',
                                         ' ').replace(')',
                                                      ' ').strip().split(':')
        food = Food()
        food.ings = set(temp[0].split())
        food.alls = set(temp[1].split())
        foods.append(food)

    xxx_ingredients = set.union(*[f.ings for f in foods])
    eng_allergens = set.union(*[f.alls for f in foods])
    eng_all_to_xxx_ings = {
        a: set.intersection(*[f.ings for f in foods if a in f.alls])
        for a in eng_allergens
    }
    xxx_allergens = set.union(*[eng_all_to_xxx_ings[a] for a in eng_allergens])

    # Part 1
    safe = xxx_ingredients.difference(xxx_allergens)
    print('Part1:', sum([len(safe.intersection(f.ings)) for f in foods]))

    # Part 2
    # Map of ingredients identified as allergens
    eng_all_to_ing = {}
    while True:
        # Find allergens which can be only one of the ingredients.
        pairs = {a: i for (a, i) in eng_all_to_xxx_ings.items() if len(i) == 1}
        # Assume zero or more matches.
        for p in pairs:
            eng_all_to_ing[p] = list(pairs[p])[0]
        # Zero means we're done - or the system of constraints can't be resolved.
        if len(pairs) == 0:
            break
        # Remove all the found ingredients from the food items for the next go around.
        ings = set.union(*[pairs[a] for a in pairs])
        eng_all_to_xxx_ings = {
            a: i.difference(ings)
            for (a, i) in eng_all_to_xxx_ings.items()
        }

    print('Part2:', ','.join([i for i in sorted(eng_all_to_ing)]))
    print('Part2:',
          ','.join([eng_all_to_ing[i] for i in sorted(eng_all_to_ing)]))
Beispiel #23
0
def __opinion_rule_insight(dep_tags_file,
                           pos_tags_file,
                           sent_text_file,
                           terms_vocab,
                           dst_result_file=None,
                           sents_file=None):
    print('loading data ...')
    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sent_texts = utils.read_lines(sent_text_file)
    assert len(dep_tags_list) == len(sent_texts)
    assert len(pos_tags_list) == len(dep_tags_list)
    print('done.')
    opinions_sys_list = list()
    for sent_idx, sent_text in enumerate(sent_texts):
        dep_tags = dep_tags_list[sent_idx]
        pos_tags = pos_tags_list[sent_idx]
        assert len(dep_tags) == len(pos_tags)

        opinion_terms = set()
        # used rule2 and __match_terms to pretrain
        # terms_new = opinionrules.rule1(dep_tags, pos_tags)
        # opinion_terms.update(terms_new)
        terms_new = opinionrules.rule2(dep_tags, pos_tags)
        opinion_terms.update(terms_new)
        # terms_new = opinionrules.rule4(dep_tags, pos_tags)
        # opinion_terms.update(terms_new)
        terms_new = __match_terms(sent_text, terms_vocab)
        opinion_terms.update(terms_new)
        opinions_sys_list.append(opinion_terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if dst_result_file is not None:
        __write_rule_results(opinions_sys_list, sent_texts, dst_result_file)

    if sents_file is not None:
        sents = utils.load_json_objs(sents_file)
        opinions_true_list = list()
        for sent in sents:
            opinions_true_list.append(
                [t.lower() for t in sent.get('opinions', list())])
        correct_sent_idxs = __evaluate(opinions_sys_list, opinions_true_list,
                                       dep_tags_list, pos_tags_list,
                                       sent_texts)
Beispiel #24
0
def load_train_data_bert(bert_embed_file, sents_file, train_valid_split_file):
    from utils.utils import read_lines

    token_seqs, token_embed_seqs = __load_bert_embed_data(bert_embed_file)
    aspect_terms_list, opinion_terms_list = datautils.load_terms_list(
        sents_file, True)

    tvs_line = read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    token_seqs_train, token_seqs_valid = list(), list()
    token_embed_seqs_train, token_embed_seqs_valid = list(), list()
    aspect_terms_list_train, aspect_terms_list_valid = list(), list()
    opinion_terms_list_train, opinion_terms_list_valid = list(), list()

    assert len(tvs_arr) == len(token_seqs)
    for i, tvs_label in enumerate(tvs_arr):
        if tvs_label == 0:
            token_seqs_train.append(token_seqs[i])
            token_embed_seqs_train.append(token_embed_seqs[i])
            aspect_terms_list_train.append(aspect_terms_list[i])
            opinion_terms_list_train.append(opinion_terms_list[i])
        else:
            token_seqs_valid.append(token_seqs[i])
            token_embed_seqs_valid.append(token_embed_seqs[i])
            aspect_terms_list_valid.append(aspect_terms_list[i])
            opinion_terms_list_valid.append(opinion_terms_list[i])

    cnt_miss = 0
    label_seqs_train = list()
    for i, (aspect_terms, opinion_terms) in enumerate(
            zip(aspect_terms_list_train, opinion_terms_list_train)):
        y = datautils.label_sentence(token_seqs_train[i], aspect_terms,
                                     opinion_terms)
        # if len(aspect_terms) - np.count_nonzero(y == 1) > 0:
        #     print(aspect_terms)
        label_seqs_train.append(y)
        cnt_miss += len(aspect_terms) - np.count_nonzero(y == 1)
    print(cnt_miss, 'missed')
    data_train = TrainDataBert(label_seqs_train, token_embed_seqs_train)

    data_valid = get_valid_data(token_embed_seqs_valid, token_seqs_valid,
                                aspect_terms_list_valid,
                                opinion_terms_list_valid)
    return data_train, data_valid
Beispiel #25
0
def __missing_terms():
    opinion_terms_file = 'd:/data/aspect/semeval14/opinion-terms-full.txt'
    opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))
    train_sents = utils.load_json_objs(config.SE15R_FILES['train_sents_file'])
    test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file'])
    train_terms = set()
    test_terms = dict()
    for s in train_sents:
        for t in s['opinions']:
            train_terms.add(t.lower())
    for s in test_sents:
        for t in s['opinions']:
            cnt = test_terms.get(t.lower(), 0)
            test_terms[t.lower()] = cnt + 1
            # test_terms.add(t.lower())
    for t, cnt in test_terms.items():
        if t not in train_terms:
            print(t, cnt, t in opinion_terms_vocab)
Beispiel #26
0
def __rand_laptops(n_sents):
    tok_texts_file = config.AMAZON_TOK_TEXTS_FILE
    aspect_terms_file = config.AMAZON_RM_TERMS_FILE
    opinion_terms_file = config.AMAZON_TERMS_TRUE4_FILE

    dst_tok_texts_file = 'd:/data/amazon/rand-laptops/laptops-tok-texts-{}.txt'.format(
        n_sents)
    dst_at_file = 'd:/data/amazon/rand-laptops/laptops-aspect-terms-{}.txt'.format(
        n_sents)
    dst_ot_file = 'd:/data/amazon/rand-laptops/laptops-opinion-terms-{}.txt'.format(
        n_sents)

    tok_texts = utils.read_lines(tok_texts_file)
    n_sents_total = len(tok_texts)
    rand_perm = np.random.permutation(n_sents_total)
    rand_idxs = rand_perm[:n_sents]
    __write_lines(tok_texts_file, dst_tok_texts_file, rand_idxs)
    __write_lines(aspect_terms_file, dst_at_file, rand_idxs)
    __write_lines(opinion_terms_file, dst_ot_file, rand_idxs)
Beispiel #27
0
def get_bboxes(img, gt_path):
    h, w = img.shape[0:2]
    lines = read_lines(gt_path)
    bboxes = []
    tags = []
    for line in lines:
        line = remove_all(line, '\xef\xbb\xbf')
        gt = split(line, ',')

        x1 = np.int(gt[0])
        y1 = np.int(gt[1])

        bbox = [np.int(gt[i]) for i in range(4, 32)]
        bbox = np.asarray(bbox) + ([x1 * 1.0, y1 * 1.0] * 14)
        bbox = np.asarray(bbox) / ([w * 1.0, h * 1.0] * 14)

        bboxes.append(bbox)
        tags.append(True)
    return np.array(bboxes), tags
Beispiel #28
0
def get_data_amazon_ao(vocab, aspect_terms_file, opinion_terms_file,
                       tok_texts_file):
    aspect_terms_list = utils.load_json_objs(aspect_terms_file)
    opinion_terms_list = utils.load_json_objs(opinion_terms_file)
    tok_texts = utils.read_lines(tok_texts_file)
    assert len(aspect_terms_list) == len(tok_texts)
    assert len(opinion_terms_list) == len(tok_texts)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}

    label_seq_list = list()
    word_idx_seq_list = list()
    for aspect_terms, opinion_terms, tok_text in zip(aspect_terms_list,
                                                     opinion_terms_list,
                                                     tok_texts):
        words = tok_text.split(' ')
        label_seq = label_sentence(words, aspect_terms, opinion_terms)
        label_seq_list.append(label_seq)
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])

    np.random.seed(3719)
    perm = np.random.permutation(len(label_seq_list))
    n_train = len(label_seq_list) - 2000
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]

    label_seq_list_train = [label_seq_list[idx] for idx in idxs_train]
    word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train]
    train_data = TrainData(label_seq_list_train, word_idx_seq_list_train)

    label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid]
    word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid]
    tok_texts_valid = [tok_texts[idx] for idx in idxs_valid]

    aspects_list_valid = [aspect_terms_list[idx] for idx in idxs_valid]
    opinions_list_valid = [opinion_terms_list[idx] for idx in idxs_valid]
    valid_data = ValidData(label_seq_list_valid, word_idx_seq_list_valid,
                           tok_texts_valid, aspects_list_valid,
                           opinions_list_valid)

    return train_data, valid_data
Beispiel #29
0
def get_data_semeval(train_sents_file, train_tok_text_file,
                     train_valid_split_file, test_sents_file,
                     test_tok_text_file, vocab, n_train, task):
    tvs_line = utils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    sents = utils.load_json_objs(train_sents_file)
    # texts = utils.read_lines(train_tok_text_file)
    tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file)

    sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list(
    ), list(), list()
    word_span_seqs_train, word_span_seqs_valid = list(), list()
    for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts,
                                     word_span_seqs):
        if label == 0:
            sents_train.append(s)
            tok_texts_train.append(t)
            word_span_seqs_train.append(span_seq)
        else:
            sents_valid.append(s)
            tok_texts_valid.append(t)
            word_span_seqs_valid.append(span_seq)

    labels_list_train, word_idxs_list_train = data_from_sents_file(
        sents_train, tok_texts_train, word_span_seqs_train, vocab, task)
    if n_train > -1:
        labels_list_train = labels_list_train[:n_train]
        word_idxs_list_train = word_idxs_list_train[:n_train]

    train_data = TrainData(labels_list_train, word_idxs_list_train)

    valid_data = __get_valid_data(sents_valid, tok_texts_valid,
                                  word_span_seqs_valid, vocab, task)

    sents_test = utils.load_json_objs(test_sents_file)
    texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file)
    test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test,
                                 vocab, task)
    return train_data, valid_data, test_data
Beispiel #30
0
def __process_hl04():
    filenames = utils.read_lines(config.DATA_FILE_LIST_FILE_HL04)
    reviews, sents, sents_text = list(), list(), list()
    for filename in filenames:
        tmp_revs, tmp_sents = __process_huliu04_file(filename, len(reviews))
        reviews += tmp_revs
        sents += tmp_sents

    with open(config.SENT_TEXT_FILE_HL04, 'w', encoding='utf-8', newline='\n') as fout:
        for s in sents:
            assert '\n' not in s['text']
            fout.write('{}\n'.format(s['text']))

    fout = open(config.REVIEWS_FILE_HL04, 'w', encoding='utf-8', newline='\n')
    for r in reviews:
        fout.write('{}\n'.format(json.dumps(r, ensure_ascii=False)))
    fout.close()

    fout = open(config.SENTS_FILE_HL04, 'w', encoding='utf-8', newline='\n')
    for s in sents:
        fout.write('{}\n'.format(json.dumps(s, ensure_ascii=False)))
    fout.close()