Beispiel #1
0
def parse_ground_truth_triples(df):

    ground_truth_triples = []

    current_sent_index = 0
    current_triples = []

    num_dev_documents = len(load_documents(cf.DEV_FILENAME))

    ground_truth_triples_dict = {k: [] for k in range(num_dev_documents)}

    for i, row in enumerate(df.itertuples()):
        sent_index = int(getattr(row, 'index'))
        head = getattr(row, 's1').split()
        rel = str(getattr(row, 'r')).split()
        tail = getattr(row, 's2').split()
        if sent_index not in ground_truth_triples_dict:
            ground_truth_triples_dict[sent_index] = []
        ground_truth_triples_dict[sent_index].append([head, rel, tail])

    for k in range(num_dev_documents):
        ground_truth_triples.append([])
        for t in ground_truth_triples_dict[k]:
            ground_truth_triples[-1].append(t)

    return ground_truth_triples
Beispiel #2
0
def main(opts):

    if len(opts) == 0:
        raise ValueError("Usage: evaluate.py <dataset>")
    dataset = opts[0]
    if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']:
        raise ValueError(
            "Dataset must be either cateringServices, automotiveEngineering or bbn."
        )

    cf.load_config(dataset)

    datasets = {}
    data_loaders = {}

    # 1. Read in the train and dev datasets from the csv file.
    datasets['dev'] = pd.read_csv(cf.DEV_FILENAME)
    datasets['train'] = pd.read_csv(cf.TRAIN_FILENAME, encoding='utf-8')

    # 2. Load documents
    documents = {}
    documents['train'] = load_documents(cf.TRAIN_DOCUMENTS_FILENAME)
    documents['dev'] = load_documents(cf.DEV_DOCUMENTS_FILENAME)

    # 3. Build a data loader for each dataset (train, dev, test).
    data_loaders = {}
    for ds_name, dataset in datasets.items():
        logger.info("Building %s dataset..." % (ds_name))
        dataset = build_dataset(dataset, ds_name, documents[ds_name])
        data_loader = DataLoader(dataset,
                                 batch_size=cf.BATCH_SIZE,
                                 pin_memory=True)
        data_loaders[ds_name] = data_loader
        logger.info("The %s dataset was built successfully." % ds_name)

    logger.info("Saving data loaders to file...")

    save_obj_to_pkl_file(data_loaders['train'], 'data loader (train)',
                         cf.ASSET_FOLDER + '/data_loader_train.pkl')
    save_obj_to_pkl_file(data_loaders['dev'], 'data loader (dev)',
                         cf.ASSET_FOLDER + '/data_loader_dev.pkl')
def get_encoded_data(args):
    global agree_words, disagree_words, agree_indices, disagree_indices
    # %% load data
    # load sentence data
    sents, labels = load_sentences(domain=args.domain)

    # load sentiment lexicon
    lexicon = load_lexicon()
    pos_words = [word for word in lexicon if lexicon[word] == 1]
    neg_words = [word for word in lexicon if lexicon[word] == 0]
    lex_labels = [1] * len(pos_words) + [0] * len(neg_words)
    lex_word_seqs = pos_words + neg_words

    # load document data
    mdsd_domain = 'dvd' if args.domain == 'dvds' else args.domain
    doc_texts, doc_labels, _ = load_documents(domains=(mdsd_domain,))  # just one domain, ignore domain labels

    ## build vocabulary
    counter = Counter()
    word_seqs = []
    doc_word_seqs = []
    doc_word_sseqs = []
    # tokenize to words
    for sent in sents:
        word_seqs.append(my_tokenize(sent))  # [[w1, w2, ...], ...]
    for doc in doc_texts:
        doc_word_seqs.append(my_tokenize(doc))
        sent_seqs = []
        for sent in sent_tokenize(doc):
            sent_seqs.append(my_tokenize(sent))
        doc_word_sseqs.append(sent_seqs)  # [[[w11, w12, ...], [w21, w22, ...], ...], ...]
    # stat and index
    lens = []
    doc_lens = []
    doc_sentlens = []
    doc_wordlens = []
    for word_seq in word_seqs:
        counter.update(word_seq)
        lens.append(len(word_seq))
    for word in lexicon.keys():
        counter.update([word])
    for doc_word_seq in doc_word_seqs:
        # counter.update(doc_word_seq)
        doc_lens.append(len(doc_word_seq))
    for sent_seqs in doc_word_sseqs:
        doc_sentlens.append(len(sent_seqs))
        for sent_seq in sent_seqs:
            counter.update(sent_seq)
            doc_wordlens.append(len(sent_seq))
    percentage = 98
    maxlen = int(np.percentile(lens, percentage))
    doc_maxlen_sent = int(np.percentile(doc_sentlens, percentage))  # max sent per doc
    doc_maxlen_word = int(np.percentile(doc_wordlens, percentage))  # max word per sent
    doc_maxlen_word = max(maxlen, doc_maxlen_word)

    # the vocabulary
    min_freq = 3
    word2index = dict()
    idx = 2  # start from 2, 0 as <PAD>, 1 as <OOV>
    for word_count in counter.most_common():
        if word_count[1] >= min_freq or word_count[0] in lexicon:
            word2index[word_count[0]] = idx
            idx += 1
    n_words = len(word2index) + 2
    print('words:', len(word2index))

    print('[agree] words:')
    for word in agree_words:
        if word in word2index:
            agree_indices.add(word2index[word])
            print(' -', word, word2index[word])
    print('[disagree] words:')
    for word in disagree_words:
        if word in word2index:
            disagree_indices.add(word2index[word])
            print(' -', word, word2index[word])
    print('agree: {}\ndisagree: {}'.format(agree_indices, disagree_indices))

    # %% data encoding ====================================================================
    # sent data, and CV version
    seqs = []
    for words in word_seqs:
        seqs.append([word2index.get(word, 1) for word in words])
    padded_seqs_bak = pad_sequences(seqs, maxlen=doc_maxlen_word, padding='post', truncating='post')
    labels_bak = np.asarray(labels, dtype=int)
    print('sent:', padded_seqs_bak.shape, labels_bak.shape)

    # CV-fold split for sentence data
    kf = StratifiedKFold(n_splits=CV, shuffle=True)
    padded_seqs_trains = dict()
    padded_seqs_tests = dict()
    labels_trains = dict()
    labels_tests = dict()
    print('{} fold train/test splitting'.format(CV))
    for cv, (train_idx, test_idx) in enumerate(kf.split(padded_seqs_bak, labels_bak)):
        padded_seqs_trains[cv] = padded_seqs_bak[train_idx]
        padded_seqs_tests[cv] = padded_seqs_bak[test_idx]
        labels_trains[cv] = labels_bak[train_idx]
        labels_tests[cv] = labels_bak[test_idx]

    # lex data
    lex_seqs = []
    for word in lex_word_seqs:
        lex_seqs.append([word2index.get(word, 1)])
    lex_padded_seqs = pad_sequences(lex_seqs, maxlen=1, padding='post', truncating='post')
    lex_labels = np.asarray(lex_labels, dtype=int)
    print(' - lex (all):', lex_padded_seqs.shape, lex_labels.shape)

    # doc data (hierarchical), padding from word to sent
    n_samples = len(doc_word_sseqs)
    doc_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent, doc_maxlen_word), dtype=int)
    for i, sseq_1doc in enumerate(doc_word_sseqs):
        for j, seq_1doc in enumerate(sseq_1doc):
            if j < doc_maxlen_sent:
                for k, word in enumerate(seq_1doc):
                    if k < doc_maxlen_word:
                        doc_padded_seqs[i, j, k] = word2index.get(word, 1)
    doc_labels = np.asarray(doc_labels, dtype=int)
    print(' - doc (all):', doc_padded_seqs.shape, doc_labels.shape)

    # relation data for doc (internal sents) (agree & disagree)
    count_agree, count_disagree = 0, 0
    doc_rel_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent), dtype=int)
    for i in range(0, n_samples):
        for j in range(1, doc_maxlen_sent):
            if doc_padded_seqs[i, j, 0] in agree_indices:
                doc_rel_padded_seqs[i, j] = 1
                count_agree += 1
            if doc_padded_seqs[i, j, 0] in disagree_indices:
                doc_rel_padded_seqs[i, j] = -1
                count_disagree += 1
    print(' - doc sent-rel (all):', doc_rel_padded_seqs.shape)
    print(' - doc sent-rel (all): agree: {}, disagree: {}'.format(count_agree, count_disagree))

    ## sub-sample from lexicon and documents
    print('sub-sampling:')
    # doc data sub-sample
    n_samples = len(padded_seqs_trains[0]) + len(padded_seqs_tests[0])
    doc_padded_seqs, doc_rel_padded_seqs, doc_labels = balanced_subsample3(
        doc_padded_seqs, doc_rel_padded_seqs, doc_labels, subsample_num=n_samples)
    doc_padded_seqs = np.asarray(doc_padded_seqs)
    doc_labels = np.asarray(doc_labels, dtype=int)
    print(' - doc (sampled):', doc_padded_seqs.shape, doc_labels.shape)

    # lex data sub-sample
    lex_padded_seqs, lex_labels = balanced_subsample2(lex_padded_seqs, lex_labels, subsample_num=n_samples)
    lex_padded_seqs = np.asarray(lex_padded_seqs)
    lex_labels = np.asarray(lex_labels, dtype=int)
    print(' - lex (sampled):', lex_padded_seqs.shape, lex_labels.shape)
    ddata = {
        'n_samples': n_samples,
        'n_words': n_words,
        'doc_maxlen_word': doc_maxlen_word,
        'doc_maxlen_sent': doc_maxlen_sent,
        'word2index': word2index,
        'padded_seqs_trains': padded_seqs_trains,
        'labels_trains': labels_trains,
        'padded_seqs_tests': padded_seqs_tests,
        'labels_tests': labels_tests,
        'lex_padded_seqs': lex_padded_seqs,
        'lex_labels': lex_labels,
        'doc_padded_seqs': doc_padded_seqs,
        'doc_labels': doc_labels,
        'doc_rel_padded_seqs': doc_rel_padded_seqs,
    }
    return ddata