order = sorted(data.tags.keys())
        base, claim = embed.process(data.p_data, data.w_data)

        input = np.array(
            list(base[i] + claim[i][0] + claim[i][1] for i in order))

        tags = np.array([data.tags[i] for i in order])
        # tags = ku.to_categorical(tags, 2)
        return input, tags, order


if __name__ == "__main__":
    """Sample demo of neural net"""
    import args
    try:
        dataset = SemEvalData(file=args.train_file())
    except FileNotFoundError:
        # Couldn't find dataset, will need to specify.
        print(
            'Could not find dataset at', args.train_file(),
            '\nPlease set the parent dir (-d), training dir (-trd),'
            'and dataset location (-tr) and execute script again.')
        raise SystemExit

    # Some preprocessing steps for dataset.
    dataset.expand_contraction()
    dataset.remove_common()
    dataset.add_bag_words()

    # Train, predict, and show as csv.
    embedder = WordEmbedder(load=args.google_file())
        p_data = self.p_data.copy()
        p_data.update(other.p_data)
        ds.p_data = p_data

        w_data = self.w_data.copy()
        w_data.update(other.w_data)
        ds.w_data = w_data

        return ds


if __name__ == '__main__':
    """Simple demonstration of capabilities of the dataset."""
    import args
    dataset = SemEvalData(file=args.train_file())

    dataset.remove_stop_words()
    dataset.tag_pos()
    dataset.add_ngrams()
    # print(dataset.w_data['13319707_476_A1DJNUJZN8FE7N'])

    # print(len(dataset.pretext))
    # print('\n'.join(str(x) for x in dataset.folds(9)))

    folded_datasets, orders = dataset.datasets_from_folds(10)
    test = folded_datasets[0] + folded_datasets[1]
    # print('\n'.join(str(len(o)) for o in orders))
    print(list(sorted(folded_datasets[0].tags.keys())))
    print(list(sorted(folded_datasets[1].tags.keys())))
    print(list(sorted(test.tags.keys())))
Beispiel #3
0
""" compare output.csv tags with the train-full.txt data """
""" author: wardac """

import args
import pandas as pd

raw_data = pd.read_csv(args.train_file(), sep='\t').to_records()
tags = {x[1]: x[4] for x in raw_data}
pred_data = pd.read_csv('output.csv', sep=',').to_records()
preds = {x[1]: x[2] for x in pred_data}

correct = 0
for key in tags.keys():
    if key in preds:
        # print(key,'\t',tags[key],'\t',preds[key])
        if tags[key] == preds[key]:
            correct = correct + 1
    else:
        print(key, 'key not found')

print('accuracy', correct / len(tags))
lost = len(tags) - len(preds)
if lost:
    print(lost, 'key(s) lost')