order = sorted(data.tags.keys()) base, claim = embed.process(data.p_data, data.w_data) input = np.array( list(base[i] + claim[i][0] + claim[i][1] for i in order)) tags = np.array([data.tags[i] for i in order]) # tags = ku.to_categorical(tags, 2) return input, tags, order if __name__ == "__main__": """Sample demo of neural net""" import args try: dataset = SemEvalData(file=args.train_file()) except FileNotFoundError: # Couldn't find dataset, will need to specify. print( 'Could not find dataset at', args.train_file(), '\nPlease set the parent dir (-d), training dir (-trd),' 'and dataset location (-tr) and execute script again.') raise SystemExit # Some preprocessing steps for dataset. dataset.expand_contraction() dataset.remove_common() dataset.add_bag_words() # Train, predict, and show as csv. embedder = WordEmbedder(load=args.google_file())
p_data = self.p_data.copy() p_data.update(other.p_data) ds.p_data = p_data w_data = self.w_data.copy() w_data.update(other.w_data) ds.w_data = w_data return ds if __name__ == '__main__': """Simple demonstration of capabilities of the dataset.""" import args dataset = SemEvalData(file=args.train_file()) dataset.remove_stop_words() dataset.tag_pos() dataset.add_ngrams() # print(dataset.w_data['13319707_476_A1DJNUJZN8FE7N']) # print(len(dataset.pretext)) # print('\n'.join(str(x) for x in dataset.folds(9))) folded_datasets, orders = dataset.datasets_from_folds(10) test = folded_datasets[0] + folded_datasets[1] # print('\n'.join(str(len(o)) for o in orders)) print(list(sorted(folded_datasets[0].tags.keys()))) print(list(sorted(folded_datasets[1].tags.keys()))) print(list(sorted(test.tags.keys())))
""" compare output.csv tags with the train-full.txt data """ """ author: wardac """ import args import pandas as pd raw_data = pd.read_csv(args.train_file(), sep='\t').to_records() tags = {x[1]: x[4] for x in raw_data} pred_data = pd.read_csv('output.csv', sep=',').to_records() preds = {x[1]: x[2] for x in pred_data} correct = 0 for key in tags.keys(): if key in preds: # print(key,'\t',tags[key],'\t',preds[key]) if tags[key] == preds[key]: correct = correct + 1 else: print(key, 'key not found') print('accuracy', correct / len(tags)) lost = len(tags) - len(preds) if lost: print(lost, 'key(s) lost')