def test_predict(self): lines = sys.stdin.readlines() reader = IOBReader(lines) extractors = [] params = { 'epochs': 100, 'learning_rate': 0.01, 'window_size': 5, 'name_model': 'model_we.ckpt', 'word_embeddings_file': 'data/vectors.txt' } classifier = WordEmbeddingsClassifier(reader, extractors, WordEmbeddingsEstimator, **params) predicted = classifier.predict() labels_idx_rev = {v:k for k,v in reader.vocabulary[reader.getPosition('LABEL')].items()} i = 0 for line in lines: line = line.strip() if line: print '%s\t%s\t%s' % (line.split()[0], line.split()[1], labels_idx_rev[predicted[i]]) i += 1 else: print
def main(): parser = argparse.ArgumentParser(description='Named Entity Recognition with TensorFlow') subparsers = parser.add_subparsers() parser_train = subparsers.add_parser('train') parser_train.set_defaults(which='train') parser_train.add_argument('-e', '--epochs', help='epochs number', type=int, required=True) parser_train.add_argument('-l', '--learning-rate', help='learning rate', type=float, required=True) parser_train.add_argument('-o', '--optimizer', help='optimizer', type=str, required=True, choices=OPTIMIZERS.keys()) parser_tag = subparsers.add_parser('tag') parser_tag.set_defaults(which='tag') parser_collect_data = subparsers.add_parser('collect') parser_collect_data.set_defaults(which='collect') parser_collect_data.add_argument('-d', '--directory', help='directory', type=str, required=True) parser_collect_data.add_argument('-i', '--input-file', help='input file', type=str, required=False) parser_score = subparsers.add_parser('score') parser_score.set_defaults(which='score') parser_score.add_argument('-p', '--predicted', help='predicted file', type=str, required=True) parser_score.add_argument('-g', '--gold', help='gold file', type=str, required=True) # common arguments for p in (parser_train, parser_tag): p.add_argument('-m', '--model', help='model-file', type=str, required=True) p.add_argument('-r', '--reader-file', help='reader file', type=str, required=True) p.add_argument('-w', '--word-embeddings', help='word embeddings', type=str, required=False) p.add_argument('-et', '--word-embeddings-type', help='word embeddings type', type=str, required=False) p.add_argument('-i', '--input-file', help='input file', type=str, required=False) p.add_argument('-t', '--type', help='estimator type', type=str, required=True, choices=ESTIMATORS.keys()) p.add_argument('-nl', '--num-layers', help='number layers for multi rnn estimator', type=int, required=False) p.add_argument('-f', '--feats-conf', help='add the feats in the conf number', type=int, required=False) args = parser.parse_args() try: infile = open(args.input_file) if args.input_file is not None else sys.stdin except: pass print args if args.which == 'collect': with infile as f: for line in f: domain, agro, categories = line.strip().split('\t') # TODO: skipping multi-categories if ',' in categories: continue cfile = os.path.join(args.directory, domain[0], domain[1], domain[2], domain, 'content.txt') try: content = open(cfile).read() except: print >> sys.stderr, '%s not found in %s' % (domain, cfile) continue words = ' '.join([word.strip() for word in content.split()]) if words: print '%s\t%s\t%s' % (domain, categories, words) elif args.which == 'train': max_size = 500000 word_embeddings = Word2VecReader(open(args.word_embeddings)).read() reader = WebContentReader(infile, separator='\t') examples, labels = reader.read() label_extractor = LabelExtractor(reader.getPosition('LABEL'), one_hot=False) X, y = reader.map2idx(examples, labels, [], label_extractor, word_embeddings) _X = [] for x, _ in X: # we dont want feats from extractors if len(x) < max_size: x = np.lib.pad(x, (0, max_size - len(x)), 'constant', constant_values=0) else: x = x[:max_size] _X.append(x) X = np.array(_X) y = np.array(y) d = {} for yy in y: try: d[yy] += 1 except: d[yy] = 1 print d X = preprocessing.StandardScaler().fit_transform(X) classifier = skflow.TensorFlowDNNClassifier(hidden_units=[100, 200, 150, 100], n_classes=len(label_extractor.vocabulary)) #classifier = skflow.TensorFlowLinearRegressor() #classifier.fit(X, y, logdir='log') classifier.fit(X, y) #### test lines = open('data/fine-it-test').readlines() reader = WebContentReader(lines, separator='\t') examples, labels = reader.read() label_extractor = LabelExtractor(reader.getPosition('LABEL'), one_hot=False) X, y = reader.map2idx(examples, labels, [], label_extractor, word_embeddings) _X = [] for x, _ in X: # we dont want feats from extractors if len(x) < max_size: x = np.lib.pad(x, (0, max_size - len(x)), 'constant', constant_values=0) else: x = x[:max_size] _X.append(x) X = np.array(_X) y = np.array(y) predicted = classifier.predict(X) score = metrics.accuracy_score(y, predicted) labels_idx_rev = {v:k for k,v in reader.vocabulary[reader.getPosition('LABEL')].items()} i = 0 for line in lines: line = line.strip() if line: print '%s\t%s' % (line.split()[0], labels_idx_rev[predicted[i]]) i += 1 else: print print("Accuracy: %f" % score) elif args.which == 'tag': lines = sys.stdin.readlines() reader = reader = WebContentReader(lines, separator='\t') extractors = [] params = { 'name_model': args.model, 'word_embeddings_file': args.word_embeddings, 'reader_file': args.reader_file, 'num_layers': args.num_layers } classifier = WordEmbeddingsClassifier(reader, extractors, ESTIMATORS[args.type], **params) predicted = classifier.predict() print >> sys.stderr, 'l predicted', len(predicted), 'l lines', len(lines) labels_idx_rev = {v:k for k,v in reader.vocabulary[reader.getPosition('LABEL')].items()} i = 0 for line in lines: line = line.strip() if line: print '%s\t%s' % (line.split()[0], labels_idx_rev[predicted[i]]) i += 1 else: print elif args.which == 'score': gold_dict = {} for line in open(args.gold): domain, label = line.strip().split('\t')[:2] gold_dict[domain] = label y_true = [] y_pred = [] for line in open(args.predicted): domain, label = line.strip().split('\t')[:2] y_pred.append(int(label)) y_true.append(int(gold_dict[domain])) print f1_score(y_true, y_pred, average='macro')
def main(): parser = argparse.ArgumentParser(description='Named Entity Recognition with TensorFlow') subparsers = parser.add_subparsers() parser_train = subparsers.add_parser('train') parser_train.set_defaults(which='train') parser_train.add_argument('-e', '--epochs', help='epochs number', type=int, required=True) parser_train.add_argument('-l', '--learning-rate', help='learning rate', type=float, required=True) parser_train.add_argument('-o', '--optimizer', help='optimizer', type=str, required=True, choices=OPTIMIZERS.keys()) parser_tag = subparsers.add_parser('tag') parser_tag.set_defaults(which='tag') parser_collect_data = subparsers.add_parser('collect') parser_collect_data.set_defaults(which='collect') parser_collect_data.add_argument('-d', '--directory', help='directory', type=str, required=True) parser_collect_data.add_argument('-i', '--input-file', help='input file', type=str, required=False) parser_score = subparsers.add_parser('score') parser_score.set_defaults(which='score') parser_score.add_argument('-p', '--predicted', help='predicted file', type=str, required=True) parser_score.add_argument('-g', '--gold', help='gold file', type=str, required=True) # common arguments for p in (parser_train, parser_tag): p.add_argument('-m', '--model', help='model-file', type=str, required=True) p.add_argument('-r', '--reader-file', help='reader file', type=str, required=True) p.add_argument('-w', '--word-embeddings', help='word embeddings', type=str, required=False) p.add_argument('-et', '--word-embeddings-type', help='word embeddings type', type=str, required=False) p.add_argument('-i', '--input-file', help='input file', type=str, required=False) p.add_argument('-t', '--type', help='estimator type', type=str, required=True, choices=ESTIMATORS.keys()) p.add_argument('-nl', '--num-layers', help='number layers for multi rnn estimator', type=int, required=False) p.add_argument('-f', '--feats-conf', help='add the feats in the conf number', type=int, required=False) args = parser.parse_args() try: infile = open(args.input_file) if args.input_file is not None else sys.stdin except: pass if args.which == 'collect': with infile as f: for line in f: domain, agro, categories = line.strip().split('\t') # TODO: skipping multi-categories if ',' in categories: continue cfile = os.path.join(args.directory, domain[0], domain[1], domain[2], domain, 'content.txt') try: content = open(cfile).read() except: print >> sys.stderr, '%s not found in %s' % (domain, cfile) continue words = ' '.join([word.strip() for word in content.split()]) if words: print '%s\t%s\t%s' % (domain, categories, words) elif args.which == 'train': reader = WebContentReader(infile, separator='\t') params = { 'epochs': args.epochs, 'learning_rate': args.learning_rate, 'name_model': args.model, 'word_embeddings_file': args.word_embeddings, 'reader_file': args.reader_file, 'optimizer': OPTIMIZERS[args.optimizer], 'num_layers': args.num_layers } classifier = WordEmbeddingsClassifier(reader, [], ESTIMATORS[args.type], **params) classifier.train() elif args.which == 'tag': lines = sys.stdin.readlines() reader = reader = WebContentReader(lines, separator='\t') extractors = [] params = { 'name_model': args.model, 'word_embeddings_file': args.word_embeddings, 'reader_file': args.reader_file, 'num_layers': args.num_layers } classifier = WordEmbeddingsClassifier(reader, extractors, ESTIMATORS[args.type], **params) predicted = classifier.predict() print >> sys.stderr, 'l predicted', len(predicted), 'l lines', len(lines) labels_idx_rev = {v:k for k,v in reader.vocabulary[reader.getPosition('LABEL')].items()} i = 0 for line in lines: line = line.strip() if line: print '%s\t%s' % (line.split()[0], labels_idx_rev[predicted[i]]) i += 1 else: print elif args.which == 'score': gold_dict = {} for line in open(args.gold): domain, label = line.strip().split('\t')[:2] gold_dict[domain] = label y_true = [] y_pred = [] for line in open(args.predicted): domain, label = line.strip().split('\t')[:2] y_pred.append(int(label)) y_true.append(int(gold_dict[domain])) print f1_score(y_true, y_pred, average='macro')
def main(): parser = argparse.ArgumentParser(description='Named Entity Recognition with TensorFlow') subparsers = parser.add_subparsers() parser_train = subparsers.add_parser('train') parser_train.set_defaults(which='train') parser_train.add_argument('-e', '--epochs', help='epochs number', type=int, required=True) parser_train.add_argument('-l', '--learning-rate', help='learning rate', type=float, required=True) parser_train.add_argument('-o', '--optimizer', help='optimizer', type=str, required=True, choices=OPTIMIZERS.keys()) parser_tag = subparsers.add_parser('tag') parser_tag.set_defaults(which='tag') # common arguments for p in (parser_train, parser_tag): p.add_argument('-m', '--model', help='model-file', type=str, required=True) p.add_argument('-r', '--reader-file', help='reader file', type=str, required=True) p.add_argument('-w', '--word-embeddings', help='word embeddings', type=str, required=False) p.add_argument('-et', '--word-embeddings-type', help='word embeddings type', type=str, required=False) p.add_argument('-i', '--input-file', help='input file', type=str, required=False) p.add_argument('-t', '--type', help='estimator type', type=str, required=True, choices=ESTIMATORS.keys()) p.add_argument('-wi', '--window', help='context window size', type=int, required=True) p.add_argument('-nl', '--num-layers', help='number layers for multi rnn estimator', type=int, required=False) p.add_argument('-f', '--feats-conf', help='add the feats in the conf number', type=int, required=False) args = parser.parse_args() infile = args.input_file if args.input_file is not None else sys.stdin if args.which == 'train': reader = IOBReader(infile, separator='\t') extractors = [] if args.feats_conf is not None and args.feats_conf != 0: extractors = [ FieldExtractor(reader.getPosition('FORM')), FieldExtractor(reader.getPosition('POS')), CapitalExtractor(reader.getPosition('FORM')) ] params = { 'epochs': args.epochs, 'learning_rate': args.learning_rate, 'window_size': args.window, 'name_model': args.model, 'word_embeddings_file': args.word_embeddings, 'reader_file': args.reader_file, 'optimizer': OPTIMIZERS[args.optimizer], 'num_layers': args.num_layers } classifier = WordEmbeddingsClassifier(reader, extractors, ESTIMATORS[args.type], **params) classifier.train() elif args.which == 'tag': lines = sys.stdin.readlines() reader = IOBReader(lines) extractors = [] if args.feats_conf is not None and args.feats_conf != 0: extractors = [ FieldExtractor(reader.getPosition('FORM')), FieldExtractor(reader.getPosition('POS')), CapitalExtractor(reader.getPosition('FORM')) ] params = { 'window_size': args.window, 'name_model': args.model, 'word_embeddings_file': args.word_embeddings, 'reader_file': args.reader_file, 'num_layers': args.num_layers } classifier = WordEmbeddingsClassifier(reader, extractors, ESTIMATORS[args.type], **params) predicted = classifier.predict() print >> sys.stderr, len(predicted), len(lines) labels_idx_rev = {v:k for k,v in reader.vocabulary[reader.getPosition('LABEL')].items()} i = 0 for line in lines: line = line.strip() if line: print '%s\t%s\t%s' % (line.split()[0], line.split()[1], labels_idx_rev[predicted[i]]) i += 1 else: print