Esempio n. 1
0
 j = 0
 while j < len(abstract):
     found = False
     for k in range(len(keywords)):
         keyword = keywords[k]
         jkeyword = jkeywords[k]
         keyword_len = len(keyword)
         if keyword_len > 0 and keyword == abstract[j:j +
                                                    keyword_len]:
             if have_dic:
                 if keyword[0] in d:
                     f, s = d[keyword[0]]
                     s.add(jkeyword)
                     d[keyword[0]] = f, s
                 else:
                     d[keyword[0]] = print_features(
                         keyword[0]), set([jkeyword])
             print "B\t" + print_features(
                 keyword[0]) + "\t" + jkeyword
             #print keyword[0] + "\tB\t" + str(i+1) + "\t" + str(k+1)
             for l in keyword[1:]:
                 if have_dic:
                     if l in d:
                         f, s = d[l]
                         s.add(jkeyword)
                         d[l] = f, s
                     else:
                         d[l] = print_features(l), set([jkeyword])
                 print "I\t" + print_features(l) + "\t" + jkeyword
             #print l + "\tI\t" + str(i+1) + "\t" + str(k+1)
             found = True
             j += keyword_len
Esempio n. 2
0
for i in range(len(lines)):
    if i % 1000 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if 'EKYWD' in line and 'EABST' in line:
        abstract = line['EABST']
        keywords = re.split('\t', line['EKYWD'])
        abstract = word_tokenize(abstract)
        keywords = [word_tokenize(keyword) for keyword in keywords]
        j = 0
        while j < len(abstract):
            found = False
            for k in range(len(keywords)):
                keyword = keywords[k]
                keyword_len = len(keyword)
                if keyword_len > 0 and keyword == abstract[j:j + keyword_len]:
                    print "B\t" + print_features(keyword[0])
                    for l in keyword[1:]:
                        print "I\t" + print_features(l)
                    found = True
                    j += keyword_len
                if found:
                    break
            if j >= len(abstract):
                break
            print "O\t" + print_features(abstract[j])
            j += 1

sys.stderr.write("Finished\n")
Esempio n. 3
0
 abstract = line["EABST"]
 keywords = re.split("\t", line["EKYWD"])
 abstract = word_tokenize(abstract)
 keywords = [word_tokenize(keyword) for keyword in keywords]
 j = 0
 while j < len(abstract):
     found = False
     for k in range(len(keywords)):
         keyword = keywords[k]
         keyword_len = len(keyword)
         if keyword_len > 0 and keyword == abstract[j : j + keyword_len]:
             if have_dic:
                 if keyword[0] in d:
                     print "B\t" + d[keyword[0]]
                 else:
                     print "B\t" + print_features(keyword[0])
             else:
                 print "B\t" + print_features(keyword[0])
             # print "B\t" + print_features(keyword[0]) + "\t" + jkeyword
             # print keyword[0] + "\tB\t" + str(i+1) + "\t" + str(k+1)
             for l in keyword[1:]:
                 if have_dic:
                     if l in d:
                         print "I\t" + d[l]
                     else:
                         print "I\t" + print_features(l)
                 else:
                     print "I\t" + print_features(l)
                 # print "I\t" + print_features(l) + "\t" + jkeyword
             # print l + "\tI\t" + str(i+1) + "\t" + str(k+1)
             found = True
Esempio n. 4
0
 keywords = [word_tokenize(keyword) for keyword in keywords]
 j = 0
 while j < len(abstract):
     found = False
     for k in range(len(keywords)):
         keyword = keywords[k]
         jkeyword = jkeywords[k]
         keyword_len = len(keyword)
         if keyword_len > 0 and keyword == abstract[j:j+keyword_len]:
             if have_dic:
                 if keyword[0] in d:
                     f, s = d[keyword[0]]
                     s.add(jkeyword)
                     d[keyword[0]] = f, s
                 else:
                     d[keyword[0]] = print_features(keyword[0]), set([jkeyword])
             print "B\t" + print_features(keyword[0]) + "\t" + jkeyword
             #print keyword[0] + "\tB\t" + str(i+1) + "\t" + str(k+1)
             for l in keyword[1:]:
                 if have_dic:
                     if l in d:
                         f, s = d[l]
                         s.add(jkeyword)
                         d[l] = f, s
                     else:
                         d[l] = print_features(l), set([jkeyword])
                 print "I\t" + print_features(l) + "\t" + jkeyword
             #print l + "\tI\t" + str(i+1) + "\t" + str(k+1)
             found = True
             j += keyword_len
         if found:
Esempio n. 5
0
for i in range(len(lines)):
    if i % 1000 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if "EKYWD" in line and "EABST" in line and "KYWD" in line:
        keywords = re.split("\t", line["EKYWD"])
        jkeywords = re.split("\t", line["KYWD"])
        if len(keywords) == len(jkeywords):
            keywords = [word_tokenize(keyword) for keyword in keywords]
            for k in range(len(keywords)):
                keyword = keywords[k]
                jkeyword = jkeywords[k]
                for word in keyword:
                    if word in d:
                        f, s = d[word]
                        s.add(jkeyword)
                        d[word] = f, s
                    else:
                        d[word] = print_features(word), set([jkeyword])

sys.stderr.write("Printing dict\n")
for k, (f, s) in d.iteritems():
    line = k + "\t"
    line += f
    for j in s:
        line += "\t" + j
    print line
sys.stderr.write("Finished\n");

Esempio n. 6
0
if len(args) == 0:
    f = sys.stdin
else:
    f = open(args[0])

d = dict()
i = 0
for line in f:
    if i % 20000 == 0:
        sys.stderr.write("Entry: " + str(i) + "\n")
    i += 1
    line = re.split('\t', line)
    jterm = line[1]
    eterm = word_tokenize(line[2])
    for token in eterm:
        if token in d:
            d[token].add(jterm)
        else:
            d[token] = set([jterm])

sys.stderr.write(str(i) + "\n")
sys.stderr.write(str(len(d)) + "\n")

for k, v in d.iteritems():
    sys.stdout.write(k)
    sys.stdout.write("\t" + print_features(k))
    for j in v:
        sys.stdout.write('\t' + j)
    sys.stdout.write('\n')
Esempio n. 7
0
def main():
    model_choices = [
        'PerceptronModel', 'SoftmaxRegressionModel', 'ConvNetModel'
    ]
    solver_choices = [
        'PerceptronSolver', 'GradientDescentSolver',
        'StochasticGradientDescentSolver',
        'MinibatchStochasticGradientDescentSolver'
    ]
    data_choices = [
        'tinyMnistDataset', 'medMnistDataset', 'largeMnistDataset',
        'mnistDataset', 'datasetA', 'datasetB'
    ]
    parser = argparse.ArgumentParser(
        description='Input the arguments to train the neural net.')
    parser.add_argument('-m',
                        '--model',
                        choices=model_choices,
                        default='SoftmaxRegressionModel',
                        help='Perceptron or neural net model')
    parser.add_argument('-s',
                        '--solver',
                        choices=solver_choices,
                        default='MinibatchStochasticGradientDescentSolver',
                        help='Solver to train the model')
    parser.add_argument(
        '-d',
        '--data',
        choices=data_choices,
        default='medMnistDataset',
        help='Dataset to use for training',
    )
    parser.add_argument(
        '-f',
        '--weight_file',
        default=None,
        help='File name (.npz) of weights to use to initialize the model')
    parser.add_argument('-i',
                        '--iterations',
                        default=10,
                        type=int,
                        help='Maximum iterations to run training')
    parser.add_argument('-l',
                        '--learning_rate',
                        nargs='+',
                        default=[0.001],
                        type=float,
                        help='Learning rate to use for the solver')
    parser.add_argument(
        '-b',
        '--batch_size',
        nargs='+',
        default=[32],
        type=int,
        help=
        'Minibatch size to use when iterating the training and validation data'
    )
    parser.add_argument('-u',
                        '--momentum',
                        nargs='+',
                        default=[0.0],
                        type=float,
                        help='Momentum to use for the solver')
    parser.add_argument('-w',
                        '--weight_decay',
                        default=1e-3,
                        type=float,
                        help='Coefficient for l2 regularization on the loss')
    parser.add_argument('-bn',
                        '--batch_norm',
                        action='store_true',
                        help='Batch normalization')
    parser.add_argument('--no-shuffle',
                        action='store_true',
                        help='Disables shuffling of data')
    parser.add_argument('--no-graphics',
                        action='store_true',
                        help='Turns off plots')
    parser.add_argument(
        '-p',
        '--plot_interval',
        default=100,
        type=int,
        help='Only plot only every this often (in terms of iterations)')
    parser.add_argument('--print_features',
                        action='store_true',
                        help='Print high weight features')
    parser.add_argument(
        '--feature_extractor',
        choices=['enhancedFeatureExtractor', 'basicFeatureExtractor'],
        help='Feature extractor function to use for mnist images')
    args = parser.parse_args()

    # Parse args and print information
    if args.model == 'PerceptronModel':
        args.solver = 'PerceptronSolver'
    print("data:\t\t" + args.data)
    print("model:\t\t" + args.model)
    print("solver:\t\t" + args.solver)

    train_data, val_data, test_data = get_data(args)

    # Load weights if applicable
    if args.weight_file is not None:
        print("loading parameter values from %s" % args.weight_file)
        init_param_values_file = np.load(args.weight_file)
        init_param_values = [
            init_param_values_file['arr_%d' % i]
            for i in range(len(init_param_values_file.files))
        ]
    else:
        init_param_values = None

    # train and validate
    hyperparams = [args.learning_rate, args.momentum, args.batch_size]
    if all([len(hyperparam) == 1 for hyperparam in hyperparams
            ]):  # train and validate using a single set of hyperparameters
        model = get_model(args, train_data)
        if init_param_values is not None:
            model.set_param_values(init_param_values)
        solver = get_solver(args)
        print("Training...")
        solver.solve(*(train_data + val_data +
                       [model, pacman_display_callback(train_data)]))
    else:  # do hyperparameter search
        # cartesian product of hyperparameters
        hyperparams = list(itertools.product(*hyperparams))
        model, best_hyperparams = search_hyperparams(
            *(train_data + val_data + zip(*hyperparams)),
            iterations=args.iterations,
            model_class=get_model_class(args),
            init_param_values=init_param_values,
            use_bn=args.batch_norm)
        print(
            'Best model is trained with these hyperparameters: learning_rate=%r, momentum=%r, batch_size=%r'
            % tuple(best_hyperparams))

    if args.print_features and args.model == 'PerceptronModel' and 'mnist' in args.data.lower(
    ):
        for l in model.legal_labels:
            highest_weighted_features = model.find_high_weight_features(l)
            features.print_features(highest_weighted_features)

    if 'mnist' in args.data.lower() and args.feature_extractor is not None:

        def get_data_labels_pred(data):
            features = data[0]
            labels = np.argmax(data[1], axis=-1)
            predictions = model.classify(features)
            return features, labels, predictions

        trainData, trainLabels, trainPredictions = get_data_labels_pred(
            train_data)
        validationData, validationLabels, validationPredictions = get_data_labels_pred(
            val_data)
        features.analysis(model, trainData, trainLabels, trainPredictions,
                          validationData, validationLabels,
                          validationPredictions)

    print("Computing accuracies")
    if train_data[0].shape[
            0] <= 10000:  # compute training accuracy only for small datasets (otherwise computing this is too slow)
        print("Train accuracy: %.1f%%" % (100.0 * model.accuracy(*train_data)))
    print("Validation accuracy: %.1f%%" % (100.0 * model.accuracy(*val_data)))
    print("Test accuracy: %.1f%%" % (100.0 * model.accuracy(*test_data)))
    raw_input('Press enter to exit')
Esempio n. 8
0
 j = 0
 while j < len(abstract):
     found = False
     for k in range(len(keywords)):
         keyword = keywords[k]
         keyword_len = len(keyword)
         if keyword_len > 0 and keyword == abstract[j:j+keyword_len]:
             for l in range(keyword_len):
                 this_word = keyword[l]
                 this_pos = pos_abstract[j+l][1]
                 out = ""
                 if l == 0:
                     out += "B\t"
                 else:
                     out += "I\t"
                 out += print_features(this_word)
                 out += "\tPOS=" + this_pos
                 if this_pos in taking_pos and this_word in d:
                     out += "\t" + d[this_word]
                 print out
             found = True
             j += keyword_len
         if found:
             break
     if j >= len(abstract):
         break
     this_word = abstract[j]
     this_pos = pos_abstract[j][1]
     out = "O\t" + print_features(this_word)
     out += "\tPOS=" + this_pos
     if this_pos in taking_pos and this_word in d:
Esempio n. 9
0
 j = 0
 while j < len(abstract):
     found = False
     for k in range(len(keywords)):
         keyword = keywords[k]
         keyword_len = len(keyword)
         if keyword_len > 0 and keyword == abstract[j:j + keyword_len]:
             for l in range(keyword_len):
                 this_word = keyword[l]
                 this_pos = pos_abstract[j + l][1]
                 out = ""
                 if l == 0:
                     out += "B\t"
                 else:
                     out += "I\t"
                 out += print_features(this_word)
                 out += "\tPOS=" + this_pos
                 if this_pos in taking_pos and this_word in d:
                     out += "\t" + d[this_word]
                 print out
             found = True
             j += keyword_len
         if found:
             break
     if j >= len(abstract):
         break
     this_word = abstract[j]
     this_pos = pos_abstract[j][1]
     out = "O\t" + print_features(this_word)
     out += "\tPOS=" + this_pos
     if this_pos in taking_pos and this_word in d:
Esempio n. 10
0
d = dict()

for i in range(len(lines)):
    if i % 1000 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if "EKYWD" in line and "EABST" in line and "KYWD" in line:
        keywords = re.split("\t", line["EKYWD"])
        jkeywords = re.split("\t", line["KYWD"])
        if len(keywords) == len(jkeywords):
            keywords = [word_tokenize(keyword) for keyword in keywords]
            for k in range(len(keywords)):
                keyword = keywords[k]
                jkeyword = jkeywords[k]
                for word in keyword:
                    if word in d:
                        f, s = d[word]
                        s.add(jkeyword)
                        d[word] = f, s
                    else:
                        d[word] = print_features(word), set([jkeyword])

sys.stderr.write("Printing dict\n")
for k, (f, s) in d.iteritems():
    line = k + "\t"
    line += f
    for j in s:
        line += "\t" + j
    print line
sys.stderr.write("Finished\n")