def main(argv):
  parser = argparse.ArgumentParser()
  parser.add_argument('-pp', '--positive_dir_test', required = True)
  parser.add_argument('-nn', '--negative_dir_test', required = True)
  parser.add_argument('-p', '--positive_dir', required = True)
  parser.add_argument('-n', '--negative_dir', required = True)
  parser.add_argument('-m', '--model', required = True)
  parser.add_argument('-s', '--filesuffix', default='.txt')

  opts = parser.parse_args()

  # read training data
  files_labels = \
    [(opts.positive_dir + "/" + f, 1) for f in os.listdir(opts.positive_dir) if f.endswith(opts.filesuffix)] + \
    [(opts.negative_dir + "/" + f, 0) for f in os.listdir(opts.negative_dir) if f.endswith(opts.filesuffix)]
  random.shuffle(files_labels)
  filelist = [fl[0] for fl in files_labels]

  # read test data
  files_labels_test = \
    [(opts.positive_dir_test + "/" + f, 1) for f in os.listdir(opts.positive_dir_test) if f.endswith(opts.filesuffix)] + \
    [(opts.negative_dir_test + "/" + f, 0) for f in os.listdir(opts.negative_dir_test) if f.endswith(opts.filesuffix)]

  vocab = vocabulary(filelist, vocab_size)
  weights = {}
  for token in vocab:
    weights[token] = 0.0

  for i in xrange(training_iterations):
    error_count = 0
    for input_file, desired_output in files_labels:
      features = set(perceptron_utils.filtered_tokens(input_file, vocab))
      result = perceptron_utils.prediction(features, weights)
      error = desired_output - result
      if error != 0:
        error_count += 1
        for f in features:
          weights[f] += learning_rate * error
    print('-' * 20)
    print "iteration: ", i + 1
    print "train errors: ", error_count
    error_count = 0
    for input_file, desired_output in files_labels_test:
      features = set(perceptron_utils.filtered_tokens(input_file, vocab))
      result = perceptron_utils.prediction(features, weights)
      if desired_output != result:
        error_count += 1
    print "test errors: ", error_count

  print "number of train instances: ", len(files_labels)
  print "number of test instances: ", len(files_labels_test)

  with open(opts.model, 'w') as modelfile:
    json.dump(weights, modelfile)
Esempio n. 2
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-pp', '--positive_dir_test', required=True)
    parser.add_argument('-nn', '--negative_dir_test', required=True)
    parser.add_argument('-p', '--positive_dir', required=True)
    parser.add_argument('-n', '--negative_dir', required=True)
    parser.add_argument('-m', '--model', required=True)
    parser.add_argument('-s', '--filesuffix', default='.txt')

    opts = parser.parse_args()

    # read training data
    files_labels = \
      [(opts.positive_dir + "/" + f, 1) for f in os.listdir(opts.positive_dir) if f.endswith(opts.filesuffix)] + \
      [(opts.negative_dir + "/" + f, 0) for f in os.listdir(opts.negative_dir) if f.endswith(opts.filesuffix)]
    random.shuffle(files_labels)
    filelist = [fl[0] for fl in files_labels]

    # read test data
    files_labels_test = \
      [(opts.positive_dir_test + "/" + f, 1) for f in os.listdir(opts.positive_dir_test) if f.endswith(opts.filesuffix)] + \
      [(opts.negative_dir_test + "/" + f, 0) for f in os.listdir(opts.negative_dir_test) if f.endswith(opts.filesuffix)]

    vocab = vocabulary(filelist, vocab_size)
    weights = {}
    for token in vocab:
        weights[token] = 0.0

    for i in xrange(training_iterations):
        error_count = 0
        for input_file, desired_output in files_labels:
            features = set(perceptron_utils.filtered_tokens(input_file, vocab))
            result = perceptron_utils.prediction(features, weights)
            error = desired_output - result
            if error != 0:
                error_count += 1
                for f in features:
                    weights[f] += learning_rate * error
        print('-' * 20)
        print "iteration: ", i + 1
        print "train errors: ", error_count
        error_count = 0
        for input_file, desired_output in files_labels_test:
            features = set(perceptron_utils.filtered_tokens(input_file, vocab))
            result = perceptron_utils.prediction(features, weights)
            if desired_output != result:
                error_count += 1
        print "test errors: ", error_count

    print "number of train instances: ", len(files_labels)
    print "number of test instances: ", len(files_labels_test)

    with open(opts.model, 'w') as modelfile:
        json.dump(weights, modelfile)
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-pp', '--positive_dir', required=True)
    parser.add_argument('-nn', '--negative_dir', required=True)
    parser.add_argument('-m', '--model', required=True)
    parser.add_argument('-s', '--filesuffix', default='.txt')
    opts = parser.parse_args()

    files_labels = \
      [(opts.positive_dir + "/" + f, 1) for f in os.listdir(opts.positive_dir) if f.endswith(opts.filesuffix)] + \
      [(opts.negative_dir + "/" + f, 0) for f in os.listdir(opts.negative_dir) if f.endswith(opts.filesuffix)]

    with open(opts.model, 'r') as modelfile:
        weights = json.load(modelfile)
    vocab = set(weights.keys())

    error_count = 0
    for input_file, desired_output in files_labels:
        features = set(perceptron_utils.filtered_tokens(input_file, vocab))
        result = perceptron_utils.prediction(features, weights)
        if desired_output != result:
            error_count += 1
    print "test errors: ", error_count
    print "number of instances: ", len(files_labels)
def main(argv):
  parser = argparse.ArgumentParser()
  parser.add_argument('-p', '--positive_dir', required = True)
  parser.add_argument('-n', '--negative_dir', required = True)
  parser.add_argument('-m', '--model', required = True)
  parser.add_argument('-s', '--filesuffix', default='.txt')
  opts = parser.parse_args()

  files_labels = \
    [(opts.positive_dir + "/" + f, 1) for f in os.listdir(opts.positive_dir) if f.endswith(opts.filesuffix)] + \
    [(opts.negative_dir + "/" + f, 0) for f in os.listdir(opts.negative_dir) if f.endswith(opts.filesuffix)]

  with open(opts.model, 'r') as modelfile:
    weights = json.load(modelfile)
  vocab = set(weights.keys())

  error_count = 0
  for input_file, desired_output in files_labels:
    features = set(perceptron_utils.filtered_tokens(input_file, vocab))
    result = perceptron_utils.prediction(features, weights)
    if desired_output != result:
      error_count += 1
  print "test errors: ", error_count
  print "number of instances: ", len(files_labels)