Example #1
0
def main(args):
  if os.path.isdir(args.input):
    input_path = os.path.join(args.input, 'PragerFeats')
  else:
    input_path = args.input

  if args.output:
    output_path = args.output
  else:
    output_path = input_path + '.scanner'

  # display paths
  logger.info("input path: {0}".format(input_path))
  logger.info("output path: {0}".format(output_path))

  nb_features = read_features(input_path)
  tk_nextmove, tk_output = build_scanner(nb_features)
  scanner = tk_nextmove, tk_output, nb_features

  with open(output_path, 'w') as f:
    cPickle.dump(scanner, f)
  logger.info("wrote scanner to {0}".format(output_path))
    args = parser.parse_args()
    if not (args.domain or args.lang) or (args.domain and args.lang):
        parser.error("exactly one of domain(-d) or language (-l) must be specified")

    if args.features:
        feature_path = args.features
    else:
        feature_path = os.path.join(args.model, "DFfeats")

    bucketlist_path = os.path.join(args.model, "bucketlist")

    if not os.path.exists(feature_path):
        parser.error("{0} does not exist".format(feature_path))

    bucketlist = map(str.strip, open(bucketlist_path))
    features = read_features(feature_path)

    if args.domain:
        index_path = os.path.join(args.model, "domain_index")
        suffix = ".domain"
    elif args.lang:
        index_path = os.path.join(args.model, "lang_index")
        suffix = ".lang"
    else:
        raise ValueError("no event specified")

    if args.weights:
        weights_path = args.weights
    else:
        weights_path = os.path.join(args.model, "IGweights" + suffix + (".bin" if args.binarize else ""))
Example #3
0
if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats")
  parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE")
  args = parser.parse_args()

  if os.path.isdir(args.input):
    input_path = os.path.join(args.input, 'LDfeats')
  else:
    input_path = args.input

  if args.output:
    output_path = args.output
  else:
    output_path = input_path + '.scanner'

  # display paths
  if not SILENT:
    print "input path:", input_path
    print "output path:", output_path

  nb_features = read_features(input_path)
  tk_nextmove, tk_output = build_scanner(nb_features)
  scanner = tk_nextmove, tk_output, nb_features

  with open(output_path, 'w') as f:
    cPickle.dump(scanner, f)
  if not SILENT:
    print "wrote scanner to {0}".format(output_path)
  if args.output:
    makedir(args.output)
    out_dir = args.output
  else:
    out_dir = model_dir

  langs = sorted(all_langs)

  # display paths
  print "languages({1}): {0}".format(langs, len(langs))
  print "model path:", model_dir
  print "feature path:", feat_path
  print "output path:", out_dir
  print "temp (buckets) path:", buckets_dir

  feats = read_features(feat_path)

  indexer = CorpusIndexer(args.corpus, langs = langs)
  items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
  if len(items) == 0:
    raise ValueError("found no files!")

  print "will process {0} features across {1} paths".format(len(feats), len(items))

  # produce a scanner over all the features
  tk_nextmove, tk_output = build_scanner(feats)

  # Generate a class map over all the languages we are dealing with
  cm = generate_cm([ (l,p) for d,l,p in items], len(langs))

  # Compute P(t|C)
Example #5
0
            "exactly one of domain(-d) or language (-l) must be specified")

    if args.features:
        feature_path = args.features
    else:
        feature_path = os.path.join(args.model, 'DFfeats')

    if args.buckets:
        bucketlist_paths = args.buckets
    else:
        bucketlist_paths = [os.path.join(args.model, 'bucketlist')]

    if not os.path.exists(feature_path):
        parser.error('{0} does not exist'.format(feature_path))

    features = read_features(feature_path)

    if args.domain:
        index_path = os.path.join(args.model, 'domain_index')
        suffix = '.domain'
    elif args.lang:
        index_path = os.path.join(args.model, 'lang_index')
        suffix = '.lang'
    else:
        raise ValueError("no event specified")

    if args.weights:
        weights_path = args.weights
    else:
        weights_path = os.path.join(
            args.model,
Example #6
0
        with open(index_path, "w") as f:
            writer = csv.writer(f)
            writer.writerows(items)

    if args.temp:
        buckets_dir = args.temp
    else:
        buckets_dir = os.path.join(model_dir, "buckets")
    makedir(buckets_dir)

    bucketlist_path = os.path.join(model_dir, "bucketlist")
    index_path = os.path.join(model_dir, "paths")

    if args.ld_feats:
        # LD features are pre-specified. We are basically just building the NB model.
        LDfeats = read_features(args.ld_feats)

    else:
        # LD features not pre-specified, so we compute them.

        # Tokenize
        DFfeats = None
        print "will tokenize %d files" % len(items)
        # TODO: Custom tokenizer if doing custom first-pass features
        if args.df_feats:
            print "reading custom features from:", args.df_feats
            DFfeats = read_features(args.df_feats)
            print "building tokenizer for custom list of {0} features".format(len(DFfeats))
            tk = Scanner(DFfeats)
        elif args.word:
            print "using word tokenizer"
Example #7
0
  if args.output:
    makedir(args.output)
    out_dir = args.output
  else:
    out_dir = model_dir

  langs = sorted(all_langs)

  # display paths
  print "languages({1}): {0}".format(langs, len(langs))
  print "model path:", model_dir
  print "feature path:", feat_path
  print "output path:", out_dir
  print "temp (buckets) path:", buckets_dir

  feats = read_features(feat_path)

  indexer = CorpusIndexer(args.corpus, langs = langs)
  items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
  if len(items) == 0:
    raise ValueError("found no files!")

  print "will process {0} features across {1} paths".format(len(feats), len(items))

  # produce a scanner over all the features
  tk_nextmove, tk_output = build_scanner(feats)

  # Generate a class map over all the languages we are dealing with
  cm = generate_cm([ (l,p) for d,l,p in items], len(langs))

  # Compute P(t|C)
Example #8
0
        help=
        "build a scanner for INPUT. If input is a directory, read INPUT/LDfeats"
    )
    parser.add_argument("-o",
                        "--output",
                        help="output scanner to OUTFILE",
                        metavar="OUTFILE")
    args = parser.parse_args()

    if os.path.isdir(args.input):
        input_path = os.path.join(args.input, 'LDfeats')
    else:
        input_path = args.input

    if args.output:
        output_path = args.output
    else:
        output_path = input_path + '.scanner'

    # display paths
    print "input path:", input_path
    print "output path:", output_path

    nb_features = read_features(input_path)
    tk_nextmove, tk_output = build_scanner(nb_features)
    scanner = tk_nextmove, tk_output, nb_features

    with open(output_path, 'w') as f:
        cPickle.dump(scanner, f)
    print "wrote scanner to {0}".format(output_path)
Example #9
0
        with open(index_path, 'w') as f:
            writer = csv.writer(f)
            writer.writerows(items)

    if args.temp:
        buckets_dir = args.temp
    else:
        buckets_dir = os.path.join(model_dir, 'buckets')
    makedir(buckets_dir)

    bucketlist_path = os.path.join(model_dir, 'bucketlist')
    index_path = os.path.join(model_dir, 'paths')

    if args.ld_feats:
        # LD features are pre-specified. We are basically just building the NB model.
        LDfeats = read_features(args.ld_feats)

    else:
        # LD features not pre-specified, so we compute them.

        # Tokenize
        DFfeats = None
        print "will tokenize %d files" % len(items)
        # TODO: Custom tokenizer if doing custom first-pass features
        if args.df_feats:
            print "reading custom features from:", args.df_feats
            DFfeats = read_features(args.df_feats)
            print "building tokenizer for custom list of {0} features".format(
                len(DFfeats))
            tk = Scanner(DFfeats)
        elif args.word: