Ejemplo n.º 1
0
def main():
  parser = argparse.ArgumentParser(description="label data given model file",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input text file")
  parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file")
  parser.add_argument("--handlabel", "-H", action='store_true', default=False, help="use hand labels if available")
  parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file")
  parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode")


  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  infile = prepfile(args.infile, 'r')
  modelfile = prepfile(args.modelfile, 'rb')
  outfile = prepfile(args.outfile, 'w')

  fullmodel = pickle.load(modelfile)

  settings = fullmodel['settings']
  features, tokfeatures = hkmc.prepfeatures(settings)
  

  # slight bit of bkwd compat
  sparse = settings['sparse'] if 'sparse' in settings else True
  external = settings['externalfeatures'] if 'externalfeatures' in settings else None
  data, info, datamap = hkmc.prepdata(infile, features, tokfeatures, args.debug, settings, dv=fullmodel['feats'])
  labels = fullmodel['model'].handlabeldata(data) if args.handlabel else fullmodel['model'].labeldata(data)
  for label, theinfo in izip(labels, info):
    outfile.write("%d\t%d\t%s\n" % (theinfo['ln'], theinfo['offset'], label))
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description="replace hierarchy labels with hand labels, if present",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input classified file")
    parser.add_argument("--modelfile",
                        "-m",
                        nargs='?',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help="input model file")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file")
    parser.add_argument("--debug",
                        "-d",
                        action='store_true',
                        default=False,
                        help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    infile = prepfile(args.infile, 'r')
    modelfile = prepfile(args.modelfile, 'rb')
    outfile = prepfile(args.outfile, 'w')

    fullmodel = pickle.load(modelfile)

    features, tokfeatures = hkmc.prepfeatures(fullmodel['settings'])

    for line in infile:
        ln, cn, codestr = line.strip().split('\t')
        codes = codestr.split('.')
        model = fullmodel['model']
        label = None
        for code in codes:
            if model.handlabel is not None and model.handlabel != "r":
                label = model.handlabel
                break
            else:
                model = model.children[int(code)]
        if label is None:
            label = codestr
        outfile.write('\t'.join((ln, cn, label)) + "\n")
Ejemplo n.º 3
0
def main():
  parser = argparse.ArgumentParser(description="add hand labels on model file given annotated data",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file")
  parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input untokenized text")
  parser.add_argument("--goldfile", "-g", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input gold labels")
  parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output model file")
  parser.add_argument("--annfile", "-a", nargs='?', type=argparse.FileType('w'), default=None, help="output annotation file")
# TODO!  parser.add_argument("--refine", "-r", action='store_true', default=False, help="dynamically refine")
  parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode")
  parser.add_argument("--thresh", "-t", type=float, default=0.75, help="how pure a class has to be")


  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  modelfile = prepfile(args.modelfile, 'rb')
  infile = prepfile(args.infile, 'r')
  goldfile = prepfile(args.goldfile, 'r')
  outfile = prepfile(args.outfile, 'wb') if args.outfile is not None else None
  annfile = prepfile(args.annfile, 'w') if args.annfile is not None else None

  fullmodel = pickle.load(modelfile)

  settings = fullmodel['settings']
  features, tokfeatures = hkmc.prepfeatures(settings)

  values = [ ('AA', 'attach both sides (noop)'),
             ('DD', 'disconnect both sides'),
             ('AD', 'disconnect right only'),
             ('DA', 'disconnect left only'),
             ('AN', 'attach left, newline right'),
             ('DN', 'disconnect left, newline right'),
             ]
  data, info, datamap = hkmc.prepdata(infile, features, tokfeatures, args.debug, settings, dv=fullmodel['feats'])
  goldlabels = []
  golddata = [ x.split() for x in [ y.strip() for y in goldfile.readlines() ] ]
  for infoblock in info:
    goldlabels.append(golddata[infoblock['ln']][infoblock['offset']])
  goldlabels = np.array(goldlabels)
  fullmodel['model'].classifydata(data, info, goldlabels, annfile, thresh=args.thresh)

  if outfile is not None:
    pickle.dump(fullmodel, outfile)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description="add or modify hand labels on model file ", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--modelfile", "-m", nargs="?", type=argparse.FileType("rb"), default=sys.stdin, help="input model file"
    )
    parser.add_argument(
        "--outfile", "-o", nargs="?", type=argparse.FileType("wb"), default=None, help="output model file"
    )
    parser.add_argument(
        "--annfile", "-a", nargs="?", type=argparse.FileType("w"), default=None, help="output annotation file"
    )
    parser.add_argument("--refine", "-r", action="store_true", default=False, help="dynamically refine")
    parser.add_argument("--debug", "-d", action="store_true", default=False, help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    modelfile = prepfile(args.modelfile, "rb")
    outfile = prepfile(args.outfile, "wb") if args.outfile is not None else None
    annfile = prepfile(args.annfile, "w") if args.annfile is not None else None

    fullmodel = pickle.load(modelfile)

    features, tokfeatures = hkmc.prepfeatures(fullmodel["settings"])

    values = [
        ("AA", "attach both sides (noop)"),
        ("DD", "disconnect both sides"),
        ("AD", "disconnect right only"),
        ("DA", "disconnect left only"),
        ("AN", "attach left, newline right"),
        ("DN", "disconnect left, newline right"),
    ]
    if fullmodel["model"].handLabel(values=values, annfile=annfile, refine=args.refine):
        print("Stopping early")

    if outfile is not None:
        pickle.dump(fullmodel, outfile)
Ejemplo n.º 5
0
def main():
  parser = argparse.ArgumentParser(description="replace hierarchy labels with hand labels, if present",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input classified file")
  parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file")
  parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file")
  parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode")


  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  infile = prepfile(args.infile, 'r')
  modelfile = prepfile(args.modelfile, 'rb')
  outfile = prepfile(args.outfile, 'w')

  fullmodel = pickle.load(modelfile)

  features, tokfeatures = hkmc.prepfeatures(fullmodel['settings'])

  for line in infile:
    ln, cn, codestr = line.strip().split('\t')
    codes = codestr.split('.')
    model = fullmodel['model']
    label = None
    for code in codes:
      if model.handlabel is not None and model.handlabel != "r":
        label = model.handlabel
        break
      else:
        model = model.children[int(code)]
    if label is None:
      label = codestr
    outfile.write('\t'.join((ln, cn, label))+"\n")
Ejemplo n.º 6
0
def main():
  parser = argparse.ArgumentParser(description="k means clustering for periods. see unitok/scripts/learntok for some inspiration",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file")
  #parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output file")
  parser.add_argument("--tontfile", "-t", nargs='?', type=argparse.FileType('w'), default=None, help="test on train output file")
  parser.add_argument("--unicodepossibles", "-u", action='store_true', default=False, help="interpret possibles list as unicode class prefixes")
  parser.add_argument("--kclusters", "-k", default=2, type=int, help="number of clusters per layer")
#  parser.add_argument("--clean", "-c", action='store_true', default=False, help="clean model training (no tont)")
  parser.add_argument("--layers", "-y", default=2, type=int, help="number of layers")
  parser.add_argument("--minclustersize", "-z", default=10.0, type=float, help="no cluster splitting below this pct of training data")
  parser.add_argument("--leftcontext", "-l", default=5, type=int, help="make features for this number of previous characters")
  parser.add_argument("--rightcontext", "-r", default=0, type=int, help="make features for this number of next characters")
  parser.add_argument("--nochar", "-n", action='store_false', dest='charfeature', default=True,  help="no character features (class only)")
  parser.add_argument("--possibles", "-p", nargs='+', default=['.'], help="set of characters to possibly split on")
  parser.add_argument("--handlabel", "-H", action='store_true', default=False, help="do hand labeling after training")
  parser.add_argument("--dbscan", action='store_true', default=False, help="try dbscan instead of kmeans")
  parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode")
  parser.add_argument("--banned", nargs='+', default=[], help='tok-based features to remove')
  parser.add_argument("--paramnames", nargs='+', default=[], help='algorithm parameter names')
  parser.add_argument("--paramvals", nargs='+', default=[], help='algorithm parameter values')

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  infile = prepfile(args.infile, 'r')
  tontfile = prepfile(args.tontfile, 'w') if args.tontfile is not None else None

  settings = {}
  settings['kclusters'] = args.kclusters
  settings['layers'] = args.layers
  settings['minclustersize'] = args.minclustersize
  settings['leftcontext'] = args.leftcontext
  settings['rightcontext'] = args.rightcontext
  settings['possibles'] = args.possibles
  settings['unicodepossibles'] = args.unicodepossibles
  settings['charfeature'] = args.charfeature
  settings['banned'] = args.banned

  features, tokfeatures = hkmc.prepfeatures(settings)
  

#  print("Preparing data")
  data, info, datamap = hkmc.prepdata(infile, args.possibles, features, tokfeatures, args.debug, isTargetPunc=args.unicodepossibles)

#  print("Done")
  #print(data.shape)
  if(args.debug):
    print(data)

  modeltype = MiniBatchKMeans
  modelkwargs = {'n_clusters':args.kclusters}
  if args.dbscan:
    modeltype = DBSCAN
    modelkwargs = {'eps':0.2}

  if len(args.paramnames) != 0:
    modelkwargs = dict(zip(args.paramnames, map(float, args.paramvals)))
  print(modelkwargs)
  modelTree = ModelTree(modeltype, data, info, modelkwargs=modelkwargs)

  labels = modelTree.model.fit_predict(modelTree.data)
  for label in set(labels):
    subset = modelTree.data[labels==label]
    subinfo = modelTree.info[labels==label]
    tontfile.write("%s\t%d\n" % (label, len(subinfo)))
    for elem in subinfo:
      tontfile.write("%s\t%s\t%s\n" % (label, hkmc.formatContext(elem), str(elem['feats'])))
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description="add or modify hand labels on model file ",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--modelfile",
                        "-m",
                        nargs='?',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help="input model file")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('wb'),
                        default=None,
                        help="output model file")
    parser.add_argument("--annfile",
                        "-a",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=None,
                        help="output annotation file")
    parser.add_argument("--refine",
                        "-r",
                        action='store_true',
                        default=False,
                        help="dynamically refine")
    parser.add_argument("--debug",
                        "-d",
                        action='store_true',
                        default=False,
                        help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    modelfile = prepfile(args.modelfile, 'rb')
    outfile = prepfile(args.outfile,
                       'wb') if args.outfile is not None else None
    annfile = prepfile(args.annfile, 'w') if args.annfile is not None else None

    fullmodel = pickle.load(modelfile)

    features, tokfeatures = hkmc.prepfeatures(fullmodel['settings'])

    values = [
        ('AA', 'attach both sides (noop)'),
        ('DD', 'disconnect both sides'),
        ('AD', 'disconnect right only'),
        ('DA', 'disconnect left only'),
        ('AN', 'attach left, newline right'),
        ('DN', 'disconnect left, newline right'),
    ]
    if fullmodel['model'].handLabel(values=values,
                                    annfile=annfile,
                                    refine=args.refine):
        print("Stopping early")

    if outfile is not None:
        pickle.dump(fullmodel, outfile)
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        description="label data given model file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input text file")
    parser.add_argument("--modelfile",
                        "-m",
                        nargs='?',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help="input model file")
    parser.add_argument("--handlabel",
                        "-H",
                        action='store_true',
                        default=False,
                        help="use hand labels if available")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file")
    parser.add_argument("--debug",
                        "-d",
                        action='store_true',
                        default=False,
                        help="debug mode")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    infile = prepfile(args.infile, 'r')
    modelfile = prepfile(args.modelfile, 'rb')
    outfile = prepfile(args.outfile, 'w')

    fullmodel = pickle.load(modelfile)

    settings = fullmodel['settings']
    features, tokfeatures = hkmc.prepfeatures(settings)

    # slight bit of bkwd compat
    sparse = settings['sparse'] if 'sparse' in settings else True
    external = settings[
        'externalfeatures'] if 'externalfeatures' in settings else None
    data, info, datamap = hkmc.prepdata(infile,
                                        features,
                                        tokfeatures,
                                        args.debug,
                                        settings,
                                        dv=fullmodel['feats'])
    labels = fullmodel['model'].handlabeldata(
        data) if args.handlabel else fullmodel['model'].labeldata(data)
    for label, theinfo in izip(labels, info):
        outfile.write("%d\t%d\t%s\n" %
                      (theinfo['ln'], theinfo['offset'], label))