Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--traindata', help='training data file', required=True)
    args = parser.parse_args()

    with open(args.traindata) as f:
        data = corpus.read(f)

    class_counts = defaultdict(lambda: Counter())
    for doc in data:
        for class_name, method_name in doc:
            class_counts[class_name][method_name] += 1

    print '{} classes'.format(len(class_counts))
    for class_name, counts in class_counts.iteritems():
        print class_name
        print counts.most_common(n=10)
        print
Esempio n. 2
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument('--traindata', help='path to training data', required=True)
    parser.add_argument('--discount', help='discount parameter for PYP', required=True, type=float)
    parser.add_argument('--strength', help='strength parameter for PYP', required=True, type=float)
    parser.add_argument('--niter', help='number of iterations of sampling', type=int, default=10)
    parser.add_argument('--char_lm_order', help='order of character language model', type=int, default=10)
    args = parser.parse_args()

    with open(args.traindata) as f:
        _, identifiers = corpus.read(f)

    char_lm = SRILMWrapper()
    char_lm.train(set(identifiers), args.char_lm_order, 'wbdiscount')
    base = CharLM(char_lm.ngram_file)
    assert args.strength > - args.discount
    model = PYP(args.discount, args.strength, base)
    run_sampler(model, identifiers, args.niter)