def main(): parser = argparse.ArgumentParser() parser.add_argument('--traindata', help='training data file', required=True) args = parser.parse_args() with open(args.traindata) as f: data = corpus.read(f) class_counts = defaultdict(lambda: Counter()) for doc in data: for class_name, method_name in doc: class_counts[class_name][method_name] += 1 print '{} classes'.format(len(class_counts)) for class_name, counts in class_counts.iteritems(): print class_name print counts.most_common(n=10) print
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser() parser.add_argument('--traindata', help='path to training data', required=True) parser.add_argument('--discount', help='discount parameter for PYP', required=True, type=float) parser.add_argument('--strength', help='strength parameter for PYP', required=True, type=float) parser.add_argument('--niter', help='number of iterations of sampling', type=int, default=10) parser.add_argument('--char_lm_order', help='order of character language model', type=int, default=10) args = parser.parse_args() with open(args.traindata) as f: _, identifiers = corpus.read(f) char_lm = SRILMWrapper() char_lm.train(set(identifiers), args.char_lm_order, 'wbdiscount') base = CharLM(char_lm.ngram_file) assert args.strength > - args.discount model = PYP(args.discount, args.strength, base) run_sampler(model, identifiers, args.niter)