def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('input') parser.add_argument('-c','--category', required=False, help='major category of rules to apply (default: use all)') parser.add_argument('-f','--family', required=False, help='minor category of rules to apply (default: use all)') parser.add_argument('-i','--indicator', required=False, help='Indicate precise rule as X.Y.Z') parser.add_argument('-t','--type', required=False, default='text', help='input file type', choices=('text', 'html')) parser.add_argument('-v','--verbose', required=False, help='verbose', action='store_true') args=parser.parse_args() with codecs.open(args.input, 'r', encoding='utf-8') as f: text = f.read() if args.type == 'html': from pymod.htmlextract import extract_text text = extract_text(text) tok = Tokenizer(text) tokens = [t for t in tok.genTokens()] result = patternScan(tokens, category=args.category, family=args.family, indicator=args.indicator) print >> sys.stdout, json.dumps(result, indent=4)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('input') parser.add_argument( '-c', '--category', required=False, help='major category of rules to apply (default: use all)') parser.add_argument( '-f', '--family', required=False, help='minor category of rules to apply (default: use all)') parser.add_argument('-i', '--indicator', required=False, help='Indicate precise rule as X.Y.Z') parser.add_argument('-t', '--type', required=False, default='text', help='input file type', choices=('text', 'html')) parser.add_argument('-v', '--verbose', required=False, help='verbose', action='store_true') args = parser.parse_args() with codecs.open(args.input, 'r', encoding='utf-8') as f: text = f.read() if args.type == 'html': from pymod.htmlextract import extract_text text = extract_text(text) tok = Tokenizer(text) tokens = [t for t in tok.genTokens()] result = patternScan(tokens, category=args.category, family=args.family, indicator=args.indicator) print >> sys.stdout, json.dumps(result, indent=4)
def applyClassifier(self, input): classifierName = self.positiveClass indicator = self.indicator if input == '-': # special case, read from stdin input = sys.stdin.read() text = input if self.inputType == 'html': from pymod.htmlextract import extract_text text = extract_text(text) with timeblock("applying %s %s classifier" % (classifierName, indicator), self.verbose): prob_dist = self.classifier.prob_classify(text) result = {"input": input, "class": self.positiveClass, "prob": prob_dist.prob("pos")} return result