def __init__(self, geo_coder, sponsor_coder, conn, model, feature_space): self.geo_coder = geo_coder self.sponsor_coder = sponsor_coder self.conn = conn self.model = model self.feature_space = feature_space fgs = [ entity_text_bag_feature_generator.unigram_feature_generator(force=True), simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True), gen_geo_features.geo_feature_generator(force = True), sponsor_feature_generator.SponsorFeatureGenerator(force = True), ] self.pipe = Pipe(fgs, num_processes=1)
def main(): parser = argparse.ArgumentParser(description='get pickeled instances') subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help') parser_serialize = subparsers.add_parser('serialize', help='pickle instances') parser_serialize.add_argument('--data_folder', required=True, help='path to output pickled files') parser_serialize.add_argument('--threads', type=int, default = mp.cpu_count(), help='number of threads to run in parallel') parser_serialize.add_argument('--positivefile', required=True, help='file containing entities identified as earmarks') parser_serialize.add_argument('--negativefile', required=True, help='file containing negative example entities') parser_add = subparsers.add_parser('add', help='add to pickled instances') parser_add.add_argument('--data_folder', required=True, help='path to output pickled files') parser_add.add_argument('--threads', type=int, default = 1, help='number of threads to run in parallel') args = parser.parse_args() logging.info("pid: " + str(os.getpid())) if args.subparser_name == "serialize": positive_entities = read_entities_file(args.positivefile) negative_entities = read_entities_file(args.negativefile) logging.info("Pulling entities from database") positive_instance = get_instances_from_entities(get_entity_objects(positive_entities, args.threads), 1, args.threads ) negative_instance = get_instances_from_entities(get_entity_objects(negative_entities, args.threads), 0, args.threads ) instances = positive_instance + negative_instance logging.info("Creating pipe") feature_generators = [ #wikipedia_categories_feature_generator.wikipedia_categories_feature_generator(depth = 2, distinguish_levels=False, force=True ), entity_text_bag_feature_generator.unigram_feature_generator(force=True), #entity_text_bag_feature_generator.bigram_feature_generator(force=True), simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True), gen_geo_features.geo_feature_generator(force = True), sponsor_feature_generator.SponsorFeatureGenerator(force = True), #calais_feature_generator.CalaisFeatureGenerator(force=True) ] elif args.subparser_name == "add": logging.info("pid: " + str(os.getpid())) instances = load_instances(args.data_folder) logging.info("Creating pipe") feature_generators = [ #wikipedia_categories_feature_generator.wikipedia_categories_feature_generator(depth = 2, distinguish_levels=False, force=True ), entity_text_bag_feature_generator.unigram_feature_generator(force=True), #entity_text_bag_feature_generator.bigram_feature_generator(force=True), simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True), gen_geo_features.geo_feature_generator(force = True), #calais_feature_generator.CalaisFeatureGenerator(force=True), #prefix_feature_generator.PrefixFeatureGenerator(force=True, prefixes = ['O&M', 'for']) sponsor_feature_generator.SponsorFeatureGenerator(force = True), ] pipe = Pipe(feature_generators, instances, num_processes=args.threads) logging.info("Pushing into pipe") pipe.push_all_parallel() logging.info("Start Serializing") serialize_instances(pipe.instances, args.data_folder) logging.info("Done!")