コード例 #1
0
    def __init__(self, geo_coder, sponsor_coder, conn, model, feature_space):
        self.geo_coder = geo_coder
        self.sponsor_coder = sponsor_coder
        self.conn = conn
        self.model = model
        self.feature_space = feature_space
        fgs = [
            entity_text_bag_feature_generator.unigram_feature_generator(force=True),
            simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True),
            gen_geo_features.geo_feature_generator(force = True),
            sponsor_feature_generator.SponsorFeatureGenerator(force = True),

        ]
        self.pipe = Pipe(fgs, num_processes=1)
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(description='get pickeled instances')
    subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help')

    parser_serialize = subparsers.add_parser('serialize', help='pickle instances')
    parser_serialize.add_argument('--data_folder', required=True, help='path to output pickled files')
    parser_serialize.add_argument('--threads', type=int, default = mp.cpu_count(), help='number of threads to run in parallel')
    parser_serialize.add_argument('--positivefile', required=True, help='file containing entities identified as earmarks')
    parser_serialize.add_argument('--negativefile',  required=True, help='file containing negative example entities')

    parser_add = subparsers.add_parser('add', help='add to pickled instances')
    parser_add.add_argument('--data_folder', required=True, help='path to output pickled files')
    parser_add.add_argument('--threads', type=int, default = 1, help='number of threads to run in parallel')






    args = parser.parse_args()
    logging.info("pid: " + str(os.getpid()))




    if args.subparser_name == "serialize":
        positive_entities = read_entities_file(args.positivefile)
        negative_entities = read_entities_file(args.negativefile)
        logging.info("Pulling entities from database")
        positive_instance = get_instances_from_entities(get_entity_objects(positive_entities, args.threads), 1, args.threads )
        negative_instance = get_instances_from_entities(get_entity_objects(negative_entities, args.threads), 0, args.threads )
        instances = positive_instance + negative_instance

        logging.info("Creating pipe")

        feature_generators = [
        #wikipedia_categories_feature_generator.wikipedia_categories_feature_generator(depth = 2, distinguish_levels=False, force=True ),
        entity_text_bag_feature_generator.unigram_feature_generator(force=True),
        #entity_text_bag_feature_generator.bigram_feature_generator(force=True),
        simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True),
        gen_geo_features.geo_feature_generator(force = True),
        sponsor_feature_generator.SponsorFeatureGenerator(force = True),

        #calais_feature_generator.CalaisFeatureGenerator(force=True)
        ]



    elif args.subparser_name == "add":
        logging.info("pid: " + str(os.getpid()))
        instances = load_instances(args.data_folder)
        logging.info("Creating pipe")


        feature_generators = [
        #wikipedia_categories_feature_generator.wikipedia_categories_feature_generator(depth = 2, distinguish_levels=False, force=True ),
        entity_text_bag_feature_generator.unigram_feature_generator(force=True),
        #entity_text_bag_feature_generator.bigram_feature_generator(force=True),
        simple_entity_text_feature_generator.simple_entity_text_feature_generator(force=True),
        gen_geo_features.geo_feature_generator(force = True),
        #calais_feature_generator.CalaisFeatureGenerator(force=True),
        #prefix_feature_generator.PrefixFeatureGenerator(force=True, prefixes = ['O&M', 'for'])
        sponsor_feature_generator.SponsorFeatureGenerator(force = True),
        ]


    pipe = Pipe(feature_generators, instances, num_processes=args.threads)
    logging.info("Pushing into pipe")
    pipe.push_all_parallel()
    logging.info("Start Serializing")
    serialize_instances(pipe.instances, args.data_folder)
    logging.info("Done!")