예제 #1
0
def main():
    parser = argparse.ArgumentParser(description='get pickeled instances')
    subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help')

    parser_serialize = subparsers.add_parser('serialize', help='pickle instances')
    parser_serialize.add_argument('--data', required=True, help='path to output pickled files')
    parser_serialize.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel')

    parser_add = subparsers.add_parser('add', help='add to pickled instances')
    parser_add.add_argument('--data', required=True, help='path to output pickled files')
    parser_add.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel')

    args = parser.parse_args()
    logging.info("pid: " + str(os.getpid()))

        
    if args.subparser_name == "serialize":
        
        earmark_ids = list(get_earmarks_from_db())
        logging.info("Got %d earmarks" % len(earmark_ids))

        entity_ids = list(get_entities_from_db())
        logging.info("Got %d entities" % len(entity_ids))


        instances = get_matching_instances(entity_ids, earmark_ids, get_earmark_entity_tuples(), args.threads)
        logging.info("Got %d instances" % len(instances))

    
        logging.info("Creating pipe")
        fgs = [
            JaccardFeatureGenerator(),
        ]
        pipe = Pipe(fgs, instances, num_processes=1)
        logging.info("Pushing into pipe")
        pipe.push_all_parallel()



        # group by earmark and document:
        pairs = [("JACCARD_FG","JACCARD_FG_max_inferred_name_jaccard" ), ("JACCARD_FG", "JACCARD_FG_max_cell_jaccard")]
        fgs = [
            RankingFeatureGenerator(pairs = pairs),
            DifferenceFeatureGenerator(pairs = pairs)
        ]
        grouper = InstancesGrouper(['earmark_id', 'document_id'])
        pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=1 )
        pipe.push_all_parallel()



        #Serialize
        logging.info("Start Serializing")
        serialize_instances(pipe.instances, args.data)
        logging.info("Done!")



    elif args.subparser_name == "add":
        instances = load_instances(args.data)
        logging.info("Creating pipe")


        fgs = [
            InfixFeatureGenerator()
        ]

        pipe = Pipe(fgs, instances, num_processes=args.threads)
        logging.info("Pushing into pipe")
        pipe.push_all_parallel()
        


        #Serialize
        logging.info("Start Serializing")
        serialize_instances(pipe.instances, args.data)
        logging.info("Done!")
예제 #2
0
def folder_to_scipy(folder_path, feature_space = None):
    instances = load_instances(folder_path)
    x, y, feature_space = pipe.instances_to_matrix(instances, feature_space= feature_space, dense = True)
    return x, y, feature_space