Example #1
0
def serialize_student_group(students, data_folder):
    instances = []
    for i in range(len(students)):
        for j in range(i+1, len(students), 1):
            instances.append(get_instance(students[i], students[j]))
    logging.info("Created %d instances" %(len(instances)))
    if len(instances) == 0:
        logging.warn("FAILED TO GENERATE INSTANCES!")
        return
    fgs = [IsSameFeatureGenerator(fields=['ZipCode', 'Gender', 'Language', 'HomeLanguage'
    ,'BirthCountry', 'Race', 'Food', 'ESL', 'LEP', 'SpecialED','CatchmentSchool',
    'ThisGradeSchoolKey']),
    AbsoluteDifferenceFeatureGenerator(fields=['GPA', 'EighthMathISAT', 'EighthReadingISAT', 'AttendanceRate']),
    DistanceFeatureGenerator(),
    OtherFeaturesFeatureGenerator(),
    ]
    pipe = pipe = Pipe(fgs, instances, num_processes=1)
    pipe.push_all_parallel()
    serialize_instances(pipe.instances, data_folder)
    """
Example #2
0
def main():
    parser = argparse.ArgumentParser(description='get pickeled instances')
    subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help')

    parser_serialize = subparsers.add_parser('serialize', help='pickle instances')
    parser_serialize.add_argument('--data', required=True, help='path to output pickled files')
    parser_serialize.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel')

    parser_add = subparsers.add_parser('add', help='add to pickled instances')
    parser_add.add_argument('--data', required=True, help='path to output pickled files')
    parser_add.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel')

    args = parser.parse_args()
    logging.info("pid: " + str(os.getpid()))

        
    if args.subparser_name == "serialize":
        
        earmark_ids = list(get_earmarks_from_db())
        logging.info("Got %d earmarks" % len(earmark_ids))

        entity_ids = list(get_entities_from_db())
        logging.info("Got %d entities" % len(entity_ids))


        instances = get_matching_instances(entity_ids, earmark_ids, get_earmark_entity_tuples(), args.threads)
        logging.info("Got %d instances" % len(instances))

    
        logging.info("Creating pipe")
        fgs = [
            JaccardFeatureGenerator(),
        ]
        pipe = Pipe(fgs, instances, num_processes=1)
        logging.info("Pushing into pipe")
        pipe.push_all_parallel()



        # group by earmark and document:
        pairs = [("JACCARD_FG","JACCARD_FG_max_inferred_name_jaccard" ), ("JACCARD_FG", "JACCARD_FG_max_cell_jaccard")]
        fgs = [
            RankingFeatureGenerator(pairs = pairs),
            DifferenceFeatureGenerator(pairs = pairs)
        ]
        grouper = InstancesGrouper(['earmark_id', 'document_id'])
        pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=1 )
        pipe.push_all_parallel()



        #Serialize
        logging.info("Start Serializing")
        serialize_instances(pipe.instances, args.data)
        logging.info("Done!")



    elif args.subparser_name == "add":
        instances = load_instances(args.data)
        logging.info("Creating pipe")


        fgs = [
            InfixFeatureGenerator()
        ]

        pipe = Pipe(fgs, instances, num_processes=args.threads)
        logging.info("Pushing into pipe")
        pipe.push_all_parallel()
        


        #Serialize
        logging.info("Start Serializing")
        serialize_instances(pipe.instances, args.data)
        logging.info("Done!")