sentsPerProc = int(math.floor(len(sentences)*1.0/numOfProcesses)) processes = [] lock = Lock() test_json_file = open(test_json, 'w', 0) for i in range(numOfProcesses): if i == numOfProcesses - 1: p = Process(target=parse, args=(sentences[i*sentsPerProc:], test_json_file, lock)) else: p = Process(target=parse, args=(sentences[i*sentsPerProc:(i+1)*sentsPerProc], test_json_file, lock)) p.start() processes.append(p) for proc in processes: proc.join() test_json_file.close() print 'Start feature extraction' pipeline(train_json, indir + '/brown', outdir) filter(outdir+'/feature.map', outdir+'/train_x.txt', outdir+'/feature.txt', outdir+'/train_x_new.txt') pipeline_test(test_json, indir + '/brown', outdir+'/feature.txt',outdir+'/type.txt', outdir) supertype(outdir) distribution(outdir) # Perform no pruning to generate training data print 'Start training and test data generation' feature_number = get_number(outdir+'/feature.txt') type_number = get_number(outdir+'/type.txt') prune(outdir, outdir, 'no', feature_number, type_number) # Generate type type correlation print 'Start type correlation calculation' share_entity(indir + '/type_entities.txt', outdir + '/type.txt', outdir + '/type_type_kb.txt')
print 'Start rm feature extraction' pipeline(train_json, indir + '/brown', outdir, requireEmType=False, isEntityMention=False) pipeline_qa(qa_json, indir + '/brown', outdir + '/feature.map', outdir + '/type.txt', outdir, requireEmType=False, isEntityMention=False) filter(outdir + '/feature.map', outdir + '/train_x.txt', outdir + '/feature.txt', outdir + '/train_x_new.txt', feature_freq_threshold) filter(outdir + '/feature.map', outdir + '/qa_x.txt', outdir + '/feature.txt', outdir + '/qa_x_new.txt', feature_freq_threshold) pipeline_test(test_json, indir + '/brown', outdir + '/feature.txt', outdir + '/type.txt', outdir, requireEmType=False, isEntityMention=False) ### Perform no pruning to generate training data print 'Start rm training and test data generation' feature_number = get_number(outdir + '/feature.txt')