Exemple #1
0
    sentsPerProc = int(math.floor(len(sentences)*1.0/numOfProcesses))
    processes = []
    lock = Lock()
    test_json_file = open(test_json, 'w', 0)
    for i in range(numOfProcesses):
        if i == numOfProcesses - 1:
            p = Process(target=parse, args=(sentences[i*sentsPerProc:], test_json_file, lock))
        else:
            p = Process(target=parse, args=(sentences[i*sentsPerProc:(i+1)*sentsPerProc], test_json_file, lock))
        p.start()
        processes.append(p)
    for proc in processes:
        proc.join()
    test_json_file.close()
    print 'Start feature extraction'
    pipeline(train_json, indir + '/brown', outdir)
    filter(outdir+'/feature.map', outdir+'/train_x.txt', outdir+'/feature.txt', outdir+'/train_x_new.txt')
    pipeline_test(test_json, indir + '/brown', outdir+'/feature.txt',outdir+'/type.txt', outdir)
    supertype(outdir)
    distribution(outdir)

    # Perform no pruning to generate training data
    print 'Start training and test data generation'
    feature_number = get_number(outdir+'/feature.txt')
    type_number = get_number(outdir+'/type.txt')
    prune(outdir, outdir, 'no', feature_number, type_number)

    # Generate type type correlation
    print 'Start type correlation calculation'
    share_entity(indir + '/type_entities.txt', outdir + '/type.txt', outdir + '/type_type_kb.txt')
    print 'Start rm feature extraction'
    pipeline(train_json,
             indir + '/brown',
             outdir,
             requireEmType=False,
             isEntityMention=False)
    pipeline_qa(qa_json,
                indir + '/brown',
                outdir + '/feature.map',
                outdir + '/type.txt',
                outdir,
                requireEmType=False,
                isEntityMention=False)
    filter(outdir + '/feature.map', outdir + '/train_x.txt',
           outdir + '/feature.txt', outdir + '/train_x_new.txt',
           feature_freq_threshold)
    filter(outdir + '/feature.map', outdir + '/qa_x.txt',
           outdir + '/feature.txt', outdir + '/qa_x_new.txt',
           feature_freq_threshold)
    pipeline_test(test_json,
                  indir + '/brown',
                  outdir + '/feature.txt',
                  outdir + '/type.txt',
                  outdir,
                  requireEmType=False,
                  isEntityMention=False)

    ### Perform no pruning to generate training data
    print 'Start rm training and test data generation'
    feature_number = get_number(outdir + '/feature.txt')