Beispiel #1
0
def crossvalidation(folds_folder, number_of_folds, combination_name_indexes,
                    qrels, summary_file):

    torch.multiprocessing.set_start_method("spawn")

    lrs = [0.01, 0.001]
    batch_sizes = [3]
    epochs = [5, 10, 17]
    # epochs = [1]
    momentums = [0.9]
    # dropouts = [0.2,0.5]
    scores = {}
    models = {}
    evaluator = eval(metrics=["map", "ndcg", "P.2", "P.5"])
    test_trec_file = "NN_test_trec_file.txt"
    for fold in range(1, number_of_folds + 1):
        print("in fold:", fold)
        models[fold] = {}
        scores[fold] = {}
        training_folder = folds_folder + str(fold) + "/train/"
        validation_folder = folds_folder + str(fold) + "/validation/"
        test_folder = folds_folder + str(fold) + "/test/"
        validation_results_folder = folds_folder + str(
            fold) + "/validation_results/"
        if not os.path.exists(validation_results_folder):
            os.makedirs(validation_results_folder)
        current_labels_file = "labels_fold_" + str(fold) + ".pkl"
        for lr in lrs:
            for epoch in epochs:
                for momentum in momentums:
                    for batch_size in batch_sizes:
                        model_name = "_".join((str(lr), str(epoch),
                                               str(momentum), str(batch_size)))
                        model, model_file = train_model(
                            lr, momentum, current_labels_file, training_folder,
                            batch_size, epoch, fold)
                        results = predict_folder_content(
                            validation_folder, model)
                        trec_file_name = validation_results_folder + "NN_" + model_name + ".txt"
                        evaluator.create_trec_eval_file_nn(
                            results, combination_name_indexes["val"][fold],
                            trec_file_name)
                        score = evaluator.run_trec_eval(trec_file_name, qrels)
                        scores[fold][model_name] = float(score)
                        models[fold][model_name] = model_file
        best_model = max(scores[fold].items(), key=operator.itemgetter(1))[0]
        print("chosen model on fold", fold, ":", best_model)
        test_model = torch.load(models[fold][best_model])
        results = predict_folder_content(test_folder, test_model)
        evaluator.create_trec_eval_file_nn(
            results, combination_name_indexes["test"][fold], test_trec_file,
            True)
    final_trec_file = evaluator.order_trec_file(test_trec_file)
    run_bash_command("rm " + test_trec_file)
    evaluator.run_trec_eval_on_test(summary_file=summary_file,
                                    qrels=qrels,
                                    method="NN",
                                    trec_file=final_trec_file)
def create_tfidf_features_and_features_file(features_dir, index_path,
                                            sentence_file, top_doc_files,
                                            input_query, past_winners_file,
                                            key):
    query = input_query + key
    command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp indri.jar Main " + index_path + " " + sentence_file + " " + top_doc_files + " " + past_winners_file + " " + query
    print(run_bash_command(command))
    command = "mv doc*_* " + features_dir
    run_bash_command(command)
Beispiel #3
0
def run():
    for i in [j / 10 for j in range(11)]:
        run_name1 = str(i)
        command = "rm -r /lv_local/home/sgregory/auto_seo/new_merged_index*"
        run_bash_command(command)
        command = "nohup python pagerank_experiment_platform.py 1 " + run_name1 + " &"
        run_bash_command(command)
        while True:
            if os.path.isfile("stop.stop_1" + run_name1.replace(".", "")):
                break
def merge_indexes_for_experiments(index1, index2, merged_index):
    if os.path.isdir(merged_index):
        print("merged index exists, deleting the index")
        run_bash_command("rm -r " + merged_index)
        print("deletion of old merged index is done")
    command = '/home/greg/indri_test/bin/dumpindex ' + merged_index + ' merge ' + index1 + ' ' + index2
    print("merging command:", command)
    sys.stdout.flush()
    out = run_bash_command(command)
    print("merging out command:", out)
    return merged_index
 def run_model(self, test_file, fold, trees, leaves,
               model_path):  #TODO:add to main functionality + test file
     # score_file = self.code_base_path+"lm_scores/"+str(fold)+"/score" + str(trees)+"_"+str(leaves)
     score_file = "lm_score/" + str(fold) + "/score" + str(
         trees) + "_" + str(leaves)
     if not os.path.exists(os.path.dirname(score_file)):
         os.makedirs(os.path.dirname(score_file))
     run_bash_command('touch ' + score_file)
     command = self.java_path + " -jar " + self.jar_path + " -load " + model_path + " -rank " + test_file + " -score " + score_file
     run_bash_command(command)
     return score_file
def run_model(test_file, run_name=""):
    java_path = "/home/greg/jdk1.8.0_181/bin/java"
    jar_path = "/home/greg/SEO_CODE/model_running/RankLib.jar"
    score_file = "scores_winners/scores_of_seo_run" + run_name
    if not os.path.exists("scores_winners/"):
        os.makedirs("scores_winners/")
    features = test_file
    model_path = "/home/greg/auto_seo/CrossValidationUtils/model_bot_group"
    run_bash_command('touch ' + score_file)
    command = java_path + " -jar " + jar_path + " -load " + model_path + " -rank " + features + " -score " + score_file
    out = run_bash_command(command)
    print(out)
    return score_file
def create_features(reference_docs, past_winners_file_index, doc_ids_file,
                    index_path, top_docs, doc_text):
    print("loading w2v model")
    model = load_model()
    print("loading done")
    final_features_dir = "sentence_feature_files_test/"
    features_file = final_features_dir + "new_data_sentence_features_test"
    features_dir = "sentence_feature_values_test/"
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)
    if not os.path.exists(final_features_dir):
        os.makedirs(final_features_dir)
    total_working_set_file = "total_working_set_file_test"
    run_bash_command("touch " + total_working_set_file)
    for key in reference_docs:
        past_winners_file = past_winners_file_index[key]
        for query in reference_docs[key]:
            print("working on", query)
            doc = reference_docs[key][query]
            print("working on", doc)
            print("top_doc_file is created")
            top_docs_file = create_top_docs_per_ref_doc(
                top_docs, key, doc, query)
            sentence_file_name, sentences_index = create_sentence_file(
                top_docs_file, doc, query, key, doc_text)
            if len(sentences_index[query + key][doc]) < 2:
                continue
            print("sentence_file is created")
            working_set_file = create_sentence_working_set(
                doc, sentence_file_name, query, key)
            run_bash_command("cat " + working_set_file + " >> " +
                             total_working_set_file)
            print("sentence working-set is created")
            create_w2v_features(sentence_file_name, top_docs_file,
                                doc_ids_file, past_winners_file, model, query,
                                key)
            print("created seo w2v features")
            # create_coherency_features(sentences_index,doc,query,model,key)
            # print("created coherency features")
            create_tfidf_features_and_features_file(features_dir, index_path,
                                                    sentence_file_name,
                                                    top_docs_file, query,
                                                    past_winners_file, key)
            print("created tf-idf features")
    print("creating all features")
    create_features_from_dir(features_dir, features_file,
                             total_working_set_file)
    return features_file
Beispiel #8
0
    def check(self):
        """Check pipeline run results."""
        if self._run_pipeline:
            ###### Monitor Job ######
            try:
                start_time = datetime.now()
                response = self._client.wait_for_run_completion(
                    self._run_id, self._test_timeout
                )
                succ = (response.run.status.lower() == 'succeeded')
                end_time = datetime.now()
                elapsed_time = (end_time - start_time).seconds
                utils.add_junit_test(
                    self._test_cases, 'job completion', succ,
                    'waiting for job completion failure', elapsed_time
                )
            finally:
                ###### Output Argo Log for Debugging ######
                workflow_json = self._client._get_workflow_json(self._run_id)
                workflow_id = workflow_json['metadata']['name']
                print("Argo Workflow Name: ", workflow_id)
                argo_log, _ = utils.run_bash_command(
                    'argo logs {} -n {}'.format(
                        workflow_id, self._namespace
                    )
                )
                print('=========Argo Workflow Log=========')
                print(argo_log)

            if not succ:
                utils.write_junit_xml(
                    self._test_name, self._result, self._test_cases
                )
                exit(1)

            ###### Validate the results for specific test cases ######
            if self._testname == 'xgboost_training_cm':
                # For xgboost sample, check its confusion matrix.
                cm_tar_path = './confusion_matrix.tar.gz'
                utils.get_artifact_in_minio(
                    workflow_json, 'confusion-matrix', cm_tar_path,
                    'mlpipeline-ui-metadata'
                )
                with tarfile.open(cm_tar_path) as tar_handle:
                    file_handles = tar_handle.getmembers()
                    assert len(file_handles) == 1

                    with tar_handle.extractfile(file_handles[0]) as f:
                        cm_data = f.read()
                        utils.add_junit_test(
                            self._test_cases, 'confusion matrix format',
                            (len(cm_data) > 0),
                            'the confusion matrix file is empty'
                        )

        ###### Delete Job ######
        #TODO: add deletion when the backend API offers the interface.

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(self._test_name, self._result, self._test_cases)
def merge_indices(new_index,new_index_name):
    path_to_folder = '/lv_local/home/sgregory/Bots/'
    command = '/lv_local/home/sgregory/indri/bin/dumpindex '+new_index_name+' merge '+new_index+' '+'/lv_local/home/sgregory/cluewebindex'
    print("merging command:",command)
    out=run_bash_command(command)
    print("merging out command:",out)
    return new_index_name
def delete_doc_from_index(index, doc, dic, run_name=""):
    did = dic[doc]
    command = '/lv_local/home/sgregory/indri_test/bin/dumpindex ' + index + ' delete ' + did
    print("deleting command:", command)
    sys.stdout.flush()
    out = run_bash_command(command)
    print("deleting out command:", out)
def create_features_file_original(features_dir,
                                  index_path,
                                  queries_file,
                                  new_features_file,
                                  run_name=""):
    run_bash_command("rm -r " + features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)

    command = "/home/greg/auto_seo/past_winners/LTRFeatures " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=/home/greg/auto_seo/SentenceRanking/working_set' + run_name + ' -workingSetFormat=trec'
    print(command)
    out = run_bash_command(command)
    print(out)
    # command='/home/greg/auto_seo/past_winners/Cent ' + queries_file + ' -index=' + index_path + ' -useWorkingSet=true -workingSetFile=/home/greg/auto_seo/SentenceRanking/working_set'+run_name + ' -workingSetFormat=trec'
    # print(command)
    # out = run_bash_command(command)
    # print(out)
    run_bash_command("mv doc*_* " + features_dir)
    command = "perl /home/greg/auto_seo/past_winners/generate.pl " + features_dir + " /home/greg/auto_seo/SentenceRanking/working_set" + run_name
    print(command)
    out = run_bash_command(command)
    print(out)
    command = "mv features " + new_features_file
    print(command)
    out = run_bash_command(command)
    print(out)
Beispiel #12
0
 def run_svm_light_model(self,test_file,model_file,fold):
     predictions_folder = "svm_light_score/"+str(fold)+"/"
     if not os.path.exists(predictions_folder):
         os.makedirs(predictions_folder)
     predictions_file = predictions_folder+os.path.basename(model_file)
     command = "./svm-predict "+test_file +" "+model_file+" "+  predictions_file
     out = run_bash_command(command)
     print(out)
     return predictions_file
Beispiel #13
0
 def learn_svm_rank_model(self,train_file,fold,C):
     models_folder = "svm_rank_models/" + str(fold) + "/"
     if not os.path.exists(models_folder):
         os.makedirs(models_folder)
     model_file = models_folder + "model_"+str(C)+".txt"
     command = "./svm_rank_learn -c "+str(C)+" "+ train_file + " " + model_file
     out = run_bash_command(command)
     # print(out)
     return model_file
Beispiel #14
0
    def check(self):
        """ Check the pipeline running results of the notebook sample. """
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(
            test_cases, 'test script execution', (self._exit_code == '0'),
            'test script failure with exit code: ' + self._exit_code)

        try:
            with open(DEFAULT_CONFIG, 'r') as f:
                raw_args = yaml.safe_load(f)
        except yaml.YAMLError as yamlerr:
            raise RuntimeError('Illegal default config:{}'.format(yamlerr))
        except OSError as ose:
            raise FileExistsError('Default config not found:{}'.format(ose))
        else:
            test_timeout = raw_args['test_timeout']

        if self._run_pipeline:
            experiment = self._experiment_name
            ###### Initialization ######
            client = Client(host=self._host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(
                experiment_name=experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, test_timeout)
                succ = (response.run.status.lower() == 'succeeded')
                utils.add_junit_test(test_cases, 'job completion', succ,
                                     'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                print("Argo Workflow Name: ", workflow_id)
                argo_log, _ = utils.run_bash_command(
                    'argo logs {} -n {}'.format(workflow_id, self._namespace))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
Beispiel #15
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    client = Client(namespace=args.namespace)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = args.testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = args.testname + '_sample'
    params = {}
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        args.namespace, workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
Beispiel #16
0
 def learn_svm_light_model(self,train_file,fold,C,number_of_queries):
     models_folder = "svm_light_models/" + str(fold) + "/"
     if not os.path.exists(models_folder):
         os.makedirs(models_folder)
     model_file = models_folder + "model_"+str(C)+".txt"
     # command = "./svm_learn -z p -c "+str(C)+" -m 70000 "+ train_file + " " + model_file
     command = "./svm-train -c "+str(C)+" "+ train_file + " " + model_file
     out = run_bash_command(command)
     print(out)
     return model_file
def init_top_doc_vectors(top_docs,doc_ids,model):
    top_docs_vectors={}
    for query in top_docs:
        docs = top_docs[query]
        command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp /home/greg/auto_seo/scripts/indri.jar DocStems ~/mergedindex \""+" ".join([doc_ids[d.rstrip()].strip() for d in docs])+"\""
        print(command)
        print(run_bash_command(command))
        top_docs_vectors[query]=[]
        with open("/home/greg/auto_seo/SentenceRanking/docsForVectors") as docs:
            for i,doc in enumerate(docs):
                top_docs_vectors[query].append(get_document_vector(doc,model))
    return top_docs_vectors
    def create_model_LambdaMart(self,
                                number_of_trees,
                                number_of_leaves,
                                train_file,
                                fold,
                                test=False):

        if test:
            add = "test"
        else:
            add = ""
        # model_path = self.model_base_path+str(fold) +"/" + add +'model_' + str(number_of_trees) + "_" + str(number_of_leaves)
        model_path = "lm_models/" + str(fold) + "/" + add + 'model_' + str(
            number_of_trees) + "_" + str(number_of_leaves)

        if not os.path.exists(os.path.dirname(model_path)):
            os.makedirs(os.path.dirname(model_path))
        command = self.java_path + ' -jar ' + self.jar_path + ' -train ' + train_file + ' -ranker 6    -metric2t NDCG@5' \
                                                                                        ' -tree ' + str(number_of_trees) + ' -leaf ' + str(number_of_leaves) +' -save ' +model_path
        print("command = ", command)
        run_bash_command(command)
        return model_path
Beispiel #19
0
def create_tfidf_features_and_features_file(sentence_working_set,
                                            features_file, features_dir,
                                            index_path, sentence_file,
                                            top_doc_files, query,
                                            past_winners_file):
    command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp indri.jar Main " + index_path + " " + sentence_file + " " + top_doc_files + " " + past_winners_file + " " + query
    print(run_bash_command(command))
    command = "mv doc*_* " + features_dir
    run_bash_command(command)
    command = "perl " + params.sentence_feature_creator + " " + features_dir + " " + sentence_working_set
    run_bash_command(command)
    command = "mv features " + features_file
    run_bash_command(command)
def merge_indices(new_index, run_name="", new_index_name=""):
    path_to_folder = '/home/greg/auto_seo'
    if new_index_name == "":
        new_index_name = path_to_folder + '/new_merged_index' + run_name
    # print("deleting old merged index repository")
    # command = "rm -r "+path_to_folder+'/new_merged_index*'
    # print("delete command = ",command)
    # run_bash_command(command)
    # print("delete finished")
    command = '/home/greg/indri_test/bin/dumpindex ' + new_index_name + ' merge ' + new_index + ' ' + params.corpus_path_56
    print("merging command:", command)
    sys.stdout.flush()
    out = run_bash_command(command)
    print("merging out command:", out)
    # run_command(command)
    return new_index_name
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Write the script exit code log ######
    utils.add_junit_test(
        test_cases, 'test script execution', (args.exit_code == '0'),
        'test script failure with exit code: ' + args.exit_code)

    if args.experiment is not None:
        ###### Initialization ######
        host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
        client = Client(host=host)

        ###### Get experiments ######
        experiment_id = client.get_experiment(
            experiment_name=args.experiment).id

        ###### Get runs ######
        list_runs_response = client.list_runs(page_size=1000,
                                              experiment_id=experiment_id)

        ###### Check all runs ######
        for run in list_runs_response.runs:
            run_id = run.id
            response = client.wait_for_run_completion(run_id, 1200)
            succ = (response.run.status.lower() == 'succeeded')
            utils.add_junit_test(test_cases, 'job completion', succ,
                                 'waiting for job completion failure')

            ###### Output Argo Log for Debugging ######
            workflow_json = client._get_workflow_json(run_id)
            workflow_id = workflow_json['metadata']['name']
            argo_log, _ = utils.run_bash_command(
                'argo logs -n {} -w {}'.format(args.namespace, workflow_id))
            print("=========Argo Workflow Log=========")
            print(argo_log)

            if not succ:
                utils.write_junit_xml(test_name, args.result, test_cases)
                exit(1)

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    client = Client(namespace=args.namespace)

    ###### Get experiments ######
    list_experiments_response = client.list_experiments(page_size=100)
    for experiment in list_experiments_response.experiments:
        if experiment.name == args.experiment:
            experiment_id = experiment.id

    ###### Get runs ######
    import kfp_run
    resource_reference_key_type = kfp_run.models.api_resource_type.ApiResourceType.EXPERIMENT
    resource_reference_key_id = experiment_id
    list_runs_response = client.list_runs(
        page_size=1000,
        resource_reference_key_type=resource_reference_key_type,
        resource_reference_key_id=resource_reference_key_id)

    ###### Check all runs ######
    for run in list_runs_response.runs:
        run_id = run.id
        response = client.wait_for_run_completion(run_id, 1200)
        succ = (response.run.status.lower() == 'succeeded')
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure')

        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print("=========Argo Workflow Log=========")
        print(argo_log)

        if not succ:
            utils.write_junit_xml(test_name, args.result, test_cases)
            exit(1)

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
def add_docs_to_index(index, run_name=""):
    """
    Parse the trectext file given, and create an index.
    """
    path_to_folder = '/lv_local/home/sgregory/auto_seo'
    indri_build_index = '/lv_local/home/sgregory/indri_test/bin/IndriBuildIndex'
    corpus_path = params.new_trec_text_file + run_name
    corpus_class = 'trectext'
    memory = '1G'
    stemmer = 'krovetz'
    os.popen('mkdir -p ' + path_to_folder)
    if not os.path.exists(path_to_folder + "/index/"):
        os.makedirs(path_to_folder + "/index/")
    command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer
    print(command)
    out = run_bash_command(command)
    print(out)
    return index
Beispiel #24
0
    def check(self):
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(test_cases, 'test script execution',
                             (self._exit_code == '0'),
                             'test script failure with exit code: '
                             + self._exit_code)

        if self._experiment is not None:  # Bypassing dsl type check sample.
            ###### Initialization ######
            host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
            client = Client(host=host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(experiment_name=self._experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=_RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, _TEST_TIMEOUT)
                succ = (response.run.status.lower()=='succeeded')
                utils.add_junit_test(test_cases, 'job completion',
                                     succ, 'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                argo_log, _ = utils.run_bash_command(
                    'argo logs -n {} -w {}'.format(self._namespace, workflow_id))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
def create_index(trec_text_file, run_name=""):
    """
    Parse the trectext file given, and create an index.
    """
    path_to_folder = '/home/greg/auto_seo'
    indri_build_index = '/home/greg/indri_test/bin/IndriBuildIndex'
    corpus_path = trec_text_file
    corpus_class = 'trectext'
    memory = '1G'
    index = path_to_folder + "/index/new_index" + run_name
    stemmer = 'krovetz'
    os.popen('mkdir -p ' + path_to_folder)
    if not os.path.isdir(path_to_folder + "/index/"):
        os.makedirs(path_to_folder + "/index/")
    command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer
    print(command)
    out = run_bash_command(command)
    print(out)
    return index
    def run_model_on_test(self, test_file, fold):
        trees, leaves = self.chosen_model_per_fold[fold]
        score_file = "lm_score/" + str(fold) + "/score" + str(
            trees) + "_" + str(leaves)
        if not os.path.exists(os.path.dirname(score_file)):
            os.makedirs(os.path.dirname(score_file))
        run_bash_command('touch ' + score_file)

        model_path = "lm_models/" + str(fold) + "/model_" + str(
            trees) + "_" + str(leaves)
        run_bash_command('touch ' + score_file)
        command = self.java_path + " -jar " + self.jar_path + " -load " + model_path + " -rank " + test_file + " -score " + score_file
        run_bash_command(command)
        return score_file
Beispiel #27
0
def create_features_file_sentence_exp(features_dir, index_path, queries_file,
                                      new_features_file, working_set):
    run_bash_command("rm -r " + features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)

    command = params.ltr_features_script + " " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=' + working_set + ' -workingSetFormat=trec'
    print(command)
    out = run_bash_command(command)
    print(out)
    run_bash_command("mv doc*_* " + features_dir)
    command = "perl " + params.features_generator_script_path + " " + features_dir + " " + working_set
    print(command)
    out = run_bash_command(command)
    print(out)
    command = "mv features_ " + new_features_file
    print(command)
    out = run_bash_command(command)
    print(out)
def create_features_file(features_dir, index_path, queries_file,
                         new_features_file, add_remove_file, run_name,
                         working_set):
    run_bash_command("rm -r " + features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)

    # command= params.ltr_features_script+" "+ queries_file + ' -stream=doc -index=' + index_path + ' -repository='+ index_path +' -useWorkingSet=true -workingSetFile='+ params.working_set_file+run_name + ' -workingSetFormat=trec'
    command = " java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp /home/greg/auto_seo/scripts/indri.jar LTRFeaturesCreator " + add_remove_file + " " + working_set + " docIDs"
    print(command)
    out = run_bash_command(command)
    print(out)
    # command=params.cent_script+' ' + queries_file + ' -index=' + index_path + ' -useWorkingSet=true -workingSetFile='+ params.working_set_file+run_name + ' -workingSetFormat=trec'
    # print(command)
    # out = run_bash_command(command)
    # print(out)
    run_bash_command("mv doc*_* " + features_dir)
    command = "perl " + params.features_generator_script_path + " " + features_dir + " " + working_set + " " + run_name
    print(command)
    out = run_bash_command(command)
    print(out)
Beispiel #29
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'Kubeflow Sample Test'

    ###### Initialization ######
    client = Client()

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Create Experiment ######
    experiment_name = 'kubeflow sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'kubeflow_sample'
    params = {
        'output': args.output,
        'project': 'ml-pipeline-test',
        'evaluation': 'gs://ml-pipeline-dataset/sample-test/flower/eval15.csv',
        'train': 'gs://ml-pipeline-dataset/sample-test/flower/train30.csv',
        'hidden-layer-size': '10,5',
        'steps': '5'
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)
    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    #TODO: remove the namespace dependency or make is configurable.
    argo_log, _ = utils.run_bash_command(
        'argo logs -n kubeflow -w {}'.format(workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    ###### Validate the results ######
    #   confusion matrix should show three columns for the flower data
    #     target, predicted, count
    cm_tar_path = './confusion_matrix.tar.gz'
    cm_filename = 'mlpipeline-ui-metadata.json'
    utils.get_artifact_in_minio(workflow_json, 'confusionmatrix', cm_tar_path)
    tar_handler = tarfile.open(cm_tar_path)
    tar_handler.extractall()

    with open(cm_filename, 'r') as f:
        cm_data = json.load(f)
        utils.add_junit_test(
            test_cases, 'confusion matrix format',
            (len(cm_data['outputs'][0]['schema']) == 3),
            'the column number of the confusion matrix output is not equal to three'
        )

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
def move_feature_file(feature_file, run_name):
    command = 'mv ' + feature_file + ' ' + feature_file + run_name
    run_bash_command(command)
    print("feature file moved")