Ejemplo n.º 1
0
class InterfaceWrapper:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python run.py -operation <operation_name> <parameters for operation>"
        print "******supported operations******"
        print "(1) operation name: sampling"," parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>"

        print "(2) operation name: extract_features"," parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset  <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"

        print "(3) operation name: modeling","\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>","\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>"

        print "(4) operation name: classification"," parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>"

        exit()

    def run(self):
        # print sys.argv
        self.arg_obj.parse(sys.argv)
        print '*'*100
        print "Run got arguments:",self.arg_obj.args
        print '*'*100

        if not self.arg_obj.args.has_key("operation"):
            self.show_help()

        if self.arg_obj.args["operation"] == SAMPLING:
            self.run_sampling()

        if self.arg_obj.args["operation"] == FEATURE_EXTRACTION:
            self.run_feature_extraction()

        if self.arg_obj.args["operation"] == MODELING:
            self.run_modeling()

        if self.arg_obj.args["operation"] == CLASSIFICATION:
            self.run_classification()

        if self.arg_obj.args["operation"] == DEFAULT:
            self.run_default_flow()

    # method to run sampling
    def run_sampling(self):
        # launch sampling
        _cmd = "python "+_prefix+"/"+"sampler_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Sampling:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass

    # method to run feature extraction
    def run_feature_extraction(self):
        # launch feature extraction here
        _cmd = "python "+_prefix+"/"+"feature_extraction_interface.py " + self.arg_obj.get_string()
        print "run: Invoking Feature Extraction:\n",_cmd
        self.invoke(_cmd)
        pass

    # method to run modeling
    def run_modeling(self):
        # launch modeling
        _cmd = "python "+_prefix+"/"+"modeler_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Modeling:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass

    # method to run the classification
    def run_classification(self):
        # launch classification
        _cmd = "python "+_prefix+"/"+"classification_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Classification:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass


    # method to check if all the necessary parameters are provided for default flow
    def check_params(self):
        # this checks for all params required to execute the default flow of the framework
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key("fe_mapper") or not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") or not self.arg_obj.args.has_key("cl_mapper") or not self.arg_obj.args.has_key("cl_reducer") :
            self.show_help()

    # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling
    # user just needs to provide required parameters
    # this runs: sampling->feature extraction->modeling
    def run_default_flow(self):
        self.check_params()

        # launch feature extraction
        self.run_feature_extraction()

        # if feature extraction was successful then proceed for modeling
        if os.path.exists("./feature_set_for_modeling"):
            print "Launching modeler with extracted feature set..."
            self.run_modeling()
            # if model was successfully generated, set the model parameter for classification flow
            self.arg_obj.args["model"] = "trained_models/"+str(self.arg_obj.args["train_size"])+"_output.model"
        else:
            print "unable to find the directory 'feature_set_for_modeling'"

        # if the modeling was successful then proceed for classification
        if os.path.exists("./trained_models"):
            print "Launching classification with trained model from ./trained_models"
            # launch classification
            self.run_classification()
        else:
            print "unable to find the directory 'trained_models'"

    # method to invoke the commands
    def invoke(self, cmd):
        os.system(cmd)
class FeatureExtractionInterface:

    def __init__(self, data_location=None):
        self.arg_obj = ArgParser()

        self.data_location = data_location
        self.sampler = None
        self.feature_extraction_mapper = None
        self.feature_extraction_reducer = None
        self.mapper_param = None
        self.reducer_param = None
        self.training_set_size = None
        self.test_set_size = None

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python feature_extraction_interface.py -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"
        exit()

    # method that checks if required parameters are there or not
    # returns False if the required params are missing
    # returns True if all the required params are provided
    def check_params(self):
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key("fe_mapper") or  not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") :
            self.show_help()
            exit()

    # method that invokes sampling- it assumes that positive instances file is named 'positive_instances' and negative instances file is named 'negative_instances'
    def invoke_sampling(self):
        self.check_params()
        # do sampling

        # using default sampler if not provided sampler in param list
        if not self.arg_obj.args.has_key("sampler"):
            _sampler = "lib.sampler.random_sampler.RandomSampler"
        else:
            _sampler = self.arg_obj.args["sampler"]

        _cmd = "python "+_prefix+"/"+"sampler_interface.py "+self.arg_obj.get_string()
        os.system(_cmd)
        time.sleep(5)

    # method that removes the dataset directory on HDFS
    def remove_dataset_dir_on_hdfs(self):
        self.check_params()
        # remove training directory on HDFS
        _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_input"
        os.system(_cmd)
        time.sleep(5)

    # method that removes the output directory on HDFS
    def remove_output_dir_on_hdfs(self):
        self.check_params()
        # remove output directory on HDFS
        _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_output"
        os.system(_cmd)
        time.sleep(5)

    # method that loads the dataset into HDFS
    def load_data_set_on_hdfs(self):
        self.check_params()
        # load new training data on HDFS
        _cmd = "hadoop fs -put Train/train_set_w_tags /user/hadoop/feature_extraction_input/"
        os.system(_cmd)
        time.sleep(10)

    # method that starts the feature extraction job
    def start_feature_extraction_job(self):
        self.check_params()
        print "Launching map-reduce feature extraction task..."
        # start feature extraction

        _cmd = "hadoop jar /home/hadoop/contrib/streaming/hadoop-streaming-1.0.3.jar -input /user/hadoop/feature_extraction_input -mapper '"+ self.feature_extraction_mapper

        # use parameters for mapper job if they are provided
        if not self.mapper_param is None:
            _cmd = _cmd +" "+ self.mapper_param

        _cmd = _cmd + "' -file "+ self.feature_extraction_mapper +" -reducer '"+ self.feature_extraction_reducer

        # use parameters for reducer job if they are provided
        if not self.reducer_param is None:
            _cmd = _cmd +" "+ self.reducer_param
        _cmd = _cmd + "' -file "+ self.feature_extraction_reducer +" -file glossextractionengine.mod -output /user/hadoop/feature_extraction_output -jobconf mapred.job.name='GlossExtractionEngine:FeatureExtraction'"

        os.system(_cmd)
        time.sleep(5)
        print "feature extraction task completed."

    # method that exports the result of feature extraction from HDFS to local file system
    def export_output_from_hdfs(self):
        self.check_params()

        # create the output directory for featuer extraction job
        if not os.path.exists("feature_set_for_modeling"):
            os.system("mkdir feature_set_for_modeling")

        # remove previous version of the feature set file in the output directory
        if os.path.exists("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"):
            print "FeatureExtractionInterface: File already exists.. removing it :","./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"
            os.remove("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt")

        # get the merged output from HDFS
        _cmd = "hadoop fs -getmerge /user/hadoop/feature_extraction_output ./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"
        os.system(_cmd)
        print "Saved output[Feature set for modeling] at : feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"

    # method to perform sequence of operations before launching a map-reduce job for feature extraction
    def launch(self):
        self.check_params()
        if self.arg_obj.args.has_key("sampler"):
            # interact with sampling interface for sampling
            self.invoke_sampling()

        self.remove_dataset_dir_on_hdfs()
        self.remove_output_dir_on_hdfs()
        self.load_data_set_on_hdfs()
        # start the feature extraction job
        self.start_feature_extraction_job()
        self.export_output_from_hdfs()
Ejemplo n.º 3
0
class InterfaceWrapper:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python run.py -operation <operation_name> <parameters for operation>"
        print "******supported operations******"
        print "(1) operation name: sampling", " parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>"

        print "(2) operation name: extract_features", " parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset  <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"

        print "(3) operation name: modeling", "\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>", "\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>"

        print "(4) operation name: classification", " parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>"

        exit()

    def run(self):
        # print sys.argv
        self.arg_obj.parse(sys.argv)
        print '*' * 100
        print "Run got arguments:", self.arg_obj.args
        print '*' * 100

        if not self.arg_obj.args.has_key("operation"):
            self.show_help()

        if self.arg_obj.args["operation"] == SAMPLING:
            self.run_sampling()

        if self.arg_obj.args["operation"] == FEATURE_EXTRACTION:
            self.run_feature_extraction()

        if self.arg_obj.args["operation"] == MODELING:
            self.run_modeling()

        if self.arg_obj.args["operation"] == CLASSIFICATION:
            self.run_classification()

        if self.arg_obj.args["operation"] == DEFAULT:
            self.run_default_flow()

    # method to run sampling
    def run_sampling(self):
        # launch sampling
        _cmd = "python " + _prefix + "/" + "sampler_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Sampling:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to run feature extraction
    def run_feature_extraction(self):
        # launch feature extraction here
        _cmd = "python " + _prefix + "/" + "feature_extraction_interface.py " + self.arg_obj.get_string(
        )
        print "run: Invoking Feature Extraction:\n", _cmd
        self.invoke(_cmd)
        pass

    # method to run modeling
    def run_modeling(self):
        # launch modeling
        _cmd = "python " + _prefix + "/" + "modeler_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Modeling:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to run the classification
    def run_classification(self):
        # launch classification
        _cmd = "python " + _prefix + "/" + "classification_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Classification:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to check if all the necessary parameters are provided for default flow
    def check_params(self):
        # this checks for all params required to execute the default flow of the framework
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key(
                "fe_mapper") or not self.arg_obj.args.has_key(
                    "fe_reducer") or not self.arg_obj.args.has_key(
                        "train_dataset") or not self.arg_obj.args.has_key(
                            "train_size") or not self.arg_obj.args.has_key(
                                "test_size") or not self.arg_obj.args.has_key(
                                    "cl_mapper"
                                ) or not self.arg_obj.args.has_key(
                                    "cl_reducer"):
            self.show_help()

    # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling
    # user just needs to provide required parameters
    # this runs: sampling->feature extraction->modeling
    def run_default_flow(self):
        self.check_params()

        # launch feature extraction
        self.run_feature_extraction()

        # if feature extraction was successful then proceed for modeling
        if os.path.exists("./feature_set_for_modeling"):
            print "Launching modeler with extracted feature set..."
            self.run_modeling()
            # if model was successfully generated, set the model parameter for classification flow
            self.arg_obj.args["model"] = "trained_models/" + str(
                self.arg_obj.args["train_size"]) + "_output.model"
        else:
            print "unable to find the directory 'feature_set_for_modeling'"

        # if the modeling was successful then proceed for classification
        if os.path.exists("./trained_models"):
            print "Launching classification with trained model from ./trained_models"
            # launch classification
            self.run_classification()
        else:
            print "unable to find the directory 'trained_models'"

    # method to invoke the commands
    def invoke(self, cmd):
        os.system(cmd)