def __init__(self, data_location=None):
        self.arg_obj = ArgParser()

        self.data_location = data_location
        self.sampler = None
        self.feature_extraction_mapper = None
        self.feature_extraction_reducer = None
        self.mapper_param = None
        self.reducer_param = None
        self.training_set_size = None
        self.test_set_size = None
 def __init__(self, data_location=None, model_file=None):
     self.arg_obj = ArgParser()
     # this is the local location where test dataset is located
     self.data_location = data_location
     self.model_file = model_file
     self.classification_mapper = None
     self.classification_reducer = None
     self.mapper_param = None
     self.reducer_param = None
class SamplingInterface:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print " usage: python sample_interface.py -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>"
        exit()

    # method that checks if required parameters are there or not
    # returns False if the required params are missing
    # returns True if all the required params are provided
    def check_params(self):
        print "sampler checking:",self.arg_obj.args
        if not self.arg_obj.args.has_key("sampler") or not self.arg_obj.args.has_key("positive") or not self.arg_obj.args.has_key("negative") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size"):
            self.show_help()
        else:
            return True

    def sample(self):
        self.arg_obj.parse(sys.argv)
        # check if required parameters are provided
        self.check_params()

        sampler_implementation = self.arg_obj.args["sampler"]
        positive_source_file = self.arg_obj.args["positive"]
        negative_source_file = self.arg_obj.args["negative"]
        train_set_size = self.arg_obj.args["train_size"]
        test_set_size = self.arg_obj.args["test_size"]

        # this dynamically loads the concrete implementation of sampler
        _dyn_cls_loader = DynamicClassLoader()
        _class_placeholder = _dyn_cls_loader.load(sampler_implementation)
        print "SamplingInterface:Instantiating:",_class_placeholder

        # instantiating the sampler
        _instance = _class_placeholder(positive_source_file, negative_source_file)

        _instance.generateDatasets(train_set_size, test_set_size)

        print "SamplingStub: sampling done."
class ModelingStub:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method that checks if required parameters are there or not
    # returns False if the required params are missing
    # returns True if all the required params are provided
    def check_params(self):
        print "modeler checking:",self.arg_obj.args
        if not self.arg_obj.args.has_key("feature_set_location"):
            return False
        else:
            return True
    # method to show error message
    def show_help(self):
        print ":( not enough params"
        print "usage: python modeler_interface.py -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>"
        print "==or=="
        print "usage: python modeler_interface.py -feature_set_location <feature_set_location_directory>"
        exit()

    # start the modeling using the feature set
    def model(self):
        self.arg_obj.parse(sys.argv)

        if not self.check_params():
            self.show_help()

        _feature_set_location = self.arg_obj.args["feature_set_location"]

        if os.path.isfile(_feature_set_location):
            # if given a file path and not provided the model name to save as
            if not self.arg_obj.args.has_key("model_name"):
                self.show_help()

            _model_name = self.arg_obj.args["model_name"]
            _instance = POSContextSequenceModeler(feature_set_location = _feature_set_location)
            _instance.train()
            _instance.save_model(name=_model_name,location="trained_models")

            print "ModelingStub: modeling done for given feature set file."

        if os.path.isdir(_feature_set_location):
            print "ModelingStub: looking into feature set directory..."

            # filter only feature set files with .txt extension
            file_list = [fn for fn in os.listdir(_feature_set_location) if fn.endswith(('.txt'))]

            for _file in file_list:
                _path = _feature_set_location+"/"+_file

                _coll = _file.split(".")

                _model_name = _coll[0]+".model"

                _instance = POSContextSequenceModeler(feature_set_location = _path)
                _instance.train()
                print "ModelingStub: trained the model.about to save."
                _instance.save_model(name=_model_name,location="trained_models")
                print "ModelingStub: modeling done for:",_file

            print "ModelingStub: modeling done for all files in directory provided."
 def __init__(self):
     self.arg_obj = ArgParser()
Example #6
0
 def __init__(self):
     self.arg_obj = ArgParser()
Example #7
0
class ModelingStub:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method that checks if required parameters are there or not
    # returns False if the required params are missing
    # returns True if all the required params are provided
    def check_params(self):
        print "modeler checking:", self.arg_obj.args
        if not self.arg_obj.args.has_key("feature_set_location"):
            return False
        else:
            return True

    # method to show error message
    def show_help(self):
        print ":( not enough params"
        print "usage: python modeler_interface.py -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>"
        print "==or=="
        print "usage: python modeler_interface.py -feature_set_location <feature_set_location_directory>"
        exit()

    # start the modeling using the feature set
    def model(self):
        self.arg_obj.parse(sys.argv)

        if not self.check_params():
            self.show_help()

        _feature_set_location = self.arg_obj.args["feature_set_location"]

        if os.path.isfile(_feature_set_location):
            # if given a file path and not provided the model name to save as
            if not self.arg_obj.args.has_key("model_name"):
                self.show_help()

            _model_name = self.arg_obj.args["model_name"]
            _instance = POSContextSequenceModeler(
                feature_set_location=_feature_set_location)
            _instance.train()
            _instance.save_model(name=_model_name, location="trained_models")

            print "ModelingStub: modeling done for given feature set file."

        if os.path.isdir(_feature_set_location):
            print "ModelingStub: looking into feature set directory..."

            # filter only feature set files with .txt extension
            file_list = [
                fn for fn in os.listdir(_feature_set_location)
                if fn.endswith(('.txt'))
            ]

            for _file in file_list:
                _path = _feature_set_location + "/" + _file

                _coll = _file.split(".")

                _model_name = _coll[0] + ".model"

                _instance = POSContextSequenceModeler(
                    feature_set_location=_path)
                _instance.train()
                print "ModelingStub: trained the model.about to save."
                _instance.save_model(name=_model_name,
                                     location="trained_models")
                print "ModelingStub: modeling done for:", _file

            print "ModelingStub: modeling done for all files in directory provided."
class FeatureExtractionInterface:

    def __init__(self, data_location=None):
        self.arg_obj = ArgParser()

        self.data_location = data_location
        self.sampler = None
        self.feature_extraction_mapper = None
        self.feature_extraction_reducer = None
        self.mapper_param = None
        self.reducer_param = None
        self.training_set_size = None
        self.test_set_size = None

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python feature_extraction_interface.py -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"
        exit()

    # method that checks if required parameters are there or not
    # returns False if the required params are missing
    # returns True if all the required params are provided
    def check_params(self):
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key("fe_mapper") or  not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") :
            self.show_help()
            exit()

    # method that invokes sampling- it assumes that positive instances file is named 'positive_instances' and negative instances file is named 'negative_instances'
    def invoke_sampling(self):
        self.check_params()
        # do sampling

        # using default sampler if not provided sampler in param list
        if not self.arg_obj.args.has_key("sampler"):
            _sampler = "lib.sampler.random_sampler.RandomSampler"
        else:
            _sampler = self.arg_obj.args["sampler"]

        _cmd = "python "+_prefix+"/"+"sampler_interface.py "+self.arg_obj.get_string()
        os.system(_cmd)
        time.sleep(5)

    # method that removes the dataset directory on HDFS
    def remove_dataset_dir_on_hdfs(self):
        self.check_params()
        # remove training directory on HDFS
        _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_input"
        os.system(_cmd)
        time.sleep(5)

    # method that removes the output directory on HDFS
    def remove_output_dir_on_hdfs(self):
        self.check_params()
        # remove output directory on HDFS
        _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_output"
        os.system(_cmd)
        time.sleep(5)

    # method that loads the dataset into HDFS
    def load_data_set_on_hdfs(self):
        self.check_params()
        # load new training data on HDFS
        _cmd = "hadoop fs -put Train/train_set_w_tags /user/hadoop/feature_extraction_input/"
        os.system(_cmd)
        time.sleep(10)

    # method that starts the feature extraction job
    def start_feature_extraction_job(self):
        self.check_params()
        print "Launching map-reduce feature extraction task..."
        # start feature extraction

        _cmd = "hadoop jar /home/hadoop/contrib/streaming/hadoop-streaming-1.0.3.jar -input /user/hadoop/feature_extraction_input -mapper '"+ self.feature_extraction_mapper

        # use parameters for mapper job if they are provided
        if not self.mapper_param is None:
            _cmd = _cmd +" "+ self.mapper_param

        _cmd = _cmd + "' -file "+ self.feature_extraction_mapper +" -reducer '"+ self.feature_extraction_reducer

        # use parameters for reducer job if they are provided
        if not self.reducer_param is None:
            _cmd = _cmd +" "+ self.reducer_param
        _cmd = _cmd + "' -file "+ self.feature_extraction_reducer +" -file glossextractionengine.mod -output /user/hadoop/feature_extraction_output -jobconf mapred.job.name='GlossExtractionEngine:FeatureExtraction'"

        os.system(_cmd)
        time.sleep(5)
        print "feature extraction task completed."

    # method that exports the result of feature extraction from HDFS to local file system
    def export_output_from_hdfs(self):
        self.check_params()

        # create the output directory for featuer extraction job
        if not os.path.exists("feature_set_for_modeling"):
            os.system("mkdir feature_set_for_modeling")

        # remove previous version of the feature set file in the output directory
        if os.path.exists("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"):
            print "FeatureExtractionInterface: File already exists.. removing it :","./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"
            os.remove("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt")

        # get the merged output from HDFS
        _cmd = "hadoop fs -getmerge /user/hadoop/feature_extraction_output ./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"
        os.system(_cmd)
        print "Saved output[Feature set for modeling] at : feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"

    # method to perform sequence of operations before launching a map-reduce job for feature extraction
    def launch(self):
        self.check_params()
        if self.arg_obj.args.has_key("sampler"):
            # interact with sampling interface for sampling
            self.invoke_sampling()

        self.remove_dataset_dir_on_hdfs()
        self.remove_output_dir_on_hdfs()
        self.load_data_set_on_hdfs()
        # start the feature extraction job
        self.start_feature_extraction_job()
        self.export_output_from_hdfs()
Example #9
0
class InterfaceWrapper:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python run.py -operation <operation_name> <parameters for operation>"
        print "******supported operations******"
        print "(1) operation name: sampling"," parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>"

        print "(2) operation name: extract_features"," parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset  <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"

        print "(3) operation name: modeling","\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>","\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>"

        print "(4) operation name: classification"," parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>"

        exit()

    def run(self):
        # print sys.argv
        self.arg_obj.parse(sys.argv)
        print '*'*100
        print "Run got arguments:",self.arg_obj.args
        print '*'*100

        if not self.arg_obj.args.has_key("operation"):
            self.show_help()

        if self.arg_obj.args["operation"] == SAMPLING:
            self.run_sampling()

        if self.arg_obj.args["operation"] == FEATURE_EXTRACTION:
            self.run_feature_extraction()

        if self.arg_obj.args["operation"] == MODELING:
            self.run_modeling()

        if self.arg_obj.args["operation"] == CLASSIFICATION:
            self.run_classification()

        if self.arg_obj.args["operation"] == DEFAULT:
            self.run_default_flow()

    # method to run sampling
    def run_sampling(self):
        # launch sampling
        _cmd = "python "+_prefix+"/"+"sampler_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Sampling:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass

    # method to run feature extraction
    def run_feature_extraction(self):
        # launch feature extraction here
        _cmd = "python "+_prefix+"/"+"feature_extraction_interface.py " + self.arg_obj.get_string()
        print "run: Invoking Feature Extraction:\n",_cmd
        self.invoke(_cmd)
        pass

    # method to run modeling
    def run_modeling(self):
        # launch modeling
        _cmd = "python "+_prefix+"/"+"modeler_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Modeling:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass

    # method to run the classification
    def run_classification(self):
        # launch classification
        _cmd = "python "+_prefix+"/"+"classification_interface.py " + self.arg_obj.get_string()
        print '*'*100
        print "run: Invoking Classification:\n",_cmd
        print '*'*100
        self.invoke(_cmd)
        pass


    # method to check if all the necessary parameters are provided for default flow
    def check_params(self):
        # this checks for all params required to execute the default flow of the framework
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key("fe_mapper") or not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") or not self.arg_obj.args.has_key("cl_mapper") or not self.arg_obj.args.has_key("cl_reducer") :
            self.show_help()

    # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling
    # user just needs to provide required parameters
    # this runs: sampling->feature extraction->modeling
    def run_default_flow(self):
        self.check_params()

        # launch feature extraction
        self.run_feature_extraction()

        # if feature extraction was successful then proceed for modeling
        if os.path.exists("./feature_set_for_modeling"):
            print "Launching modeler with extracted feature set..."
            self.run_modeling()
            # if model was successfully generated, set the model parameter for classification flow
            self.arg_obj.args["model"] = "trained_models/"+str(self.arg_obj.args["train_size"])+"_output.model"
        else:
            print "unable to find the directory 'feature_set_for_modeling'"

        # if the modeling was successful then proceed for classification
        if os.path.exists("./trained_models"):
            print "Launching classification with trained model from ./trained_models"
            # launch classification
            self.run_classification()
        else:
            print "unable to find the directory 'trained_models'"

    # method to invoke the commands
    def invoke(self, cmd):
        os.system(cmd)
Example #10
0
class InterfaceWrapper:
    def __init__(self):
        self.arg_obj = ArgParser()

    # method to show help message
    def show_help(self):
        print ":( not enough params"
        print "usage: python run.py -operation <operation_name> <parameters for operation>"
        print "******supported operations******"
        print "(1) operation name: sampling", " parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>"

        print "(2) operation name: extract_features", " parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params  <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset  <dataset_location> -train_size <train_set_size> -test_size <test_set_size>"

        print "(3) operation name: modeling", "\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>", "\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>"

        print "(4) operation name: classification", " parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>"

        exit()

    def run(self):
        # print sys.argv
        self.arg_obj.parse(sys.argv)
        print '*' * 100
        print "Run got arguments:", self.arg_obj.args
        print '*' * 100

        if not self.arg_obj.args.has_key("operation"):
            self.show_help()

        if self.arg_obj.args["operation"] == SAMPLING:
            self.run_sampling()

        if self.arg_obj.args["operation"] == FEATURE_EXTRACTION:
            self.run_feature_extraction()

        if self.arg_obj.args["operation"] == MODELING:
            self.run_modeling()

        if self.arg_obj.args["operation"] == CLASSIFICATION:
            self.run_classification()

        if self.arg_obj.args["operation"] == DEFAULT:
            self.run_default_flow()

    # method to run sampling
    def run_sampling(self):
        # launch sampling
        _cmd = "python " + _prefix + "/" + "sampler_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Sampling:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to run feature extraction
    def run_feature_extraction(self):
        # launch feature extraction here
        _cmd = "python " + _prefix + "/" + "feature_extraction_interface.py " + self.arg_obj.get_string(
        )
        print "run: Invoking Feature Extraction:\n", _cmd
        self.invoke(_cmd)
        pass

    # method to run modeling
    def run_modeling(self):
        # launch modeling
        _cmd = "python " + _prefix + "/" + "modeler_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Modeling:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to run the classification
    def run_classification(self):
        # launch classification
        _cmd = "python " + _prefix + "/" + "classification_interface.py " + self.arg_obj.get_string(
        )
        print '*' * 100
        print "run: Invoking Classification:\n", _cmd
        print '*' * 100
        self.invoke(_cmd)
        pass

    # method to check if all the necessary parameters are provided for default flow
    def check_params(self):
        # this checks for all params required to execute the default flow of the framework
        # mapper and reducer params might be optional- thus they are not required
        if not self.arg_obj.args.has_key(
                "fe_mapper") or not self.arg_obj.args.has_key(
                    "fe_reducer") or not self.arg_obj.args.has_key(
                        "train_dataset") or not self.arg_obj.args.has_key(
                            "train_size") or not self.arg_obj.args.has_key(
                                "test_size") or not self.arg_obj.args.has_key(
                                    "cl_mapper"
                                ) or not self.arg_obj.args.has_key(
                                    "cl_reducer"):
            self.show_help()

    # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling
    # user just needs to provide required parameters
    # this runs: sampling->feature extraction->modeling
    def run_default_flow(self):
        self.check_params()

        # launch feature extraction
        self.run_feature_extraction()

        # if feature extraction was successful then proceed for modeling
        if os.path.exists("./feature_set_for_modeling"):
            print "Launching modeler with extracted feature set..."
            self.run_modeling()
            # if model was successfully generated, set the model parameter for classification flow
            self.arg_obj.args["model"] = "trained_models/" + str(
                self.arg_obj.args["train_size"]) + "_output.model"
        else:
            print "unable to find the directory 'feature_set_for_modeling'"

        # if the modeling was successful then proceed for classification
        if os.path.exists("./trained_models"):
            print "Launching classification with trained model from ./trained_models"
            # launch classification
            self.run_classification()
        else:
            print "unable to find the directory 'trained_models'"

    # method to invoke the commands
    def invoke(self, cmd):
        os.system(cmd)