def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression): """ Feed in an essay set to get feature vector and classifier essays must be an essay set object additional array is an optional argument that can specify a numpy array of values to add in returns a trained FeatureExtractor object and a trained classifier """ f = feature_extractor.FeatureExtractor() f.initialize_dictionaries(essays) train_feats = f.gen_feats(essays) set_score = numpy.asarray(essays._score, dtype=numpy.int) if len(util_functions.f7(list(set_score)))>5: algorithm = util_functions.AlgorithmTypes.regression else: algorithm = util_functions.AlgorithmTypes.classification clf,clf2 = get_algorithms(algorithm) cv_error_results=get_cv_error(clf2,train_feats,essays._score) try: clf.fit(train_feats, set_score) except ValueError: log.exception("Not enough classes (0,1,etc) in sample.") set_score[0]=1 set_score[1]=0 clf.fit(train_feats, set_score) return f, clf, cv_error_results
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression): """ Feed in an essay set to get feature vector and classifier essays must be an essay set object additional array is an optional argument that can specify a numpy array of values to add in returns a trained FeatureExtractor object and a trained classifier """ f = feature_extractor.FeatureExtractor() f.initialize_dictionaries(essays) train_feats = f.gen_feats(essays) set_score = numpy.asarray(essays._score, dtype=numpy.int) if len(util_functions.f7(list(set_score))) > 5: type = util_functions.AlgorithmTypes.regression else: type = util_functions.AlgorithmTypes.classification clf, clf2 = get_algorithms(type) cv_error_results = get_cv_error(clf2, train_feats, essays._score) try: clf.fit(train_feats, set_score) except ValueError: log.exception("Not enough classes (0,1,etc) in sample.") set_score[0] = 1 set_score[1] = 0 clf.fit(train_feats, set_score) return f, clf, cv_error_results
def select_algorithm(score_list): #Decide what algorithm to use (regression or classification) try: #Count the number of unique score points in the score list if len(util_functions.f7(list(score_list))) > 5: algorithm = util_functions.AlgorithmTypes.regression else: algorithm = util_functions.AlgorithmTypes.classification except: algorithm = util_functions.AlgorithmTypes.regression return algorithm
def select_algorithm(score_list): #Decide what algorithm to use (regression or classification) try: #Count the number of unique score points in the score list if len(util_functions.f7(list(score_list)))>5: algorithm = util_functions.AlgorithmTypes.regression else: algorithm = util_functions.AlgorithmTypes.classification except: algorithm = util_functions.AlgorithmTypes.regression return algorithm
def create(text,score,prompt_string): """ Creates a machine learning model from input text, associated scores, a prompt, and a path to the model TODO: Remove model path argument, it is needed for now to support legacy code text - A list of strings containing the text of the essays score - a list of integers containing score values prompt_string - the common prompt for the set of essays """ #Initialize a results dictionary to return results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, 'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification, 'score' : score, 'text' : text, 'prompt' : prompt_string} if len(text)!=len(score): msg = "Target and text lists must be same length." results['errors'].append(msg) log.exception(msg) return results #Decide what algorithm to use (regression or classification) try: #Count the number of unique score points in the score list if len(util_functions.f7(list(score)))>5: type = util_functions.AlgorithmTypes.regression else: type = util_functions.AlgorithmTypes.classification except: type = util_functions.AlgorithmTypes.regression try: #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc) e_set = model_creator.create_essay_set(text, score, prompt_string) except: msg = "essay set creation failed." results['errors'].append(msg) log.exception(msg) try: #Gets features from the essay set and computes error feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type) results['cv_kappa']=cv_error_results['kappa'] results['cv_mean_absolute_error']=cv_error_results['mae'] results['feature_ext']=feature_ext results['classifier']=classifier results['algorithm'] = type results['success']=True except: msg = "feature extraction and model creation failed." results['errors'].append(msg) log.exception(msg) return results
def create(text, score, prompt_string): """ Creates a machine learning model from input text, associated scores, a prompt, and a path to the model TODO: Remove model path argument, it is needed for now to support legacy code text - A list of strings containing the text of the essays score - a list of integers containing score values prompt_string - the common prompt for the set of essays """ # Initialize a results dictionary to return results = { 'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0, 'feature_ext': "", 'classifier': "", 'algorithm': util_functions.AlgorithmTypes.classification, 'score': score, 'text': text, 'prompt': prompt_string } if len(text) != len(score): msg = "Target and text lists must be same length." results['errors'].append(msg) log.exception(msg) return results # Decide what algorithm to use (regression or classification) try: # Count the number of unique score points in the score list if len(util_functions.f7(list(score))) > 5: type = util_functions.AlgorithmTypes.regression else: type = util_functions.AlgorithmTypes.classification except: type = util_functions.AlgorithmTypes.regression try: # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc) e_set = model_creator.create_essay_set(text, score, prompt_string) except: msg = "essay set creation failed." results['errors'].append(msg) log.exception(msg) try: # Gets features from the essay set and computes error feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model( e_set, type=type) results['cv_kappa'] = cv_error_results['kappa'] results['cv_mean_absolute_error'] = cv_error_results['mae'] results['feature_ext'] = feature_ext results['classifier'] = classifier results['algorithm'] = type results['success'] = True except: msg = "feature extraction and model creation failed." results['errors'].append(msg) log.exception(msg) return results