def getTrainingData(text_features=True): ''' Loads the training data from the appropriate directory ''' #Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) token_ner, token_pos = parseTrainingData(training_data) feature_type = "" state_features = {} #Considering only words with count less than 3 for the similarity based classifier if text_features: low_frequency_token_ner = findLowFrequencyWord(token_ner) state_features = findFeaturesForText(low_frequency_token_ner) feature_type = "text_features" #Considering all words for the POS based classifier else: state_features = findFeaturesForPOS(token_pos, token_ner) feature_type = "pos_features" #Finding the probabilities for the features feature_probabilities = findProbabilityForFeatures(state_features) saveFeaturesToDisk(feature_probabilities, feature_type) pprint(feature_probabilities) return feature_probabilities
def getTrainingData(text_features=True): """ Loads the training data from the appropriate directory """ # Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) token_ner, token_pos = parseTrainingData(training_data) feature_type = "" state_features = {} # Considering only words with count less than 3 for the similarity based classifier if text_features: low_frequency_token_ner = findLowFrequencyWord(token_ner) state_features = findFeaturesForText(low_frequency_token_ner) feature_type = "text_features" # Considering all words for the POS based classifier else: state_features = findFeaturesForPOS(token_pos, token_ner) feature_type = "pos_features" # Finding the probabilities for the features feature_probabilities = findProbabilityForFeatures(state_features) saveFeaturesToDisk(feature_probabilities, feature_type) pprint(feature_probabilities) return feature_probabilities
def getTrainingData(): ''' Loads the training data from the appropriate directory ''' #Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) context, pos, ner = parseTrainingData(training_data) training_data = processTrainingData(context, pos, ner) largest_key_size = getMaxLengthKey(training_data) return training_data, largest_key_size
def getTestData(HMM=False): ''' Loads the test data from the appropriate directory ''' #Load the data present in the test file f = loadFile(dir_path + test_file) test_data = getDataFromFile(f) if not HMM: context, pos, index = parseTestData(test_data) return context, pos, index else: context, pos, index = parseTestDataHMM(test_data) return context, pos, index
def getTrainingData(HMM=False): ''' Loads the training data from the appropriate directory ''' #Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) if not HMM: context, pos, ner = parseTrainingData(training_data) training_data = processTrainingData(context, pos, ner) return training_data else: context, pos, ner = parseTrainingDataHMM(training_data) return context, pos, ner