def getTrainingData(text_features=True): ''' Loads the training data from the appropriate directory ''' #Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) token_ner, token_pos = parseTrainingData(training_data) feature_type = "" state_features = {} #Considering only words with count less than 3 for the similarity based classifier if text_features: low_frequency_token_ner = findLowFrequencyWord(token_ner) state_features = findFeaturesForText(low_frequency_token_ner) feature_type = "text_features" #Considering all words for the POS based classifier else: state_features = findFeaturesForPOS(token_pos, token_ner) feature_type = "pos_features" #Finding the probabilities for the features feature_probabilities = findProbabilityForFeatures(state_features) saveFeaturesToDisk(feature_probabilities, feature_type) pprint(feature_probabilities) return feature_probabilities
def getTrainingData(text_features=True): """ Loads the training data from the appropriate directory """ # Load the data present in the training file f = loadFile(dir_path + training_file) training_data = getDataFromFile(f) token_ner, token_pos = parseTrainingData(training_data) feature_type = "" state_features = {} # Considering only words with count less than 3 for the similarity based classifier if text_features: low_frequency_token_ner = findLowFrequencyWord(token_ner) state_features = findFeaturesForText(low_frequency_token_ner) feature_type = "text_features" # Considering all words for the POS based classifier else: state_features = findFeaturesForPOS(token_pos, token_ner) feature_type = "pos_features" # Finding the probabilities for the features feature_probabilities = findProbabilityForFeatures(state_features) saveFeaturesToDisk(feature_probabilities, feature_type) pprint(feature_probabilities) return feature_probabilities
def processTrainingData(context, pos, ner): ''' Given the context, part of speeches and the named entity tags, create a data structure that simplifies look ups ''' training_data = OrderedDefaultdict(dict); for i in xrange(len(context)): #if ner[i] != "": Reduced the accuracy down to 34% training_data[context[i]] = {'POS' : pos[i], 'NE' : ner[i]} pprint(training_data) return training_data
def processTrainingData(context, pos, ner): ''' Given the context, part of speeches and the named entity tags, create a data structure that simplifies look ups ''' training_data = OrderedDefaultdict(dict) for i in xrange(len(context)): #if ner[i] != "": Reduced the accuracy down to 34% training_data[context[i]] = {'POS': pos[i], 'NE': ner[i]} pprint(training_data) return training_data