Beispiel #1
0
def getTrainingData(text_features=True):
    '''
        Loads the training data from the appropriate directory
    '''
    #Load the data present in the training file
    f = loadFile(dir_path + training_file)
    training_data = getDataFromFile(f)
    token_ner, token_pos = parseTrainingData(training_data)

    feature_type = ""
    state_features = {}

    #Considering only words with count less than 3 for the similarity based classifier
    if text_features:
        low_frequency_token_ner = findLowFrequencyWord(token_ner)
        state_features = findFeaturesForText(low_frequency_token_ner)
        feature_type = "text_features"

    #Considering all words for the POS based classifier
    else:
        state_features = findFeaturesForPOS(token_pos, token_ner)
        feature_type = "pos_features"

    #Finding the probabilities for the features
    feature_probabilities = findProbabilityForFeatures(state_features)
    saveFeaturesToDisk(feature_probabilities, feature_type)
    pprint(feature_probabilities)
    return feature_probabilities
def getTrainingData(text_features=True):
    """
        Loads the training data from the appropriate directory
    """
    # Load the data present in the training file
    f = loadFile(dir_path + training_file)
    training_data = getDataFromFile(f)
    token_ner, token_pos = parseTrainingData(training_data)

    feature_type = ""
    state_features = {}

    # Considering only words with count less than 3 for the similarity based classifier
    if text_features:
        low_frequency_token_ner = findLowFrequencyWord(token_ner)
        state_features = findFeaturesForText(low_frequency_token_ner)
        feature_type = "text_features"

    # Considering all words for the POS based classifier
    else:
        state_features = findFeaturesForPOS(token_pos, token_ner)
        feature_type = "pos_features"

    # Finding the probabilities for the features
    feature_probabilities = findProbabilityForFeatures(state_features)
    saveFeaturesToDisk(feature_probabilities, feature_type)
    pprint(feature_probabilities)
    return feature_probabilities
def processTrainingData(context, pos, ner):
    '''
        Given the context, part of speeches and the named entity tags, 
        create a data structure that simplifies look ups
    '''
    training_data = OrderedDefaultdict(dict);

    for i in xrange(len(context)):
        #if ner[i] != "": Reduced the accuracy down to 34%
            training_data[context[i]] = {'POS' : pos[i], 'NE' : ner[i]}

    pprint(training_data)
    return training_data
def processTrainingData(context, pos, ner):
    '''
        Given the context, part of speeches and the named entity tags, 
        create a data structure that simplifies look ups
    '''
    training_data = OrderedDefaultdict(dict)

    for i in xrange(len(context)):
        #if ner[i] != "": Reduced the accuracy down to 34%
        training_data[context[i]] = {'POS': pos[i], 'NE': ner[i]}

    pprint(training_data)
    return training_data