def getTrainingContextData(): training_data = OrderedDict() #Initialising the xml parser for the training and test set training_root = initializeXMLParser(dir_path+training_file) #Grabbing one word type at a time for word_type_xml in training_root: word_type = word_type_xml.attrib['item'] training_data[word_type] = defaultdict(lambda: defaultdict(dict)) #Grabbing the instance id and its list of senses for word_instance in word_type_xml: instance = word_instance.attrib['id'] senses = [answer.attrib['senseid'] for answer in word_instance.findall('answer')] pre_context = word_instance.find('context').text.split() post_context = word_instance.find('context').find('head').tail.split() #Pre-processing the pre-context and post context #TODO: Check why this is reducing the accuracy of the model by 1% pre_context = preProcessContextData(pre_context) post_context = preProcessContextData(post_context) training_data[word_type]['training'][instance] = {"Sense":senses, "Pre-Context":pre_context, "Post-Context":post_context } #break;#TODO: Remove this breakpoint. Only testing for one word type right now return training_data
def getTrainingContextData(): training_data = OrderedDict() #Initialising the xml parser for the training and test set training_root = initializeXMLParser(dir_path+training_file) #Grabbing one word type at a time for word_type_xml in training_root: word_type = word_type_xml.attrib['item'] training_data[word_type] = defaultdict(lambda: defaultdict(dict)) #Grabbing the instance id and its list of senses for word_instance in word_type_xml: instance = word_instance.attrib['id'] senses = [answer.attrib['senseid'] for answer in word_instance.findall('answer')] pre_context = word_instance.find('context').text.split() post_context = word_instance.find('context').find('head').tail.split() #Pre-processing the pre-context and post context pre_context = preProcessContextData(pre_context) post_context = preProcessContextData(post_context) #Dividing the training data into training and validation training_data[word_type]['training'][instance] = {"Sense":senses, "Pre-Context":pre_context, "Post-Context":post_context } #Choosing a random set of training data as the validation data training_data[word_type] = createValidationData(training_data[word_type]) return training_data
def getTestContextData(test_data): #Initialising the xml parser for the training and test set training_root = initializeXMLParser(dir_path + test_file) #Grabbing one word type at a time for word_type_xml in training_root: word_type = word_type_xml.attrib['item'] #Grabbing the instance id and its list of senses for word_instance in word_type_xml: instance = word_instance.attrib['id'] pre_context = word_instance.find('context').text.split() post_context = word_instance.find('context').find('head').tail.split() pre_context = preProcessContextData(pre_context) post_context = preProcessContextData(post_context) test_data[word_type]['test'][instance] = {"Pre-Context":pre_context, "Post-Context":post_context } #break#TODO: Remove this breakpoint. Only testing for one word type right now return test_data