def train_from_file(ifile, file_type):

	dictionaries = []
	sentences = []
	with open(ifile, 'r') as fr:
		sentence_index = 0
		sentence = []
		nonprojs = []
		for line in fr.readlines():
			split_line = line.split()
			if len(split_line) == 10:
				if file_type == 'conll':
					word = get_word_from_conll(split_line)
				if file_type == 'conllu':
					if split_line[0] == '#':
						continue
					word = get_word_from_conllu(split_line)
				word['deprel'] = 'dep'
				sentence.append(word)
			else:
				if sentence_index >= 0:
					parser = DependencyParser(sentence)
					try:
						log = parser.get_transitions()
						dictionaries.append(log)
						sentences.append(sentence)
					except NonProjectiveParseError as nppe:
						nonprojs.append((sentence_index, str(nppe)))
				sentence_index += 1
				sentence = []
				# break
	return dictionaries, sentences
def train_from_file(ifile, file_type):

    dictionaries = []
    sentences = []
    with open(ifile, 'r') as fr:
        sentence_index = 0
        sentence = []
        nonprojs = []
        for line in fr.readlines():
            split_line = line.split()
            if len(split_line) == 10:
                if file_type == 'conll':
                    word = get_word_from_conll(split_line)
                if file_type == 'conllu':
                    if split_line[0] == '#':
                        continue
                    word = get_word_from_conllu(split_line)
                word['deprel'] = 'dep'
                sentence.append(word)
            else:
                if sentence_index >= 0:
                    parser = DependencyParser(sentence)
                    try:
                        log = parser.get_transitions()
                        dictionaries.append(log)
                        sentences.append(sentence)
                    except NonProjectiveParseError as nppe:
                        nonprojs.append((sentence_index, str(nppe)))
                sentence_index += 1
                sentence = []
                # break
    return dictionaries, sentences
def parse_dp(dp_file, wts_file, surp_file, k, ofile, num_sent=None):
    parse_stats = {
        'correct': 0.0,
        'total': 0.0,
        'correct_trans_avlbl': 0.0,
        'all_correct': 0.0,
        'all_correct_total': 0.0,
        'correct_label': 0.0,
        'sent_correct': 0.0,
        'num_sents': 0.0
    }
    maxent = MaxEnt(wts_file, len(ArcEagerState.transition_types))
    with open(dp_file, 'r') as fr:
        with open(surp_file, 'w') as fw:
            with open(ofile, 'w') as fo:
                sentence_index = 0
                fw.write('item\troi\tword\tsurprisal\tretrieval\n')
                sentence = []
                nonprojs = []
                for line in fr.readlines() + ['\n']:
                    split_line = line.split()
                    if len(split_line) == 10:
                        word = get_word_from_conll(split_line)
                        # print word['form'],
                        sentence.append(word)
                    else:
                        if sentence_index % 5 == 0:
                            print "Sent %s" % sentence_index
                        if sentence_index >= 0:
                            parser = DependencyParser(sentence)
                            parse = parser.best_parse(maxent, k)
                            for key in parse_stats:
                                if key in parse:
                                    parse_stats[key] = parse_stats.get(
                                        key, 0) + parse[key]
                            parse_stats['num_sents'] += 1
                            parse_stats['sent_correct'] += 1 if (
                                parse_stats['total'] -
                                parse_stats['correct'] < 0.5) else 0
                            for pair in parse['surprisal']:
                                fw.write(
                                    str(sentence_index + 1) + '\t' +
                                    '\t'.join([str(x) for x in pair]) + '\n')
                            if len(sentence):
                                for w in sentence:
                                    w['parent'] = parse['parent'][
                                        w['index']]['index']
                                    fo.write(write_to_conll(w) + '\n')
                                fo.write('\n')
                        sentence_index += 1
                        sentence = []
                        if num_sent and (sentence_index >= num_sent): break
    return parse_stats
예제 #4
0
def generateCompFiles(model_type, num_iterations, features_cutoff):
    if model_type == "basic":
        print("Generating Basic Model Competition Predictions")
        data = DependencyDataReader(train_file)
        basic_features = FullBasicFeatures(data, features_cutoff)
        basic_features.initialize_vector()
        basic_model = DependencyParser(basic_features, num_iterations)
        generateCompTagging(comp_file, basic_model)
    elif model_type == "advanced":
        print("Generating Advanced Model Competition Predictions")
        data = DependencyDataReader(all_file)
        advanced_features = AdvancedFeatures(data, features_cutoff)
        advanced_features.initialize_vector()
        advanced_model = DependencyParser(advanced_features, num_iterations)
        generateCompTagging(comp_file, advanced_model)
def parse_dp(dp_file, wts_file, surp_file, k, ofile, num_sent=None):
	parse_stats = {'correct': 			0.0,
				'total': 				0.0,
				'correct_trans_avlbl': 	0.0,
				'all_correct': 			0.0,
				'all_correct_total': 	0.0,
				'correct_label': 		0.0,
				'sent_correct':			0.0,
				'num_sents':			0.0
				}
	maxent = MaxEnt(wts_file, len(ArcEagerState.transition_types))
	with open(dp_file, 'r') as fr:
		with open(surp_file, 'w') as fw:
			with open(ofile, 'w') as fo:
				sentence_index = 0
				fw.write('item\troi\tword\tsurprisal\tretrieval\n')
				sentence = []
				nonprojs = []
				for line in fr.readlines()+['\n']:
					split_line = line.split()
					if len(split_line) == 10:
						word = get_word_from_conll(split_line)
						# print word['form'],
						sentence.append(word)
					else:
						if sentence_index % 5 == 0:	print "Sent %s"%sentence_index
						if sentence_index >= 0:
							parser = DependencyParser(sentence)
							parse = parser.best_parse(maxent,k)
							for key in parse_stats:
								if key in parse:
									parse_stats[key] = parse_stats.get(key, 0) + parse[key]
							parse_stats['num_sents'] += 1
							parse_stats['sent_correct'] += 1 if (parse_stats['total'] - parse_stats['correct'] < 0.5) else 0
							for pair in parse['surprisal']:
								fw.write(str(sentence_index + 1) + '\t' + '\t'.join([str(x) for x in pair]) + '\n')
							if len(sentence):
								for w in sentence:
									w['parent'] = parse['parent'][w['index']]['index']
									fo.write(write_to_conll(w)+'\n')
								fo.write('\n')
						sentence_index += 1
						sentence = []
						if num_sent and (sentence_index >= num_sent): break
	return parse_stats
예제 #6
0
def main():
    global_timer = Timer("Total Runtime")
    if len(argv) == 3:
        NUM_ITERATIONS = int(argv[1])
        FEATURES_CUTOFF = int(argv[2])
    elif len(argv) == 2:
        NUM_ITERATIONS = int(argv[1])
        FEATURES_CUTOFF = 0
    else:
        NUM_ITERATIONS = 20
        FEATURES_CUTOFF = 0
    evaluate_per_iteration = False
    pretrained_weights = None

    time = Timer('Data reader')
    train_data = DependencyDataReader(train_file)
    time.stop()
    print("Number of sentences:", train_data.get_num_sentences())
    time = Timer('Advanced Features')
    features = AdvancedFeatures(train_data, FEATURES_CUTOFF)
    features.initialize_vector()
    time.stop()
    print("Number of Features:", features.getFeaturesVectorLength())
    model = DependencyParser(features, pretrained_weights)
    if pretrained_weights is None:
        model.fit(NUM_ITERATIONS, evaluate_per_iteration)
    results = [
        "Number of Iterations: " + str(NUM_ITERATIONS),
        "Feature Cutoff: " + str(FEATURES_CUTOFF)
    ]
    model.predict(train_data)
    results.append(str(model.evaluate(train_data)))
    test_data = DependencyDataReader(test_file)
    print("Number of sentences:", test_data.get_num_sentences())
    model.predict(test_data)
    results.append(str(model.evaluate(test_data)))
    global_timer.stop()