def train_from_file(ifile, file_type): dictionaries = [] sentences = [] with open(ifile, 'r') as fr: sentence_index = 0 sentence = [] nonprojs = [] for line in fr.readlines(): split_line = line.split() if len(split_line) == 10: if file_type == 'conll': word = get_word_from_conll(split_line) if file_type == 'conllu': if split_line[0] == '#': continue word = get_word_from_conllu(split_line) word['deprel'] = 'dep' sentence.append(word) else: if sentence_index >= 0: parser = DependencyParser(sentence) try: log = parser.get_transitions() dictionaries.append(log) sentences.append(sentence) except NonProjectiveParseError as nppe: nonprojs.append((sentence_index, str(nppe))) sentence_index += 1 sentence = [] # break return dictionaries, sentences
def parse_dp(dp_file, wts_file, surp_file, k, ofile, num_sent=None): parse_stats = { 'correct': 0.0, 'total': 0.0, 'correct_trans_avlbl': 0.0, 'all_correct': 0.0, 'all_correct_total': 0.0, 'correct_label': 0.0, 'sent_correct': 0.0, 'num_sents': 0.0 } maxent = MaxEnt(wts_file, len(ArcEagerState.transition_types)) with open(dp_file, 'r') as fr: with open(surp_file, 'w') as fw: with open(ofile, 'w') as fo: sentence_index = 0 fw.write('item\troi\tword\tsurprisal\tretrieval\n') sentence = [] nonprojs = [] for line in fr.readlines() + ['\n']: split_line = line.split() if len(split_line) == 10: word = get_word_from_conll(split_line) # print word['form'], sentence.append(word) else: if sentence_index % 5 == 0: print "Sent %s" % sentence_index if sentence_index >= 0: parser = DependencyParser(sentence) parse = parser.best_parse(maxent, k) for key in parse_stats: if key in parse: parse_stats[key] = parse_stats.get( key, 0) + parse[key] parse_stats['num_sents'] += 1 parse_stats['sent_correct'] += 1 if ( parse_stats['total'] - parse_stats['correct'] < 0.5) else 0 for pair in parse['surprisal']: fw.write( str(sentence_index + 1) + '\t' + '\t'.join([str(x) for x in pair]) + '\n') if len(sentence): for w in sentence: w['parent'] = parse['parent'][ w['index']]['index'] fo.write(write_to_conll(w) + '\n') fo.write('\n') sentence_index += 1 sentence = [] if num_sent and (sentence_index >= num_sent): break return parse_stats
def generateCompFiles(model_type, num_iterations, features_cutoff): if model_type == "basic": print("Generating Basic Model Competition Predictions") data = DependencyDataReader(train_file) basic_features = FullBasicFeatures(data, features_cutoff) basic_features.initialize_vector() basic_model = DependencyParser(basic_features, num_iterations) generateCompTagging(comp_file, basic_model) elif model_type == "advanced": print("Generating Advanced Model Competition Predictions") data = DependencyDataReader(all_file) advanced_features = AdvancedFeatures(data, features_cutoff) advanced_features.initialize_vector() advanced_model = DependencyParser(advanced_features, num_iterations) generateCompTagging(comp_file, advanced_model)
def parse_dp(dp_file, wts_file, surp_file, k, ofile, num_sent=None): parse_stats = {'correct': 0.0, 'total': 0.0, 'correct_trans_avlbl': 0.0, 'all_correct': 0.0, 'all_correct_total': 0.0, 'correct_label': 0.0, 'sent_correct': 0.0, 'num_sents': 0.0 } maxent = MaxEnt(wts_file, len(ArcEagerState.transition_types)) with open(dp_file, 'r') as fr: with open(surp_file, 'w') as fw: with open(ofile, 'w') as fo: sentence_index = 0 fw.write('item\troi\tword\tsurprisal\tretrieval\n') sentence = [] nonprojs = [] for line in fr.readlines()+['\n']: split_line = line.split() if len(split_line) == 10: word = get_word_from_conll(split_line) # print word['form'], sentence.append(word) else: if sentence_index % 5 == 0: print "Sent %s"%sentence_index if sentence_index >= 0: parser = DependencyParser(sentence) parse = parser.best_parse(maxent,k) for key in parse_stats: if key in parse: parse_stats[key] = parse_stats.get(key, 0) + parse[key] parse_stats['num_sents'] += 1 parse_stats['sent_correct'] += 1 if (parse_stats['total'] - parse_stats['correct'] < 0.5) else 0 for pair in parse['surprisal']: fw.write(str(sentence_index + 1) + '\t' + '\t'.join([str(x) for x in pair]) + '\n') if len(sentence): for w in sentence: w['parent'] = parse['parent'][w['index']]['index'] fo.write(write_to_conll(w)+'\n') fo.write('\n') sentence_index += 1 sentence = [] if num_sent and (sentence_index >= num_sent): break return parse_stats
def main(): global_timer = Timer("Total Runtime") if len(argv) == 3: NUM_ITERATIONS = int(argv[1]) FEATURES_CUTOFF = int(argv[2]) elif len(argv) == 2: NUM_ITERATIONS = int(argv[1]) FEATURES_CUTOFF = 0 else: NUM_ITERATIONS = 20 FEATURES_CUTOFF = 0 evaluate_per_iteration = False pretrained_weights = None time = Timer('Data reader') train_data = DependencyDataReader(train_file) time.stop() print("Number of sentences:", train_data.get_num_sentences()) time = Timer('Advanced Features') features = AdvancedFeatures(train_data, FEATURES_CUTOFF) features.initialize_vector() time.stop() print("Number of Features:", features.getFeaturesVectorLength()) model = DependencyParser(features, pretrained_weights) if pretrained_weights is None: model.fit(NUM_ITERATIONS, evaluate_per_iteration) results = [ "Number of Iterations: " + str(NUM_ITERATIONS), "Feature Cutoff: " + str(FEATURES_CUTOFF) ] model.predict(train_data) results.append(str(model.evaluate(train_data))) test_data = DependencyDataReader(test_file) print("Number of sentences:", test_data.get_num_sentences()) model.predict(test_data) results.append(str(model.evaluate(test_data))) global_timer.stop()