def runPart(train_file, test_file, best_answer, language): train_set = parse_data(train_file) test_set = parse_data(test_file) # for k in range(len(EXPERIMENTS)): # print "Experiment {0}".format(k) # ops = {"STOPWORDS": EXPERIMENTS[k][0], \ # "PUNCTUATION": EXPERIMENTS[k][1], \ # "BAGOFWORDS": EXPERIMENTS[k][2], \ # "COLLOCATION": EXPERIMENTS[k][3], \ # "PARTOFSPEECH": EXPERIMENTS[k][4]} # # B.run(train_set, test_set, language, best_answer, ops) # evaluate_part(2) B.run(train_set, test_set, language, best_answer) evaluate_part(2)
def import_data(data_file): counter = 0 with open(data_file) as f: for row in parse_data(f): question = QuestionModel(**json2db(row)) db.session.add(question) db.session.commit() counter += 1 print('{} committed'.format(counter))
def test_parse_marker(self): marker = parse_data(Marker, self.marker_dummy) for key in self.marker_dummy: self.assertEqual(marker.__dict__[key], self.marker_dummy[key])
def test_bad_data(self): self.assertIsNone(parse_data(Marker, self.bad_marker_dummy))
def test_bad_data_format_marker_class(self): self.assertEqual(parse_data(Marker, self.bad_marker_dummy), None)
def test_class_without_parse_method(self): self.assertEqual(parse_data(self.NoParsedObject, self.marker_dummy), None)
def test_gp_mut(generations=20): data_path = Path('./containerfs/tmp/cetdl1772small.dat') training_data = parse_data(data_path) gpobj = GP(POP_SIZE, training_data, mutation_method='branch_replacement') gpobj.run(generations)
def runPart(train_file, test_file, knn_answer, svm_answer, language): train_set = parse_data(train_file) test_set = parse_data(test_file) A.run(train_set, test_set, language, knn_answer, svm_answer)
def test_parse_marker(self): marker = parse_data(Marker, self.marker_dummy) for key, value in self.marker_dummy.iteritems(): self.assertEqual(getattr(marker, key), value)
import A, B from main import parse_data eng_train = parse_data('data/English-train.xml') eng_test = parse_data('data/English-dev.xml') cata_train = parse_data('data/Catalan-train.xml') cata_test = parse_data('data/Catalan-dev.xml') span_train = parse_data('data/Spanish-train.xml') span_test = parse_data('data/Spanish-dev.xml') # A.run(train, test, language, knn_file, svm_file): A.run(eng_train, eng_test, 'English', 'KNN-English.answer', 'SVM-English.answer') A.run(cata_train, cata_test, 'Catalan', 'KNN-Catalan.answer', 'SVM-Catalan.answer') A.run(span_train, span_test, 'Spanish', 'KNN-Spanish.answer', 'SVM-Spanish.answer') # B.run(train, test, language, answer) B.run(eng_train, eng_test, 'English', 'Best-English.answer') B.run(cata_train, cata_test, 'Catalan', 'Best-Catalan.answer') B.run(span_train, span_test, 'Spanish', 'Best-Spanish.answer')
def test_data_null(self): self.assertIsNone(parse_data(Marker, None))
def test_data_null(self): self.assertEqual(parse_data(Marker, None), None)
def test_class_null(self): self.assertEqual(parse_data(None, self.marker_dummy), None)
return sense def most_frequent_sense(data, sense_dict, language): outfile = codecs.open(language + '.baseline', encoding='utf-8', mode='w') for lexelt, instances in sorted( data.iteritems(), key=lambda d: main.replace_accented(d[0].split('.')[0])): for instance in sorted(instances, key=lambda d: int(d[0].split('.')[-1])): instance_id = instance[0] sid = getFrequentSense(lexelt, sense_dict) outfile.write( main.replace_accented(lexelt + ' ' + instance_id + ' ' + sid + '\n')) outfile.close() if __name__ == '__main__': data_path = '/home/595/Homework3/data/' if len(sys.argv) != 2: print 'Usage: python baseline.py <language>' sys.exit(0) language = sys.argv[1] train_file = data_path + language + '-train.xml' dev_file = data_path + language + '-dev.xml' train = main.parse_data(train_file) test = main.parse_data(dev_file) sense_dict = build_dict(train) most_frequent_sense(test, sense_dict, language)
import couchdb from main import parse_data port = couchdb.client.Server('https://*****:*****@fiatjaf.iriscouch.com')['portao'] for row in port.view('webapp/only-raw', include_docs=True): print row print row.doc data = parse_data(row.value) data['_rev'] = row.doc.rev print data port[row.id] = data
Return the most frequent sense of a word (lexelt) in the training set ''' sense = '' try: sense = sense_dict[lexelt] except KeyError: pass return sense def most_frequent_sense(data, sense_dict, language): outfile = codecs.open(language + '.baseline', encoding='utf-8', mode='w') for lexelt, instances in sorted(data.iteritems(), key=lambda d: main.replace_accented(d[0].split('.')[0])): for instance in sorted(instances, key=lambda d: int(d[0].split('.')[-1])): instance_id = instance[0] sid = getFrequentSense(lexelt, sense_dict) outfile.write(main.replace_accented(lexelt + ' ' + instance_id + ' ' + sid + '\n')) outfile.close() if __name__ == '__main__': data_path = '/home/595/Homework3/data/' if len(sys.argv) != 2: print 'Usage: python baseline.py <language>' sys.exit(0) language = sys.argv[1] train_file = data_path + language + '-train.xml' dev_file = data_path + language + '-dev.xml' train = main.parse_data(train_file) test = main.parse_data(dev_file) sense_dict = build_dict(train) most_frequent_sense(test, sense_dict,language)
import main from chromosome import Chromosome main.parse_data() fitness = main.fitness_function_procedure chromosome = Chromosome(['1', '101', '10', '10'], fitness) chromosome.calculate_value()