def cross_validate(): references, corpus = pre.get_input(main._TEST_FILE, labeled=True) sim_vectors, labels = mod.model(references, corpus, labeled=True) sim_vectors = sum(sim_vectors, []) folds = [] fold_sizes = [len(sim_vectors) / _FOLDS for _ in range(_FOLDS)] rest = len(sim_vectors) % _FOLDS for i in range(rest): fold_sizes[i] += 1 indices = range(len(sim_vectors)) rd.shuffle(indices) for i in range(_FOLDS): folds.append(indices[:fold_sizes[i]]) indices = indices[fold_sizes[i]:] pred = lm.LogisticRegression() brier = [] for index, fold in enumerate(folds): train_indices = set(range(len(sim_vectors))) - set(fold) train = [sim_vectors[i] for i in train_indices] train_target = [labels[i] for i in train_indices] pred.fit(train, train_target) test = np.array([list(sim_vectors[i]) for i in fold]) test_target = [labels[i] for i in fold] test_pred = [p[1] for p in pred.predict_proba(test)] brier.append(1.0/len(test) * sum([(test_target[i] - test_pred[i]) ** 2 for i in range(len(test))])) print 'Brier score' print brier print sum(brier) / len(brier) data = np.array(sim_vectors) classes = np.array(labels) pred = lm.LogisticRegression() scores = cross_validation.cross_val_score(pred, data, classes, 'accuracy', cv=5) print 'Accuracy' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'precision', cv=5) print 'Precision' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'recall', cv=5) print 'Recall' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'f1', cv=5) print 'f1' print scores print sum(scores) / len(scores)
def probabilistic_disambiguation(print_time=False): """ Performs probabilistic disambiguation, including training fase, for a set of references. Observations: - The returned values are serialized since they are the same between different iterations. Args: print_time: whether time elapsed during this function execution should be printed. Returns: A blocked list of references to be ranked and a list of probabilistic matrices, one for each block. """ time_i = time.time() if os.path.isfile(_PKL_PROBS) and os.path.isfile(_PKL_REFS): pkl_file = open(_PKL_REFS, 'r') references = pkl.load(pkl_file) pkl_file.close() pkl_file = open(_PKL_PROBS, 'r') probs = pkl.load(pkl_file) pkl_file.close() else: references, corpus = pre.get_input(_TRAINING_FILE, labeled=True) pred = lear.train(references, corpus) references, corpus = pre.get_input(_TEST_FILE, limit=_INPUT_LIMIT) pkl_file = open(_PKL_REFS, 'w') probs = pkl.dump(references, pkl_file) pkl_file.close() probs = lear.test(references, corpus, pred) pkl_file = open(_PKL_PROBS, 'w') probs = pkl.dump(probs, pkl_file) pkl_file.close() time_f = time.time() if print_time: time_file = open(_TIME_FILE, 'a') print >> time_file, 'PROBABILISTIC_DISAMBIGUATION' print >> time_file, time_f - time_i time_file.close() return references, probs
def test_modeling_unlabeled(self): """ Tests the correct composition of the result in modeling function, since the singular functions are tested. For unlabeled case. """ references, corpus = pre.get_input(self.testfilename) sim_vectors = mod.model([references[0]], corpus) truth = [[(0.6506800700017669, 2.5, 1, 0.8571428571428571, 0), (0.6506800700017669, 3.5, 2, 0.0, 0), (1.0, 4.5, 1, 0.0, 0)]] self.assertEquals(sim_vectors, truth)
def test_get_base_partitioning(self): """ Tests the get_base_partitioning function. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3]] + \ [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) probs = lear.test(references, corpus, pred) matrices = part.get_probability_matrices(references, probs) distances = [part.transform_distance_matrix(matrix) for matrix in matrices] self.assertEqual(part.get_base_partitioning(distances[0]), ([0, 0, 0], 1))
def test_modeling_labeled(self): """ Tests the correct composition of the result in modeling function, since the singular functions are tested. For labeled case. """ references, corpus = pre.get_input(self.testfilename, labeled=True) result = mod.model([references[0] + references[2]], corpus, labeled=True) truth = ([[(0.6506800700017669, 2.5, 1, 0.8571428571428571, 0), (0.6506800700017669, 3.5, 2, 0.0, 0), (0.0, 0, 0, 0.0, 0), (1.0, 4.5, 1, 0.0, 0), (0.0, 0, 0, 0.0, 0), (0.0, 0, 0, 0.0, 0)]], [1, 1, 0, 1, 0, 0]) self.assertEquals(result, truth)
def test_testing(self): """ Tests the test phase. Observations: - Considers that the error in the training set is zero, which is used in testing fase. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3]] + [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) sim_vectors, classes = mod.model(references, corpus, labeled=True) self.assertEqual([round(prob) for prob in sum(lear.test(references, corpus, pred), [])], classes)
def test_training(self): """ Tests the training phase. Observations: - Considers that there is no error in the training, which is not always the case. However, it works for small inputs and is a reasonable approximation test. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3]] + [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) sim_vectors, classes = mod.model(references, corpus, labeled=True) self.assertEqual(pred.predict(sum(sim_vectors, [])).tolist(), classes)
def test_testing(self): """ Tests the test phase. Observations: - Considers that the error in the training set is zero, which is used in testing fase. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3] ] + [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) sim_vectors, classes = mod.model(references, corpus, labeled=True) self.assertEqual([ round(prob) for prob in sum(lear.test(references, corpus, pred), []) ], classes)
def test_get_probability_matrix(self): """ Tests the get_probability_matrix function. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3]] + \ [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) probs = lear.test(references, corpus, pred) self.assertEqual( part.get_probability_matrices(references, probs), [[[1.0, 0.89441830645266462, 0.95097107828998639], [0.89441830645266462, 1.0, 0.97565300931621723], [0.95097107828998639, 0.97565300931621723, 1.0]], [[1.0, 0.73429831405564638], [0.73429831405564638, 1.0]], [[1.0, 0.60153464586560224], [0.60153464586560224, 1.0]], [[1.0, 0.4067916074419064, 0.4067916074419064], [0.4067916074419064, 1.0, 0.4067916074419064], [0.4067916074419064, 0.4067916074419064, 1.0]]])
def test_get_probability_matrix(self): """ Tests the get_probability_matrix function. """ references, corpus = pre.get_input(self.testfilename, labeled=True) references = [references[0], references[1], references[3]] + \ [references[2] + references[4] + references[5]] pred = lear.train(references, corpus) probs = lear.test(references, corpus, pred) self.assertEqual(part.get_probability_matrices(references, probs),[ [[1.0, 0.89441830645266462, 0.95097107828998639], [0.89441830645266462, 1.0, 0.97565300931621723], [0.95097107828998639, 0.97565300931621723, 1.0]], [[1.0, 0.73429831405564638], [0.73429831405564638, 1.0]], [[1.0, 0.60153464586560224], [0.60153464586560224, 1.0]], [[1.0, 0.4067916074419064, 0.4067916074419064], [0.4067916074419064, 1.0, 0.4067916074419064], [0.4067916074419064, 0.4067916074419064, 1.0]]])
def test_get_input_unlabeled(self): """ Tests function get_input for unlabeled case. """ result = pre.get_input(self.testfilename) truth = ( [[Reference(0, 'm jones', 'symbol intersect detect method improv spatial intersect join', ['e rundensteiner', 'y huang'], 'geoinformatica', None), Reference(1, 'matthew c jones', 'improv spatial intersect join symbol intersect detect', ['e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra'], 'sigmodels.intern manag data', None), Reference(2, 'matthew c jones', 'view materi techniqu complex hirarch object', ['e rundensteiner', 'y huang'], 'ssd symposium larg spatial databas', None)], [Reference(3, 'mike w miller', 'domin draw bipartit graph', ['l berg'], 'sigucc special interest group univers comput servic', None), Reference(4, 'mike w miller', 'rel compromis statist databas', [], 'sigucc special interest group univers comput servic', None)], [Reference(5, 'c chen', 'formal approach scenario analysi', ['d kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima'], 'ieee softwar', None)], [Reference(6, 'jane j robinson', 'discours code clue context', [], 'acl meet the associ comput linguist', None), Reference(7, 'jane j robinson', 'diagram grammar dialogu', [], 'cooper interfac inform system', None)], [Reference(8, 'a gupta', 'iri h java distanc educ', ['a gonzalez', 'a hamid', 'c overstreet', 'h wahab', 'j wild', 'k maly', 's ghanem', 'x zhu'], 'acm journal educ resourc comput', None)], [Reference(9, 'mary d brown', 'intern redund represent limit bypass support pipelin adder regist' 'file', ['y patt'], 'proceed the th ieee intern symposium high ' 'perform comput architectur hpca intern symposium high perform ' 'comput architectur talk slide', None)]], ['m jones', 'e rundensteiner', 'y huang', 'matthew c jones', 'e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra', 'matthew c jones', 'e rundensteiner', 'y huang', 'mike w miller', 'l berg', 'mike w miller', 'c chen', 'd kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima', 'jane j robinson', 'jane j robinson', 'a gupta', 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab', 'j wild', 'k maly', 's ghanem', 'x zhu', 'mary d brown', 'y patt']) self.assertEquals(result, truth)
def cross_validate(): references, corpus = pre.get_input(main._TEST_FILE, labeled=True) sim_vectors, labels = mod.model(references, corpus, labeled=True) sim_vectors = sum(sim_vectors, []) folds = [] fold_sizes = [len(sim_vectors) / _FOLDS for _ in range(_FOLDS)] rest = len(sim_vectors) % _FOLDS for i in range(rest): fold_sizes[i] += 1 indices = range(len(sim_vectors)) rd.shuffle(indices) for i in range(_FOLDS): folds.append(indices[:fold_sizes[i]]) indices = indices[fold_sizes[i]:] pred = lm.LogisticRegression() brier = [] for index, fold in enumerate(folds): train_indices = set(range(len(sim_vectors))) - set(fold) train = [sim_vectors[i] for i in train_indices] train_target = [labels[i] for i in train_indices] pred.fit(train, train_target) test = np.array([list(sim_vectors[i]) for i in fold]) test_target = [labels[i] for i in fold] test_pred = [p[1] for p in pred.predict_proba(test)] brier.append(1.0 / len(test) * sum([(test_target[i] - test_pred[i])**2 for i in range(len(test))])) print 'Brier score' print brier print sum(brier) / len(brier) data = np.array(sim_vectors) classes = np.array(labels) pred = lm.LogisticRegression() scores = cross_validation.cross_val_score(pred, data, classes, 'accuracy', cv=5) print 'Accuracy' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'precision', cv=5) print 'Precision' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'recall', cv=5) print 'Recall' print scores print sum(scores) / len(scores) scores = cross_validation.cross_val_score(pred, data, classes, 'f1', cv=5) print 'f1' print scores print sum(scores) / len(scores)
def test_get_input_unlabeled(self): """ Tests function get_input for unlabeled case. """ result = pre.get_input(self.testfilename) truth = ([ [ Reference( 0, 'm jones', 'symbol intersect detect method improv spatial intersect join', ['e rundensteiner', 'y huang'], 'geoinformatica', None), Reference( 1, 'matthew c jones', 'improv spatial intersect join symbol intersect detect', [ 'e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra' ], 'sigmodels.intern manag data', None), Reference(2, 'matthew c jones', 'view materi techniqu complex hirarch object', ['e rundensteiner', 'y huang'], 'ssd symposium larg spatial databas', None) ], [ Reference( 3, 'mike w miller', 'domin draw bipartit graph', ['l berg'], 'sigucc special interest group univers comput servic', None), Reference( 4, 'mike w miller', 'rel compromis statist databas', [], 'sigucc special interest group univers comput servic', None) ], [ Reference( 5, 'c chen', 'formal approach scenario analysi', ['d kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima'], 'ieee softwar', None) ], [ Reference(6, 'jane j robinson', 'discours code clue context', [], 'acl meet the associ comput linguist', None), Reference(7, 'jane j robinson', 'diagram grammar dialogu', [], 'cooper interfac inform system', None) ], [ Reference(8, 'a gupta', 'iri h java distanc educ', [ 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab', 'j wild', 'k maly', 's ghanem', 'x zhu' ], 'acm journal educ resourc comput', None) ], [ Reference( 9, 'mary d brown', 'intern redund represent limit bypass support pipelin adder regist' 'file', ['y patt'], 'proceed the th ieee intern symposium high ' 'perform comput architectur hpca intern symposium high perform ' 'comput architectur talk slide', None) ] ], [ 'm jones', 'e rundensteiner', 'y huang', 'matthew c jones', 'e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra', 'matthew c jones', 'e rundensteiner', 'y huang', 'mike w miller', 'l berg', 'mike w miller', 'c chen', 'd kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima', 'jane j robinson', 'jane j robinson', 'a gupta', 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab', 'j wild', 'k maly', 's ghanem', 'x zhu', 'mary d brown', 'y patt' ]) self.assertEquals(result, truth)
ktok = 1 for t in xrange(1, min(k, n - k) + 1): ntok *= n ktok *= t n -= 1 return ntok // ktok else: return 0 def bell_number(n): if n == 1: return 1 elif n in BELL_NUMBERS: return BELL_NUMBERS[n] else: bell = 0 for i in range(n): bell += choose(n-1, i) * bell_number(n-1) BELL_NUMBERS[n] = bell return bell if __name__ == '__main__': blocks, _ = pre.get_input('data/data.dat') blocks_bells = [] for block in blocks: blocks_bells.append(bell_number(len(block))) #print [len(block) for block in blocks] #print blocks_bells print math.log10(sum(blocks_bells))