def cross_validate():
  references, corpus = pre.get_input(main._TEST_FILE, labeled=True)
  sim_vectors, labels = mod.model(references, corpus, labeled=True)
  sim_vectors = sum(sim_vectors, [])

  folds = []
  fold_sizes = [len(sim_vectors) / _FOLDS for _ in range(_FOLDS)]
  rest = len(sim_vectors) % _FOLDS
  for i in range(rest):
    fold_sizes[i] += 1
  indices = range(len(sim_vectors))
  rd.shuffle(indices)
  for i in range(_FOLDS):
    folds.append(indices[:fold_sizes[i]])
    indices = indices[fold_sizes[i]:]
  
  pred = lm.LogisticRegression()
  brier = []
  for index, fold in enumerate(folds):
    train_indices = set(range(len(sim_vectors))) - set(fold)
    train = [sim_vectors[i] for i in train_indices]
    train_target = [labels[i] for i in train_indices]
    pred.fit(train, train_target)
    test = np.array([list(sim_vectors[i]) for i in fold])
    test_target = [labels[i] for i in fold]
    test_pred = [p[1] for p in pred.predict_proba(test)]
    brier.append(1.0/len(test) * sum([(test_target[i] - test_pred[i]) ** 2 for i
        in range(len(test))]))
  print 'Brier score'
  print brier
  print sum(brier) / len(brier)

  data = np.array(sim_vectors)
  classes = np.array(labels)
  pred = lm.LogisticRegression()

  scores = cross_validation.cross_val_score(pred, data, classes, 'accuracy', 
      cv=5)
  print 'Accuracy'
  print scores
  print sum(scores) / len(scores)

  scores = cross_validation.cross_val_score(pred, data, classes, 'precision',
      cv=5)
  print 'Precision'
  print scores
  print sum(scores) / len(scores)

  scores = cross_validation.cross_val_score(pred, data, classes, 'recall', 
      cv=5)
  print 'Recall'
  print scores
  print sum(scores) / len(scores)

  scores = cross_validation.cross_val_score(pred, data, classes, 'f1', cv=5)
  print 'f1'
  print scores
  print sum(scores) / len(scores)
def probabilistic_disambiguation(print_time=False):
  """ Performs probabilistic disambiguation, including training fase, for a set
      of references.

  Observations:
    - The returned values are serialized since they are the same between
      different iterations.

  Args:
    print_time: whether time elapsed during this function execution should be
    printed.

  Returns:
    A blocked list of references to be ranked and a list of probabilistic
    matrices, one for each block.
  """
  time_i = time.time()
  
  if os.path.isfile(_PKL_PROBS) and os.path.isfile(_PKL_REFS):
    pkl_file = open(_PKL_REFS, 'r')
    references = pkl.load(pkl_file)
    pkl_file.close()
    pkl_file = open(_PKL_PROBS, 'r')
    probs = pkl.load(pkl_file)
    pkl_file.close()
  else:
    references, corpus = pre.get_input(_TRAINING_FILE, labeled=True)
    pred = lear.train(references, corpus)
    references, corpus = pre.get_input(_TEST_FILE, limit=_INPUT_LIMIT)
    pkl_file = open(_PKL_REFS, 'w')
    probs = pkl.dump(references, pkl_file)
    pkl_file.close()
    probs = lear.test(references, corpus, pred)
    pkl_file = open(_PKL_PROBS, 'w')
    probs = pkl.dump(probs, pkl_file)
    pkl_file.close()
  
  time_f = time.time()
  if print_time:
    time_file = open(_TIME_FILE, 'a')
    print >> time_file, 'PROBABILISTIC_DISAMBIGUATION'
    print >> time_file, time_f - time_i
    time_file.close()
    
  return references, probs
 def test_modeling_unlabeled(self):
   """ Tests the correct composition of the result in modeling function, 
       since the singular functions are tested. For unlabeled case. """
   references, corpus = pre.get_input(self.testfilename)
   sim_vectors = mod.model([references[0]], corpus)
   truth = [[(0.6506800700017669, 2.5, 1, 0.8571428571428571, 0),
       (0.6506800700017669, 3.5, 2, 0.0, 0),
       (1.0, 4.5, 1, 0.0, 0)]]
   self.assertEquals(sim_vectors, truth)    
 def test_get_base_partitioning(self):
   """ Tests the get_base_partitioning function. """
   references, corpus = pre.get_input(self.testfilename, labeled=True)
   references = [references[0], references[1], references[3]] + \
       [references[2] + references[4] + references[5]]
   pred = lear.train(references, corpus)
   probs = lear.test(references, corpus, pred)
   matrices = part.get_probability_matrices(references, probs)
   distances = [part.transform_distance_matrix(matrix) for matrix in matrices]
   self.assertEqual(part.get_base_partitioning(distances[0]), ([0, 0, 0], 1))
 def test_modeling_labeled(self):
   """ Tests the correct composition of the result in modeling function, 
       since the singular functions are tested. For labeled case. """
   references, corpus = pre.get_input(self.testfilename, labeled=True)
   result = mod.model([references[0] + references[2]], corpus, 
       labeled=True)
   truth = ([[(0.6506800700017669, 2.5, 1, 0.8571428571428571, 0),
       (0.6506800700017669, 3.5, 2, 0.0, 0),
       (0.0, 0, 0, 0.0, 0),
       (1.0, 4.5, 1, 0.0, 0),
       (0.0, 0, 0, 0.0, 0),
       (0.0, 0, 0, 0.0, 0)]], 
       [1, 1, 0, 1, 0, 0])
   self.assertEquals(result, truth)
  def test_testing(self):
    """ Tests the test phase.

    Observations:
      - Considers that the error in the training set is zero, which is used in
        testing fase.
    """
    references, corpus = pre.get_input(self.testfilename, labeled=True)
    references = [references[0], references[1], references[3]] + [references[2] 
        + references[4] + references[5]]
    pred = lear.train(references, corpus)
    sim_vectors, classes = mod.model(references, corpus, labeled=True)
    self.assertEqual([round(prob) for prob in 
        sum(lear.test(references, corpus, pred), [])], classes)
 def test_training(self):
   """ Tests the training phase. 
   
     Observations:
       - Considers that there is no error in the training, which is not
         always the case. However, it works for small inputs and is a
         reasonable approximation test.
   """
   references, corpus = pre.get_input(self.testfilename, labeled=True)
   references = [references[0], references[1], references[3]] + [references[2] 
       + references[4] + references[5]]
   pred = lear.train(references, corpus)
   sim_vectors, classes = mod.model(references, corpus, labeled=True)
   self.assertEqual(pred.predict(sum(sim_vectors, [])).tolist(), classes)
Esempio n. 8
0
    def test_testing(self):
        """ Tests the test phase.

    Observations:
      - Considers that the error in the training set is zero, which is used in
        testing fase.
    """
        references, corpus = pre.get_input(self.testfilename, labeled=True)
        references = [references[0], references[1], references[3]
                      ] + [references[2] + references[4] + references[5]]
        pred = lear.train(references, corpus)
        sim_vectors, classes = mod.model(references, corpus, labeled=True)
        self.assertEqual([
            round(prob)
            for prob in sum(lear.test(references, corpus, pred), [])
        ], classes)
 def test_get_probability_matrix(self):
     """ Tests the get_probability_matrix function. """
     references, corpus = pre.get_input(self.testfilename, labeled=True)
     references = [references[0], references[1], references[3]] + \
         [references[2] + references[4] + references[5]]
     pred = lear.train(references, corpus)
     probs = lear.test(references, corpus, pred)
     self.assertEqual(
         part.get_probability_matrices(references, probs),
         [[[1.0, 0.89441830645266462, 0.95097107828998639],
           [0.89441830645266462, 1.0, 0.97565300931621723],
           [0.95097107828998639, 0.97565300931621723, 1.0]],
          [[1.0, 0.73429831405564638], [0.73429831405564638, 1.0]],
          [[1.0, 0.60153464586560224], [0.60153464586560224, 1.0]],
          [[1.0, 0.4067916074419064, 0.4067916074419064],
           [0.4067916074419064, 1.0, 0.4067916074419064],
           [0.4067916074419064, 0.4067916074419064, 1.0]]])
 def test_get_probability_matrix(self):
   """ Tests the get_probability_matrix function. """
   references, corpus = pre.get_input(self.testfilename, labeled=True)
   references = [references[0], references[1], references[3]] + \
       [references[2] + references[4] + references[5]]
   pred = lear.train(references, corpus)
   probs = lear.test(references, corpus, pred)
   self.assertEqual(part.get_probability_matrices(references, probs),[
       [[1.0, 0.89441830645266462, 0.95097107828998639],
        [0.89441830645266462, 1.0, 0.97565300931621723],
        [0.95097107828998639, 0.97565300931621723, 1.0]],
       [[1.0, 0.73429831405564638],
        [0.73429831405564638, 1.0]],
       [[1.0, 0.60153464586560224],
        [0.60153464586560224, 1.0]],
       [[1.0, 0.4067916074419064, 0.4067916074419064],
        [0.4067916074419064, 1.0, 0.4067916074419064],
        [0.4067916074419064, 0.4067916074419064, 1.0]]])
  def test_get_input_unlabeled(self):
    """ Tests function get_input for unlabeled case. """
    result = pre.get_input(self.testfilename)
    truth = (
      [[Reference(0, 'm jones', 
          'symbol intersect detect method improv spatial intersect join', 
          ['e rundensteiner', 'y huang'], 'geoinformatica', None),
        Reference(1, 'matthew c jones', 
            'improv spatial intersect join symbol intersect detect', 
            ['e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra'], 
            'sigmodels.intern manag data', None),
        Reference(2, 'matthew c jones',
            'view materi techniqu complex hirarch object', ['e rundensteiner',
            'y huang'], 'ssd symposium larg spatial databas', None)],
      [Reference(3, 'mike w miller', 'domin draw bipartit graph', 
          ['l berg'], 'sigucc special interest group univers comput servic',
          None),
        Reference(4, 'mike w miller', 'rel compromis statist databas', 
            [], 'sigucc special interest group univers comput servic', None)],
      [Reference(5, 'c chen', 'formal approach scenario analysi',
          ['d kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima'],
          'ieee softwar', None)],
      [Reference(6, 'jane j robinson', 'discours code clue context', [], 
          'acl meet the associ comput linguist', None),
        Reference(7, 'jane j robinson', 'diagram grammar dialogu', [],
            'cooper interfac inform system', None)],
      [Reference(8, 'a gupta', 'iri h java distanc educ', ['a gonzalez', 
          'a hamid', 'c overstreet', 'h wahab', 'j wild', 'k maly', 's ghanem',
          'x zhu'], 'acm journal educ resourc comput', None)],
      [Reference(9, 'mary d brown',
          'intern redund represent limit bypass support pipelin adder regist'
          'file', ['y patt'], 'proceed the th ieee intern symposium high '
          'perform comput architectur hpca intern symposium high perform '
          'comput architectur talk slide', None)]],

    ['m jones', 'e rundensteiner', 'y huang', 'matthew c jones', 
        'e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra', 
        'matthew c jones', 'e rundensteiner', 'y huang', 'mike w miller',
        'l berg', 'mike w miller', 'c chen', 'd kung', 'j samuel', 'j gao',
        'p hsia', 'y toyoshima', 'jane j robinson', 'jane j robinson',
        'a gupta', 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab', 'j wild',
        'k maly', 's ghanem', 'x zhu', 'mary d brown', 'y patt'])
    self.assertEquals(result, truth)
Esempio n. 12
0
def cross_validate():
    references, corpus = pre.get_input(main._TEST_FILE, labeled=True)
    sim_vectors, labels = mod.model(references, corpus, labeled=True)
    sim_vectors = sum(sim_vectors, [])

    folds = []
    fold_sizes = [len(sim_vectors) / _FOLDS for _ in range(_FOLDS)]
    rest = len(sim_vectors) % _FOLDS
    for i in range(rest):
        fold_sizes[i] += 1
    indices = range(len(sim_vectors))
    rd.shuffle(indices)
    for i in range(_FOLDS):
        folds.append(indices[:fold_sizes[i]])
        indices = indices[fold_sizes[i]:]

    pred = lm.LogisticRegression()
    brier = []
    for index, fold in enumerate(folds):
        train_indices = set(range(len(sim_vectors))) - set(fold)
        train = [sim_vectors[i] for i in train_indices]
        train_target = [labels[i] for i in train_indices]
        pred.fit(train, train_target)
        test = np.array([list(sim_vectors[i]) for i in fold])
        test_target = [labels[i] for i in fold]
        test_pred = [p[1] for p in pred.predict_proba(test)]
        brier.append(1.0 / len(test) * sum([(test_target[i] - test_pred[i])**2
                                            for i in range(len(test))]))
    print 'Brier score'
    print brier
    print sum(brier) / len(brier)

    data = np.array(sim_vectors)
    classes = np.array(labels)
    pred = lm.LogisticRegression()

    scores = cross_validation.cross_val_score(pred,
                                              data,
                                              classes,
                                              'accuracy',
                                              cv=5)
    print 'Accuracy'
    print scores
    print sum(scores) / len(scores)

    scores = cross_validation.cross_val_score(pred,
                                              data,
                                              classes,
                                              'precision',
                                              cv=5)
    print 'Precision'
    print scores
    print sum(scores) / len(scores)

    scores = cross_validation.cross_val_score(pred,
                                              data,
                                              classes,
                                              'recall',
                                              cv=5)
    print 'Recall'
    print scores
    print sum(scores) / len(scores)

    scores = cross_validation.cross_val_score(pred, data, classes, 'f1', cv=5)
    print 'f1'
    print scores
    print sum(scores) / len(scores)
Esempio n. 13
0
 def test_get_input_unlabeled(self):
     """ Tests function get_input for unlabeled case. """
     result = pre.get_input(self.testfilename)
     truth = ([
         [
             Reference(
                 0, 'm jones',
                 'symbol intersect detect method improv spatial intersect join',
                 ['e rundensteiner', 'y huang'], 'geoinformatica', None),
             Reference(
                 1, 'matthew c jones',
                 'improv spatial intersect join symbol intersect detect', [
                     'e rundensteiner', 'h kuno', 'p marron', 'v taube',
                     'y ra'
                 ], 'sigmodels.intern manag data', None),
             Reference(2, 'matthew c jones',
                       'view materi techniqu complex hirarch object',
                       ['e rundensteiner', 'y huang'],
                       'ssd symposium larg spatial databas', None)
         ],
         [
             Reference(
                 3, 'mike w miller', 'domin draw bipartit graph',
                 ['l berg'],
                 'sigucc special interest group univers comput servic',
                 None),
             Reference(
                 4, 'mike w miller', 'rel compromis statist databas', [],
                 'sigucc special interest group univers comput servic',
                 None)
         ],
         [
             Reference(
                 5, 'c chen', 'formal approach scenario analysi',
                 ['d kung', 'j samuel', 'j gao', 'p hsia', 'y toyoshima'],
                 'ieee softwar', None)
         ],
         [
             Reference(6, 'jane j robinson', 'discours code clue context',
                       [], 'acl meet the associ comput linguist', None),
             Reference(7, 'jane j robinson', 'diagram grammar dialogu', [],
                       'cooper interfac inform system', None)
         ],
         [
             Reference(8, 'a gupta', 'iri h java distanc educ', [
                 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab',
                 'j wild', 'k maly', 's ghanem', 'x zhu'
             ], 'acm journal educ resourc comput', None)
         ],
         [
             Reference(
                 9, 'mary d brown',
                 'intern redund represent limit bypass support pipelin adder regist'
                 'file', ['y patt'],
                 'proceed the th ieee intern symposium high '
                 'perform comput architectur hpca intern symposium high perform '
                 'comput architectur talk slide', None)
         ]
     ], [
         'm jones', 'e rundensteiner', 'y huang', 'matthew c jones',
         'e rundensteiner', 'h kuno', 'p marron', 'v taube', 'y ra',
         'matthew c jones', 'e rundensteiner', 'y huang', 'mike w miller',
         'l berg', 'mike w miller', 'c chen', 'd kung', 'j samuel', 'j gao',
         'p hsia', 'y toyoshima', 'jane j robinson', 'jane j robinson',
         'a gupta', 'a gonzalez', 'a hamid', 'c overstreet', 'h wahab',
         'j wild', 'k maly', 's ghanem', 'x zhu', 'mary d brown', 'y patt'
     ])
     self.assertEquals(result, truth)
    ktok = 1
    for t in xrange(1, min(k, n - k) + 1):
      ntok *= n
      ktok *= t
      n -= 1
    return ntok // ktok
  else:
    return 0


def bell_number(n):
  if n == 1:
    return 1
  elif n in BELL_NUMBERS:
    return BELL_NUMBERS[n]
  else:
    bell = 0
    for i in range(n):
      bell += choose(n-1, i) * bell_number(n-1)
    BELL_NUMBERS[n] = bell
    return bell

if __name__ == '__main__':
  blocks, _ = pre.get_input('data/data.dat')
  blocks_bells = []
  for block in blocks:
    blocks_bells.append(bell_number(len(block)))
  #print [len(block) for block in blocks]
  #print blocks_bells
  print math.log10(sum(blocks_bells))