Ejemplo n.º 1
0
def load_instances(instance_strs, word_vectors):
  '''Load training examples
  
  Args:
    instance_strs: each string is a training example
    word_vectors: an instance of vec.wordvector
    
  Return:
    instances: a list of Instance
  '''

  instances = [Instance.parse_from_str(i, word_vectors) for i in instance_strs]
  total_internal_node = 0
  for instance in instances:
    total_internal_node += (len(instance.words)-1) * instance.freq
  return instances, total_internal_node
Ejemplo n.º 2
0
def load_instances(instance_strs, word_vectors):
  '''Load training examples
  
  Args:
    instance_strs: each string is a training example
    word_vectors: an instance of vec.wordvector
    
  Return:
    instances: a list of Instance
  '''

  instances = [Instance.parse_from_str(i, word_vectors) for i in instance_strs]
  total_internal_node = 0
  for instance in instances:
    total_internal_node += (len(instance.words)-1) * instance.freq
  return instances, total_internal_node
Ejemplo n.º 3
0
def load_instances(instance_strs, word_vectors):
    """Load training examples
  
    Args:
        instance_strs: each string is a training example
        word_vectors: an instance of vec.wordvector
    
    Return:
        instances: a list of Instance
    """
    instances = [Instance.parse_from_str(i, word_vectors) for i in instance_strs]
    total_internal_node = 0
    for instance in instances:
        # 对于一个短语有n个单词,则经过n-1次组合后形成唯一的短语向量,故中间节点共有n-1个
        total_internal_node += (len(instance.words) - 1) * instance.freq
    return instances, total_internal_node
 
 print >> stderr, 'load RAE parameters...'
 theta = unpickle(theta_file)
 rae = RecursiveAutoencoder.build(theta, embsize)
   
 total_cost = 0
 total_instance_num = 0
 total_internal_node_num = 0
 
 print '='*63
 print '%20s %20s %20s' % ('all', 'avg/node', 'internal node')
 print '-'*63
 
 with Reader(phrases_file) as reader, Writer(output_file) as writer:
   for phrase in reader:
     instance = Instance.parse_from_str(phrase, word_vectors)
     words_embedded = word_vectors[instance.words]
     root_node, cost = rae.forward(words_embedded)
     # print "root node: ",root_node
     # vec = root_node.p.T[0] # convert n*1 vector to common vector
     vec=[]
     for node in root_node:
       vec.append(node.p.T[0])
     # for nodes in root_node:
     #   print "n:",nodes.p.shape
     # continue
     # writer.write(' '.join([str(vec[i]) for i in range(vec.size)]))
     for j in range(len(vec)): 
       v=vec[j]
       writer.write(' '.join([str(v[i]) for i in range(v.size)]))
       if not j==(len(vec)-1):
Ejemplo n.º 5
0
    print >> stderr, 'load RAE parameters...'
    theta = unpickle(theta_file)
    rae = RecursiveAutoencoder.build(theta, embsize)

    total_cost = 0
    total_instance_num = 0
    total_internal_node_num = 0

    print '=' * 63
    print '%20s %20s %20s' % ('all', 'avg/node', 'internal node')
    print '-' * 63

    with Reader(phrases_file) as reader, Writer(output_file) as writer:
        for phrase in reader:
            instance = Instance.parse_from_str(phrase, word_vectors)
            words_embedded = word_vectors[instance.words]
            root_node, cost = rae.forward(words_embedded)
            vec = root_node.p.T[0]  # convert n*1 vector to common vector
            writer.write(' '.join([str(vec[i]) for i in range(vec.size)]))
            writer.write('\n')

            internal_node_num = len(instance.words) - 1
            if internal_node_num > 0:
                print '%20.8f, %20.8f, %18d' % (cost, cost / internal_node_num,
                                                internal_node_num)
            else:
                print '%20.8f, %20.8f, %18d' % (cost, cost, 0)

            total_cost += cost
            total_instance_num += 1