Esempio n. 1
0
def load_instances(instances_lines, word_vectors):
    '''Load real training examples

    Args:
        instance_lines: each string is a training example
        word_vectors: an instance of vec.wordvector

    Return:
        instances: a list of ReorderInstance
    '''
    instances = [ReorderInstance.paser_from_str(i, word_vectors) for i in instances_lines]

    instances = [i for i in instances if len(i.preWords) != 0 and len(i.aftWords) != 0]

    return instances
Esempio n. 2
0
def prepare_data(word_vectors=None, dataFile=None, unlabelFile=None):
    '''Prepare training data
    Args:
    word_vectors: an instance of vec.wordvector
    dataFile: raw training file

    Return:
    instances: a list of ReorderInstance
    word_vectors
    '''
    if rank == 0:
        comm.bcast(word_vectors, root=0)

        instance_of_domain = []
        instance_lines = []
        lines_of_Unlabel = []
        instances_of_Unlabel = []

        if unlabelFile != None:
            with Reader(unlabelFile) as file:
                for line in file:
                    lines_of_Unlabel.append(line)

            instances_of_Unlabel = [ReorderInstance.paser_from_unlabeled_str(i, word_vectors) for i in lines_of_Unlabel]
            instances_of_Unlabel = [i for i in instances_of_Unlabel if len(i.preWords) != 0 and len(i.aftWords) != 0]

        comm.bcast(instances_of_Unlabel, root=0)

        # if type(dataFile) == str:
        #     with Reader(dataFile) as file:
        #         for line in file:
        #             instance_of_domain.append(line)
        #     instances = load_instances(instance_of_domain, word_vectors)
        #     if unlabelFile != None:
        #         return instances, instances_of_Unlabel, word_vectors
        #     return instances, word_vectors

        for file in dataFile:
            with Reader(file) as file:
                for line in file:
                    instance_of_domain.append(line)
            instance_lines.append(instance_of_domain)
            instance_of_domain = []

        for i in range(1, worker_num):
            comm.send(instance_lines[i], dest=i)
        comm.barrier()

        instances = load_instances(instance_lines[0], word_vectors)

        del instance_lines

        if unlabelFile != None:
            return instances, instances_of_Unlabel, word_vectors

        return instances, None, word_vectors
    else:
        word_vectors = comm.bcast(root=0)
        instances_of_Unlabel = comm.bcast(root=0)

        instances_lines = comm.recv(source=0)
        comm.barrier()

        instances = load_instances(instances_lines, word_vectors)
        if unlabelFile != None:
            return instances, instances_of_Unlabel, word_vectors
        return  instances, None, word_vectors