def load_instances(instances_lines, word_vectors): '''Load real training examples Args: instance_lines: each string is a training example word_vectors: an instance of vec.wordvector Return: instances: a list of ReorderInstance ''' instances = [ReorderInstance.paser_from_str(i, word_vectors) for i in instances_lines] instances = [i for i in instances if len(i.preWords) != 0 and len(i.aftWords) != 0] return instances
def prepare_data(word_vectors=None, dataFile=None, unlabelFile=None): '''Prepare training data Args: word_vectors: an instance of vec.wordvector dataFile: raw training file Return: instances: a list of ReorderInstance word_vectors ''' if rank == 0: comm.bcast(word_vectors, root=0) instance_of_domain = [] instance_lines = [] lines_of_Unlabel = [] instances_of_Unlabel = [] if unlabelFile != None: with Reader(unlabelFile) as file: for line in file: lines_of_Unlabel.append(line) instances_of_Unlabel = [ReorderInstance.paser_from_unlabeled_str(i, word_vectors) for i in lines_of_Unlabel] instances_of_Unlabel = [i for i in instances_of_Unlabel if len(i.preWords) != 0 and len(i.aftWords) != 0] comm.bcast(instances_of_Unlabel, root=0) # if type(dataFile) == str: # with Reader(dataFile) as file: # for line in file: # instance_of_domain.append(line) # instances = load_instances(instance_of_domain, word_vectors) # if unlabelFile != None: # return instances, instances_of_Unlabel, word_vectors # return instances, word_vectors for file in dataFile: with Reader(file) as file: for line in file: instance_of_domain.append(line) instance_lines.append(instance_of_domain) instance_of_domain = [] for i in range(1, worker_num): comm.send(instance_lines[i], dest=i) comm.barrier() instances = load_instances(instance_lines[0], word_vectors) del instance_lines if unlabelFile != None: return instances, instances_of_Unlabel, word_vectors return instances, None, word_vectors else: word_vectors = comm.bcast(root=0) instances_of_Unlabel = comm.bcast(root=0) instances_lines = comm.recv(source=0) comm.barrier() instances = load_instances(instances_lines, word_vectors) if unlabelFile != None: return instances, instances_of_Unlabel, word_vectors return instances, None, word_vectors