def run(self, walk_filename, representation_size, window_size, model_filename): ''' Training ---------------------------------------------- Parameters: walk_filename: 随机游走的路径存储文件 type: str representation_size: 表示的空间大小 type: int window_size: 窗口大小 type: int model_filename: 模型存储文件 type: str ''' num_walks = self.random_walk.num_paths * len(self.graph) print("Number of walks: {}".format(num_walks)) data_size = num_walks * self.random_walk.path_length print("Data size (walks*length): {}".format(data_size)) print("Walking....") walks = self.random_walk.generate(self.graph) DataWriter.write_walks(walk_filename, walks) walks_corpus = DataReader.load_walks(walk_filename) model = Skipgram(sentences=walks_corpus, size=representation_size, window=window_size, min_count=0, trim_rule=None, workers=self.workers) # model.wv.save_word2vec_format(walk_filename[:8], ) model.save(model_filename) print 'Terminal.'
def write_dataset(dataset: List[Dict[str, Any]], partitions: List[str], fractions: List[float], output_folder: str): # Initialize writers and counters writers: Dict[str, DataWriter] = dict() label_counters: Dict[str, Counter] = dict() for partition in partitions: writer = DataWriter(os.path.join(output_folder, partition), chunk_size=5000, file_prefix='data', file_suffix='jsonl.gz') writers[partition] = writer label_counters[partition] = Counter() # Write all samples for sample in dataset: partition = get_partition(partitions, fractions) writers[partition].add(sample) label_counters[partition][sample[OUTPUT]] += 1 # Close all writers for writer in writers.values(): writer.close() print(label_counters)
def tokenize_dataset(input_folder: str, output_folder: str, chunk_size: int): make_dir(output_folder) data_writers = { TRAIN: DataWriter(os.path.join(output_folder, TRAIN), file_prefix='data', file_suffix='jsonl.gz', chunk_size=chunk_size), VALID: DataWriter(os.path.join(output_folder, VALID), file_prefix='data', file_suffix='jsonl.gz', chunk_size=chunk_size), TEST: DataWriter(os.path.join(output_folder, TEST), file_prefix='data', file_suffix='jsonl.gz', chunk_size=chunk_size) } partition_counters = {TRAIN: Counter(), VALID: Counter(), TEST: Counter()} for i, (sample, partition) in enumerate(data_generator(input_folder)): data_writers[partition].add(sample) partition_counters[partition][sample[OUTPUT]] += 1 if (i + 1) % chunk_size == 0: print('Wrote {0} samples.'.format(i + 1), end='\r') print() for writer in data_writers.values(): writer.close() print(partition_counters)
def create_dataset(output_folder: str, num_samples: int, seq_length: int): with DataWriter(output_folder, file_prefix='data', file_suffix='jsonl.gz', chunk_size=5000) as writer: for sample_id in range(num_samples): # Randomly generate input points xs = np.sort( np.random.uniform(low=MIN_VALUE, high=MAX_VALUE, size=(seq_length, ))) label = 1 if np.random.uniform(low=0.0, high=1.0) < 0.5 else 0 if label == 1: # Cubic inputs = np.power(xs, 3) else: # Linear inputs = xs sample = { INPUTS: np.expand_dims(inputs, axis=-1).astype(float).tolist(), OUTPUT: label, SAMPLE_ID: sample_id } writer.add(sample)
def write_dataset(input_folder: str, output_folder: str, series: str): input_path = os.path.join(input_folder, 'RightWhaleCalls_{0}.arff'.format(series.upper())) if series == TRAIN: writers = { TRAIN: DataWriter(os.path.join(output_folder, TRAIN), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE), VALID: DataWriter(os.path.join(output_folder, VALID), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE) } label_counters = { TRAIN: Counter(), VALID: Counter() } else: writers = { TEST: DataWriter(os.path.join(output_folder, TEST), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE) } label_counters = { TEST: Counter() } for index, sample in enumerate(data_generator(input_path)): if series == TRAIN: if random.random() < TRAIN_FRAC: partition = TRAIN else: partition = VALID else: partition = TEST writers[partition].add(sample) label_counters[partition][sample[OUTPUT]] += 1 if (index + 1) % CHUNK_SIZE == 0: print('Completed {0} samples'.format(index + 1), end='\r') print() # Close all writers for writer in writers.values(): writer.close() print(label_counters)
def merge_datasets(folders: List[str], output_folder: str, file_prefix: str, file_suffix: str, chunk_size: int): with DataWriter(output_folder, file_prefix=file_prefix, file_suffix=file_suffix, chunk_size=chunk_size) as writer: data_files = chain(*(iterate_files(folder, pattern=f'.*{file_suffix}') for folder in folders)) sample_id = 0 for data_file in data_files: for sample in read_by_file_suffix(data_file): sample[SAMPLE_ID] = sample_id writer.add(sample) sample_id += 1 if (sample_id + 1) % chunk_size == 0: print('Completed {0} samples.'.format(sample_id + 1), end='\r') print()
def tokenize_dataset(input_file: str, output_folder: str, window_size: int, noise: float, reps: int, num_features: Optional[int]): with DataWriter(output_folder, file_prefix='data', chunk_size=10000, file_suffix='jsonl.gz') as writer: label_counter: Counter = Counter() for index, sample in enumerate( dataset_iterator(input_file, window_size, noise=noise, reps=reps, num_features=num_features)): label_counter[sample[OUTPUT]] += 1 writer.add(sample) if (index + 1) % 500 == 0: print('Completed {0} samples.'.format(index + 1), end='\r') print() print(label_counter)
def write_dataset(data: np.ndarray, output_folder: str, series: str): # Create the data writers if series == TRAIN: writers = { TRAIN: DataWriter(os.path.join(output_folder, TRAIN), file_prefix='data', chunk_size=CHUNK_SIZE, file_suffix='jsonl.gz'), VALID: DataWriter(os.path.join(output_folder, VALID), file_prefix='data', chunk_size=CHUNK_SIZE, file_suffix='jsonl.gz') } label_counters = {TRAIN: Counter(), VALID: Counter()} else: writers = { TEST: DataWriter(os.path.join(output_folder, TEST), file_prefix='data', chunk_size=CHUNK_SIZE, file_suffix='jsonl.gz') } label_counters = {TEST: Counter()} sample_id = 0 for index, features in enumerate(data): label = int(features[0]) input_features = features[1:].reshape(-1, 1).astype(float).tolist() # Get the data partition if series == TRAIN: if random.random() < TRAIN_FRAC: partition = TRAIN else: partition = VALID else: partition = TEST # Create the sample and add to corresponding data writer for i in range(0, len(input_features) - WINDOW_SIZE + 1, STRIDE): sample = { SAMPLE_ID: sample_id, OUTPUT: label, INPUTS: input_features[i:i + WINDOW_SIZE], } writers[partition].add(sample) label_counters[partition][label] += 1 sample_id += 1 if (index + 1) % CHUNK_SIZE == 0: print('Completed {0} sample.'.format(index + 1), end='\r') print() # Close all data writers for writer in writers.values(): writer.close() print(label_counters)
def split_dataset(input_folder: str, output_folder: str, fractions: List[float], file_prefix: str, chunk_size: int, file_type: str): assert len(fractions) == len( PARTITIONS ), f'Must provide enough fractions to account for all partitions' assert file_type in FILE_TYPES, f'Invalid file type: {file_type}' # Make output folder if necessary make_dir(output_folder) # Create the data manager data_manager = get_data_manager(input_folder, SAMPLE_ID, DATA_FIELDS, extension=file_type) data_manager.load() data_iterator = data_manager.iterate(should_shuffle=False, batch_size=chunk_size) num_samples = data_manager.length # Get folders for each partition train_folder = os.path.join(output_folder, TRAIN) valid_folder = os.path.join(output_folder, VALID) test_folder = os.path.join(output_folder, TEST) # Track counts per partition partition_counters: Counter = Counter() # Create data writers if file_type == 'npz': partition_writers = { TRAIN: NpzDataWriter(train_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, sample_id_name=SAMPLE_ID, data_fields=DATA_FIELDS, mode='w'), VALID: NpzDataWriter(valid_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, sample_id_name=SAMPLE_ID, data_fields=DATA_FIELDS, mode='w'), TEST: NpzDataWriter(test_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, sample_id_name=SAMPLE_ID, data_fields=DATA_FIELDS, mode='w') } else: partition_writers = { TRAIN: DataWriter(train_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, mode='w'), VALID: DataWriter(valid_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, mode='w'), TEST: DataWriter(test_folder, file_prefix=file_prefix, file_suffix=file_type, chunk_size=chunk_size, mode='w') } # Write to chunked files for index, sample in enumerate(data_iterator): partition_index = get_partition_index(sample, fractions) partition_folder = PARTITIONS[partition_index] partition_writers[partition_folder].add(sample) partition_counters[partition_folder] += 1 if (index + 1) % chunk_size == 0: print(f'Completed {index + 1}/{num_samples} samples.', end='\r') print() # Flush any remaining data samples for writer in partition_writers.values(): writer.flush() # Print out metrics and save metadata print('====== RESULTS ======') total = sum(partition_counters.values()) metadata: Dict[str, Dict[str, float]] = dict() for series in PARTITIONS: count = partition_counters[series] frac = count / total metadata[series] = dict(count=count, frac=frac) print(f'{series.capitalize()}: {count} ({frac:.03f})') metadata_file = os.path.join(output_folder, 'metadata.json') save_by_file_suffix(metadata, metadata_file)
parser.add_argument('--output-folder', type=str, required=True) parser.add_argument('--chunk-size', type=int, default=10000) args = parser.parse_args() # Read the labels labels = np.loadtxt(os.path.join(args.input_folder, LABELS_FILE)) labels_dict: Dict[LabelKey, int] = dict() for entry in labels: key = LabelKey(user_id=int(entry[1]), exp_id=int(entry[0]), begin=int(entry[3]), end=int(entry[4])) labels_dict[key] = int(entry[2]) # Create the output data writers make_dir(args.output_folder) writers = { TRAIN: DataWriter(os.path.join(args.output_folder, TRAIN), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size), VALID: DataWriter(os.path.join(args.output_folder, VALID), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size), TEST: DataWriter(os.path.join(args.output_folder, TEST), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size), } # Initialize counters counters = { TRAIN: Counter(), VALID: Counter(), TEST: Counter() } # We load all data files first to prevent redundant loading print('Loading Input files...') data_files = load_data_files(labels_dict, args.input_folder)