Beispiel #1
0
    def run(self, walk_filename, representation_size, window_size,
            model_filename):
        '''
        Training
        ----------------------------------------------
        Parameters:
            walk_filename: 随机游走的路径存储文件
                type: str
            representation_size: 表示的空间大小
                type: int
            window_size: 窗口大小
                type: int
            model_filename: 模型存储文件
                type: str
        '''
        num_walks = self.random_walk.num_paths * len(self.graph)
        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * self.random_walk.path_length
        print("Data size (walks*length): {}".format(data_size))

        print("Walking....")
        walks = self.random_walk.generate(self.graph)
        DataWriter.write_walks(walk_filename, walks)

        walks_corpus = DataReader.load_walks(walk_filename)
        model = Skipgram(sentences=walks_corpus,
                         size=representation_size,
                         window=window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=self.workers)
        # model.wv.save_word2vec_format(walk_filename[:8], )
        model.save(model_filename)
        print 'Terminal.'
def write_dataset(dataset: List[Dict[str, Any]], partitions: List[str],
                  fractions: List[float], output_folder: str):
    # Initialize writers and counters
    writers: Dict[str, DataWriter] = dict()
    label_counters: Dict[str, Counter] = dict()
    for partition in partitions:
        writer = DataWriter(os.path.join(output_folder, partition),
                            chunk_size=5000,
                            file_prefix='data',
                            file_suffix='jsonl.gz')
        writers[partition] = writer

        label_counters[partition] = Counter()

    # Write all samples
    for sample in dataset:
        partition = get_partition(partitions, fractions)
        writers[partition].add(sample)
        label_counters[partition][sample[OUTPUT]] += 1

    # Close all writers
    for writer in writers.values():
        writer.close()

    print(label_counters)
Beispiel #3
0
def tokenize_dataset(input_folder: str, output_folder: str, chunk_size: int):
    make_dir(output_folder)
    data_writers = {
        TRAIN:
        DataWriter(os.path.join(output_folder, TRAIN),
                   file_prefix='data',
                   file_suffix='jsonl.gz',
                   chunk_size=chunk_size),
        VALID:
        DataWriter(os.path.join(output_folder, VALID),
                   file_prefix='data',
                   file_suffix='jsonl.gz',
                   chunk_size=chunk_size),
        TEST:
        DataWriter(os.path.join(output_folder, TEST),
                   file_prefix='data',
                   file_suffix='jsonl.gz',
                   chunk_size=chunk_size)
    }

    partition_counters = {TRAIN: Counter(), VALID: Counter(), TEST: Counter()}

    for i, (sample, partition) in enumerate(data_generator(input_folder)):
        data_writers[partition].add(sample)
        partition_counters[partition][sample[OUTPUT]] += 1

        if (i + 1) % chunk_size == 0:
            print('Wrote {0} samples.'.format(i + 1), end='\r')
    print()

    for writer in data_writers.values():
        writer.close()

    print(partition_counters)
def create_dataset(output_folder: str, num_samples: int, seq_length: int):
    with DataWriter(output_folder,
                    file_prefix='data',
                    file_suffix='jsonl.gz',
                    chunk_size=5000) as writer:

        for sample_id in range(num_samples):
            # Randomly generate input points
            xs = np.sort(
                np.random.uniform(low=MIN_VALUE,
                                  high=MAX_VALUE,
                                  size=(seq_length, )))

            label = 1 if np.random.uniform(low=0.0, high=1.0) < 0.5 else 0

            if label == 1:  # Cubic
                inputs = np.power(xs, 3)
            else:  # Linear
                inputs = xs

            sample = {
                INPUTS: np.expand_dims(inputs, axis=-1).astype(float).tolist(),
                OUTPUT: label,
                SAMPLE_ID: sample_id
            }
            writer.add(sample)
def write_dataset(input_folder: str, output_folder: str, series: str):
    input_path = os.path.join(input_folder, 'RightWhaleCalls_{0}.arff'.format(series.upper()))

    if series == TRAIN:
        writers = {
            TRAIN: DataWriter(os.path.join(output_folder, TRAIN), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE),
            VALID: DataWriter(os.path.join(output_folder, VALID), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE)
        }

        label_counters = {
            TRAIN: Counter(),
            VALID: Counter()
        }
    else:
        writers = {
            TEST: DataWriter(os.path.join(output_folder, TEST), file_prefix='data', file_suffix='jsonl.gz', chunk_size=CHUNK_SIZE)
        }

        label_counters = {
            TEST: Counter()
        }

    for index, sample in enumerate(data_generator(input_path)):
        if series == TRAIN:
            if random.random() < TRAIN_FRAC:
                partition = TRAIN
            else:
                partition = VALID
        else:
            partition = TEST

        writers[partition].add(sample)
        label_counters[partition][sample[OUTPUT]] += 1

        if (index + 1) % CHUNK_SIZE == 0:
            print('Completed {0} samples'.format(index + 1), end='\r')
    print()

    # Close all writers
    for writer in writers.values():
        writer.close()

    print(label_counters)
def merge_datasets(folders: List[str], output_folder: str, file_prefix: str,
                   file_suffix: str, chunk_size: int):
    with DataWriter(output_folder,
                    file_prefix=file_prefix,
                    file_suffix=file_suffix,
                    chunk_size=chunk_size) as writer:

        data_files = chain(*(iterate_files(folder, pattern=f'.*{file_suffix}')
                             for folder in folders))

        sample_id = 0
        for data_file in data_files:
            for sample in read_by_file_suffix(data_file):
                sample[SAMPLE_ID] = sample_id
                writer.add(sample)
                sample_id += 1

                if (sample_id + 1) % chunk_size == 0:
                    print('Completed {0} samples.'.format(sample_id + 1),
                          end='\r')
        print()
Beispiel #7
0
def tokenize_dataset(input_file: str, output_folder: str, window_size: int,
                     noise: float, reps: int, num_features: Optional[int]):

    with DataWriter(output_folder,
                    file_prefix='data',
                    chunk_size=10000,
                    file_suffix='jsonl.gz') as writer:

        label_counter: Counter = Counter()

        for index, sample in enumerate(
                dataset_iterator(input_file,
                                 window_size,
                                 noise=noise,
                                 reps=reps,
                                 num_features=num_features)):
            label_counter[sample[OUTPUT]] += 1
            writer.add(sample)

            if (index + 1) % 500 == 0:
                print('Completed {0} samples.'.format(index + 1), end='\r')

    print()
    print(label_counter)
def write_dataset(data: np.ndarray, output_folder: str, series: str):
    # Create the data writers
    if series == TRAIN:
        writers = {
            TRAIN:
            DataWriter(os.path.join(output_folder, TRAIN),
                       file_prefix='data',
                       chunk_size=CHUNK_SIZE,
                       file_suffix='jsonl.gz'),
            VALID:
            DataWriter(os.path.join(output_folder, VALID),
                       file_prefix='data',
                       chunk_size=CHUNK_SIZE,
                       file_suffix='jsonl.gz')
        }

        label_counters = {TRAIN: Counter(), VALID: Counter()}
    else:
        writers = {
            TEST:
            DataWriter(os.path.join(output_folder, TEST),
                       file_prefix='data',
                       chunk_size=CHUNK_SIZE,
                       file_suffix='jsonl.gz')
        }

        label_counters = {TEST: Counter()}

    sample_id = 0
    for index, features in enumerate(data):
        label = int(features[0])
        input_features = features[1:].reshape(-1, 1).astype(float).tolist()

        # Get the data partition
        if series == TRAIN:
            if random.random() < TRAIN_FRAC:
                partition = TRAIN
            else:
                partition = VALID
        else:
            partition = TEST

        # Create the sample and add to corresponding data writer
        for i in range(0, len(input_features) - WINDOW_SIZE + 1, STRIDE):
            sample = {
                SAMPLE_ID: sample_id,
                OUTPUT: label,
                INPUTS: input_features[i:i + WINDOW_SIZE],
            }

            writers[partition].add(sample)
            label_counters[partition][label] += 1
            sample_id += 1

        if (index + 1) % CHUNK_SIZE == 0:
            print('Completed {0} sample.'.format(index + 1), end='\r')

    print()

    # Close all data writers
    for writer in writers.values():
        writer.close()

    print(label_counters)
Beispiel #9
0
def split_dataset(input_folder: str, output_folder: str,
                  fractions: List[float], file_prefix: str, chunk_size: int,
                  file_type: str):
    assert len(fractions) == len(
        PARTITIONS
    ), f'Must provide enough fractions to account for all partitions'
    assert file_type in FILE_TYPES, f'Invalid file type: {file_type}'

    # Make output folder if necessary
    make_dir(output_folder)

    # Create the data manager
    data_manager = get_data_manager(input_folder,
                                    SAMPLE_ID,
                                    DATA_FIELDS,
                                    extension=file_type)
    data_manager.load()
    data_iterator = data_manager.iterate(should_shuffle=False,
                                         batch_size=chunk_size)
    num_samples = data_manager.length

    # Get folders for each partition
    train_folder = os.path.join(output_folder, TRAIN)
    valid_folder = os.path.join(output_folder, VALID)
    test_folder = os.path.join(output_folder, TEST)

    # Track counts per partition
    partition_counters: Counter = Counter()

    # Create data writers
    if file_type == 'npz':
        partition_writers = {
            TRAIN:
            NpzDataWriter(train_folder,
                          file_prefix=file_prefix,
                          file_suffix=file_type,
                          chunk_size=chunk_size,
                          sample_id_name=SAMPLE_ID,
                          data_fields=DATA_FIELDS,
                          mode='w'),
            VALID:
            NpzDataWriter(valid_folder,
                          file_prefix=file_prefix,
                          file_suffix=file_type,
                          chunk_size=chunk_size,
                          sample_id_name=SAMPLE_ID,
                          data_fields=DATA_FIELDS,
                          mode='w'),
            TEST:
            NpzDataWriter(test_folder,
                          file_prefix=file_prefix,
                          file_suffix=file_type,
                          chunk_size=chunk_size,
                          sample_id_name=SAMPLE_ID,
                          data_fields=DATA_FIELDS,
                          mode='w')
        }
    else:
        partition_writers = {
            TRAIN:
            DataWriter(train_folder,
                       file_prefix=file_prefix,
                       file_suffix=file_type,
                       chunk_size=chunk_size,
                       mode='w'),
            VALID:
            DataWriter(valid_folder,
                       file_prefix=file_prefix,
                       file_suffix=file_type,
                       chunk_size=chunk_size,
                       mode='w'),
            TEST:
            DataWriter(test_folder,
                       file_prefix=file_prefix,
                       file_suffix=file_type,
                       chunk_size=chunk_size,
                       mode='w')
        }

    # Write to chunked files
    for index, sample in enumerate(data_iterator):
        partition_index = get_partition_index(sample, fractions)
        partition_folder = PARTITIONS[partition_index]

        partition_writers[partition_folder].add(sample)
        partition_counters[partition_folder] += 1

        if (index + 1) % chunk_size == 0:
            print(f'Completed {index + 1}/{num_samples} samples.', end='\r')
    print()

    # Flush any remaining data samples
    for writer in partition_writers.values():
        writer.flush()

    # Print out metrics and save metadata
    print('====== RESULTS ======')
    total = sum(partition_counters.values())
    metadata: Dict[str, Dict[str, float]] = dict()
    for series in PARTITIONS:
        count = partition_counters[series]
        frac = count / total
        metadata[series] = dict(count=count, frac=frac)

        print(f'{series.capitalize()}: {count} ({frac:.03f})')

    metadata_file = os.path.join(output_folder, 'metadata.json')
    save_by_file_suffix(metadata, metadata_file)
    parser.add_argument('--output-folder', type=str, required=True)
    parser.add_argument('--chunk-size', type=int, default=10000)
    args = parser.parse_args()

    # Read the labels
    labels = np.loadtxt(os.path.join(args.input_folder, LABELS_FILE))

    labels_dict: Dict[LabelKey, int] = dict()
    for entry in labels:
        key = LabelKey(user_id=int(entry[1]), exp_id=int(entry[0]), begin=int(entry[3]), end=int(entry[4]))
        labels_dict[key] = int(entry[2])

    # Create the output data writers
    make_dir(args.output_folder)
    writers = {
        TRAIN: DataWriter(os.path.join(args.output_folder, TRAIN), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size),
        VALID: DataWriter(os.path.join(args.output_folder, VALID), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size),
        TEST: DataWriter(os.path.join(args.output_folder, TEST), file_prefix='data', file_suffix='jsonl.gz', chunk_size=args.chunk_size),
    }

    # Initialize counters
    counters = {
        TRAIN: Counter(),
        VALID: Counter(),
        TEST: Counter()
    }

    # We load all data files first to prevent redundant loading
    print('Loading Input files...')
    data_files = load_data_files(labels_dict, args.input_folder)