Exemple #1
0
    def __init__(self, params):
        super().__init__()
        self.samples = defaultdict(lambda: {
            'mix': None,
            'vocl': None,
            'inst': []
        })
        path = params['path']

        self.number_if_samples = 0
        for a in iterate_files(os.path.join(path, 'Mixtures'), '.wav'):
            key = os.path.basename(os.path.dirname(a))
            self.samples[key]['mix'] = a
            self.number_if_samples += 1

        for a in iterate_files(os.path.join(path, 'Sources'), '.wav'):
            key = os.path.basename(os.path.dirname(a))
            if a.endswith('vocals.wav'):
                self.samples[key]['vocl'] = a
                self.add_frames_num('vocl', a)
            else:
                self.samples[key]['inst'].append(a)
                self.add_frames_num('inst', a)
            self.number_if_samples += 1

        self.samples = dict(self.samples)
Exemple #2
0
def pretrain():
    model = self_supervized_model(400)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate=8e-5),
                  metrics='accuracy')

    config = {
        'batch_size': 128,
        'maxlen': 80,
        'units': 400,
        'model': model,
    }

    run = wandb.init(project="dotter",
                     group="pretrain",
                     tags=[],
                     config=config)

    wandb_callback = wandb.keras.WandbCallback(log_batch_frequency=50,
                                               save_model=False,
                                               log_weights=False)

    with run:
        for fname in utils.iterate_files(["../wikipedia/AA"]):
            name = fname.split('/')[-1]
            raw_y = load_plaintext(fname, 80)
            utils.shuffle_in_unison(raw_y)
            x, y = get_masked(raw_y, 0.3)
            model.fit(x, y, batch_size=128, validation_split=0.1, callbacks=[wandb_callback])
            model.save(f'{pretrain_path}/{name}.h5', save_format='tf')

    model.save(model_name, save_format='tf')
    return model
Exemple #3
0
def diacritize_all(sysname):
    for filename in utils.iterate_files([basepath]):
        # if filename.endswith(r'\nrg\6.txt') or filename.endswith(r'president\6.txt'):
        #    continue
        print(filename, end=' ' * 30 + '\r', flush=True)

        actual = diacritize(sysname, filename)

        outfile = filename.replace('expected', sysname)
        Path(outfile).parent.mkdir(parents=True, exist_ok=True)

        with open(outfile, 'w', encoding='utf8') as f:
            f.write(actual)
Exemple #4
0
    def __init__(self, params):
        super().__init__()
        self.samples = defaultdict(lambda: {
            'mix': None,
            'vocl': None,
            'inst': None
        })
        path = params['path']

        for a in iterate_files(path, '.wav'):
            key = os.path.basename(os.path.dirname(a))
            if a.endswith('mix.wav'):
                self.samples[key]['mix'] = a
            elif a.endswith('source-01.wav'):
                self.samples[key]['inst'] = a
                self.add_frames_num('inst', a)
            else:
                self.samples[key]['vocl'] = a
                self.add_frames_num('vocl', a)
        self.samples = dict(self.samples)
Exemple #5
0
    def __init__(self, params):
        super().__init__()
        self.samples = defaultdict(lambda: {'mix': [], 'vocl': [], 'inst': []})
        path = params['path']
        self.net_vocals = 0
        self.net_insts = 0
        for a in iterate_files(os.path.join(path, 'separation'), '.wav'):
            key = os.path.basename(os.path.dirname(os.path.dirname(a))).lower()
            filename = os.path.basename(a).lower()
            if self._is_vocal_name(filename):
                self.samples[key]['vocl'].append(a)
                # self.net_vocls += get_net_duration(a)
                self.add_frames_num('vocl', a)
                continue
            if 'mix' in filename:
                self.samples[key]['mix'].append(a)
                continue

            self.samples[key]['inst'].append(a)
            self.add_frames_num('inst', a)
            # self.net_insts += get_net_duration(a)
        self.samples = dict(self.samples)
Exemple #6
0
 def __init__(self, params):
     super().__init__()
     path = params['path']
     self.vocl = []
     self.inst = []
     reg = re.compile('\[(.*?)\]')
     for a in iterate_files(path, '.wav'):
         try:
             txt_filename = a[:-4] + '.txt'
             if 'voi' in map(str.strip, open(txt_filename)):
                 self.vocl.append(a)
                 self.add_frames_num('vocl', a)
             else:
                 self.inst.append(a)
                 self.add_frames_num('inst', a)
         except FileNotFoundError:
             if 'voi' in reg.findall(a):
                 self.vocl.append(a)
                 self.add_frames_num('vocl', a)
             else:
                 self.inst.append(a)
                 self.add_frames_num('inst', a)
Exemple #7
0
def collect_tokens(paths: Iterable[str]):
    return tokenize(itertools.chain.from_iterable(iterate_file(path) for path in utils.iterate_files(paths)))
Exemple #8
0
def read_corpora(base_paths):
    return [(filename, list(hebrew.iterate_file(filename)))
            for filename in utils.iterate_files(base_paths)]