Beispiel #1
0
def start_download():
    corporas = tools.get_min_max(str(args.corpora), 1)
    tools.printer(-3)
    tools.printer(1, 'Downloader\n')
    corporas = downloader.check_corpora(language_data, corporas, 0)
    downloader.download(current_dir, language_path + 'datasets/', main_db,
                        language_data[0], corporas)
Beispiel #2
0
def close_db(which=0, leave=False):
    if which == 0 or which == 1:
        language_data[4].close()
    if which == 0 or which == 2:
        main_db[0].close()

    if leave == True:
        tools.printer(-3)
        exit(1)
Beispiel #3
0
def checking_clean_corpora():
    if os.path.isfile(language_path + 'clean_raw') == False:
        create_clean_corpora()
    else:

        clean_size = round(
            os.path.getsize(language_path + 'clean_raw') / 1000000000, 3)
        tools.printer(2, 'found clean corpora')
        tools.printer(0, 'size', str(clean_size) + 'GB')
        return True
Beispiel #4
0
def multi_cleaner(inputs):
    if args.max < 1:
        args.max = 1

    index = 0
    startrange = inputs[0]
    endrange = inputs[1]

    failer = 0
    success = 0
    counter = endrange - startrange
    all_sentences = []
    show_index = 100000
    if inputs[4] == 0:
        show_index = round(counter / 200)

    with open(inputs[2], 'r') as f:
        for text in f:
            if index >= startrange and index <= endrange:

                if args.noclean == False:
                    cleaned = alphabet.sentences_cleaner(
                        prepare, str(text), language_data[0], args.upper,
                        num_activ, args.lower)
                else:
                    cleaned = str(text)
                    if args.upper == False:
                        cleaned = cleaned.lower()

                if cleaned == False:
                    failer += 1

                else:
                    success += 1
                    all_sentences.append(str(cleaned))
                if index % show_index == 0 and inputs[4] == 0:

                    tools.printer(
                        0,
                        '[' + str(round((index / counter) * 100, 2)) + '%]',
                    )

                if len(all_sentences) > args.max:
                    tools.create_file(all_sentences, inputs[3], 'a')
                    all_sentences = []

            index += 1

    f.close

    if len(all_sentences) > 0:
        tools.create_file(all_sentences, inputs[3], 'a')
Beispiel #5
0
def error_flag():
    if sec_input == '':
        tools.printer(8, 'no audio path')
        array = [
            tools.get_inputs([['wav_path', 'audio path from db', 1],
                              ['audios_id', 'audio id from db', 5]], '', True,
                             True)
        ]
    else:
        array = [sec_input]
    try:
        sql_test = "select audios_id from audios where audios_id=" + str(
            int(array[0])) + ""
        sql = "update audios set errors=9 where wav_path=" + str(int(
            array[0])) + ""
    except:

        sql_test = "select audios_id from audios where wav_path='" + str(
            array[0]) + "'"
        sql = "update audios set errors=9 where wav_path='" + str(
            array[0]) + "'"

    if str(array[0]) == 'unflag':
        tools.printer(22, 'all errors are unflagged', array[0])
        main_db[1].execute('update audios set errors=0')
        main_db[0].commit()
    else:
        main_db[1].execute(sql_test)
        test = main_db[1].fetchall()
        if len(test) > 0:
            tools.printer(22, 'error flagged', array[0])
            main_db[1].execute(sql)
            main_db[0].commit()
        else:
            tools.printer(9, 'not found', array[0])
Beispiel #6
0
def model_check():
    new_path = model_path
    new_model = current_model
    tools.printer(11, 'training', new_model)
    if os.path.isdir(new_path) == True:
        tools.printer(8, 'already trainings file for', new_model)
        u_input = tools.get_inputs([['d', 'delete and recreate', 0],
                                    ['c', 'choose another trainings name', 0]],
                                   '', True, True)
        if u_input == 'd':
            tools.delete_all(new_path)
        elif u_input == 'c':
            while True:
                new_model = input("         new name for training\n\n--:")
                new_path = current_dir + '/languages/' + str(
                    language_data[0]) + '/training/' + new_model + '/'
                if os.path.isdir(new_path) == False:
                    break
                elif new_model in ['q', 'Q']:
                    return [False]
                print()
                tools.printer(8, 'already trainings file for', new_model)
        elif new_model in ['q', 'Q']:
            return [False]
        else:
            return [False]

    tools.create_folder(current_dir, new_model, 'create', language_data[0])

    tools.printer(-3, '', '', True, new_path + 'info/training.txt', True,
                  False)
    return [True, new_model, new_path]
Beispiel #7
0
def create_clean_corpora(append=False):
    tools.printer(99, 'need to download clean corpora')
    corporas = downloader.check_corpora(language_data, '[\'0\']', 1)
    if len(corporas) == 0:
        tools.printer(8, 'no clean corpora', str(language_data[0]))
        tools.printer(0, 'trying to crawl one')
        tools.printer(-3)
        crawler.crawl_corpora(args.process, language_data[0], language_path,
                              10, append)

    else:
        tools.printer(2, 'downloading clean corpora', '')
        downloader.download_clean(corporas[0][0], language_path)
    return True
Beispiel #8
0
def create_trie():
    clean = checking_clean_corpora()
    if clean == False:
        return False

    raw_sentences = []
    clean_path = language_path + 'clean_raw'
    clean_model = model_path + "clean"
    file_cleaner(clean_path, clean_model)
    crawler.sort_remove_duplicates(clean_model)
    tools.printer(
        0,
        '\n   -----------------------------------------\n   creating trie and lm.binary\n\n\n',
        '')
    print('bash toolbox/make_trie.sh "' + str(deepspeech_dir[0]) + '" "' +
          str(model_path) + '" ' + str('3'))
    os.system('bash toolbox/make_trie.sh "' + str(deepspeech_dir[0]) + '" "' +
              str(model_path) + '" ' + str('3'))
    return os.path.isfile(model_path + 'trie') or DEPRECATED_0_7
Beispiel #9
0
def file_cleaner(path, targetpath):
    counter = 0
    all_process = []

    with open(path, 'r') as f:
        for text in f:
            counter += 1

    cpus = get_process_count()

    steps = round(counter / cpus)
    start = 0
    end = 0
    current = 0
    for p in range(cpus):

        start = current
        current += steps

        if p + 1 == cpus:
            end = counter
        else:
            end = current

        all_process.append([start, end, path, targetpath, p])

    tools.printer(2, 'cleaning', path)
    tools.printer(2, '        ', targetpath)
    tools.printer(2, 'processing', cpus)
    with Pool(cpus) as p:
        return p.map(multi_cleaner, all_process)
Beispiel #10
0
def file_cleaner(path, targetpath):
    counter = sum(1 for _ in open(path, 'r', encoding='utf-8'))
    all_process = []

    cpus = get_process_count()

    steps = round(counter / cpus)
    start = 0
    end = 0
    current = 0
    for p in range(cpus):

        start = current
        current += steps

        if p + 1 == cpus:
            end = counter
        else:
            end = current

        all_process.append([start, end, path, targetpath, p])

    tools.printer(2, 'cleaning', path)
    tools.printer(2, '        ', targetpath)
    tools.printer(2, 'processing', cpus)
    with Pool(cpus) as p:
        return p.map(multi_cleaner, all_process)
Beispiel #11
0
def test_sentences():
    if sec_input == '':
        tools.printer(8, 'no test input')
        array = [
            tools.get_inputs([['', 'sentences or path', -1]], '', True, True)
        ]

    if os.path.isfile(sec_input) == True:
        array = tools.get_file(sec_input, True)
    elif sec_input != '':
        array = [sec_input]

    print()
    for arr in array:
        cleaned = alphabet.sentences_cleaner(prepare, str(arr),
                                             language_data[0], args.upper,
                                             num_activ, args.lower)
        if cleaned == False:
            tools.printer(8, arr + '\n')
        else:
            tools.printer(2, arr)
            tools.printer(22, cleaned + '\n')
Beispiel #12
0
def clean_sentences(all_sentences, deep_create=False, analyze=False):

    # get overall statistics
    if analyze == True:
        sizes = []
        durations = []
        wordcounts = []
        words_sec = []
        letters = []
        letters_sec = []

    if all_sentences[0] == 0:  # 0 - array of sentences
        set_sentences = all_sentences[1]
    elif all_sentences[0] == 1:  # 1 - sql query
        query = create_query()
        main_db[1].execute(
            "SELECT dataset,wav_path,size,text,duration,wordcount,lettercount,lettersec,wordsec from audios where "
            + str(query) + "")
        data = main_db[1].fetchall()
        set_sentences = data

    counter = len(set_sentences)

    index = 0
    failer = 0
    success = 0
    cleaned = ''
    cleaned_sentences = []
    cleaned_raw = []
    dropped = []
    datasets = []
    show_index = round(counter / 20)
    for set_s in set_sentences:

        if index % show_index == 0:
            tools.printer(0, '[' + str(round(
                (index / counter) * 100, 2)) + '%]',
                          str(index) + '/' + str(counter))
        if deep_create == True:
            text = set_s[3]
            data_dir = str(set_s[1])
            if os.path.isfile(data_dir) == False:
                cleaned = False

        else:
            text = set_s

        if args.noclean == False:
            cleaned = alphabet.sentences_cleaner(prepare, str(text),
                                                 language_data[0], args.upper,
                                                 num_activ, args.lower)
        else:
            cleaned = str(text).replace('\n', ' ')
            if args.upper == False:
                cleaned = cleaned.lower()

        if cleaned == False:
            failer += 1
            dropped.append(str(text))
        else:
            success += 1
            if analyze == True:
                if set_s[0] in datasets:
                    pass
                else:
                    datasets.append(set_s[0])
                sizes.append(set_s[2])
                durations.append(set_s[4])
                wordcounts.append(set_s[5])
                letters.append(set_s[6])
                letters_sec.append(set_s[7])
                words_sec.append(set_s[8])

            if deep_create == True:
                cleaned_sentences.append(data_dir + ',' + str(set_s[2]) + ',' +
                                         cleaned)
                cleaned_raw.append(str(cleaned))
            else:
                cleaned_sentences.append(str(cleaned))

        index += 1

    print()

    # exit if no sentences
    if deep_create == True and len(cleaned_sentences) == 0:
        tools.printer(9, 'no sentences found or all failed', '')
        tools.printer(88, 'try other parameters', '')
        tools.printer(99, 'python3 deepspeech_cleaner.py --help" for help', '')
        close_db(2, True)

    if analyze == True:
        tools.printer(11, 'info:', '', True, model_path + 'info/training.txt')
        tools.printer(2, 'corpora', '-'.join(datasets), True,
                      model_path + 'info/training.txt')
        tools.get_size(sizes, 'size', model_path + "info/training.txt")
        all_duration = tools.get_size(durations, 'duration',
                                      model_path + "info/training.txt")
        all_words = tools.get_size(wordcounts, 'words',
                                   model_path + "info/training.txt")
        all_letters = tools.get_size(letters, 'letters',
                                     model_path + "info/training.txt")
        tools.printer(2, 'words per sec', round(all_words / all_duration, 2),
                      True, model_path + 'info/training.txt')
        tools.printer(2, 'letters per sec', round(all_letters / all_duration,
                                                  2), True,
                      model_path + 'info/training.txt')

    return [cleaned_sentences, cleaned_raw, dropped]
Beispiel #13
0
def create_train_files():
    tools.printer(22, 'wordlength', args.wordlength, True,
                  model_path + 'info/training.txt')
    tools.printer(22, 'numbers', args.numbers, True,
                  model_path + 'info/training.txt')
    tools.printer(22, 'upper', args.upper, True,
                  model_path + 'info/training.txt')
    tools.printer(22, 'lower', args.lower, True,
                  model_path + 'info/training.txt')
    tools.printer(-3)
    # get sentences from db and clean them
    newlines = clean_sentences([1], True, True)
    tri_sentences = newlines[1]
    dropped = newlines[2]
    newlines = newlines[0]
    maxlines = len(newlines)

    test = int(round(maxlines * 0.15))
    train = int(float(maxlines - test * 2))
    random.shuffle(newlines)
    test_files = []
    train_files = []
    dev_files = []
    test_files.append('wav_filename,wav_filesize,transcript')
    train_files.append('wav_filename,wav_filesize,transcript')
    dev_files.append('wav_filename,wav_filesize,transcript')
    for line in newlines:
        if len(test_files) <= test:
            test_files.append(line)
        elif len(dev_files) <= test:
            dev_files.append(line)
        else:
            train_files.append(line)

    tools.printer(2, 'all files', str(len(newlines)), True,
                  model_path + 'info/training.txt')
    tools.printer(2, 'train files', str(len(train_files)), True,
                  model_path + 'info/training.txt')
    tools.printer(2, 'test files', str(len(test_files)), True,
                  model_path + 'info/training.txt')
    tools.printer(2, 'dev files', str(len(dev_files)), True,
                  model_path + 'info/training.txt')
    # add all sentences
    tools.create_file(tri_sentences, model_path + "clean")
    tools.create_file(tri_sentences, model_path + "info/cleaned_sentences")
    tools.create_file(dropped, model_path + "info/dropped_sentences")
    tools.create_file(newlines, model_path + "all.csv")
    tools.create_file(dev_files, model_path + "dev.csv")
    tools.create_file(train_files, model_path + "train.csv")
    tools.create_file(test_files, model_path + "test.csv")

    if args.notrie == False:
        if create_trie() == False:
            tools.printer(9, 'couldn\'t create lm.binary and trie',
                          language_data[0])
            tools.printer(8, 'use -nt to skip lm.binary /trie creation',
                          language_data[0])
            return False

    # array with arguments for start_train.sh
    raw_conf = "earlystop_nsteps,train_batch_size,dev_batch_size,test_batch_size,n_hidden,learning_rate,dropout_rate,display_step,epoch,validation_step,decoder_library_path,batch_size,n_steps,summary_secs,dropout_rate2,dropout_rate3,dropout_rate4,dropout_rate5,dropout_rate6,relu_clip,early_stop,estop_mean_thresh,estop_std_thresh,beam_width,lm_weight,beta1,beta2,epsilon,valid_word_count_weight,limit_train,limit_dev,limit_test,export_batch_size,use_seq_length,log_level,max_to_keep "
    splits = raw_conf.split(',')

    # get argument values from db
    main_db[1].execute("select " + str(raw_conf) +
                       " from configs where name='" + str('default') + "'")
    index = 0
    raw_conf = main_db[1].fetchall()
    config = []

    # set right pathes
    varis = '\n\n\nlm_trie_path="' + model_path + 'trie" \nlm_binary_path="' + model_path + 'lm.binary" \ncheckpoint_dir="' + model_path + 'checkpoints" \nexport_dir="' + model_path + 'model_export"\nalphabet="' + model_path + 'alphabet.txt' + '"\ntest="' + model_path + 'test.csv' + '"\ndev="' + model_path + 'dev.csv' + '"\ntrain="' + model_path + 'train.csv' + '"'

    cmd = '\n\n\ncd "' + str(
        deepspeech_dir[0]
    ) + '"\n\npython3 DeepSpeech.py --lm_trie_path "$lm_trie_path" --lm_binary_path "$lm_binary_path" --checkpoint_dir "$checkpoint_dir" --export_dir "$export_dir" --alphabet_config_path "$alphabet" --train_files "$train" --dev_files "$dev" --test_files "$test"'

    # determine value types and add them
    for conf in raw_conf[0]:
        value = str(conf).strip()

        conf = str(splits[index]).strip()
        config.append([conf, value])
        if re.compile(r'[a-zA-Z]+').search(value):

            if value == 'True' or value == 'False' or re.compile(
                    r'1e-').search(value):
                varis += '\n' + conf + '=' + value + ''
                cmd += ' --' + conf + ' $' + conf + ''
            else:
                varis += '\n' + conf + '="' + value + '"'
                cmd += ' --' + conf + ' "$' + conf + '"'
        else:
            varis += '\n' + conf + '=' + value + ''
            cmd += ' --' + conf + ' $' + conf + ''
        index += 1

    varis += cmd
    tools.printer(-4, varis, '', True, model_path + 'start_train.sh', True,
                  False)

    tools.printer(-3)
    if args.notrie == True:
        tools.printer(8, 'path for your own trie/lm.binary', model_path)
        tools.printer(0, 'otherwise trainings script wont work\n')
    tools.printer(2, 'start training with')
    tools.printer(-1, '          ↓        ')
    tools.printer(-1, 'bash "' + model_path + 'start_train.sh"')

    return True
Beispiel #14
0
        sec_input = ''
    elif len(args.mode) > 1:
        sec_input = ' '.join(args.mode[1:])

    if mode == 'help':
        pass
    else:
        current_dir = tools.check_dir()
        language_data = alphabet.get_default_lang(args.lang, True)
        main_db = alphabet.get_db(language_data)
        deepspeech_dir = tools.check_deepspeech(language_data)
        if deepspeech_dir[0] == False:
            mode = '_'
        else:

            tools.printer(-3, '', '')
            tools.printer(1, 'Language', language_data[2])
            tools.printer(0, '', language_data[0])
            language_path = current_dir + '/languages/' + str(
                language_data[0]) + '/'
            # check if num2words got language support
            num_activ = alphabet.check_num2words(language_data[0])
            prepare = get_prepare()

    if mode == 'create':

        tools.printer(-3)
        tools.printer(1, 'Creator\n')
        sql_minmax = [['size', []], ['duration', []], ['bitrate', []],
                      ['samplerate', []], ['channels', []], ['wordcount', []],
                      ['wordsec', []], ['lettercount', []], ['lettersec', []],