def convert_full_set(path, pattern, new_ext="wav", **kwargs):
    pattern = os.path.join(path, pattern)
    audio_files = glob.glob(pattern)
    for af in tqdm.tqdm(audio_files):
        base, ext = os.path.splitext(af)
        wav = base + os.path.extsep + new_ext
        convert.to_wave(af, wav, **kwargs)
Ejemplo n.º 2
0
def main(label_csv:str, audio_dir:str, json_path:str)->None:
    """
    Reads the label_csv and writes the labels, audio_path, and duration
    to the path in json_path. 
    """

    with open(json_path, 'w') as fid:
        with open(label_csv, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            
            for line in reader:
                assert len(line)==2, f"row with {line[0]} has more than 2 elements"
                src_ext = "m4a"
                src_filename = line[0] + os.path.extsep + src_ext
                src_audio_path = os.path.join(audio_dir, src_filename)

                dst_ext = "wav"
                dst_filename = line[0] + os.path.extsep + dst_ext
                dst_audio_path = os.path.join(audio_dir, dst_filename)

                to_wave(src_audio_path, dst_audio_path)

                labels = line[1]
                labels = process_labels(labels)
                
                duration  = wav_duration(dst_audio_path)

                datum = {'text' : labels,
                        'duration' : duration,
                        'audio' : dst_audio_path}
                
                json.dump(datum, fid)
                fid.write("\n")
Ejemplo n.º 3
0
    def extract_samples(self, save_dir: str):
        """
        Extracts the wav files from the directories and copies them into the noise_dir.
        The audio files in the "SCAFE_48k" data subset are in 48 kHz and should be converted
        to 16 kHz. The if-statement in the for-loop does this conversion.

        Assumptions: 
            - The directory structure of the zip files will not change
            - 
        """
        pattern = "*/*.wav"
        high_res_audio = {"SCAFE"}
        all_wav_paths = glob.glob(os.path.join(save_dir, pattern))

        print("Extracting and removing sample files...")
        for wav_path in tqdm.tqdm(all_wav_paths):
            filename = os.path.basename(wav_path)
            dirname = os.path.basename(os.path.dirname(wav_path))
            dst_filename = "{}_{}".format(dirname, filename)
            dst_wav_path = os.path.join(self.feed_model_dir, dst_filename)
            if os.path.exists(dst_wav_path):
                print(f"{dst_wav_path} exists. Skipping...")
                continue
            else:
                # if the wavs are high resolution, down-convert to 16kHz
                if dirname in high_res_audio:
                    to_wave(wav_path, dst_wav_path)
                # if not high-res, just copy
                else:
                    copyfile(wav_path, dst_wav_path)
Ejemplo n.º 4
0
def convert_full_set(path, pattern, new_ext="wav", **kwargs):
    """Function from Awni's original codebase that is used to convert TIMIT
    """
    pattern = os.path.join(path, pattern)
    audio_files = glob.glob(pattern)
    for af in tqdm.tqdm(audio_files):
        base, ext = os.path.splitext(af)
        wav = base + os.path.extsep + new_ext
        convert.to_wave(af, wav, **kwargs)
Ejemplo n.º 5
0
def convert_full_set(path, pattern, new_ext="wav", **kwargs):
    """
    To convert from other audio formats to wav
    """
    pattern = os.path.join(path, pattern)
    audio_files = glob.glob(pattern)
    for af in tqdm.tqdm(audio_files):
        # split across extension e.g. /ff/ff/ff/.wav will be split into /ff/ff/ff and wav
        base, ext = os.path.splitext(af)
        # os.path.extsep is the separator e.g. /
        wav = base + os.path.extsep + new_ext
        convert.to_wave(af, wav, **kwargs)
Ejemplo n.º 6
0
def resample(audio_dir:str, target_samp_rate:int)->None:
    """
    resamples all of the audio files in audio_dir to the target sample rate
    Arguments
        audio_dir (str): the audio directory whose files will be resampled
        target_samp_rate(int): the sample rate the files will be resampled to
    """

    assert os.path.exists(audio_dir) == True, "audio directory does not exist"
    out_dir = os.path.join(audio_dir, "resampled")
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    extensions = ["*.wav", "*.mp3", "*.aiff", "*.flac"]
    audio_files = list()
    for ext in extensions:
        pattern = os.path.join(audio_dir, ext)
        audio_files.extend(glob.glob(pattern))
    
    for audio_fn in audio_files: 
        filename = os.path.splitext(os.path.basename(audio_fn))[0]
        wav_file = filename + os.path.extsep + "wav"
        out_path = os.path.join(out_dir, wav_file)
        convert.to_wave(audio_fn, out_path)
Ejemplo n.º 7
0
    def download_dataset(self):
        """
        This method loops through the firestore document database using paginated queries based on
        the document id. It filters out documents where `target != guess` if `self.target_eq_guess` is True
        and saves the audio file and target text into separate files. 
        """

        PROJECT_ID = 'speak-v2-2a1f1'
        QUERY_LIMIT = 2000  # max size of query
        SAMPLES_PER_QUERY = 200  # number of random samples downloaded per query
        AUDIO_EXT = '.m4a'  # extension of downloaded audio
        audio_dir = os.path.join(self.output_dir, "audio")
        os.makedirs(audio_dir, exist_ok=True)

        # verify and set the credientials
        CREDENTIAL_PATH = "/home/dzubke/awni_speech/speak-v2-2a1f1-d8fc553a3437.json"
        assert os.path.exists(
            CREDENTIAL_PATH
        ), "Credential file does not exist or is in the wrong location."
        # set the enviroment variable that `firebase_admin.credentials` will use
        os.putenv("GOOGLE_APPLICATION_CREDENTIALS", CREDENTIAL_PATH)

        # initialize the credentials and firebase db client
        cred = credentials.ApplicationDefault()
        firebase_admin.initialize_app(cred, {'projectId': PROJECT_ID})
        db = firestore.client()

        # create the data-label path and initialize the tsv headers
        date = datetime.date.today().isoformat()
        self.data_label_path = os.path.join(self.output_dir,
                                            "eval2-v4_data_" + date + ".tsv")
        self.metadata_path = os.path.join(
            self.output_dir, "eval2-v4_metadata_" + date + ".json")

        # re-calculate the constraints in the `config` as integer counts based on the `dataset_size`
        self.constraints = {
            name: int(self.constraints[name] * self.num_examples)
            for name in self.constraints.keys()
        }
        # constraint_names will help to ensure the dict keys created later are consistent.
        constraint_names = list(self.constraints.keys())
        print("constraints: ", self.constraints)

        # id_counter keeps track of the counts for each speaker, lesson, and line ids
        id_counter = {name: dict() for name in constraint_names}

        # create a mapping from record_id to lesson, line, and speaker ids
        disjoint_ids_map = get_record_ids_map(metadata_path, constraint_names)

        # create a dict of sets of all the ids in the disjoint datasets that will not
        # be included in the filtered dataset
        disjoint_id_sets = {name: set() for name in self.disjoint_id_names}
        for disj_dataset_path in self.disjoint_datasets:
            disj_dataset = read_data_json(disj_dataset_path)
            # extracts the record_ids from the excluded datasets
            record_ids = [
                path_to_id(example['audio']) for example in disj_dataset
            ]
            # loop through each record id
            for record_id in record_ids:
                # loop through each id_name and update the disjoint_id_sets
                for disjoint_id_name, disjoint_id_set in disjoint_id_sets.items(
                ):
                    disjoint_id_set.add(
                        disjoint_ids_map[record_id][disjoint_id_name])

        # creating a data range from `self.days_from_today` in the correct format
        now = datetime.datetime.utcnow()
        day_delta = datetime.timedelta(days=self.days_from_today)
        day_range = now - day_delta
        day_range = day_range.isoformat("T") + "Z"

        with open(self.data_label_path, 'w', newline='\n') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            header = [
                "id", "target", "guess", "lessonId", "target_sentence",
                "lineId", "uid", "redWords_score", "date"
            ]
            tsv_writer.writerow(header)

            # create the first query based on the constant QUERY_LIMIT
            rec_ref = db.collection(u'recordings')

            # this is the final record_id that was downloaded from the speak training set
            speak_train_last_id = 'SR9TIlF8bSWApZa1tqEBIHOQs5z1-1583920255'

            next_query = rec_ref\
                .order_by(u'id')\
                .start_after({u'id': speak_train_last_id})\
                .limit(QUERY_LIMIT)\

            # loop through the queries until the example_count is at least the num_examples
            example_count = 0
            # get the ids from the training and testsets to ensure the downloaded set is disjoint
            train_test_set = self.get_train_test_ids()

            while example_count < self.num_examples:
                print(f"another loop with {example_count} examples written")
                # convert the generator to a list to retrieve the last doc_id
                docs = list(
                    map(lambda x: self._doc_trim_to_dict(x),
                        next_query.stream()))

                try:
                    # this time will be used to start the next query
                    last_id = docs[-1]['id']
                # if the docs list is empty, there are no new documents
                # and an IndexError will be raised and break the while loop
                except IndexError:
                    print("Exiting while loop")
                    break

                # selects a random sample of `SAMPLES_PER_QUERY` from the total queries
                #docs = random.sample(docs, SAMPLES_PER_QUERY)

                for doc in docs:
                    # if num_examples is reached, break
                    if example_count >= self.num_examples:
                        break

                    target = process_text(doc['info']['target'])

                    # check that the speaker, target-sentence, and record_Id are disjoint
                    if doc['user']['uid'] not in disjoint_id_sets['speaker']\
                    and target not in disjoint_id_sets['target_sentence']\
                    and doc['id'] not in train_test_set:
                        # set `self.target_eq_guess` to True in `init` if you want
                        ## to filter by `target`==`guess`
                        if self.target_eq_guess:
                            # process the target and guess and remove apostrophe's for comparison
                            guess = process_text(doc['result']['guess'])
                            # removing apostrophes for comparison
                            target_no_apostrophe = target.replace("'", "")
                            guess_no_apostrophe = guess.replace("'", "")
                            # if targ != guess, skip the record
                            if target_no_apostrophe != guess_no_apostrophe:
                                continue

                        # if `True` constraints on the records downloaded will be checked
                        if self.check_constraints:
                            # create a mapping to feed into `check_update_constraints`
                            record_ids_map = {
                                doc['id']: {
                                    'lesson': doc['info']['lessonId'],
                                    'target_sentence':
                                    target,  # using processed target as id 
                                    'speaker': doc['user']['uid']
                                }
                            }
                            pass_constraint = check_update_contraints(
                                doc['id'], record_ids_map, id_counter,
                                self.constraints)
                            # if the record doesn't pass the constraints, continue to the next record
                            if not pass_constraint:
                                continue

                        # save the audio file from the link in the document
                        audio_url = doc['result']['audioDownloadUrl']
                        audio_path = os.path.join(audio_dir,
                                                  doc['id'] + AUDIO_EXT)

                        # convert the downloaded file to .wav format
                        # usually, this conversion done in the preprocessing step
                        # but some eval sets don't need PER labels, and so this removes the need to
                        # preprocess the evalset.
                        base, raw_ext = os.path.splitext(audio_path)
                        # use the `.wv` extension if the original file is a `.wav`
                        wav_path = base + os.path.extsep + "wav"
                        # if the wave file doesn't exist, convert to wav
                        if not os.path.exists(wav_path):
                            try:
                                to_wave(audio_path, wav_path)
                            except subprocess.CalledProcessError:
                                # if the file can't be converted, skip the file by continuing
                                logging.info(
                                    f"Process Error converting file: {audio_path}"
                                )
                                continue

                        # save the target in a tsv row
                        # tsv header: "id", "target", "guess", "lessonId", "target_id", "lineId", "uid", "date"
                        tsv_row = [
                            doc['id'],
                            doc['info']['target'],
                            doc['result']['guess'],
                            doc['info']['lessonId'],
                            target,  # using this to replace lineId
                            doc['info']['lineId'],
                            doc['user']['uid'],
                            doc['result']['score'],
                            doc['info']['date']
                        ]
                        tsv_writer.writerow(tsv_row)
                        # save all the metadata in a separate file
                        #with open(self.metadata_path, 'a') as jsonfile:
                        #    json.dump(doc, jsonfile)
                        #    jsonfile.write("\n")

                        example_count += 1

                # create the next query starting after the last_id
                next_query = (rec_ref\
                    .order_by(u'id')\
                    .start_after({u'id': last_id})\
                    .limit(QUERY_LIMIT)
                )