Ejemplo n.º 1
0
def filter_datasets_by_stt(data_paths: List[str], metadata_path: str,
                           stt_path: str, save_path: str) -> None:
    """This function takes in a list of datasets paths `data_paths`, combines them, and
    filters out the examples where the transcript does not equal the transcript from Google's
    speech-to-text API saved at `stt_path`. The filtered dataset (which is a filtered superset
    of all of the datasets in `data_paths`) is written to `save_path`. 

    Args:
        data_paths: list of datasets to combine and fitler to output
        metadata_path: path to speak metadata tsv file
        stt_path: path to speech-to-text saved output from `stt_on_datasets` function
        save_path: path where filtered examples will be saved
    """
    data_dict = combine_sort_datasets(data_paths)
    metadata = get_record_ids_map(metadata_path, has_url=True)
    stt_data = read_data_json(stt_path)
    filtered_data = list()

    count = {"total": 0, "filtered": 0}
    for datum in stt_data:
        audio_id = path_to_id(datum['audio_path'])
        spk_trans = process_text(metadata[audio_id]['target_sentence'])
        ggl_trans = process_text(datum['transcript'])
        count['total'] += 1

        if spk_trans == ggl_trans:
            count['filtered'] += 1
            filtered_data.append(data_dict[datum['audio_path']])

    write_data_json(filtered_data, save_path)
    print(f"number of total de-duplicated examples: {count['total']}")
    print(f"number of filtered examples: {count['filtered']}")
Ejemplo n.º 2
0
    def singleprocess_download(self, docs_list: list):
        """
        Downloads the audio file associated with the record and records the transcript and various metadata
        Args:
            docs_list - List[firebase-document]: a list of document references
        """

        AUDIO_EXT = ".m4a"
        TXT_EXT = ".txt"
        audio_dir = os.path.join(self.output_dir, "audio")

        with open(self.data_label_path, 'a') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            # filter the documents where `target`==`guess`
            for doc in docs_list:

                doc_dict = doc.to_dict()
                original_target = doc_dict['info']['target']

                # some of the guess's don't include apostrophes
                # so the filter criterion will not use apostrophes
                target = process_text(doc_dict['info']['target'])
                target_no_apostrophe = target.replace("'", "")

                guess = process_text(doc_dict['result']['guess'])
                guess_no_apostrophe = guess.replace("'", "")

                if target_no_apostrophe == guess_no_apostrophe:
                    # save the audio file from the link in the document
                    audio_url = doc_dict['result']['audioDownloadUrl']
                    audio_save_path = os.path.join(audio_dir,
                                                   doc_dict['id'] + AUDIO_EXT)
                    try:
                        urllib.request.urlretrieve(audio_url,
                                                   filename=audio_save_path)
                    except (ValueError, urllib.error.URLError) as e:
                        print(
                            f"unable to download url: {audio_url} due to exception: {e}"
                        )
                        continue

                    # save the target in a tsv row
                    # tsv header: "id", "text", "lessonId", "lineId", "uid", "date"
                    tsv_row = [
                        doc_dict['id'], original_target,
                        doc_dict['info']['lessonId'],
                        doc_dict['info']['lineId'], doc_dict['user']['uid'],
                        doc_dict['result']['score'], doc_dict['info']['date']
                    ]
                    tsv_writer.writerow(tsv_row)
Ejemplo n.º 3
0
    def multiprocess_download(self, doc: dict, audio_dir: str,
                              metadata_path: str, audio_ext: str):
        """
        Takes in a single document and records and downloads the contents if the recording
        meets the criterion. Used by a multiprocessing pool.

        Args:
            doc (dict): dict of the record to bee processed. A dict is passed as it needs to be pickled.
            tsv_writer (csv.writer): tsv_writer object where the metadata will be written
            audio_dir (str): directory where audio is saved
            metadata_path (str): path where metadata tsv file will be saved
            audio_ext (str): extension of the saved audio file
        Returns:
            None - write to file
        """
        with open(metadata_path, 'a') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            # process the target and guess text to see if they are equal
            # some of the guess's don't include apostrophes so processing will remove apostrophes
            target = process_text(doc['info']['target'], remove_apost=True)
            guess = process_text(doc['result']['guess'], remove_apost=True)

            if target == guess:

                # if True, save the audio file from the link in the document
                if self.download_audio:
                    audio_url = doc['result']['audioDownloadUrl']
                    audio_save_path = os.path.join(audio_dir,
                                                   doc['id'] + AUDIO_EXT)
                    try:
                        urllib.request.urlretrieve(audio_url,
                                                   filename=audio_save_path)
                    except (ValueError, URLError) as e:
                        print(
                            f"unable to download url: {audio_url} due to exception: {e}"
                        )

                # save the target and metadata in a tsv row
                # tsv header: "id", "target", "lessonId", "lineId", "uid", "redWords Score", "date"
                tsv_row = [
                    doc['id'], doc['info']['target'], doc['info']['lessonId'],
                    doc['info']['lineId'], doc['user']['uid'],
                    doc['result']['score'], doc['info']['date']
                ]
                # if not downloading the audio file, add the url to the metadata
                if not self.download_audio:
                    tsv_row.append(doc['result']['audioDownloadUrl'])
                tsv_writer.writerow(tsv_row)
Ejemplo n.º 4
0
    def download_dataset(self):
        """
        This method doesn't download an audio dataset like in other Downloader classes. It writes the metadata
        of a set of recordings to a tsv file. 
        """

        PROJECT_ID = 'speak-v2-2a1f1'
        QUERY_LIMIT = 2000  # max size of query
        SAMPLES_PER_QUERY = 200  # number of random samples downloaded per query

        # verify and set the credientials
        CREDENTIAL_PATH = "/home/dzubke/awni_speech/speak-v2-2a1f1-d8fc553a3437.json"
        assert os.path.exists(
            CREDENTIAL_PATH
        ), "Credential file does not exist or is in the wrong location."
        # set the enviroment variable that `firebase_admin.credentials` will use
        os.putenv("GOOGLE_APPLICATION_CREDENTIALS", CREDENTIAL_PATH)

        # initialize the credentials and firebase db client
        cred = credentials.ApplicationDefault()
        firebase_admin.initialize_app(cred, {'projectId': PROJECT_ID})
        db = firestore.client()
        # select the recordings collection
        rec_ref = db.collection(u'recordings')

        data_label_path = os.path.join(self.output_dir,
                                       "speak-test_metadata_2020-12-09.tsv")

        with open(data_label_path, 'w', newline='\n') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            header = [
                "id", "target", "guess", "lessonId", "lineId", "uid",
                "redWords_score", "date"
            ]
            tsv_writer.writerow(header)

            # take the record_ids in batches of 10
            # the firestore `in` operater can take only a list of 10 elements
            for idx_10 in range(0, len(self.record_ids), 10):

                batch_10_ids = self.record_ids[idx_10:idx_10 + 10]

                next_query = rec_ref.where(u'id', u'in', batch_10_ids)

                for doc in next_query.stream():
                    doc = doc.to_dict()

                    target = process_text(doc['info']['target'])

                    tsv_writer.writerow([
                        doc['id'], doc['info']['target'],
                        doc['result']['guess'], doc['info']['lessonId'],
                        doc['info']['lineId'], doc['user']['uid'],
                        doc['result']['score'], doc['info']['date']
                    ])
Ejemplo n.º 5
0
def stt_on_sample(data_path: str,
                  metadata_path: str,
                  save_path: str,
                  stt_provider: str = 'ibm') -> None:
    """Pulls a random sample of audio files from `data_path` and calls a speech-to-text API to 
    get transcript predictions. The STT output is formated and written to `save_path` along 
    with the files's transcript from `metadata_path`. 

    Args:
        data_path: path to training json 
        metadata_path: path to metadata tsv containing transcript
        save_path: path where output txt will be saved
        stt_provider: name of company providing STT model
    """
    random.seed(0)
    SAMPLE_SIZE = 100

    data = read_data_json(data_path)
    data_sample = random.choices(data, k=SAMPLE_SIZE)
    print(f"sampling {len(data_sample)} samples from {data_path}")

    # mapping from audio_id to transcript
    metadata = get_record_ids_map(metadata_path, has_url=True)

    client = get_stt_client(stt_provider)

    preds_with_two_trans = set()
    match_trans_entries = list()  # output list for matching transcripts
    diff_trans_entries = list()  # output list for non-matching transcripts
    for datum in data_sample:
        audio_path = datum['audio']
        audio_id = path_to_id(audio_path)
        id_plus_dir = os.path.join(*audio_path.split('/')[-2:])

    data = read_data_json(data_path)
    data_sample = random.choices(data, k=SAMPLE_SIZE)
    print(f"sampling {len(data_sample)} samples from {data_path}")

    # mapping from audio_id to transcript
    metadata = get_record_ids_map(metadata_path, has_url=True)

    client = get_stt_client(stt_provider)

    preds_with_two_trans = set()
    match_trans_entries = list()  # output list for matching transcripts
    diff_trans_entries = list()  # output list for non-matching transcripts
    for datum in data_sample:
        audio_path = datum['audio']
        audio_id = path_to_id(audio_path)
        id_plus_dir = os.path.join(*audio_path.split('/')[-2:])

        response = get_stt_response(audio_path, client, stt_provider)
        resp_dict = format_response_dict(audio_path, response, stt_provider)

        ggl_trans = process_text(resp_dict['transcript'])
        apl_trans = process_text(metadata[audio_id]['target_sentence'])
        out_txt = format_txt_from_dict(resp_dict, apl_trans, id_plus_dir)

        if apl_trans == ggl_trans:
            match_trans_entries.append(out_txt)
        else:
            diff_trans_entries.append(out_txt)

    with open(save_path, 'w') as fid:
        for entries in [diff_trans_entries, match_trans_entries]:
            fid.write("-" * 10 + '\n')
            for entry in entries:
                fid.write(entry + '\n\n')
Ejemplo n.º 6
0
def assess_speak_train(dataset_paths: List[str], 
                        metadata_path:str, 
                        out_dir:str, 
                        use_json:bool=True)->None:
    """This function creates counts of the speaker, lesson, and line ids in a speak training dataset
    Args:
        dataset_path (str): path to speak training.json dataset
        metadata_path (str): path to tsv file that contains speaker, line, and lesson ids 
        out_dir (str): directory where plots and txt files will be saved
        use_json (bool): if true, the data will be read from a training.json file
    Returns:
        None
    """


    def _increment_key(in_dict, key): 
        in_dict[key] = in_dict.get(key, 0) + 1


    # this will read the data from a metadata.tsv file
    if not use_json:
        # count dictionaries for the lesssons, lines, and users (speakers)
        lesson_dict, line_dict, user_dict, target_dict = {}, {}, {}, {}
        # create count_dicts for each
        with open(metadata_path, 'r') as tsv_file: 
            tsv_reader = csv.reader(tsv_file, delimiter='\t')
            header = next(tsv_reader) 
            print(header) 
            for row in tsv_reader: 
                _increment_key(lesson_dict, row[2]) 
                _increment_key(line_dict, row[3]) 
                _increment_key(user_dict, row[4]) 
                _increment_key(target_dict, process_text(row[1])) 

        # put the labels and count_dicts in list of the for-loop
        constraint_names = ['lesson', 'line', 'speaker', 'target_sent']
        counter = {
            "lesson": lesson_dict, 
            "line": line_dict, 
            "speaker": user_dict,
            "target_sent": target_dict
        }

    # reading from a training.json file supported by a metadata.tsv file
    if use_json:
        # create mapping from record_id to speaker, line, and lesson ids
        rec_ids_map = dict()
        constraint_names = ['lesson', 'line', 'speaker', 'target_sent']
        counter = {name: dict() for name in constraint_names}
        with open(metadata_path, 'r') as tsv_file: 
            tsv_reader = csv.reader(tsv_file, delimiter='\t')
            # header: id, text, lessonId, lineId, uid(speaker_id), date
            header = next(tsv_reader)
            rec_ids_map = dict()
            for row in tsv_reader:
                rec_ids_map[row[0]]= {
                        constraint_names[0]: row[2],   # lesson
                        constraint_names[1]: row[3],    # line
                        constraint_names[2]: row[4],    # speaker
                        constraint_names[3]: process_text(row[1]),  # target-sentence
                        "date": row[6]                  # date
                }

        total_date_counter = dict()
        # `unq_date_sets` keep track of the unique ids
        unq_date_counter = {name: dict() for name in constraint_names}
        # iterate through the datasets
        for dataset_path in dataset_paths:
            dataset = read_data_json(dataset_path)
            print(f"dataset {path_to_id(dataset_path)} size is: {len(dataset)}")

            # iterate through the exmaples in the dataset
            for xmpl in dataset:
                rec_id = path_to_id(xmpl['audio'])
                date =  rec_ids_map[rec_id]['date']
                # date has format 2020-09-10T04:24:03.073Z, so splitting
                # and joining by '-' using the first two element will be `2020-09`
                yyyy_mm_date = '-'.join(date.split('-')[:2])
                _increment_key(total_date_counter, yyyy_mm_date)

                # iterate through the constraints and update the id counters
                for name in constraint_names:
                    constraint_id = rec_ids_map[rec_id][name]
                    _increment_key(counter[name], constraint_id)
                    update_unq_date_counter(
                        unq_date_counter, 
                        name, 
                        constraint_id,
                        yyyy_mm_date
                    )

                
    # create the plots
    fig, axs = plt.subplots(1,len(constraint_names))
    fig.set_size_inches(8, 6)

    # plot and calculate stats of the count_dicts
    for ax, name in zip(axs, constraint_names):
        plot_count(ax, counter[name], name)
        print(f"{name} stats")
        print_stats(counter[name])
        print()
    
    # ensures the directory of `out_dir` exists
    os.makedirs(out_dir, exist_ok=dir)
    out_path = os.path.join(out_dir, os.path.basename(out_dir))
    print("out_path: ", out_path)
    plt.savefig(out_path + "_count_plot.png")
    plt.close()

    # plot the total_date histogram
    fig, ax = plt.subplots(1,1)
    dates = sorted(total_date_counter.keys())
    date_counts = [total_date_counter[date] for date in dates]
    ax.plot(range(len(date_counts)), date_counts)
    plt.xticks(range(len(date_counts)), dates, rotation=60)
    #ax.set_title(label)
    #ax.set_xlabel(f"unique {label}")
    #ax.set_ylabel(f"utterance per {label}")
    #ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values));
    ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values));
    plt.tight_layout()

    plt.savefig(out_path + "_date_count.png")
    plt.close()

    # plot the unique ids
    for name in constraint_names:
        fig, ax = plt.subplots(1,1)
        date_counts = []
        dates = sorted(unq_date_counter[name].keys())
        total_count = sum([unq_date_counter[name][date]['count'] for date in dates])
        cumulative_count = 0
        for date in dates:
            cumulative_count += unq_date_counter[name][date]['count'] 
            date_counts.append(round(cumulative_count/total_count, 2))
        
        ax.plot(range(len(date_counts)), date_counts)
        plt.xticks(range(len(date_counts)), dates, rotation=60)
        ax.set_title(name)
        ax.set_xlabel(f"Date")
        ax.set_ylabel(f"% of total unique ID's")
        #ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values));
        #ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values));
        plt.tight_layout()

        plt.savefig(out_path + f"_unq_cum_date_{name}.png")
        plt.close()


    # sort the lesson_ids and line_ids and write to txt file
    for name in counter:
        sorted_ids = sorted(list(counter[name].keys()))

        with open(f"{out_path}_{name}.txt", 'w') as fid:
            for ids in sorted_ids:
                fid.write(ids+"\n")
Ejemplo n.º 7
0
    def download_dataset(self):
        """
        This method loops through the firestore document database using paginated queries based on
        the document id. It filters out documents where `target != guess` if `self.target_eq_guess` is True
        and saves the audio file and target text into separate files. 
        """

        PROJECT_ID = 'speak-v2-2a1f1'
        QUERY_LIMIT = 2000  # max size of query
        SAMPLES_PER_QUERY = 200  # number of random samples downloaded per query
        AUDIO_EXT = '.m4a'  # extension of downloaded audio
        audio_dir = os.path.join(self.output_dir, "audio")
        os.makedirs(audio_dir, exist_ok=True)

        # verify and set the credientials
        CREDENTIAL_PATH = "/home/dzubke/awni_speech/speak-v2-2a1f1-d8fc553a3437.json"
        assert os.path.exists(
            CREDENTIAL_PATH
        ), "Credential file does not exist or is in the wrong location."
        # set the enviroment variable that `firebase_admin.credentials` will use
        os.putenv("GOOGLE_APPLICATION_CREDENTIALS", CREDENTIAL_PATH)

        # initialize the credentials and firebase db client
        cred = credentials.ApplicationDefault()
        firebase_admin.initialize_app(cred, {'projectId': PROJECT_ID})
        db = firestore.client()

        # create the data-label path and initialize the tsv headers
        date = datetime.date.today().isoformat()
        self.data_label_path = os.path.join(self.output_dir,
                                            "eval2-v4_data_" + date + ".tsv")
        self.metadata_path = os.path.join(
            self.output_dir, "eval2-v4_metadata_" + date + ".json")

        # re-calculate the constraints in the `config` as integer counts based on the `dataset_size`
        self.constraints = {
            name: int(self.constraints[name] * self.num_examples)
            for name in self.constraints.keys()
        }
        # constraint_names will help to ensure the dict keys created later are consistent.
        constraint_names = list(self.constraints.keys())
        print("constraints: ", self.constraints)

        # id_counter keeps track of the counts for each speaker, lesson, and line ids
        id_counter = {name: dict() for name in constraint_names}

        # create a mapping from record_id to lesson, line, and speaker ids
        disjoint_ids_map = get_record_ids_map(metadata_path, constraint_names)

        # create a dict of sets of all the ids in the disjoint datasets that will not
        # be included in the filtered dataset
        disjoint_id_sets = {name: set() for name in self.disjoint_id_names}
        for disj_dataset_path in self.disjoint_datasets:
            disj_dataset = read_data_json(disj_dataset_path)
            # extracts the record_ids from the excluded datasets
            record_ids = [
                path_to_id(example['audio']) for example in disj_dataset
            ]
            # loop through each record id
            for record_id in record_ids:
                # loop through each id_name and update the disjoint_id_sets
                for disjoint_id_name, disjoint_id_set in disjoint_id_sets.items(
                ):
                    disjoint_id_set.add(
                        disjoint_ids_map[record_id][disjoint_id_name])

        # creating a data range from `self.days_from_today` in the correct format
        now = datetime.datetime.utcnow()
        day_delta = datetime.timedelta(days=self.days_from_today)
        day_range = now - day_delta
        day_range = day_range.isoformat("T") + "Z"

        with open(self.data_label_path, 'w', newline='\n') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            header = [
                "id", "target", "guess", "lessonId", "target_sentence",
                "lineId", "uid", "redWords_score", "date"
            ]
            tsv_writer.writerow(header)

            # create the first query based on the constant QUERY_LIMIT
            rec_ref = db.collection(u'recordings')

            # this is the final record_id that was downloaded from the speak training set
            speak_train_last_id = 'SR9TIlF8bSWApZa1tqEBIHOQs5z1-1583920255'

            next_query = rec_ref\
                .order_by(u'id')\
                .start_after({u'id': speak_train_last_id})\
                .limit(QUERY_LIMIT)\

            # loop through the queries until the example_count is at least the num_examples
            example_count = 0
            # get the ids from the training and testsets to ensure the downloaded set is disjoint
            train_test_set = self.get_train_test_ids()

            while example_count < self.num_examples:
                print(f"another loop with {example_count} examples written")
                # convert the generator to a list to retrieve the last doc_id
                docs = list(
                    map(lambda x: self._doc_trim_to_dict(x),
                        next_query.stream()))

                try:
                    # this time will be used to start the next query
                    last_id = docs[-1]['id']
                # if the docs list is empty, there are no new documents
                # and an IndexError will be raised and break the while loop
                except IndexError:
                    print("Exiting while loop")
                    break

                # selects a random sample of `SAMPLES_PER_QUERY` from the total queries
                #docs = random.sample(docs, SAMPLES_PER_QUERY)

                for doc in docs:
                    # if num_examples is reached, break
                    if example_count >= self.num_examples:
                        break

                    target = process_text(doc['info']['target'])

                    # check that the speaker, target-sentence, and record_Id are disjoint
                    if doc['user']['uid'] not in disjoint_id_sets['speaker']\
                    and target not in disjoint_id_sets['target_sentence']\
                    and doc['id'] not in train_test_set:
                        # set `self.target_eq_guess` to True in `init` if you want
                        ## to filter by `target`==`guess`
                        if self.target_eq_guess:
                            # process the target and guess and remove apostrophe's for comparison
                            guess = process_text(doc['result']['guess'])
                            # removing apostrophes for comparison
                            target_no_apostrophe = target.replace("'", "")
                            guess_no_apostrophe = guess.replace("'", "")
                            # if targ != guess, skip the record
                            if target_no_apostrophe != guess_no_apostrophe:
                                continue

                        # if `True` constraints on the records downloaded will be checked
                        if self.check_constraints:
                            # create a mapping to feed into `check_update_constraints`
                            record_ids_map = {
                                doc['id']: {
                                    'lesson': doc['info']['lessonId'],
                                    'target_sentence':
                                    target,  # using processed target as id 
                                    'speaker': doc['user']['uid']
                                }
                            }
                            pass_constraint = check_update_contraints(
                                doc['id'], record_ids_map, id_counter,
                                self.constraints)
                            # if the record doesn't pass the constraints, continue to the next record
                            if not pass_constraint:
                                continue

                        # save the audio file from the link in the document
                        audio_url = doc['result']['audioDownloadUrl']
                        audio_path = os.path.join(audio_dir,
                                                  doc['id'] + AUDIO_EXT)

                        # convert the downloaded file to .wav format
                        # usually, this conversion done in the preprocessing step
                        # but some eval sets don't need PER labels, and so this removes the need to
                        # preprocess the evalset.
                        base, raw_ext = os.path.splitext(audio_path)
                        # use the `.wv` extension if the original file is a `.wav`
                        wav_path = base + os.path.extsep + "wav"
                        # if the wave file doesn't exist, convert to wav
                        if not os.path.exists(wav_path):
                            try:
                                to_wave(audio_path, wav_path)
                            except subprocess.CalledProcessError:
                                # if the file can't be converted, skip the file by continuing
                                logging.info(
                                    f"Process Error converting file: {audio_path}"
                                )
                                continue

                        # save the target in a tsv row
                        # tsv header: "id", "target", "guess", "lessonId", "target_id", "lineId", "uid", "date"
                        tsv_row = [
                            doc['id'],
                            doc['info']['target'],
                            doc['result']['guess'],
                            doc['info']['lessonId'],
                            target,  # using this to replace lineId
                            doc['info']['lineId'],
                            doc['user']['uid'],
                            doc['result']['score'],
                            doc['info']['date']
                        ]
                        tsv_writer.writerow(tsv_row)
                        # save all the metadata in a separate file
                        #with open(self.metadata_path, 'a') as jsonfile:
                        #    json.dump(doc, jsonfile)
                        #    jsonfile.write("\n")

                        example_count += 1

                # create the next query starting after the last_id
                next_query = (rec_ref\
                    .order_by(u'id')\
                    .start_after({u'id': last_id})\
                    .limit(QUERY_LIMIT)
                )