def __init__(
        self,
        spect_file,
        spect_label_file,
        snippet_outdir,
        window_size=256,
        logfile=None,
    ):
        '''
        
        @param spect_file: The spectrogram file we want to chop
        @type spect_file: str
        @param spect_label_file: The spectrogram label file with 0/1 binary labels
        @type spect_label_file: str
        @param snippet_outdir: where the snippets are to be placed.
        This is likely to be either in the training / test folder
        of data
        @type snippet_outdir: str
        @param logfile: where to log
        @type logfile: {None|str}
        @param test_snippet_width: if set to a positive int,
            set the snippet width to that value. Used by unittests
            to work with smaller dataframes
        @type test_snippet_width: int
        '''
        '''
        Constructor
        '''

        # Again we may just ditch this!!!
        self.log = LoggingService(logfile=logfile,
                                  msg_identifier=f"chop_spectrograms")

        # If snippet_outdir's path does not exist
        # yet, create all dirs along the path:
        if not os.path.exists(snippet_outdir):
            os.makedirs(snippet_outdir)
        """
            Open the spect file and corresponding label and then CHOP, CHOP, CHOP
        """
        try:
            spectrogram = np.load(spect_file)
        except:
            raise ValueError("Invalid spectrogram file")

        try:
            spect_labels = np.load(spect_label_file)
        except:
            print("No label file! Using empty labels")
            spect_labels = np.zeros(spectrogram.shape[0])

        # Extract the spectrogram name so that we can label the chopped spectrograms
        file_family = FileFamily(spect_file)
        spect_root_name = file_family.file_root

        # Remove the .npy tag
        self.chop_spectrogram(spectrogram, spect_labels, window_size,
                              spect_root_name, snippet_outdir)
Ejemplo n.º 2
0
    def __init__(self,
                 text_file,
                 text_fields,
                 outfiles_dir=None,
                 record_id_col='id',
                 stopword_language='english',
                 ngram_len=3,
                 separator=','):

        self.log = LoggingService()
        if outfiles_dir is not None:
            if not os.path.exists(outfiles_dir):
                os.mkdir(outfiles_dir)
            elif not os.path.isdir(outfiles_dir):
                print(f"Outfile directory is a file: {outfiles_dir}; quitting")
                sys.exit(1)

        self.text_file = text_file
        self.outfiles_dir = outfiles_dir
        self.text_fields = text_fields
        self.record_id_col = record_id_col
        self.stopword_language = stopword_language
        self.ngram_len = ngram_len
        self.separator = separator

        self.create_wordnet_nltk_pos_xlation()
        # Generate outfile names for word stats,
        # and ngrams:

        if outfiles_dir is None:
            # Output result files to same dir as infile:
            (infile_path, ext) = os.path.splitext(text_file)
            wordstats_path = f"{infile_path}{self.word_file_suffix}.csv"
            ngrams_path = f"{infile_path}{self.ngram_file_suffix}.csv"
        else:
            # Outdir different from infile's:
            wordstats_path = os.path.join(
                outfiles_dir,
                f"{os.path.basename(text_file)}{self.word_file_suffix}.csv")

        self.log.info(f"Infile: {text_file}")
        self.log.info(f"Word stats will be in {wordstats_path}")
        self.log.info(f"NGrams will be in {ngrams_path}")
        self.generate_outfiles(text_file, wordstats_path, ngrams_path)
Ejemplo n.º 3
0
    def test_training_script_start_cmd(self):

        launcher = TrainScriptLauncher(unittesting=True)
        tst_world_map_path = os.path.join(self.curr_dir,
                                          'world_map_for_testing.json')
        world_map = launcher.read_world_map(tst_world_map_path)
        world_map = self.substitute_local_hostname(world_map)
        launcher.config = self.config
        launcher.build_compute_landscape(world_map)

        rank = 0
        local_rank = 0
        min_rank_this_machine = 0
        gpus_to_use = 2  # The entry of this machine in the world map

        # Set some instance vars that would
        # normally be set in the the launcher's
        # constructor:

        launcher.WORLD_SIZE = 4
        launcher.MASTER_PORT = self.config.getint('Parallelism', 'master_port')
        launcher.log = LoggingService()

        script_args = {
            'MASTER_ADDR': '127.0.0.1',
            'MASTER_PORT': 5678,
            'RANK': rank,
            'LOCAL_RANK': local_rank,
            'WORLD_SIZE': 4,
            'MIN_RANK_THIS_MACHINE': min_rank_this_machine,
            'config': self.config_path
        }

        script_path = os.path.join(self.curr_dir, '../birds_train_parallel')
        launch_args = {'training_script': script_path}

        start_cmd = launcher.training_script_start_cmd(rank, local_rank,
                                                       gpus_to_use,
                                                       min_rank_this_machine,
                                                       launch_args,
                                                       script_args)

        print(start_cmd)
class SqliteDataset(FrozenDataset):
    '''
    Takes path to a CSV file prepared by ****????****
    Columns: id,advertiser,page,leaning,tokens,ids
    Sample:
      (10,'Biden','http://...','left',"['[CLS],'Joe','runs',...'[SEP]']",'[[114 321 ...],[4531 ...]])
    
    Tokens is a stringified array of array of tokens.
    Length: sequence size (e.g. 128)
    Length: as many are there are lines of sample ads.
    Ids are a stringified arrays of arrays of ints. Each
      int is an index into BERT vocab. 
    Length: sequence size (e.g. 128)
    
    Result in sqlite DB; acts as iterator, has len()
    '''

    SEQUENCE_LEN = 128
    TEXT_COL_NAME = 'text'
    LABEL_COL_NAME = 'label'
    IDS_COL_NAME = 'tok_ids'

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(
        self,
        csv_or_sqlite_path,
        label_mapping,
        sequence_len=None,
        text_col_name=None,
        label_col_name=None,
    ):
        '''
        A dataset for the context of Bert training.        
        One usually interacts with an instance of this
        class through a BertFeederDataloader instance
        (see bert_feeder_dataloader.py).
        
        This class is a subclass of the torch.util.Dataset
        class, and behaves as such. It can act as a stream
        of input sentences, or be a dict-like data source.
        For the dict-like behavior: 
        
            my_dataset[row_num]
            
        For the stream behavior: treat my_dataset as an
        iterator. 
        
        An additional feature is the option for integrated
        train/validation/test splits. Calling split_dataset()
        internally produces input queues that feed three 
        iterators. Callers switch between these iterators via
        the switch_to_split() method. The splits can be reset
        to their beginnings using the reset() method.
        
        Takes a CSV file, and generates an Sqlite database
        that holds the integer indexes of the collection
        vocab into the BERT vocab, the tokens, and the
        labels. The CSV file can have arbitrary columns;
        only two are required: a column with the raw text
        to be processed through a BERT model, and a column
        with the true labels. The column names default to
        
          BertFeederDataset.TEXT_COL_NAME
          BertFeederDataset.LABEL_COL_NAME
          
        These defaults can be changed in the __init__() call
        or in the class variable init.
        
        The label_mapping must be an OrderedDict mapping
        the textual labels in the CSV file to integers 0,1,...
        
        Ex CSV:
        
          id,     message,       page,    leaning

         165,"We are the..." ,http://...,  left        
            ,"Foo is bar..." ,   ...    ,  right
                    ...
        
        In this example the important cols are 'message', and 'leaning
        the label_mapping might be:
        
            OrderedDict({'right'   : 0,
                         'left'    : 1,
                         'neutral' : 2})
        
        Sequence length is the maximum number of text input 
        tokens into the model in one input sentence. A 
        typical number is 128. If input texts in the CSV are 
        longer than sequence_len, one or more additional input 
        sentences are constructed with the same label as the
        long-text row. Shorter sequences are padded.
        
        @param csv_path: path to CSV file. If sqlite_path is
            provided, and exists, the database at that location
            is used, instead of importing the CSV file. If not,
            an Sqlite db will be created in the same dir as
            csv_path. 
        @type csv_path: str
        @param label_mapping: mapping from text labels to ints
        @type label_mapping: OrderedDict({str : int})
        @param sqlite_path: path where the Sqlite db will be created
        @type sqlite_path: str
        @param sequence_len: width of BERT model input sentences 
            in number of tokens.
        @type sequence_len: int
        @param text_col_name: CSV column that holds text to process
        @type text_col_name: str
        @param label_col_name: CSV column that holds labels.
        @type label_col_name: str
        @param quiet: don't ask for confirmation about existing sqlite file:
        @type quiet: bool
        @param delete_db: if True, delete Sqlite db that contains the csv
            content right from the start. If None, ask user on the command
            line
        @type delete_db: {None|bool}
        '''

        self.log = LoggingService()

        if text_col_name is None:
            self.text_col_name = self.TEXT_COL_NAME
        else:
            self.text_col_name = text_col_name

        if label_col_name is None:
            self.label_col_name = self.LABEL_COL_NAME
        else:
            self.text_col_name = text_col_name

        self.label_mapping = label_mapping

        if not os.path.exists(csv_or_sqlite_path):
            raise IOError(f"Data source {csv_or_sqlite_path} does not exist.")

        is_csv_source = csv_or_sqlite_path.endswith('.csv')

        if is_csv_source:
            # Remove any existing sqlite db that goes
            # with this CSV file:
            (file_path, _ext) = os.path.splitext(csv_or_sqlite_path)
            sqlite_path = file_path + '.sqlite'
            if os.path.exists(sqlite_path):
                os.remove(sqlite_path)
            # Fill the sqlite db with records, each
            # containing sample_id, toc_ids, label, attention_mask.
            self.db = self.process_csv_file(csv_or_sqlite_path, sqlite_path,
                                            sequence_len, text_col_name,
                                            label_col_name)

        else:
            self.db = sqlite3.connect(csv_or_sqlite_path)
            self.db.row_factory = sqlite3.Row

        num_samples_row = next(
            self.db.execute('''SELECT COUNT(*) AS num_samples from Samples'''))
        num_samples = num_samples_row['num_samples']
        # Sqlite3 ROWIDs go from 1 to n
        self.sample_ids = list(range(num_samples))

        # Make a preliminary train queue with all the
        # sample ids. If split_dataset() is called later,
        # this queue will be replaced:
        self.train_queue = deque(self.sample_ids)
        self.curr_queue = self.train_queue
        self.saved_queues = {}
        # Again: this saved_queues entry will be
        # replaced upon a split:
        self.saved_queues['train'] = self.train_queue.copy()
        self.num_samples = len(self.train_queue)

    #------------------------------------
    # train_set
    #-------------------

    def get_datasplit(self, split_id):
        if split_id == 'train':
            return self.train_frozen_dataset
        elif split_id == 'validate':
            return self.validate_frozen_dataset
        elif split_id == 'test':
            return self.test_frozen_dataset
        else:
            raise ValueError(
                "Only train, validate, and test are valid split ids.")

    #------------------------------------
    # switch_to_split
    #-------------------

    def switch_to_split(self, split_id):

        if split_id == 'train':
            self.curr_queue = self.train_queue
        elif split_id == 'validate':
            self.curr_queue = self.val_queue
        elif split_id == 'test':
            self.curr_queue = self.test_queue
        else:
            raise ValueError(
                f"Dataset ID must be one of train/validate/test; was {split_id}"
            )

    #------------------------------------
    # curr_dataset_id
    #-------------------

    def curr_split_id(self):
        if self.curr_queue == self.train_queue:
            return 'train'
        if self.curr_queue == self.val_queue:
            return 'validate'
        if self.curr_queue == self.test_queue:
            return 'test'
        raise ValueError("Bad curr_queue")

    #------------------------------------
    # reset
    #-------------------

    def reset(self, split_id=None):
        '''
        Sets the dataset's queue to the beginning.
        If dataset_id is None, resets the current
        split.
                
        @param split_id:
        @type split_id:
        '''

        # After replenishing the requested
        # queue, check whether that queue was
        # self.curr_queue. If so, change self.curr_queue
        # to point to the new, refilled queue. Else
        # self.curr_queue remains unchanged:

        if split_id == 'train':
            old_train = self.train_queue
            self.train_queue = self.saved_queues['train'].copy()
            if self.curr_queue == old_train:
                self.curr_queue = self.train_queue

        elif split_id == 'validate':
            old_val = self.val_queue
            self.val_queue = self.saved_queues['validate'].copy()
            if self.curr_queue == old_val:
                self.curr_queue = self.val_queue

        elif split_id == 'test':
            old_test = self.test_queue
            self.test_queue = self.saved_queues['test'].copy()
            if self.curr_queue == old_test:
                self.curr_queue = self.test_queue

        else:
            raise ValueError(
                f"Dataset ID must be one of train/validate/test; was {split_id}"
            )

    #------------------------------------
    # process_csv_file
    #-------------------

    def process_csv_file(self, csv_path, sqlite_path, sequence_len,
                         text_col_name, label_col_name):
        '''
        Create an sqlite db containing table 
        'Samples' with cols
           sample_id  int
           tok_ids    str  e.g. '[254,456,...]'
           label      str
           attention_mask str   e.g. [1,0,0,1,...]
        
        CSV file must contain at least a column
        called self.text_col_name and self.table_col_name.
        
        @param csv_path: 
        @type csv_path:
        @param sqlite_path:
        @type sqlite_path:
        @param sequence_len:
        @type sequence_len:
        @param text_col_name:
        @type text_col_name:
        @param label_col_name:
        @type label_col_name:
        @return: a database (connection) instance
        @rtype: sqlite3.Connection
        '''

        # Set defaults where needed:
        if sequence_len is None:
            sequence_len = self.SEQUENCE_LEN
        if text_col_name is None:
            text_col_name = self.TEXT_COL_NAME
        if label_col_name is None:
            label_col_name = self.LABEL_COL_NAME

        self.sequence_len = sequence_len
        self.text_col_name = text_col_name
        self.label_col_name = label_col_name

        # Facility to tokenize and otherwise
        # convert samples to formats ready for
        # BERT:
        self.text_augmenter = TextAugmenter(sequence_len)
        # Get number of CSV lines:
        res = subprocess.run(['wc', '-l', csv_path], capture_output=True)
        # Returns something line: b'   23556 /Users/foo.csv':
        (num_csv_lines, _filename) = res.stdout.decode().strip().split(' ')
        csv_fd = open(csv_path, 'r')
        db = sqlite3.connect(sqlite_path)
        db.row_factory = sqlite3.Row

        db.execute('''DROP TABLE IF EXISTS Samples''')
        db.execute('''
                   CREATE TABLE Samples (
                      sample_id int primary key,
                      tok_ids text,
                      attention_mask text,
                      label int
                      )
                   ''')
        num_processed = 0
        try:
            self.reader = csv.DictReader(csv_fd)
            # Some texts are partitioned into
            # multiple rows, if they exceed
            # sequence_len. A queue to manage
            # them:
            self.queued_samples = deque()
            while True:
                # Next dict with 'ids', 'label, 'attention_mask':
                # Will throw StopIteration when done:
                try:
                    row_dict = self.next_csv_row()
                except StopIteration:
                    break
                if row_dict is None:
                    # An error in the CSV file; next_csv_row()
                    # already wrote an error msg. Keep going
                    continue
                insert_cmd = f'''
                           INSERT INTO Samples (sample_id,
                                                tok_ids, 
                                                attention_mask, 
                                                label
                                                ) 
                            VALUES (
                              {num_processed},
                              '{str(row_dict['tok_ids'])}',
                              '{str(row_dict['attention_mask'])}',
                              {row_dict['label']}
                              )
                           '''
                db.execute(insert_cmd)
                num_processed += 1
                #************
                if TESTING:
                    if num_processed >= 10000:
                        db.commit()

                        break
                #************
                if num_processed % 1000 == 0:
                    db.commit()
                    self.log.info(
                        f"Processed {num_processed}/{num_csv_lines} CSV records"
                    )
        finally:
            db.commit()
            csv_fd.close()

        return db

    #------------------------------------
    # __next__
    #-------------------

    def __next__(self):
        try:
            next_sample_id = self.curr_queue.popleft()
        except IndexError:
            raise StopIteration

        res = self.db.execute(f'''
                               SELECT sample_id, tok_ids,attention_mask,label
                                FROM Samples 
                               WHERE sample_id = {next_sample_id}
                             ''')
        row = next(res)
        return self.clean_row_res(dict(row))

    #------------------------------------
    # __getitem__
    #-------------------

    def __getitem__(self, indx):
        '''
        Return indx'th row from the db.
        The entire queue is always used,
        rather than the remaining queue
        after some popleft() ops. 
        
        @param indx:
        @type indx:
        '''

        ith_sample_id = self.saved_queues[self.curr_split_id()][indx]
        res = self.db.execute(f'''
                               SELECT sample_id, tok_ids,attention_mask,label
                                FROM Samples 
                               WHERE sample_id = {ith_sample_id}
                             ''')
        # Return the (only result) row:
        row = next(res)
        return self.clean_row_res(dict(row))

    #------------------------------------
    # __iter__
    #-------------------

    def __iter__(self):
        return self

    #------------------------------------
    # __len__
    #-------------------

    def __len__(self):
        '''
        Return length of the current split. Use
        switch_to_split() before calling this
        method to get another split's length.
        The length of the entire queue is returned,
        not just what remains after calls to next()
        '''
        return len(self.saved_queues[self.curr_split_id()])

    #------------------------------------
    # next_csv_row
    #-------------------

    def next_csv_row(self):
        '''
        Returns a dict 'ids', 'label', 'attention_mask'
        '''

        # Still have a row left from a previouse
        # chopping?
        if len(self.queued_samples) > 0:
            return (self.queued_samples.popleft())

        # No pending samples from previously
        # found texts longer than sequence_len:
        row = next(self.reader)
        try:
            txt = row[self.text_col_name]
        except KeyError:
            msg = (
                f"\nCSV file does not have a column named '{self.text_col_name}'\n"
                "You can invoke bert_train_parallel.py with --text\n"
                "to specify col name for text, and --label to speciy\n"
                "name of label column.")
            self.log.err(msg)
            raise ValueError(msg)

        # Tokenize the text of the row (the ad):
        # If the ad is longer than self.SEQUENCE_LEN,
        # then multiple rows are returned.
        # Each returned 'row' is a dict containing
        # just the key self.IDS_COL. Its value is
        # an array of ints: each being an index into
        # the BERT vocab.
        #
        # The ids will already be padded. Get
        #   [{'ids' : [1,2,...]},
        #    {'ids' : [30,64,...]}
        #        ...
        #   ]

        # Get list of dicts: {'tokens' : ['[CLS]','foo',...'[SEP]'],
        #                     'ids'    : [2545, 352, ]
        #                    }
        # dicts. Only one if text is <= sequence_len, else
        # more than one:
        id_dicts = self.text_augmenter.fit_one_row_to_seq_len(txt)

        # Add label. Same label even if given text was
        # chopped into multiple rows b/c the text exceeded
        # sequence_len:

        try:
            label = row[self.label_col_name]
        except KeyError:
            msg = f"CSV file does not have col {self.label_col_name}" + '\n' +\
                    "You can invoke bert_train_parallel.py with --label"
            self.log.err(msg)
            raise ValueError(msg)

        try:
            label_encoding = self.label_mapping[label]
        except KeyError:
            # A label in the CSV file that was not
            # anticipated in the caller's label_mapping dict
            self.log.err(f"Unknown label encoding: {label}")
            return

        for id_dict in id_dicts:
            id_dict['label'] = label_encoding

        # Create a mask of 1s for each token followed by 0s for padding
        for ids_dict in id_dicts:
            ids_seq = id_dict[self.IDS_COL_NAME]
            #seq_mask = [float(i>0) for i in seq]
            seq_mask = [int(i > 0) for i in ids_seq]
            ids_dict['attention_mask'] = seq_mask

        # We now have a list of dicts, each with three
        # keys: 'ids','label','attention_mask'
        if len(id_dicts) > 1:
            self.queued_samples.extend(id_dicts[1:])
        return id_dicts[0]

    #------------------------------------
    # split_dataset
    #-------------------

    def split_dataset(self,
                      sample_ids_or_df=None,
                      train_percent=0.8,
                      val_percent=0.1,
                      test_percent=0.1,
                      save_to_db=True,
                      random_seed=1845):
        '''
        Splits dataset into train, validation, and 
        test sets at the given proportions. One of the
        proportions may be set to None. In that case
        only two splits will be created. Randomly permutes
        samples before splitting
        
        The sample_ids_or_df may be a list of of
        indices into the sqlite db of sample rows from
        the original CSV file, or a dataframe in which 
        each row corresponds to a sample row from the 
        original CSV. If None, uses what this instance
        already knows. If in doubt, let it default.
        
        Creates a deque (a queue) for each split, and
        saves copies of each in a dict (saved_queues).
        Returns a triplet with the queues. 
        
        @param sample_ids_or_df: list of sqlite sample_id, or dataframe
        @type sample_ids_or_df: {list|pandas.dataframe}
        @param train_percent: percentage of samples for training
        @type train_percent: float
        @param val_percent: percentage of samples for validation
        @type val_percent: float
        @param test_percent: percentage of samples for testing
        @type test_percent: float
        @param save_to_db: whether or not to save the indices that
            define each split in the Sqlite db
        @type save_to_db: bool
        @param random_seed: seed for permuting dataset before split
        @type random_seed: int
        '''

        if sample_ids_or_df is None:
            sample_ids_or_df = self.sample_ids

        # Deduce third portion, if one of the
        # splits is None:
        if train_percent is None:
            if val_percent is None or test_percent is None:
                raise ValueError(
                    "Two of train_percent/val_percent/test_percent must be non-None"
                )
            train_percent = 1 - val_percent - test_percent
        elif val_percent is None:
            if train_percent is None or test_percent is None:
                raise ValueError(
                    "Two of train_percent/val_percent/test_percent must be non-None"
                )
            val_percent = 1 - train_percent - test_percent
        elif test_percent is None:
            if train_percent is None or val_percent is None:
                raise ValueError(
                    "Two of train_percent/val_percent/test_percent must be non-None"
                )
            test_percent = 1 - train_percent - val_percent

        if train_percent + val_percent + test_percent != 1.0:
            raise ValueError(
                "Values for train_percent/val_percent/test_percent must add to 1.0"
            )

        np.random.seed(random_seed)
        if type(sample_ids_or_df) == DataFrame:
            sample_indices = list(sample_ids_or_df.index)
        else:
            sample_indices = sample_ids_or_df

        perm = np.random.permutation(sample_indices)
        # Permutations returns a list of arrays:
        #   [[12],[40],...]; turn into simple list of ints:
        num_samples = len(perm)

        train_end = int(train_percent * num_samples)
        validate_end = int(val_percent * num_samples) + train_end
        self.train_queue = deque(perm[:train_end])
        self.val_queue = deque(perm[train_end:validate_end])
        self.test_queue = deque(perm[validate_end:])

        self.curr_queue = self.train_queue

        if save_to_db:
            self.save_queues(self.train_queue, self.val_queue, self.test_queue)

        self.saved_queues = {}
        self.saved_queues['train'] = self.train_queue.copy()
        self.saved_queues['validate'] = self.val_queue.copy()
        self.saved_queues['test'] = self.test_queue.copy()

        self.train_frozen_dataset = FrozenDataset(self.log, self.db, 'train',
                                                  self.saved_queues['train'],
                                                  self.label_mapping,
                                                  self.sample_ids)

        self.validate_frozen_dataset = FrozenDataset(
            self.log, self.db, 'validate', self.saved_queues['validate'],
            self.label_mapping, self.sample_ids)

        self.test_frozen_dataset = FrozenDataset(self.log, self.db, 'test',
                                                 self.saved_queues['test'],
                                                 self.label_mapping,
                                                 self.sample_ids)

    #------------------------------------
    # clean_row_res
    #-------------------

    def clean_row_res(self, row):
        '''
        Given a row object returned from sqlite, 
        turn tok_ids and attention_mask into real
        np arrays, rather than their original str
        
        @param row:
        @type row:
        '''

        # tok_ids are stored as strings:
        row['tok_ids'] = self.to_np_array(row['tok_ids'])
        row['attention_mask'] = self.to_np_array(row['attention_mask'])
        return row
        return (self.train_queue, self.val_queue, self.test_queue)

    #------------------------------------
    # save_queues
    #-------------------

    def save_queues(self, train_queue, val_queue, test_queue):

        self.db.execute('DROP TABLE IF EXISTS TrainQueue')
        self.db.execute('CREATE TABLE TrainQueue (sample_id int)')

        self.db.execute('DROP TABLE IF EXISTS ValidateQueue')
        self.db.execute('CREATE TABLE ValidateQueue (sample_id int)')

        self.db.execute('DROP TABLE IF EXISTS TestQueue')
        self.db.execute('CREATE TABLE TestQueue (sample_id int)')

        # Turn [2,4,6,...] into tuples: [(2,),(4,),(6,),...]
        train_tuples = [(int(sample_id), ) for sample_id in train_queue]
        self.db.executemany("INSERT INTO TrainQueue VALUES(?);", train_tuples)

        val_tuples = [(int(sample_id), ) for sample_id in val_queue]
        self.db.executemany("INSERT INTO ValidateQueue VALUES(?);", val_tuples)

        test_tuples = [(int(sample_id), ) for sample_id in test_queue]
        self.db.executemany("INSERT INTO TestQueue VALUES(?);", test_tuples)

        self.db.commit()

    #------------------------------------
    # save_dict_to_table
    #-------------------

    def save_dict_to_table(self, table_name, the_dict, delete_existing=False):
        '''
        Given a dict, save it to a table in the underlying
        database.
        
        If the table exists, action depends on delete_existing.
        If True, the table is deleted first. Else the dict values
        are added as rows. 
        
        It is the caller's responsibility to ensure that:
        
           - Dict values are db-appropriate data types: int, float, etc.
           - The table name is a legal Sqlite table name  
        
        @param table_name: name of the table
        @type table_name: str
        @param dict: col/value information to store
        @type dict: {str : <any-db-appropriate>}
        '''
        if delete_existing:
            self.db.execute(f'''DROP TABLE IF EXISTS {table_name}''')
            self.db.execute(
                f'''CREATE TABLE {table_name} ('key_col' varchar(255),
                                                          'val_col' varchar(255));'''
            )
            self.db.commit()

        insert_vals = list(the_dict.items())
        self.db.executemany(f"INSERT INTO {table_name} VALUES(?,?);",
                            insert_vals)
        self.db.commit()

    #------------------------------------
    # yes_no_question
    #-------------------

    def query_yes_no(self, question, default='yes'):
        '''
        Ask a yes/no question via raw_input() and return their answer.
    
        "question" is a string that is presented to the user.
        "default" is the presumed answer if the user just hits <Enter>.
            It must be "yes" (the default), "no" or None (meaning
            an answer is required of the user).
    
        The "answer" return value is True for "yes" or False for "no".
        '''
        valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
        if default is None:
            prompt = " [y/n] "
        elif default == "yes":
            prompt = " [Y/n] "
        elif default == "no":
            prompt = " [y/N] "
        else:
            raise ValueError("invalid default answer: '%s'" % default)

        while True:
            sys.stdout.write(question + prompt)
            choice = input().lower()
            if default is not None and choice == '':
                return valid[default]
            elif choice in valid:
                return valid[choice]
            else:
                sys.stdout.write("Please respond with 'yes' or 'no' "
                                 "(or 'y' or 'n').\n")
Ejemplo n.º 5
0
class TextAnalyzer(object):

    # How many records to process before
    # a progress report:
    PROGRESS_EVERY = 1000

    stopWordLanguageList = [
        'arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish',
        'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian',
        'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian',
        'slovene', 'spanish', 'swedish', 'tajik', 'turkish'
    ]
    stopword_language = 'english'

    word_file_suffix = '_wordstats'
    ngram_file_suffix = '_ngrams'

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 text_file,
                 text_fields,
                 outfiles_dir=None,
                 record_id_col='id',
                 stopword_language='english',
                 ngram_len=3,
                 separator=','):

        self.log = LoggingService()
        if outfiles_dir is not None:
            if not os.path.exists(outfiles_dir):
                os.mkdir(outfiles_dir)
            elif not os.path.isdir(outfiles_dir):
                print(f"Outfile directory is a file: {outfiles_dir}; quitting")
                sys.exit(1)

        self.text_file = text_file
        self.outfiles_dir = outfiles_dir
        self.text_fields = text_fields
        self.record_id_col = record_id_col
        self.stopword_language = stopword_language
        self.ngram_len = ngram_len
        self.separator = separator

        self.create_wordnet_nltk_pos_xlation()
        # Generate outfile names for word stats,
        # and ngrams:

        if outfiles_dir is None:
            # Output result files to same dir as infile:
            (infile_path, ext) = os.path.splitext(text_file)
            wordstats_path = f"{infile_path}{self.word_file_suffix}.csv"
            ngrams_path = f"{infile_path}{self.ngram_file_suffix}.csv"
        else:
            # Outdir different from infile's:
            wordstats_path = os.path.join(
                outfiles_dir,
                f"{os.path.basename(text_file)}{self.word_file_suffix}.csv")

        self.log.info(f"Infile: {text_file}")
        self.log.info(f"Word stats will be in {wordstats_path}")
        self.log.info(f"NGrams will be in {ngrams_path}")
        self.generate_outfiles(text_file, wordstats_path, ngrams_path)

    #------------------------------------
    # generate_outfiles
    #-------------------

    def generate_outfiles(self, text_file, word_stats_path, ngrams_path):
        '''
        Generates both word stats and ngrams outfile.
        
        Assumptions about inst var initializations:
            o text_fields,
            o record_id_col,
            o stopword_language,
            o ngram_len
            o pos_tag_map
             
        @param text_file:
        @type text_file:
        @param word_stats_path output path for word stats
        @type word_stats_path: str
        @param ngrams_path: output path for ngrams
        @type ngrams_path: str
        '''

        try:
            # Open outfiles:
            word_stats_fd = open(word_stats_path, 'w', newline='')
            ngram_fd = open(ngrams_path, 'w', newline='')

            # The words stats .csv writer
            word_stats_cols = [
                self.record_id_col, 'word', 'stem', 'lemmatized', 'pos',
                'sent_neg', 'sent_neu', 'sent_pos', 'sent_compound',
                'stop_word', 'word_number'
            ]
            word_stats_writer = csv.DictWriter(word_stats_fd, word_stats_cols)
            word_stats_writer.writeheader()

            # The ngrams .csv writer. The col header depends
            # on the number of ngrams requested:
            #   id,Word1,Word2,...,full_ngram, ngram_sentiment, ngram_number

            self.word_col_names = [
                f"Word{indx+1}" for indx in range(self.ngram_len)
            ]
            heading = [self.record_id_col]
            heading.extend(self.word_col_names)

            self.log.info("Creating NLTK tool instances...")
            heading.extend(['full_ngram', 'ngram_sentiment', 'ngram_number'])
            ngrams_writer = csv.DictWriter(ngram_fd, heading)
            ngrams_writer.writeheader()

            # The NLTK tools:
            tokenizer = TweetTokenizer(preserve_case=False)
            stemmer = SnowballStemmer(self.stopword_language)
            # Speed up stemming:
            self.stem = lru_cache(maxsize=50000)(stemmer.stem)
            lemmatizer = WordNetLemmatizer()
            # Speed up lemmatization by caching:
            self.lemmatize = lru_cache(maxsize=50000)(lemmatizer.lemmatize)
            # Create the sentiment analyzer for some basic sentiment tests.
            sentiment_analyzer = SentimentIntensityAnalyzer()

            self.log.info("Done creating NLTK tool instances.")
            # List of all_stopwords:
            self.all_stopwords = set(stopwords.words(self.stopword_language))

            # Regex to keep only alpha and apostrophe.
            # Eliminates punctuation, but keeps contractions,
            # such as "can't":
            spec_char_pat = re.compile(r"[\w']+")

            # How many records since last progress
            # report:
            records_since_prog_rep = 0

            record_num = 0
            with open(text_file, 'r') as in_fd:
                csv_reader = csv.DictReader(in_fd, delimiter=self.separator)
                for row_dict in csv_reader:
                    record_num += 1
                    records_since_prog_rep += 1
                    for txt_field in self.text_fields:
                        text = row_dict[txt_field]
                        if text is None:
                            continue
                        # Tokenize:
                        token_arr = tokenizer.tokenize(text)
                        # Remove punctuation:
                        clean_token_arr = [kept_token for kept_token \
                                           in token_arr \
                                           if spec_char_pat.match(kept_token) \
                                           is not None]
                        self.write_ngrams(record_num, clean_token_arr,
                                          ngrams_writer, sentiment_analyzer,
                                          self.ngram_len)
                        self.write_word_stats(record_num, row_dict, text,
                                              clean_token_arr,
                                              word_stats_writer,
                                              sentiment_analyzer)
                    # Time for progress report?
                    if records_since_prog_rep >= self.PROGRESS_EVERY:
                        self.log.info(
                            f"Processed {record_num} input file records.")
                        records_since_prog_rep = 0
        finally:
            word_stats_fd.close()
            ngram_fd.close()

    #------------------------------------
    # write_word_stats
    #-------------------

    def write_word_stats(self, record_num, row_dict, text, clean_token_arr,
                         word_stats_writer, sentiment_analyzer):
        '''
        Compute and write to file all word stats
        of one record.
        
        @param record_num: record count of this record
        @type record_num: int
        @param row_dict: dict returned from csv reader; one key
            per column
        @type row_dict: {str : str}
        @param text: text to analyze; one field of one in file record
        @type text: str
        @param clean_token_arr: tokenized array of the text
        @type clean_token_arr: [str]
        @param word_stats_writer: csv dict writer for output
        @type word_stats_writer: csv.DictWriter
        @param sentiment_analyzer: sentiment analysis NLTK instance
        @type sentiment_analyzer: SentimentIntensityAnalyzer
        '''

        # First, get sentiment neg/neu/pos/compount for
        # the text:
        sent_dict = sentiment_analyzer.polarity_scores(text)
        sent_arr = [
            sent_dict['neg'],
            sent_dict['neu'],
            sent_dict['pos'],
            sent_dict['compound'],
        ]

        stem_arr = [self.stem(word) for word in clean_token_arr]
        pos_tuples = nltk.pos_tag(clean_token_arr)
        lem_arr   = [self.lemmatize(word, pos=self.pos_tag_map[pos]) \
                     for (word,pos) in pos_tuples]
        pos_arr = [pos for (word, pos) in pos_tuples]
        stopword_status_arr = [
            word in self.all_stopwords for word in clean_token_arr
        ]

        rec_id = row_dict[self.record_id_col]
        # Finally, put the rows together, a row for each value:
        out_row = {}
        for (i, token) in enumerate(clean_token_arr):
            out_row[self.record_id_col] = rec_id
            out_row['word'] = token
            out_row['stem'] = stem_arr[i]
            out_row['lemmatized'] = lem_arr[i]
            out_row['pos'] = pos_arr[i]
            out_row['sent_neg'] = sent_dict['neg']
            out_row['sent_neu'] = sent_dict['neu']
            out_row['sent_pos'] = sent_dict['pos']
            out_row['sent_compound'] = sent_dict['compound']
            out_row['stop_word'] = stopword_status_arr[i]
            out_row['word_number'] = record_num

            word_stats_writer.writerow(out_row)

    #------------------------------------
    # write_ngrams
    #-------------------

    def write_ngrams(self, row_id, clean_token_arr, ngram_writer,
                     nltkSentiment, ngram_len):

        ngram_tuples = list(ngrams(clean_token_arr, ngram_len))

        for (i, ngram_tuple) in enumerate(ngram_tuples):
            out_dict = dict(list(zip(self.word_col_names, ngram_tuple)))
            full_ngram = ' '.join(ngram_tuple)
            out_dict['full_ngram'] = full_ngram
            out_dict['ngram_sentiment'] = nltkSentiment.polarity_scores(
                full_ngram)['compound']
            out_dict['ngram_number'] = str(i)
            out_dict[self.record_id_col] = row_id
            ngram_writer.writerow(out_dict)

    #------------------------------------
    # create_wordnet_nltk_pos_xlation
    #-------------------

    def create_wordnet_nltk_pos_xlation(self):
        '''
        Wordnet has different part of speech tags
        than NLTK. Build a dict from one to the other.
        When in doubt, make pos be a noun.
        '''
        self.pos_tag_map = defaultdict(lambda: 'n')
        self.pos_tag_map['J'] = 'a'
        self.pos_tag_map['V'] = 'v'
        self.pos_tag_map['R'] = 'r'
Ejemplo n.º 6
0
class Spectrogrammer(object):
    '''
    Create and manipulate spectrograms corresponding to provided
    .wav files. Given a list of .wav files associated with 
    spectrograms allows for the preperation of spectrograms and / or
    the corresponding label masks.
    '''

    # Default specifications
    NFFT = 4096  # was 3208 # We want a frequency resolution of 1.95 Hz
    # Used by the elephant people but not us! If NFFT is less than
    # Pad_to does something special!
    PAD_TO = NFFT

    OVERLAP = 1 / 2
    HOP_LENGTH = 800  # Want second resolution of 0.1 seconds
    # Maximum frequency retained in spectrograms
    MAX_FREQ = 150  # This we can honestly consider decreasing. But let us leave it for now!
    # Primarily taken from reading in a .wav file!
    DEFAULT_FRAMERATE = 8000

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(
        self,
        infiles,
        actions,
        outdir=None,  # Output to same dir as files are located
        normalize=False,  # We may want to consider using this!
        framerate=None,  # this by default will be found from the .wav file
        min_freq=0,  # Hz 
        max_freq=150,  # Hz
        nfft=4096,
        pad_to=4096,
        hop=800,
        logfile=None,
    ):
        '''
        @param infiles: Files to spectrogram identified by the corresponding .wav
        @type infiles:
        @param actions: the tasks to accomplish: 
            {spectro|melspectro|labelmask|copyraven}
            NOTE: copy raven is used simply to copy over the raven gt label .txt
            file if we are moving the spectrogram to a new location
        @type actions: [str] 
        @param outdir: if provided, everything that is created is written
            to this directory. If None, is written to the directory of the file
            on which computed was based. This is a good default!
        @type outdir {None | str}
        @param normalize: whether or not to normalize the signal to be within 16 bits
        @type normalize: bool
        @param framerate: framerate of the recording. Normally 
            obtained from the wav file itself.
        @type framerate: int
        @param min_freq: min frequency in the processed spectrogram
        @type min_freq: int
        @param max_freq: max frequence in the processed spectrogram
        @type max_freq: int
        @param nfft: window width
        @type nfft: int,
        @param logfile: destination for log. Default: display
        @type logfile: {None|str}
        '''

        # Set up class variables related to spectrogram generation
        self.nfft = nfft
        self.pad_to = pad_to
        self.hop = hop
        self.min_freq = min_freq
        self.max_freq = max_freq

        # Output directory
        self.outdir = outdir

        if logfile is None:
            self.log = LoggingService()
        else:
            self.log = LoggingService(logfile, msg_identifier="spectrogrammer")

        if type(infiles) != list:
            infiles = [infiles]

        # Depending on what caller wants us to do,
        # different arguments must be passed. Make
        # all those checks to avoid caller waiting a long
        # time for processing to be done only to fail
        # at the end: - Should update this later as thing
        # come up!

        # Prerequisites:
        if not self._ensure_prerequisites(infiles, actions, framerate, nfft,
                                          outdir):
            return

        # Prepare the desired component of each .wav file
        for infile in infiles:
            # Super basic file checking
            if not os.path.exists(infile):
                print(f"File {infile} does not exist.")
                continue

            spect = None
            spectro_outfile = None
            label_mask = None

            # Get a dict with the file_root and
            # names related to the infile in our
            # file naming scheme:
            # Note this is useful for associating
            # .wav and .txt files
            file_family = FileFamily(infile)

            # Output the files to the same path as input
            # Note this allows self.outdir to change for each file
            if outdir is None:
                self.outdir = file_family.path

            # Start by trying to read the .wav file - the backbone of everything!
            try:
                self.log.info(f"Reading wav file {infile}...")
                (self.framerate, samples) = wavfile.read(infile)
                self.log.info(f"Done reading wav file {infile}.")
            except Exception as e:
                self.log.warn(f"Cannot process .wav file: {repr(e)}")
                # We should continue onto the next one!!
                # this we have seen with currupted .wav files
                continue

            # Generate and process the full spectrogram
            if 'spectro' in actions:
                try:
                    spect, times = self.make_spectrogram(samples)
                except Exception as e:
                    print(f"Cannot create spectrogram for {infile}: {repr(e)}")
                    return

                # Save the spectrogram
                spectro_outfile = os.path.join(self.outdir,
                                               file_family.spectro)
                np.save(spectro_outfile, spect)
                # Save the time mask
                times_outfile = os.path.join(self.outdir,
                                             file_family.time_labels)
                np.save(times_outfile, times)

            if 'labelmask' in actions:
                # Get label mask with 1s at time periods with an elephant call.
                time_file = file_family.fullpath(AudioType.TIME)
                try:
                    times = np.load(time_file)
                except Exception as e:
                    print(
                        f"Have not created the necessary time mask file for the spectrogram {infile}"
                    )
                    continue

                raven_file = file_family.fullpath(AudioType.LABEL)
                label_mask = self.create_label_mask_from_raven_table(
                    times, raven_file)
                np.save(os.path.join(self.outdir, file_family.mask),
                        label_mask)

    #------------------------------------
    # _ensure_prerequisites
    #-------------------

    def _ensure_prerequisites(self, infiles, actions, framerate, nfft, outdir):
        # Prerequisites:
        if outdir is not None and not os.path.exists(outdir):
            os.makedirs(outdir)

        if 'labelmask' in actions:
            # Need true framerate and spectrogram bin size
            # to compute time ranges in spectrogram:
            if nfft is None:
                self.log.warn(
                    f"Assuming default time bin nfft of {self.NFFT}!\n"
                    "If this is wrong, label allignment will be wrong")
            if framerate is None:
                self.log.warn(
                    f"Assuming default framerate of {self.DEFAULT_FRAMERATE}!\n"
                    "If this is wrong, label allignment will be wrong")

        if 'spectro' in actions or 'melspectro' in actions:
            if not any(filename.endswith('.wav') for filename in infiles):
                self.log.err(
                    "For creating a spectrogram, a .wav file must be provided")
                return False

            if framerate is not None:
                self.log.warn(
                    f"Framerate was provided, but will be ignore: using framerate from .wav file."
                )

        if framerate is None:
            self.framerate = self.DEFAULT_FRAMERATE

        if type(infiles) != list:
            infiles = [infiles]

        return True

    #------------------------------------
    # make_spectrogram
    #-------------------

    def make_spectrogram(self, raw_audio, chunk_size=1000):
        '''
        Given data, compute a spectrogram. To avoid slow memory
        issues build the spectrogram in a stream-like fashion
        where we build it in chunk sizes of 1000 spectrogram frames

        Assumptions:
            o self.framerate contains the data framerate
            o self.nfft contains the window size
            o self.hop contains the hop size for fft
            o self.pad_to contains the zero-padding that 
            we add if self.pad_to > self.nfft
             
        Returns a two-tuple: an array of segment times
            and a 2D spectrogram array

        @param data: the time/amplitude data
        @type data: np.array([float])
        @param chunk_size: controls the incremental size used to 
        build up the spectrogram
        @type chunk_size: int
        @return: (time slices arrya, spectrogram) 
        @rtype: (np.array([float]), np.array([float]))
        
        '''

        # The time_labels will be in seconds. But
        # they will be fractions of a second, like
        #   0.256, ... 1440
        self.log.info("Creating spectrogram...")

        # Compute the number of raw audio frames
        # needed to generate chunk_size spec frames
        len_chunk = (chunk_size - 1) * self.hop + self.nfft

        final_spec = None
        slice_times = None
        start_chunk = 0
        # Generate 1000 spect frames at a time, being careful to follow the correct indexing at the boarders to
        # "simulate" the full fft. Namely, if we use indeces (raw_start, raw_end) to get the raw_audio frames needed
        # to generate the spectrogram chunk, remember the next chunk does not start at raw_end but actually start
        # and (raw_end - NFFT) + hop. **THIS IS KEY** to propertly simulate the full spectrogram creation process
        iteration = 0
        print("Approx number of chunks:", int(raw_audio.shape[0] / len_chunk))
        while start_chunk + len_chunk < raw_audio.shape[0]:
            if (iteration % 100 == 0):
                print("Chunk number " + str(iteration))
            [spectrum, freqs,
             t] = ml.specgram(raw_audio[start_chunk:start_chunk + len_chunk],
                              NFFT=self.nfft,
                              Fs=self.framerate,
                              noverlap=(self.nfft - self.hop),
                              window=ml.window_hanning,
                              pad_to=self.pad_to)

            # Cutout uneeded high frequencies!
            spectrum = spectrum[(freqs <= self.max_freq)]

            if start_chunk == 0:
                final_spec = spectrum
                slice_times = t
            else:
                final_spec = np.concatenate((final_spec, spectrum), axis=1)
                # Shift t to be 0 started than Offset the new times
                # by the last frame's time + the time gap between frames (= hop / fr)
                t = t - t[0] + slice_times[-1] + (self.hop / self.framerate)
                slice_times = np.concatenate((slice_times, t))

            # Remember that we want to start as if we are doing one continuous sliding window
            start_chunk += len_chunk - self.nfft + self.hop
            iteration += 1

        # Do one final chunk for whatever remains at the end
        [spectrum, freqs, t] = ml.specgram(raw_audio[start_chunk:],
                                           NFFT=self.nfft,
                                           Fs=self.framerate,
                                           noverlap=(self.nfft - self.hop),
                                           window=ml.window_hanning,
                                           pad_to=self.pad_to)
        # Cutout the high frequencies that are not of interest
        spectrum = spectrum[(freqs <= self.max_freq)]
        final_spec = np.concatenate((final_spec, spectrum), axis=1)
        # Update the times:
        t = t - t[0] + slice_times[-1] + (self.hop / self.framerate)
        slice_times = np.concatenate((slice_times, t))

        # check the shape of this
        self.log.info("Done creating spectrogram.")

        # This we may actually want to do! Log transform!!!
        # Transformer for magnitude to power dB:
        #amp_to_dB_transformer = torchaudio.transforms.AmplitudeToDB()
        #freq_time_dB_tensor = amp_to_dB_transformer(torch.Tensor(freq_time))

        # Note transpose the spectrogram to be of shape - (time, freq)
        return final_spec.T, slice_times

    #------------------------------------
    # make_mel_spectrogram
    #-------------------
    # NEED TO UPDATE THIS STUFF!!!!!
    def make_mel_spectrogram(self, sig_t, framerate):

        # Get tensor (128 x num_samples), where 128
        # is the default number of mel bands. Can
        # change in call to MelSpectrogram:
        mel_spec_t = transforms.MelSpectrogram(
            sample_rate=framerate,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)(sig_t)

        # Turn energy values to db of max energy
        # in the spectrogram:
        mel_spec_db_t = transforms.AmplitudeToDB()(mel_spec_t)

        (num_mel_bands, _num_timebins) = mel_spec_t.shape

        # Number of columns in the spectrogram:
        num_time_label_choices = DSPUtils.compute_timeticks(
            framerate, mel_spec_db_t)
        # Enumeration of the mel bands to use as y-axis labels:
        freq_labels = np.array(range(num_mel_bands))

        return (freq_labels, num_time_label_choices, mel_spec_db_t)

    #------------------------------------
    # create_label_mask_from_raven_table
    #-------------------

    def create_label_mask_from_raven_table(self, time_mask, label_txt_file):
        '''
        Given a raven label table and a time_mask corresponding
        to the times for each spectrogram column, create a mask
        a mask file with 1s where the spectrogram time bins
        would match labels, and 0s elsewhere.
        
        Label files are of the form:
        
            Col1    ...    Begin Time (s)    End Time (s)    ... Coln
             foo             6.326            4.653              bar
                    ...

        @param time_mask: time mask representing the spectrogram
            time bins for its columns
        @type time_mask: np.array<float>
        @param label_txt_file: either a path to a label file,
            or an open fd to such a file:
        @type label_txt_file: {str|file-like}
        '''

        # Start with an all-zero label mask:
        label_mask = np.zeros(len(time_mask), dtype=int)
        try:
            if type(label_txt_file) == str:
                fd = open(label_txt_file, 'rt')  #, encoding='latin1') #???
            else:
                fd = label_txt_file

            reader = csv.DictReader(fd, delimiter='\t')
            for (start_bin_idx, end_bin_idx) in self._get_label_indices(
                    reader, time_mask, label_txt_file):

                # Fill the mask with 1s in the just-computed range:
                label_mask[start_bin_idx:end_bin_idx] = 1

        finally:
            # Only close an fd that we may have
            # opened in this method. Fds passed
            # in remain open for caller to close:
            if type(label_txt_file) == str:
                fd.close()

        return label_mask

    #------------------------------------
    # _get_label_indices
    #-------------------

    def _get_label_indices(self, reader, label_times, label_txt_file):

        if type(label_times) != np.ndarray:
            label_times = np.array(label_times)

        file_offset_key = 'File Offset (s)'
        begin_time_key = 'Begin Time (s)'
        end_time_key = 'End Time (s)'

        # Get each el call time range spec in the labels:
        for label_dict in reader:
            try:
                #start_time = float(row['File Offset (s)'])
                begin_time = float(label_dict[file_offset_key])
                call_length = float(label_dict[end_time_key]) - float(
                    label_dict[begin_time_key])
                end_time = begin_time + call_length
            except KeyError:
                raise IOError(
                    f"Raven label file {label_txt_file} does not contain one "
                    f"or both of keys '{begin_time_key}', {end_time_key}'")

            if end_time < begin_time:
                self.log.err(
                    f"Bad label: end label less than begin label: {end_time} < {begin_time}"
                )
                continue

            if begin_time > label_times[-1]:
                self.log.err(
                    f"Bad label: begin label after end of recording: {begin_time} > {label_times[-1]}"
                )
                continue
            if end_time < label_times[0]:
                self.log.err(
                    f"Bad label: end label before start of recording: {end_time} < {label_times[0]}"
                )
                continue

            # To deal very loosely with noise around the boundaries
            # let us be on the stricter end of setting the bounds.
            # Find the lower and upper time boarders that do not
            # include the start/end times and then shrink these
            # boarders in by 1.

            pre_begin_indices = np.nonzero(label_times < begin_time)[0]
            # Is label start time beyond end of recording?
            if len(pre_begin_indices) == 0:
                start_bin_idx = 0
            else:
                # Make the bounds a bit tighter!
                start_bin_idx = pre_begin_indices[-1] + 1

            # Similarly with end time:
            post_end_indices = np.nonzero(label_times > end_time)[0]
            if len(post_end_indices) == 0:
                # Label end time is beyond recording. Just
                # go up to the end:
                end_bin_idx = len(label_times)
            else:
                # Similar, make bounds a bit tighter
                end_bin_idx = post_end_indices[0] - 1

            yield (start_bin_idx, end_bin_idx)

    #------------------------------------
    # plot
    #-------------------
    # Move this to a visualization class!
    def plot(self, times, spectrum, label_mask, vert_lines, title='My Title'):
        new_features = 10 * np.log10(spectrum).T
        min_dbfs = new_features.flatten().mean()
        max_dbfs = new_features.flatten().mean()
        min_dbfs = np.maximum(new_features.flatten().min(),
                              min_dbfs - 2 * new_features.flatten().std())
        max_dbfs = np.minimum(new_features.flatten().max(),
                              max_dbfs + 6 * new_features.flatten().std())

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        ax2 = fig.add_subplot(2, 1, 1)
        frequencies = np.arange(new_features.shape[0])
        #ax.pcolormesh(times, frequencies, new_features)
        ax.imshow(new_features,
                  cmap="magma_r",
                  vmin=min_dbfs,
                  vmax=max_dbfs,
                  interpolation='none',
                  origin="lower",
                  aspect="auto",
                  extent=[times[0], times[times.shape[0] - 1], 0, 150])
        print(times[vert_lines[0]], times[vert_lines[1]])
        ax.axvline(x=times[vert_lines[0]], color='r', linestyle='-')
        ax.axvline(x=times[vert_lines[1]], color='r', linestyle='-')
        ax.set_title(title)
        ax.set_xlabel('Time')
        ax.set_ylabel('Frequency')

        ax2.plot(times, label_mask)

        # Make the plot appear in a specified location on the screen
        mngr = plt.get_current_fig_manager()
        geom = mngr.window.geometry()
        mngr.window.wm_geometry("+400+150")
        plt.show()

    #------------------------------------
    # make_time_freq_seqs
    #-------------------

    def make_time_freq_seqs(self, max_freq, spect):

        # Num rows is num of frequency bands.
        # Num cols is number of time ticks:
        (num_freqs, num_times) = spect.shape
        # Ex: if max freq is 150Hz, and the number of
        # freq ticks on the y axis is 77, then each
        # tick is worth 150Hz/77 = 1.95Hz

        freq_band = max_freq / num_freqs
        freq_scale = list(np.arange(0, max_freq, freq_band))
        time_scale = list(np.arange(num_times))
        return (freq_scale, time_scale)

    #------------------------------------
    # get_label_filename
    #-------------------

    def get_label_filename(self, spect_numpy_filename):
        '''
        Given the file name of a numpy spectrogram 
        file of the forms:
           nn03a_20180817_neg-features_10.npy
           nn03a_20180817_features_10.npy
           
        create the corresponding numpy label mask file
        name:
           nn03a_20180817_label_10.npy
           
        
           
        @param spect_numpy_filename:
        @type spect_numpy_filename:
        '''
        # Check extension:
        (_fullname, ext) = os.path.splitext(spect_numpy_filename)
        if ext != '.npy':
            raise ValueError("File needs to be a .npy file.")

        # Maybe a dir is included, maybe not:
        dirname = os.path.dirname(spect_numpy_filename)
        filename = os.path.basename(spect_numpy_filename)

        try:
            (loc_code, date, _file_content_type,
             id_num_plus_rest) = filename.split('_')
        except ValueError:
            raise ValueError(
                f"File name {spect_numpy_filename} does not have exactly four components."
            )
        label_filename = f"{loc_code}_{date}_labels_{id_num_plus_rest}"
        full_new_name = os.path.join(dirname, label_filename)
        return full_new_name
Ejemplo n.º 7
0
class Spectrogrammer(object):
    '''
    Create and manipulate spectrograms from a given folder of spectrogram
    files and (potentially) .txt label filies. With a possible set of input
    actions can generate spectrograms from .wav files, generate corresponding
    spectrogram 0/1 labelings, generate spectrogram time-slicings, and
    copy over gt_labeling
    '''

    # Default specifications
    NFFT = 4096  # was 3208 # We want a frequency resolution of 1.95 Hz
    # Used by the elephant people but not us! If NFFT is less than
    # Pad_to does something special!
    PAD_TO = NFFT

    OVERLAP = 1 / 2
    HOP_LENGTH = 800  # Want second resolution of 0.1 seconds
    # Maximum frequency retained in spectrograms
    MAX_FREQ = 150  # This we can honestly consider decreasing. But let us leave it for now!
    MIN_FREQ = 0
    # Primarily taken from reading in a .wav file!
    DEFAULT_FRAMERATE = 8000
    # We should not be using these!!

    #------------------------------------
    # Constructor
    #-------------------
    """
        Active thoughts: It would be nice to have functionality for dealing with a whole folder,
        a list of files, etc. However, this should not be the job of the spectrogramer. I like
        the idea of being passed a list of the files that we want to do stuff too! The question is
        how should this be formatted?? This is my thinking!! We want to allow several options
            - Create just spectrograms
            - create just labelmasks
            - create spectrograms and label masks
            - copy just the raven file.
        For this need flexability in accepting a collection of .wav and .txt files. Here is what 
        we shall do! Let us a list of generic files like Andreas does and then do as we want!
    """
    def __init__(
        self,
        infiles,
        actions,
        outdir=None,  # if None Output to same dir as files are located
        normalize=False,  # We may want to consider using this!
        to_db=False,
        min_freq=0,  # Hz 
        max_freq=150,  # Hz
        nfft=4096,
        pad_to=4096,
        hop=800,
        framerate=8000,
        logfile=None,
    ):
        '''
        @param infiles: List of files to process (containing potentially .wav and .txt)
        @type infiles: [str]
        @param actions: the tasks to accomplish: 
            {spectro|melspectro|labelmask|copyraven|marginal_labelmask}
            NOTE: copy raven is used simply to copy over the raven gt label .txt
            file if we are moving the spectrogram to a new location
        @type actions: [str] 
        @param outdir: if provided, everything that is created is written
            to this directory. If None, is written to the directory of the file
            on which computed was based. This is a good default!
        @type outdir {None | str}
        @param normalize: whether or not to normalize the signal to be within 16 bits
        @type normalize: bool
        @param to_db: whether to convert to db scale (log scale essentially)
        @type to_db: bool
        @param framerate: framerate of the recording. Normally 
            obtained from the wav file itself.
        @type framerate: int
        @param min_freq: min frequency in the processed spectrogram
        @type min_freq: int
        @param max_freq: max frequence in the processed spectrogram
        @type max_freq: int
        @param nfft: window width
        @type nfft: int,
        @param logfile: destination for log. Default: display
        @type logfile: {None|str}
        '''

        # Set up class variables related to spectrogram generation
        self.nfft = nfft
        self.pad_to = pad_to
        self.hop = hop
        self.min_freq = min_freq
        self.max_freq = max_freq
        self.framerate = framerate

        # Output directory
        self.outdir = outdir

        # We should figure out exactly what the shmuck is going on here
        if logfile is None:
            self.log = LoggingService()
        else:
            self.log = LoggingService(logfile, msg_identifier="spectrogrammer")

        if type(infiles) != list:
            infiles = [infiles]

        # Step through the input files and process them depending on
        # their file signature and actions specified as args.
        for infile in infiles:
            # Super basic file checking
            if not os.path.exists(infile):
                print(f"File {infile} does not exist.")
                continue

            print("Processing file:", infile, " - With actions:", actions)
            # Get a dict with the file_root and names related to
            # the infile in our file naming scheme:
            # Note this is useful for associating
            # .wav and .txt files
            file_family = FileFamily(infile)

            # Output the files to the same path as input
            # Note this allows self.outdir to change for each file
            if outdir is None:
                self.outdir = file_family.path

            # Make sure the outdir exists!!
            if not os.path.exists(outdir):
                os.mkdir(outdir)

            # Process wav file if spectro / melspectro in actions
            if infile.endswith('.wav') and ('spectro' in actions
                                            or 'melspectro' in actions):
                # Process the wave file
                try:
                    self.log.info(f"Reading wav file {infile}...")
                    (_, samples) = wavfile.read(infile)
                    self.log.info(f"Done reading wav file {infile}.")
                except Exception as e:
                    self.log.warn(f"Cannot process .wav file: {repr(e)}")
                    # We should continue onto the next one!!
                    # this we have seen with currupted .wav files
                    continue

                if 'spectro' in actions:
                    try:
                        spect, times = self.make_spectrogram(samples)
                    except Exception as e:  # This likely will not happen
                        print(
                            f"Cannot create spectrogram for {infile}: {repr(e)}"
                        )

                    # Save the spectrogram
                    spectro_outfile = os.path.join(self.outdir,
                                                   file_family.spectro)
                    np.save(spectro_outfile, spect)
                    # Save the time mask
                    times_outfile = os.path.join(self.outdir,
                                                 file_family.time_labels)
                    np.save(times_outfile, times)
                else:  # melspectro
                    print("TODO")

            # Process label files
            elif infile.endswith(
                    '.txt'
            ) and 'labelmask' in actions or 'copyraven' in actions:
                # Generate the 0/1 spectrogram labels
                if 'labelmask' in actions:
                    # If we have a label file there should be a corresponding wav file
                    # in the same folder!
                    wav_file = file_family.fullpath(AudioType.WAV)
                    if not os.path.exists(wav_file):
                        print(
                            f"File {wav_file} does not exist so we cannot generate label mask"
                        )
                        continue

                    label_mask = self.create_label_mask_from_raven_table(
                        wav_file, infile)
                    if label_mask is None:
                        print(
                            f"Issue generating {infile} due to error in .wav file"
                        )
                        continue

                    np.save(os.path.join(self.outdir, file_family.mask),
                            label_mask)

                if 'copyraven' in actions:
                    print("TODO")

            elif infile.endswith('_marginal.txt') and 'marginal_labelmask':
                # Generate the 0/1 sprectrogam labels where we now
                # exclude marginal calls!!!
                wav_file = file_family.fullpath(AudioType.WAV)
                if not os.path.exists(wav_file):
                    print(
                        f"File {wav_file} does not exist so we cannot generate label mask"
                    )
                    continue

                label_mask = self.create_label_mask_from_raven_table(
                    wav_file, infile, exclude_marginal=True)
                if label_mask is None:
                    print(
                        f"Issue generating {infile} due to error in .wav file")
                    continue

                np.save(os.path.join(self.outdir, file_family.marginal_mask),
                        label_mask)

    #------------------------------------
    # make_spectrogram
    #-------------------

    def make_spectrogram(self, raw_audio, chunk_size=1000):
        '''
        Given data, compute a spectrogram. To avoid slow memory
        issues build the spectrogram in a stream-like fashion
        where we build it in chunk sizes of 1000 spectrogram frames

        Assumptions:
            o self.framerate contains the data framerate
            o self.nfft contains the window size
            o self.hop contains the hop size for fft
            o self.pad_to contains the zero-padding that 
            we add if self.pad_to > self.nfft
             
        Returns a two-tuple: an array of segment times
            and a 2D spectrogram array

        @param raw_audio: the time/amplitude data
        @type raw_audio: np.array([float])
        @param chunk_size: controls the incremental size used to 
        build up the spectrogram
        @type chunk_size: int
        @return: (spectrogram, time slices array) 
        @rtype: (np.array([float][time x freq]), np.array([float]))
        
        '''

        # The time_labels will be in seconds. But
        # they will be fractions of a second, like
        #   0.256, ... 1440
        self.log.info("Creating spectrogram...")

        # Compute the number of raw audio frames
        # needed to generate chunk_size spec frames
        len_chunk = (chunk_size - 1) * self.hop + self.nfft

        final_spec = None
        slice_times = None
        start_chunk = 0
        # Generate 1000 spect frames at a time, being careful to follow the correct indexing at the boarders to
        # "simulate" the full fft. Namely, if we use indeces (raw_start, raw_end) to get the raw_audio frames needed
        # to generate the spectrogram chunk, remember the next chunk does not start at raw_end but actually start
        # and (raw_end - NFFT) + hop. **THIS IS KEY** to propertly simulate the full spectrogram creation process
        iteration = 0
        print("Approx number of chunks:", int(raw_audio.shape[0] / len_chunk))
        while start_chunk + len_chunk < raw_audio.shape[0]:
            if (iteration % 100 == 0):
                print("Chunk number " + str(iteration))
            [spectrum, freqs,
             t] = ml.specgram(raw_audio[start_chunk:start_chunk + len_chunk],
                              NFFT=self.nfft,
                              Fs=self.framerate,
                              noverlap=(self.nfft - self.hop),
                              window=ml.window_hanning,
                              pad_to=self.pad_to)

            # Cutout uneeded high frequencies!
            spectrum = spectrum[(freqs <= self.max_freq)]

            if start_chunk == 0:
                final_spec = spectrum
                slice_times = t
            else:
                final_spec = np.concatenate((final_spec, spectrum), axis=1)
                # Shift t to be 0 started than Offset the new times
                # by the last frame's time + the time gap between frames (= hop / fr)
                t = t - t[0] + slice_times[-1] + (self.hop / self.framerate)
                slice_times = np.concatenate((slice_times, t))

            # Remember that we want to start as if we are doing one continuous sliding window
            start_chunk += len_chunk - self.nfft + self.hop
            iteration += 1

        # Do one final chunk for whatever remains at the end
        [spectrum, freqs, t] = ml.specgram(raw_audio[start_chunk:],
                                           NFFT=self.nfft,
                                           Fs=self.framerate,
                                           noverlap=(self.nfft - self.hop),
                                           window=ml.window_hanning,
                                           pad_to=self.pad_to)
        # Cutout the high frequencies that are not of interest
        spectrum = spectrum[(freqs <= self.max_freq)]
        final_spec = np.concatenate((final_spec, spectrum), axis=1)
        # Update the times:
        t = t - t[0] + slice_times[-1] + (self.hop / self.framerate)
        slice_times = np.concatenate((slice_times, t))

        # check the shape of this
        self.log.info("Done creating spectrogram.")

        # This we may actually want to do! Log transform!!!
        # Transformer for magnitude to power dB:
        #amp_to_dB_transformer = torchaudio.transforms.AmplitudeToDB()
        #freq_time_dB_tensor = amp_to_dB_transformer(torch.Tensor(freq_time))

        # Note transpose the spectrogram to be of shape - (time, freq)
        return final_spec.T, slice_times

    #------------------------------------
    # make_mel_spectrogram
    #-------------------
    # NEED TO UPDATE THIS STUFF!!!!!
    def make_mel_spectrogram(self, sig_t, framerate):

        # Get tensor (128 x num_samples), where 128
        # is the default number of mel bands. Can
        # change in call to MelSpectrogram:
        mel_spec_t = transforms.MelSpectrogram(
            sample_rate=framerate,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)(sig_t)

        # Turn energy values to db of max energy
        # in the spectrogram:
        mel_spec_db_t = transforms.AmplitudeToDB()(mel_spec_t)

        (num_mel_bands, _num_timebins) = mel_spec_t.shape

        # Number of columns in the spectrogram:
        num_time_label_choices = DSPUtils.compute_timeticks(
            framerate, mel_spec_db_t)
        # Enumeration of the mel bands to use as y-axis labels:
        freq_labels = np.array(range(num_mel_bands))

        return (freq_labels, num_time_label_choices, mel_spec_db_t)

    #------------------------------------
    # create_label_mask_from_raven_table
    #-------------------

    def create_label_mask_from_raven_table(self,
                                           wav_file,
                                           label_file,
                                           exclude_marginal=False):
        '''
        Given a .wav recording, plus a manually created 
        selection table as produced by the Raven program,  
        create a mask file with 1s where the spectrogram 
        time bins would match labels, and 0s elsewhere.
        
        Label files are of the form:
        
            Col1    ...    Begin Time (s)    End Time (s)    ... Coln
             foo             6.326            4.653              bar
                    ...

        @param wav_file: tthe name of a .wav recording file
        @type wav_file_or_sig: {str}
        @param label_file: label file as produced with Raven
        @type label_file: {str}
        @param exclude_marginal: Flag indicating we want to exclude calls
            labeled as marginal.
        @type exclude_marginal: bool
        '''

        # The x-axis time labels that a spectrogram would have:
        time_tick_secs_labels = DATAUtils.time_ticks_from_wav(
            wav_file, hop_length=self.hop, nfft=self.nfft)
        # Check if the wav file failed to read
        if time_tick_secs_labels is None:
            return None

        # Start with an all-zero label mask:
        label_mask = np.zeros(len(time_tick_secs_labels), dtype=int)

        try:
            fd = open(label_file, 'r')
            reader = csv.DictReader(fd, delimiter='\t')
            for (start_bin_idx, end_bin_idx) in self._get_label_indices(
                    reader, time_tick_secs_labels, label_file,
                    exclude_marginal):

                # Fill the mask with 1s in the just-computed range:
                label_mask[start_bin_idx:end_bin_idx] = 1

        finally:
            # Only close an fd that we may have
            # opened in this method. Fds passed
            # in remain open for caller to close:
            fd.close()

        return label_mask

    #------------------------------------
    # _get_label_indices
    #-------------------

    def _get_label_indices(self,
                           reader,
                           label_times,
                           label_txt_file,
                           exclude_marginal=False):

        if type(label_times) != np.ndarray:
            label_times = np.array(label_times)

        file_offset_key = 'File Offset (s)'
        begin_time_key = 'Begin Time (s)'
        end_time_key = 'End Time (s)'
        marginal_key = "Marginal"

        # Get each el call time range spec in the labels:
        num_marginal_calls = 1
        for label_dict in reader:
            # Check if we want to skip this call
            if exclude_marginal and label_dict[marginal_key] == "yes":
                print("Skipping marginal call number:", num_marginal_calls)
                num_marginal_calls += 1
                continue

            try:
                begin_time = float(label_dict[file_offset_key])
                call_length = float(label_dict[end_time_key]) - float(
                    label_dict[begin_time_key])
                end_time = begin_time + call_length
            except KeyError:
                raise IOError(
                    f"Raven label file {label_txt_file} does not contain one "
                    f"or both of keys '{begin_time_key}', {end_time_key}'")

            if end_time < begin_time:
                self.log.err(
                    f"Bad label: end label less than begin label: {end_time} < {begin_time}"
                )
                continue

            if begin_time > label_times[-1]:
                self.log.err(
                    f"Bad label: begin label after end of recording: {begin_time} > {label_times[-1]}"
                )
                continue
            if end_time < label_times[0]:
                self.log.err(
                    f"Bad label: end label before start of recording: {end_time} < {label_times[0]}"
                )
                continue

            # To deal very loosely with noise around the boundaries
            # let us be on the stricter end of setting the bounds.
            # Find the lower and upper time boarders that do not
            # include the start/end times and then shrink these
            # boarders in by 1.

            # Get all of the indeces that have time less than the start time
            pre_begin_indices = np.nonzero(label_times < begin_time)[0]
            if len(pre_begin_indices) == 0:
                start_bin_idx = 0
            else:
                # Make the bounds a bit tighter by adding one to the last
                # index with the time < begin_time
                start_bin_idx = pre_begin_indices[-1] + 1

            # Similarly with end time:
            post_end_indices = np.nonzero(label_times > end_time)[0]
            if len(post_end_indices) == 0:
                # Label end time is beyond recording. Just
                # go up to the end:
                end_bin_idx = len(label_times)
            else:
                # Similar, make bounds a bit tighter
                end_bin_idx = post_end_indices[0] - 1

            yield (start_bin_idx, end_bin_idx)
Ejemplo n.º 8
0
    def __init__(
        self,
        infiles,
        actions,
        outdir=None,  # if None Output to same dir as files are located
        normalize=False,  # We may want to consider using this!
        to_db=False,
        min_freq=0,  # Hz 
        max_freq=150,  # Hz
        nfft=4096,
        pad_to=4096,
        hop=800,
        framerate=8000,
        logfile=None,
    ):
        '''
        @param infiles: List of files to process (containing potentially .wav and .txt)
        @type infiles: [str]
        @param actions: the tasks to accomplish: 
            {spectro|melspectro|labelmask|copyraven|marginal_labelmask}
            NOTE: copy raven is used simply to copy over the raven gt label .txt
            file if we are moving the spectrogram to a new location
        @type actions: [str] 
        @param outdir: if provided, everything that is created is written
            to this directory. If None, is written to the directory of the file
            on which computed was based. This is a good default!
        @type outdir {None | str}
        @param normalize: whether or not to normalize the signal to be within 16 bits
        @type normalize: bool
        @param to_db: whether to convert to db scale (log scale essentially)
        @type to_db: bool
        @param framerate: framerate of the recording. Normally 
            obtained from the wav file itself.
        @type framerate: int
        @param min_freq: min frequency in the processed spectrogram
        @type min_freq: int
        @param max_freq: max frequence in the processed spectrogram
        @type max_freq: int
        @param nfft: window width
        @type nfft: int,
        @param logfile: destination for log. Default: display
        @type logfile: {None|str}
        '''

        # Set up class variables related to spectrogram generation
        self.nfft = nfft
        self.pad_to = pad_to
        self.hop = hop
        self.min_freq = min_freq
        self.max_freq = max_freq
        self.framerate = framerate

        # Output directory
        self.outdir = outdir

        # We should figure out exactly what the shmuck is going on here
        if logfile is None:
            self.log = LoggingService()
        else:
            self.log = LoggingService(logfile, msg_identifier="spectrogrammer")

        if type(infiles) != list:
            infiles = [infiles]

        # Step through the input files and process them depending on
        # their file signature and actions specified as args.
        for infile in infiles:
            # Super basic file checking
            if not os.path.exists(infile):
                print(f"File {infile} does not exist.")
                continue

            print("Processing file:", infile, " - With actions:", actions)
            # Get a dict with the file_root and names related to
            # the infile in our file naming scheme:
            # Note this is useful for associating
            # .wav and .txt files
            file_family = FileFamily(infile)

            # Output the files to the same path as input
            # Note this allows self.outdir to change for each file
            if outdir is None:
                self.outdir = file_family.path

            # Make sure the outdir exists!!
            if not os.path.exists(outdir):
                os.mkdir(outdir)

            # Process wav file if spectro / melspectro in actions
            if infile.endswith('.wav') and ('spectro' in actions
                                            or 'melspectro' in actions):
                # Process the wave file
                try:
                    self.log.info(f"Reading wav file {infile}...")
                    (_, samples) = wavfile.read(infile)
                    self.log.info(f"Done reading wav file {infile}.")
                except Exception as e:
                    self.log.warn(f"Cannot process .wav file: {repr(e)}")
                    # We should continue onto the next one!!
                    # this we have seen with currupted .wav files
                    continue

                if 'spectro' in actions:
                    try:
                        spect, times = self.make_spectrogram(samples)
                    except Exception as e:  # This likely will not happen
                        print(
                            f"Cannot create spectrogram for {infile}: {repr(e)}"
                        )

                    # Save the spectrogram
                    spectro_outfile = os.path.join(self.outdir,
                                                   file_family.spectro)
                    np.save(spectro_outfile, spect)
                    # Save the time mask
                    times_outfile = os.path.join(self.outdir,
                                                 file_family.time_labels)
                    np.save(times_outfile, times)
                else:  # melspectro
                    print("TODO")

            # Process label files
            elif infile.endswith(
                    '.txt'
            ) and 'labelmask' in actions or 'copyraven' in actions:
                # Generate the 0/1 spectrogram labels
                if 'labelmask' in actions:
                    # If we have a label file there should be a corresponding wav file
                    # in the same folder!
                    wav_file = file_family.fullpath(AudioType.WAV)
                    if not os.path.exists(wav_file):
                        print(
                            f"File {wav_file} does not exist so we cannot generate label mask"
                        )
                        continue

                    label_mask = self.create_label_mask_from_raven_table(
                        wav_file, infile)
                    if label_mask is None:
                        print(
                            f"Issue generating {infile} due to error in .wav file"
                        )
                        continue

                    np.save(os.path.join(self.outdir, file_family.mask),
                            label_mask)

                if 'copyraven' in actions:
                    print("TODO")

            elif infile.endswith('_marginal.txt') and 'marginal_labelmask':
                # Generate the 0/1 sprectrogam labels where we now
                # exclude marginal calls!!!
                wav_file = file_family.fullpath(AudioType.WAV)
                if not os.path.exists(wav_file):
                    print(
                        f"File {wav_file} does not exist so we cannot generate label mask"
                    )
                    continue

                label_mask = self.create_label_mask_from_raven_table(
                    wav_file, infile, exclude_marginal=True)
                if label_mask is None:
                    print(
                        f"Issue generating {infile} due to error in .wav file")
                    continue

                np.save(os.path.join(self.outdir, file_family.marginal_mask),
                        label_mask)
    def __init__(
        self,
        csv_or_sqlite_path,
        label_mapping,
        sequence_len=None,
        text_col_name=None,
        label_col_name=None,
    ):
        '''
        A dataset for the context of Bert training.        
        One usually interacts with an instance of this
        class through a BertFeederDataloader instance
        (see bert_feeder_dataloader.py).
        
        This class is a subclass of the torch.util.Dataset
        class, and behaves as such. It can act as a stream
        of input sentences, or be a dict-like data source.
        For the dict-like behavior: 
        
            my_dataset[row_num]
            
        For the stream behavior: treat my_dataset as an
        iterator. 
        
        An additional feature is the option for integrated
        train/validation/test splits. Calling split_dataset()
        internally produces input queues that feed three 
        iterators. Callers switch between these iterators via
        the switch_to_split() method. The splits can be reset
        to their beginnings using the reset() method.
        
        Takes a CSV file, and generates an Sqlite database
        that holds the integer indexes of the collection
        vocab into the BERT vocab, the tokens, and the
        labels. The CSV file can have arbitrary columns;
        only two are required: a column with the raw text
        to be processed through a BERT model, and a column
        with the true labels. The column names default to
        
          BertFeederDataset.TEXT_COL_NAME
          BertFeederDataset.LABEL_COL_NAME
          
        These defaults can be changed in the __init__() call
        or in the class variable init.
        
        The label_mapping must be an OrderedDict mapping
        the textual labels in the CSV file to integers 0,1,...
        
        Ex CSV:
        
          id,     message,       page,    leaning

         165,"We are the..." ,http://...,  left        
            ,"Foo is bar..." ,   ...    ,  right
                    ...
        
        In this example the important cols are 'message', and 'leaning
        the label_mapping might be:
        
            OrderedDict({'right'   : 0,
                         'left'    : 1,
                         'neutral' : 2})
        
        Sequence length is the maximum number of text input 
        tokens into the model in one input sentence. A 
        typical number is 128. If input texts in the CSV are 
        longer than sequence_len, one or more additional input 
        sentences are constructed with the same label as the
        long-text row. Shorter sequences are padded.
        
        @param csv_path: path to CSV file. If sqlite_path is
            provided, and exists, the database at that location
            is used, instead of importing the CSV file. If not,
            an Sqlite db will be created in the same dir as
            csv_path. 
        @type csv_path: str
        @param label_mapping: mapping from text labels to ints
        @type label_mapping: OrderedDict({str : int})
        @param sqlite_path: path where the Sqlite db will be created
        @type sqlite_path: str
        @param sequence_len: width of BERT model input sentences 
            in number of tokens.
        @type sequence_len: int
        @param text_col_name: CSV column that holds text to process
        @type text_col_name: str
        @param label_col_name: CSV column that holds labels.
        @type label_col_name: str
        @param quiet: don't ask for confirmation about existing sqlite file:
        @type quiet: bool
        @param delete_db: if True, delete Sqlite db that contains the csv
            content right from the start. If None, ask user on the command
            line
        @type delete_db: {None|bool}
        '''

        self.log = LoggingService()

        if text_col_name is None:
            self.text_col_name = self.TEXT_COL_NAME
        else:
            self.text_col_name = text_col_name

        if label_col_name is None:
            self.label_col_name = self.LABEL_COL_NAME
        else:
            self.text_col_name = text_col_name

        self.label_mapping = label_mapping

        if not os.path.exists(csv_or_sqlite_path):
            raise IOError(f"Data source {csv_or_sqlite_path} does not exist.")

        is_csv_source = csv_or_sqlite_path.endswith('.csv')

        if is_csv_source:
            # Remove any existing sqlite db that goes
            # with this CSV file:
            (file_path, _ext) = os.path.splitext(csv_or_sqlite_path)
            sqlite_path = file_path + '.sqlite'
            if os.path.exists(sqlite_path):
                os.remove(sqlite_path)
            # Fill the sqlite db with records, each
            # containing sample_id, toc_ids, label, attention_mask.
            self.db = self.process_csv_file(csv_or_sqlite_path, sqlite_path,
                                            sequence_len, text_col_name,
                                            label_col_name)

        else:
            self.db = sqlite3.connect(csv_or_sqlite_path)
            self.db.row_factory = sqlite3.Row

        num_samples_row = next(
            self.db.execute('''SELECT COUNT(*) AS num_samples from Samples'''))
        num_samples = num_samples_row['num_samples']
        # Sqlite3 ROWIDs go from 1 to n
        self.sample_ids = list(range(num_samples))

        # Make a preliminary train queue with all the
        # sample ids. If split_dataset() is called later,
        # this queue will be replaced:
        self.train_queue = deque(self.sample_ids)
        self.curr_queue = self.train_queue
        self.saved_queues = {}
        # Again: this saved_queues entry will be
        # replaced upon a split:
        self.saved_queues['train'] = self.train_queue.copy()
        self.num_samples = len(self.train_queue)
Ejemplo n.º 10
0
class SoundProcessor:
    '''
    Facilities to modify audio files and spectrograms.
    All methods are class methods. So no instances
    are made of this class.
    '''

    # Get a Python logger that is
    # common to all modules in this
    # package:

    log = LoggingService()

    #------------------------------------
    # Constructor Stub
    #-------------------

    def __init__(self):
        raise NotImplementedError(
            "Class SoundProcessor is not intended for instantiation")

    # ------------------ Operations on Sound Files --------------

    #------------------------------------
    # add_background
    #-------------------

    @classmethod
    def add_background(cls,
                       file_name,
                       noise_path,
                       out_dir,
                       len_noise_to_add=5.0):
        '''
        Takes an absolute file path, and the path to a
        directory that contains noise to overlay onto the 
        given sound file (wind, rain, etc.).
        
        Returns a numpy structure corresponding to the
        original audio with the noise overlaid, plus the
        sample rate of the new sample. A file name is suggested
        for the sample. It is composed of elements such 
        as the nature and duration of the noise. Client
        may choose to ignore or use.

        :param file_name: absolute path to sound file
        :type file_name: str
        :param noise_path: absolute path to directory
            with noise files
        :type noise_path: str
        :param out_dir: destination directory of new audio file
        :type out_dir: str
        :param len_noise_to_add: how much of a noise snippet
            to overlay (seconds)
        :type len_noise_to_add: float
        :return: full path of new audio file
        :rtype: str
        '''

        len_noise_to_add = float(len_noise_to_add)
        backgrounds = os.listdir(noise_path)

        # Pick a random noise file:
        background_name = backgrounds[random.randint(0, len(backgrounds) - 1)]

        cls.log.info(f"Adding {background_name} to {file_name}.")

        # We will be working with 1 second as the smallest unit of time
        # load all of both wav files and determine the length of each
        noise, noise_sr = SoundProcessor.load_audio(
            os.path.join(noise_path,
                         background_name))  # type(noise) = np.ndarray
        orig_recording, orig_sr = SoundProcessor.load_audio(file_name)

        new_sr = math.gcd(noise_sr, orig_sr)
        if noise_sr != orig_sr:
            # Resample both noise and orig records so that they have same sample rate
            cls.log.info(f"Resampling: {background_name} and {file_name}")
            noise = librosa.resample(noise, noise_sr, new_sr)
            orig_recording = librosa.resample(orig_recording, orig_sr, new_sr)
            # input("ready?")

        noise_duration = librosa.get_duration(noise, noise_sr)
        if noise_duration < len_noise_to_add:
            cls.log.info(
                f"Duration:{noise_duration} < len_noise_to_add:{len_noise_to_add}. Will only add {noise_duration}s of noise"
            )
            samples_per_segment = len(noise)
        elif noise_duration >= len_noise_to_add:  # randomly choose noise segment
            samples_per_segment = int(
                new_sr * len_noise_to_add
            )  # this is the number of samples per 5 seconds
            # Place noise randomly:
            subsegment_start = random.randint(0,
                                              len(noise) - samples_per_segment)
            noise = noise[subsegment_start:subsegment_start +
                          samples_per_segment]
        cls.log.info(
            f"len(noise) after random segment: {len(noise)}; noise duration: {len(noise)/new_sr}"
        )

        orig_duration = librosa.core.get_duration(orig_recording, orig_sr)
        # if orig_recording is shorter than the noise we want to add, just add 5% noise
        if orig_duration < len_noise_to_add:
            cls.log.info(
                f"Recording: {file_name} was shorter than len_noise_to_add. Adding 5% of recording len worth of noise."
            )
            new_noise_len = orig_duration * 0.05
            noise = noise[:int(new_noise_len * new_sr)]
        noise_start_loc = random.randint(
            0,
            len(orig_recording) - samples_per_segment)
        cls.log.info(
            f"Inserting noise starting at {noise_start_loc/new_sr} seconds.")
        # split original into three parts: before_noise, during_noise, after_noise
        before_noise = orig_recording[:noise_start_loc]
        during_noise = orig_recording[noise_start_loc:noise_start_loc +
                                      samples_per_segment]
        after_noise = orig_recording[noise_start_loc + samples_per_segment:]

        assert len(during_noise) == len(noise)

        segment_with_noise = during_noise + Utils.noise_multiplier(
            orig_recording, noise) * noise
        first_half = np.concatenate((before_noise, segment_with_noise))
        new_sample = np.concatenate(
            (first_half, after_noise))  # what i think it should be
        new_duration = librosa.get_duration(new_sample, float(new_sr))

        assert new_duration == orig_duration
        # File name w/o extension:
        sample_file_stem = Path(file_name).stem
        noise_file_stem = Path(background_name).stem
        noise_dur = str(int(noise_start_loc / new_sr * 1000))
        file_name = f"{sample_file_stem}-{noise_file_stem}_bgd{noise_dur}ms.wav"

        # Ensure that the fname doesn't exist:
        uniq_fname = Utils.unique_fname(out_dir, file_name)
        out_path = os.path.join(out_dir, uniq_fname)

        soundfile.write(out_path, new_sample, new_sr)
        return out_path

    #------------------------------------
    # change_all_volumes
    #-------------------

    @classmethod
    def change_all_volumes(cls, in_dir, out_dir, species=None):
        """
        Adjusts the volume of all the wav files in the species directory.
    
        :param in_dir: the path to the directory to fetch samples from
        :type in_dir: str
        :param out_dir: the path to the directory to save the new files to
        :type out_dir: str
        :param species: the directory names of the species to modify the wav 
            files of. If species=None, all subdirectories will be used.
        :type species: str
        """
        for species_dir in os.listdir(in_dir):
            if species is None or species_dir in species:
                full_species_dir = os.path.join(in_dir, species_dir)
                for sample_file_nm in os.listdir(full_species_dir):
                    sample_path = os.path.join(in_dir, sample_file_nm)
                    cls.change_sample_volume(sample_path, out_dir)

    #------------------------------------
    # change_sample_volume
    #-------------------

    @classmethod
    def change_sample_volume(cls, sample_path, out_dir):
        '''
        Randomly changes an audio clip's volume, and writes
        the new audio file to out_dir

        :param sample_path: full path to sample
        :type sample_path: src
        :param out_dir: destination directory
        :type out_dir: src
        :return full path to the new audio file
        :rtype str
        '''
        y0, sample_rate0 = SoundProcessor.load_audio(sample_path)

        # Adjust the volume
        factor = random.randrange(-12, 12, 1)

        cls.log.info(f"Changing volume of {sample_path} by factor {factor}.")

        y1 = y0 * (10**(factor / 20))

        # Output the new wav data to a file
        # Just the foofile part of /home/me/foofile.mp3:
        sample_root = Path(sample_path).stem
        new_sample_fname = f"{sample_root}-volume{factor}.wav"
        out_file = os.path.join(out_dir, new_sample_fname)
        soundfile.write(out_file, y1, sample_rate0)
        return out_file

    #------------------------------------
    # time_shift
    #-------------------

    @classmethod
    def time_shift(cls, file_name, out_dir):
        """
        Performs a time shift on all the wav files in the 
        species directory. The shift is 'rolling' such that
        no information is lost: a random-sized snippet of
        the audio is moved from the start of the clip to
        its end.
    
        :param file_name: full path to audio file
        :type file_name: str
        :param out_dir: the path to the directory to save the new file to
        :type out_dir: str
        :return full path to the new audio file
        :rtype str
        :raise AssertionError when before & after lengths disagree
        """
        y, sample_rate = SoundProcessor.load_audio(file_name)
        length = librosa.get_duration(y,
                                      sample_rate)  # returns length in seconds
        # shifts the recording by a random amount between 0 and length of recording by a multiple of 10 ms
        amount = random.randrange(0,
                                  int(length) * 10,
                                  1) / 10  # shift is in seconds

        # Create two seperate sections of the audio
        # Snippet after the shift amount:
        y0, sample_rate0 = SoundProcessor.load_audio(file_name, offset=amount)
        # Snippet before the shift amount:
        y1, _sample_rate1 = SoundProcessor.load_audio(file_name,
                                                      duration=amount)

        # Append the before-snippet to the
        # end of the after-snippet:
        y2 = np.append(y0, y1)
        # print(f"Amount: {amount}ms")
        assert len(y) == len(y2), f"Before-len: {len(y)}; after-len: {len(y2)}"

        # Output the new wav data to a file
        # Get just the 'foo' part of '/blue/red/foo.mp3':
        file_stem = Path(file_name).stem
        aug_sample_name = f"{file_stem}-shift{str(int(amount * 1000))}ms.wav"
        out_path = os.path.join(out_dir, aug_sample_name)
        soundfile.write(out_path, y2, sample_rate0)
        return out_path

    # --------------- Operations on Spectrograms Files --------------

    #------------------------------------
    # create_spectrogram
    #-------------------

    @classmethod
    def create_spectrogram(cls,
                           audio_sample,
                           sr,
                           outfile,
                           n_mels=256,
                           info=None):
        '''
        Create and save a spectrogram from an audio 
        sample. Bandpass filter is applied, Mel scale is used,
        and power is converted to decibels.
        
        The sampling rate (sr) and time duration in fractional
        seconds is added as metadata under keys "sr" and "duration".
        Additional info may be included in the .png if info is 
        a dict of key/value pairs.
        
        The outfile's parent directories are created, if necessary.
        
        Retrieving the metadata can be done via SoundProcessor.load_spectrogram(),
        or any other PNG reading software that handles the PNG specification.
        To print the information from the command line, use 
        <proj-root>src/data_augmentation/list_png_metadata.py 
         
        :param audio_sample: audio
        :type audio_sample: np.array
        :param sr: sample rate
        :type sr: int
        :param outfile: where to store the result
        :type outfile: str
        :param n_mels: number of mel scale bands 
        :type n_mels: int
        :param info: if provided,  a dict of information to
            store as text-only key/value pairs in the png.
            Retrieve info via SoundProcessor.load_spectrogram()
            or other standard PNG reading software that supports
            PNG metadata
        :type info: {str : str}
        '''

        if info is not None and type(info) != dict:
            raise TypeError(
                f"If provided, info must be a dict, not {type(info)}")

        # Use bandpass filter for audio before converting to spectrogram
        audio = SoundProcessor.filter_bird(audio_sample, sr)
        mel = librosa.feature.melspectrogram(audio, sr=sr, n_mels=n_mels)
        # create a logarithmic mel spectrogram
        log_mel = librosa.power_to_db(mel, ref=np.max)

        # Create an image of the spectrogram and save it as file
        img = cls.scale_minmax(log_mel, 0, 255).astype(np.uint8)
        img = np.flip(img,
                      axis=0)  # put low frequencies at the bottom in image
        img = 255 - img  # invert. make black==more energy

        # Save as PNG, including sampling rate and
        # (superfluously) duration in seconds:
        duration = round(len(audio) / sr, 1)

        # Create metadata to include in the
        # spectrogram .png file:

        metadata = PngInfo()
        metadata.add_text("sr", str(sr))
        metadata.add_text("duration", str(duration))
        if info is not None:
            for key, val in info.items():
                metadata.add_text(key, str(val))

        outdir = os.path.dirname(outfile)
        os.makedirs(outdir, exist_ok=True)
        skimage.io.imsave(outfile, img, pnginfo=metadata)

    #------------------------------------
    # add_time_freq_masks
    #-------------------

    @classmethod
    def add_time_freq_masks(cls, file_name, in_dir, out_dir=None):
        '''
        Performs Frequency and Time Masking for Spectrograms. The
        masks add horizontal (freq), and vertical (time) masks.

        :param file_name: name of spectrogram file without parents
        :type file_name: str
        :param in_dir: directory in which the spectrogram resides
        :type in_dir: str
        :param out_dir: optionally: destination directory. If None,
            augmented copy is written to in_dir
        :type out_dir: {None | str}
        :return: full path of augmented spectrogram
        :rtype: str
        '''
        cls.log.info(f"Adding masks to {file_name}")
        # print(file_name)
        orig_spectrogram = np.asarray(
            Image.open(os.path.join(in_dir, file_name)))
        freq_masked, freq_name = cls.freq_mask(orig_spectrogram, num_masks=2)
        masked_spectrogram, time_name = cls.time_mask(freq_masked, num_masks=2)
        img = Image.fromarray(masked_spectrogram)
        fpath = Path(file_name)
        new_file_name = f"{fpath.stem}-{freq_name}-{time_name}.png"
        if out_dir is None:
            # Write masked spectrogram to the same dir as original:
            outpath = Path.joinpath(fpath.parent, new_file_name)
        else:
            outpath = Path.joinpath(out_dir, new_file_name)
        img.save(outpath)
        return outpath

    #------------------------------------
    # add_noise
    #-------------------

    @classmethod
    def add_noise(cls, spectrogram, std=1.0):
        '''
        Reads a spectrogram from a file, adds uniform noise 'jitter' to it,
        and writes the result back to a file.

        :param spectrogram: the spectrogram to modify
        :type spectrogram: np.array
        :param std: standard deviation of the noise; default: 1.0
        :type std: {int | float}
        :return: full path of augmented spectrogram
        :rtype: str
        '''

        if std < 0:
            raise ValueError(
                f"Standard deviation must be non-negative; was {std}")

        cls.log.info(f"Adding uniform noise to spectrogram")
        new_spectro = spectrogram.copy()
        spectro_noised = cls.random_noise(new_spectro, std=std)
        return spectro_noised

    #------------------------------------
    # freq_mask
    #-------------------

    # Functions below are from https://github.com/zcaceres/spec_augment/blob/master/SpecAugment.ipynb
    # Functions edited to support np arrays
    @classmethod
    def freq_mask(cls,
                  spec,
                  max_height=40,
                  num_masks=1,
                  replace_with_zero=False):
        '''
        Takes a spectrogram array, and returns a new
        spectrogram array with a frequency mask added. 
        Also returns a suggested file name based on
        the randomly chosen mask parameters.
        
        :param spec: the spectrogram
        :type spec: np.array
        :param max_height: max height of horizontal stripes
        :type max_height: int
        :param num_masks: how many masks to add
        :type num_masks: int
        :param replace_with_zero: if True, replaces the existing
            values with zero. Else replaces them with the mean
            of the entire image
        :type replace_with_zero: bool
        :returns a new array, and a suggested file name
        :rtype (np.array, str)
        '''
        cloned = spec.copy()
        num_mel_channels = cloned.shape[0]
        if max_height >= num_mel_channels:
            # Ensure that random choice of
            # mask height is at least 1 below:
            max_height = num_mel_channels - 1

        for _i in range(0, num_masks):
            # Choose random stripe height within given limit,
            # but at least 1:
            mask_height = random.randrange(1, max_height)
            # Random choice of where to place the stripe:
            f_zero = random.randrange(0, num_mel_channels - mask_height)

            # avoids randrange error if values are equal and range is empty
            #if (f_zero == f_zero + mask_height): continue

            mask_end = f_zero + mask_height
            if (replace_with_zero):
                cloned[f_zero:mask_end, :] = 0
            else:
                cloned[f_zero:mask_end, :] = cloned.mean()
        return cloned, f"-fmask{int(f_zero)}_{int(mask_end)}"

    #------------------------------------
    # time_mask
    #-------------------

    @classmethod
    def time_mask(cls,
                  spec,
                  max_width=20,
                  num_masks=1,
                  replace_with_zero=False):
        '''
        Takes a spectrogram array, and returns a new
        spectrogram array with a time mask added. 
        Also returns a suggested file name based on
        the randomly chosen time mask parameters.
        
        :param spec: the spectrogram
        :type spec: np.array
        :param max_width: width of vertical stripes
        :type max_width: int
        :param num_masks: how many masks to add
        :type num_masks: int
        :param replace_with_zero: if True, replaces the existing
            values with zero. Else replaces them with the mean
            of the entire image
        :type replace_with_zero: bool
        :returns a new array, and a suggested file name
        :rtype (np.array, str)
        '''
        cloned = spec.copy()
        min_width = 1  # Time slice
        len_spectro = cloned.shape[1]
        # Is width of stripe ge width of spectro?
        if max_width >= len_spectro:
            max_width = len_spectro - 1

        for _i in range(0, num_masks):
            # Random stripe width:
            mask_width = random.randrange(min_width, max_width)
            # Random stripe placement: up to case
            # when stripe is at the edge of the spectro:
            t_zero = random.randrange(0, len_spectro - mask_width)

            # avoids randrange error if values are equal and range is empty
            #if (t_zero == t_zero + t):
            #    mask_end = 0
            #    continue

            mask_end = t_zero + mask_width
            # cls.log.info(f"Time masked width: [{t_zero} : {mask_end}]")
            if (replace_with_zero):
                cloned[:, t_zero:mask_end] = 0
            else:
                spectro_mean = cloned.mean()
                #cls.log.info(f"Mean inserted is {spectro_mean}")
                cloned[:, t_zero:mask_end] = spectro_mean

        return cloned, f"-tmask{int(t_zero)}_{int(mask_end)}"

    #------------------------------------
    # random_noise
    #-------------------

    @classmethod
    def random_noise(cls, spectrogram, noise_type='uniform', std=1.0):
        '''
        Adds Gaussian or uniform noise to a numpy array, and returns
        the result. The std arg controls with width of the
        (standard) distribution. 
        
           Assumes that std is a positive number
           Assumes that spectrogram is normalized: 0 to 255
        
        :param spectrogram: the spectrogram to modify
        :type spectrogram: np.array
        :param noise_type: whether to add normal (gaussian) or uniform noise;
            must be 'uniform' or 'normal'
        :type noise_type: str
        :param std: standard deviation of the noise
        :type std: float
        :return: spectrogram with random noise added
        :rtype: np.array
        '''

        if noise_type not in ('uniform', 'normal'):
            raise ValueError(f"Noise type must be 'uniform', or 'normal'")

        clone = spectrogram.copy()
        if noise_type == 'uniform':
            noise = np.random.Generator.normal(np.random.default_rng(),
                                               loc=0.0,
                                               scale=std,
                                               size=clone.shape)
        else:
            noise = np.random.Generator.uniform(np.random.default_rng(),
                                                low=0,
                                                high=1.0,
                                                size=clone.shape)
        clone_noised = clone + np.uint8(np.round(noise))

        # We might get out of bounds due to noise addition.
        # Since this method is intended for images, the
        # required value range is 0-255
        clone_clipped = np.clip(clone_noised, 0, 255)
        return clone_clipped, f"-noise{noise.mean()}"

    # ----------------- Utilities --------------

    #------------------------------------
    # set_random_seed
    #-------------------

    @classmethod
    def set_random_seed(cls, seed):
        random.seed(seed)
        np.random.seed(seed)

    #------------------------------------
    # define_bandpass
    #-------------------

    @classmethod
    def define_bandpass(cls, lowcut, highcut, sr, order=2):
        """
        The defintion and implementation of the bandpass filter
    
        :param highcut: the highcut frequency for the bandpass filter
        :type highcut: int
        :param lowcut: the lowcut frequency for the bandpass filter
        :type lowcut: int
        :param sr: the sample rate of the audio
        :type sr: int
        :param order: the order of the filter
        :type order: int
        :returns b, a: Numerator (b) and denominator (a) polynomials of the IIR filter. Only returned if output='ba'.
        """
        nyq = 0.5 * sr
        low = lowcut / nyq
        high = highcut / nyq
        b, a = butter(order, [low, high], btype='band')
        return b, a

    #------------------------------------
    # filter_bird
    #-------------------

    @classmethod
    def filter_bird(cls, audio, sr):
        """
        Filters the given audio using a bandpass filter.
    
        :param audio: recording
        :type audio: np.array
        :param sr: the sample rate of the audio
        :type sr: int
        :returns output: the filtered recording audio time series
        """
        #bandpass
        b, a = cls.define_bandpass(
            500, 8000, sr)  # filters out anything not between 0.5 and 8khz
        output = lfilter(b, a, audio)

        # noise reduction - easier to listen to for a human, harder for the model to classify
        # select section of data that is noise
        #noisy_part = output[0:1000]
        # perform noise reduction
        #output =  nr.reduce_noise(audio_clip=output, noise_clip=noisy_part, verbose=True, n_grad_freq=0, n_std_thresh=2)

        # normalize the volume
        return output / np.max(output)

    #------------------------------------
    # scale_minmax
    #-------------------

    @classmethod
    def scale_minmax(cls, X, min_val=0.0, max_val=1.0):
        X_std = (X - X.min()) / (X.max() - X.min())
        X_scaled = X_std * (max_val - min_val) + min_val
        return X_scaled

    #------------------------------------
    # load_audio
    #-------------------

    @classmethod
    def load_audio(cls, fname, offset=0.0, duration=None):
        '''
        Loads a .wav or mp3 audio file,
        and returns a numpy array, and
        the recording's sample rate.

        :param fname: audio file to load
        :type fname: str
        :param offset: where to start load: seconds into recording 
        :type offset: float
        :param duration: how many seconds to load
        :type duration: {None | float|
        :returns the recording and the associated sample rate
        :rtype: (np.array, float)
        :raises FileNotFoundError, AudioLoadException
        '''

        if not os.path.exists(fname):
            raise FileNotFoundError(f"File {fname} does not exist.")

        if os.path.getsize(fname) == 0:
            raise AudioLoadException(f"Audio file to load is empty: {fname}",
                                     fname)

        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        # Happens when loading mp3 files:
        warnings.filterwarnings(
            action="ignore",
            message="PySoundFile failed. Trying audioread instead.",
            category=UserWarning,
            module='',
            lineno=0)

        try:
            recording, sample_rate = librosa.load(fname,
                                                  offset=offset,
                                                  duration=duration)
        except Exception as e:
            raise AudioLoadException(f"Could not load {fname}",
                                     fname,
                                     other_msg=repr(e)) from e

        return recording, sample_rate

    #------------------------------------
    # load_spectrogram
    #-------------------

    @classmethod
    def load_spectrogram(cls, fname, to_nparray=True):
        '''
        Loads a .png spectrogram file,
        and returns a numpy array

        :param fname: file to load
        :type fname: str
        :param to_nparray: if True, convert torchvision. Image
            instance to a numpy array, and return that as result
        :type to_nparray: bool
        :returns tuple: the image and the .png file's possibly empty metadata dict
        :rtype: ({np.array|torchvision.Image}, {str : str})
        :raises FileNotFoundError
        '''

        if not os.path.exists(fname):
            raise FileNotFoundError(f"File {fname} does not exist.")

        png_img = PngImageFile(fname)
        try:
            info = png_img.text
        except Exception as e:
            cls.log.info(f"No available info in .png file: {repr(e)}")
            info = None

        img_obj = Image.open(fname)
        if to_nparray:
            res = np.asarray(img_obj)
        else:
            res = img_obj
        return (res, info)

    #------------------------------------
    # save_image
    #-------------------

    @classmethod
    def save_image(cls, img, outfile, info=None):
        '''
        Given an image and an optional metadata
        dictionary, write the image as .png, include
        the metadata. 
        
        :param img: image to save
        :type img: np_array
        :param outfile: destination path
        :type outfile: str
        :param info: metadata to add
        :type info: {str : str}
        '''

        # Create metadata to include in the
        # spectrogram .png file:
        if info is not None:
            metadata = PngInfo()
            for key, val in info.items():
                metadata.add_text(key, str(val))

        skimage.io.imsave(outfile, img, pnginfo=metadata)

    #------------------------------------
    # save_img_array
    #-------------------

    @classmethod
    def save_img_array(cls, img_arr, dst_path):

        dst_dir = os.path.dirname(dst_path)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)

        img = Image.fromarray(img_arr)
        img.save(dst_path)
Ejemplo n.º 11
0
class AudioAugmenter:

    ADD_NOISE   = 1/3 # Add background noise, such as wind or water
    TIME_SHIFT  = 1/3 # Cut audio at random point into snippets A & B
    VOLUME      = 1/3 #    then create new audio: B-A
    
    NOISE_PATH = os.path.join(os.path.dirname(__file__),'lib')

    #------------------------------------
    # Constructor 
    #-------------------

    def __init__(self, 
                 input_dir_path,
                 plot=False,
                 overwrite_policy=False,
                 aug_goals=AugmentationGoals.MEDIAN,
                 random_augs = False,
                 multiple_augs = False,):

        '''
        
        :param input_dir_path: directory holding .wav files
        :type input_dir_path: str
        :param plot: whether or not to plot informative chars 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals; TENTH/MAX/MEDIAN)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        :param random_augs: if this is true, will randomly choose augmentation 
            to use for each new sample
        :type random_augs: bool
        :param multiple_augs: if we want to allow multiple augmentations per sample 
            (e.g. time shift and volume)):
        :type multiple_augs: bool
        '''

        self.log = LoggingService()

        if not isinstance(overwrite_policy, WhenAlreadyDone):
            raise TypeError(f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}") 

        if not os.path.isabs(input_dir_path):
            raise ValueError(f"Input path must be a full, absolute path; not {input_dir_path}")

        self.input_dir_path   = input_dir_path
        self.multiple_augs    = multiple_augs
        self.plot             = plot
        self.overwrite_policy = overwrite_policy
        
        self.species_names = Utils.find_species_names(self.input_dir_path)

        # If aug_goals is not a dict mapping
        # each species to an aug_goals, but just
        # a single AugmentationGoals, create
        # a dict from all bird species, mapping
        # each to that same value:
        
        if type(aug_goals) != dict:
            aug_goals = {species : aug_goals
                          for species in self.species_names
                          }

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_samples
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = Utils.sample_compositions_by_species(input_dir_path, 
                                                                      augmented=False)
        
        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = Utils.compute_num_augs_per_species(aug_goals, 
                                                             self.sample_distrib_df)
        
        # Get input dir path without trailing slash:
#****        canonical_in_path = str(Path(input_dir_path))
        # Create the descriptive name of an output directory 
        # for the augmented samples: 
        if random_augs:
            os.path.join(Path(input_dir_path).parent, 'augmented_samples_random')
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, 
                                                'augmented_samples_random')
        else:
            assert(self.ADD_NOISE + self.TIME_SHIFT + self.VOLUME == 1)
            dir_nm = f"Augmented_samples_-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.VOLUME:.2f}w"
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, dir_nm)

        if self.multiple_augs:
            self.output_dir_path += "/"
        else:
            # Indicate that augmentations are mutually exclusive
            self.output_dir_path += "-exc/"  

        self.log.info(f"Results will be in {self.output_dir_path}")

        Utils.create_folder(self.output_dir_path, self.overwrite_policy)

        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        warnings.filterwarnings(action="ignore",
                                message="PySoundFile failed. Trying audioread instead.",
                                category=UserWarning, 
                                module='', 
                                lineno=0)


    #------------------------------------
    # generate_all_augmentations
    #-------------------

    def generate_all_augmentations(self):
        '''
        Workhorse:
        Create new samples via augmentation for each species. 
        Augment the audio files to reach the number of audio files
        indicated in the self.aug_requirements.
        
        Assumption: self.aug_requirements is a dict mapping 
        species-name : num_required_augmentations
        
        Assumption: self.sample_distrib_df is a dataframe like
        
        	        num_species
        	  sp1       10
        	  sp2       15
        	       ...
        	  
        '''
        num_augmentations = 0
        failures = 0
        
        for species, _rows in self.sample_distrib_df.iterrows():
            # For each species, create as many augmentations
            # as was computed earlier:
            num_needed_augs = self.augs_to_do[species]
            if num_needed_augs == 0:
                continue
            in_dir = os.path.join(self.input_dir_path, species)
            out_dir = os.path.join(self.output_dir_path, species)
            aug_paths, failures = self.augment_one_species(in_dir,
                                                           out_dir,
                                                           num_needed_augs 
                                                           )
            num_augmentations += len(aug_paths)

        # Clean up directory clutter:
        search_root_dir = os.path.join(self.output_dir_path)
        os.system(f"find {search_root_dir} -name \".DS_Store\" -delete")
        
        self.log.info(f"Total of {num_augmentations} new audio files")
        if failures > 0:
            self.log.info(f"Grant total of audio augmentation failures: {len(failures)}")
        
        self.log.info("Done")
        
    #------------------------------------
    # augment_one_species
    #-------------------

    def augment_one_species(self, in_dir, out_dir, num_augs_to_do):
        '''
        Takes one species, and a number of audio
        augmentations to do. Generates the files,
        and returns a list of the newly created 
        files (full paths).
        
        The maximum number of augmentations created
        depends on the number of audio augmentation 
        methods available (currently 3), and the number
        of audio files available for the given species:
        
           num-available-audio-augs * num-of-audio-files
        
        If num_augs_to_do is higher than the above maximum,
        only that maximum is created. The rest will need to 
        be accomplished by spectrogram augmentation in a 
        different portion of the workflow.

        Augmentations are effectively done round robin across all of
        the species' audio files such that each file is
        augmented roughly the same number of times until
        num_augs_to_do is accomplished.

        :param in_dir: directory holding one species' audio files
        :type in_dir: str
        :param out_dir: destination for new audio files
        :type out_dir: src
        :param num_augs_to_do: number of augmentations
        :type num_augs_to_do: int
        :returns: list of newly created file paths
        :rtype: [src]
        '''
        
        # By convention, species name is the last part of the directory:
        species_name = Path(in_dir).stem
        
        # Create subfolder for the given species:
        if not Utils.create_folder(out_dir, self.overwrite_policy):
            self.log.info(f"Skipping augmentations for {species_name}")
            return []

        # Get dict: {full-path-to-an-audio_file : 0}
        # The zeroes will be counts of augmentations
        # needed for that file:    
        in_wav_files     = {full_in_path : 0
                            for full_in_path
                            in Utils.listdir_abs(in_dir)
                            } 
        # Cannot do augmentations for species with 0 samples
        if len(in_wav_files) == 0:
            self.log.info(f"Skipping for {species_name} since there are no original samples.")
            return []

        # Distribute augmenations across the original
        # input files:
        aug_assigned = 0
        while aug_assigned < num_augs_to_do:
            for fname in in_wav_files.keys():
                in_wav_files[fname] += 1
                aug_assigned += 1
                if aug_assigned >= num_augs_to_do:
                    break
        new_sample_paths = []
        failures = 0

        for in_fname, num_augs_this_file in in_wav_files.items():

            # Create augs with different methods:

            # Pick audio aug methods to apply (without replacement)
            # Note that if more augs are to be applied to each file
            # than methods are available, some methods will need
            # to be applied multiple times; no problem, as each
            # method includes randomness:
            max_methods_sample_size = min(len(list(AudAugMethod)), num_augs_this_file)
            methods = random.sample(list(AudAugMethod), max_methods_sample_size)
            
            # Now have something like:
            #     [volume, time-shift], or all methods: [volume, time-shift, noise]
            
            if num_augs_this_file > len(methods):
                # Repeat the methods as often as
                # needed:
                num_method_set_repeats = int(math.ceil(num_augs_this_file/len(methods)))
                # The slice to num_augs_this_file chops off
                # the possible excess from the array replication: 
                method_seq = (methods * num_method_set_repeats)[:num_augs_this_file]
                
                # Assuming num_augs_per_file is 7, we not have method_seq:
                #    [m1,m2,m3,m1,m2,m3,m1]
            else:
                method_seq = methods
                
            for method in method_seq:
                out_path_or_err = self.create_new_sample(in_fname, out_dir, method)
                if isinstance(out_path_or_err, Exception):
                    failures += 1
                else:
                    new_sample_paths.append(out_path_or_err)

        self.log.info(f"Audio aug report: {len(new_sample_paths)} new files; {failures} failures")
                
        return new_sample_paths, failures

    #------------------------------------
    # create_new_sample 
    #-------------------

    def create_new_sample(self,
                          sample_path,
                          out_dir,
                          method,
                          noise_path=None):
        '''
        Given one audio recording and an audio augmentation
        method name, compute that augmentation, create a file name
        that gives insight into the aug applied, and write that
        new audio file to out_dir.
        
        Currently available types of audio augmentation technique:
        
            o adding background sounds
            o randomly changing volume
            o random time shifts

        Returns the full path of the newly created audio file:
        
        :param sample_path: absolute path to audio sample
        :type sample_path: str
        :param out_dir: destination of resulting new samples
        :type out_dir: src
        :param method: the audio augmentation method to apply
        :type method: AudAugMethod
        :param noise_path: full path to audio files with background
            noises to overlay onto audio (wind, rain, etc.). Ignored
            unless method is AudAugMethod.ADD_NOISE.
        :type noise_path: str
        :return: Newly created audio file (full path) or an Exception
            object whose e.args attribute is a tuple with the error
            msg plus a manually added one 
        :rtype: {str | Exception}
        '''
        
        failures = None
        out_path = None
        if method == AudAugMethod.ADD_NOISE:
            if noise_path is None:
                noise_path = AudioAugmenter.NOISE_PATH
            # Add rain, wind, or such at random:
            try:
                out_path = SoundProcessor.add_background(
                        sample_path,
                        self.NOISE_PATH,
                        out_dir, 
                        len_noise_to_add=5.0)
            except Exception as e:
                sample_fname = Path(sample_path).stem
                msg = f"Failed to add background sounds to {sample_fname} ({repr(e)})"
                self.log.err(msg)
                e.args = tuple([e.args[0], msg])
                failures = e

        elif method == AudAugMethod.TIME_SHIFT:
            try:
                out_path = SoundProcessor.time_shift(sample_path, out_dir)
            except Exception as e:
                sample_fname = Path(sample_path).stem
                msg = f"Failed to time shift on {sample_fname} ({repr(e)})"
                self.log.err(msg)
                e.args = tuple([e.args[0], msg])
                failures = e
        elif method == AudAugMethod.VOLUME:
            try:
                out_path = SoundProcessor.change_sample_volume(sample_path, out_dir)
            except Exception as e:
                sample_fname = Path(sample_path).stem
                msg = f"Failed to modify volume on {sample_fname} ({repr(e)})"
                self.log.err(msg)
                e.args = tuple([e.args[0], msg])
                failures = e

        return out_path if failures is None else failures
Ejemplo n.º 12
0
    def __init__(self,
                 input_dir_path,
                 output_dir_path,
                 plot=False,
                 overwrite_policy=False,
                 aug_goals=AugmentationGoals.MEDIAN):
        '''
        
        :param input_dir_path: directory holding .png files
        :type input_dir_path: str
        :param output_dir_path: root of destination dir under
            which each species' subdirectories will be placed.
            Augmentations will be placed in those subdirs.
        :type output_dir_path: str
        :param plot: whether or not to plot informative charts 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals; TENTH/MAX/MEDIAN)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        '''

        self.log = LoggingService()

        if not isinstance(overwrite_policy, WhenAlreadyDone):
            raise TypeError(
                f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}"
            )

        if not os.path.isabs(input_dir_path):
            raise ValueError(
                f"Input path must be a full, absolute path; not {input_dir_path}"
            )

        self.input_dir_path = input_dir_path
        self.output_dir_path = output_dir_path
        self.plot = plot
        self.overwrite_policy = overwrite_policy

        self.species_names = Utils.find_species_names(self.input_dir_path)

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_species
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = Utils.sample_compositions_by_species(
            input_dir_path, augmented=False)

        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = Utils.compute_num_augs_per_species(
            aug_goals, self.sample_distrib_df)

        self.log.info(f"Results will be in {self.output_dir_path}")

        Utils.create_folder(self.output_dir_path, self.overwrite_policy)
    def __init__(self, result_file, charts=False):
        '''
        Constructor
        '''
        self.log = LoggingService()
        
        res_files_dict = self.get_result_file_paths(result_file)
        
        # Get the stats dict from disk:

        try:
            # Load the train/validate/test stats dict
            # from the db:
            stats_file = res_files_dict['stats_file']
            with open(stats_file, 'rb') as fd:
                # Load the data struct into cpu RAM, even
                # if it was on a GPU when it was saved:
                train_test_stats = json.load(fd)
                
        except FileNotFoundError:
            self.log.err(f"No train/validate/test stats file found: {stats_file}; quitting")
            sys.exit(1)

        # Get the predictions made for the testset, 
        # and the corresponding true labels
        
        # The file is simple csv:
        #     prediction,true_label
        #        0      ,   1
        #        2      ,   2
        #         ...
        # For robustness, don't make assumption about
        # the column order, but read it from the first
        # row. We build a two-key dict: {'prediction' : [...],
        #                                'true_label' : [...]
        #                               }
        # The most pythonic method would be to read all
        # the rows, forming an array of tuples, and then
        # use the star operator (*). But I have to look
        # up the semantics of that op every time. So, no.
        try:
            preds_file = res_files_dict['preds_file']
            with open(preds_file, 'r') as fd:
                reader = csv.reader(fd)
                col_names = next(reader)
                pred_vs_true_dict = {col_names[0] : [],
                                     col_names[1] : []
                                     }
                for pred_vs_true in reader:
                    pred_vs_true_dict[col_names[0]].append(int(pred_vs_true[0]))
                    pred_vs_true_dict[col_names[1]].append(int(pred_vs_true[1]))
        except FileNotFoundError:
            self.log.err(f"No test predictions file found ({res_files_dict['preds_file']})")
            sys.exit(1)

        # Print descriptives:
        db_file = res_files_dict['db_file']
        try:
            self.db = sqlite3.connect(db_file)
            self.db.row_factory = sqlite3.Row
            
            # Get ordered dict mapping int labels to
            # text labels:
            
            self.label_encodings = self.get_label_encodings()

            self.get_descriptives(train_test_stats, pred_vs_true_dict)
     
            # Plot train/val losses by epoch:
            if charts:
                self.plot_train_val_loss_and_accuracy(train_test_stats)
        finally:
            self.db.close()
class BertResultAnalyzer(object):
    '''
    classdocs
    '''


    #------------------------------------
    # Constructor 
    #-------------------

    def __init__(self, result_file, charts=False):
        '''
        Constructor
        '''
        self.log = LoggingService()
        
        res_files_dict = self.get_result_file_paths(result_file)
        
        # Get the stats dict from disk:

        try:
            # Load the train/validate/test stats dict
            # from the db:
            stats_file = res_files_dict['stats_file']
            with open(stats_file, 'rb') as fd:
                # Load the data struct into cpu RAM, even
                # if it was on a GPU when it was saved:
                train_test_stats = json.load(fd)
                
        except FileNotFoundError:
            self.log.err(f"No train/validate/test stats file found: {stats_file}; quitting")
            sys.exit(1)

        # Get the predictions made for the testset, 
        # and the corresponding true labels
        
        # The file is simple csv:
        #     prediction,true_label
        #        0      ,   1
        #        2      ,   2
        #         ...
        # For robustness, don't make assumption about
        # the column order, but read it from the first
        # row. We build a two-key dict: {'prediction' : [...],
        #                                'true_label' : [...]
        #                               }
        # The most pythonic method would be to read all
        # the rows, forming an array of tuples, and then
        # use the star operator (*). But I have to look
        # up the semantics of that op every time. So, no.
        try:
            preds_file = res_files_dict['preds_file']
            with open(preds_file, 'r') as fd:
                reader = csv.reader(fd)
                col_names = next(reader)
                pred_vs_true_dict = {col_names[0] : [],
                                     col_names[1] : []
                                     }
                for pred_vs_true in reader:
                    pred_vs_true_dict[col_names[0]].append(int(pred_vs_true[0]))
                    pred_vs_true_dict[col_names[1]].append(int(pred_vs_true[1]))
        except FileNotFoundError:
            self.log.err(f"No test predictions file found ({res_files_dict['preds_file']})")
            sys.exit(1)

        # Print descriptives:
        db_file = res_files_dict['db_file']
        try:
            self.db = sqlite3.connect(db_file)
            self.db.row_factory = sqlite3.Row
            
            # Get ordered dict mapping int labels to
            # text labels:
            
            self.label_encodings = self.get_label_encodings()

            self.get_descriptives(train_test_stats, pred_vs_true_dict)
     
            # Plot train/val losses by epoch:
            if charts:
                self.plot_train_val_loss_and_accuracy(train_test_stats)
        finally:
            self.db.close()

    #------------------------------------
    # get_descriptives
    #-------------------
    
    def get_descriptives(self, train_test_stats, prediction_vs_labels_dict): 

        '''
        Given a dict like the following, which was stored
        by the training process by the self.db Sqlite db,
        get more result info from that db, and print result
        evaluations:
        
            training_stats:
              'Training' : [{'epoch': 1,
                             'Training Loss': 0.016758832335472106,
                             'Validation Loss': 0.102080237865448,
                             'Training Accuracy': 0.00046875,
                             'Validation Accuracy.': 0.05,
                             'Training Time': '0:00:25',
                             'Validation Time': '0:00:01'},
                            {'epoch': 2,
                               ...
                            }
                            ]
            
              'Testing'  : {'Test Loss': tensor(1.0733),
                            'Test Accuracy': 0.1,
                            'Matthews corrcoef': 0.0,
                            'Confusion matrix': array([[0, 0, 0],
                                                       [3, 1, 6],
                                                       [0, 0, 0]])
                           }
        
        
        @param train_test_stats: dict of test and training results
        @type train_test_stats: dict
        @param prediction_vs_labels_dict: dict with keys 'prediction',
            and 'true_labels'.
        @type prediction_vs_labels_dict: {str:[int]}
        '''

        # Convenience: pull out the Testing sub-dir:
        test_res_dict = train_test_stats['Testing']

        # Get distribution of labels across the entire dataset,
        # and the train, validation, and test sets:
        
         
        # Get overall label distribution:
        
        res = self.db.execute('''SELECT label, count(*) AS num_this_label 
                            FROM Samples 
                           GROUP BY label;
                        ''')
        label_count_dict_whole_set = {}
        # Build dict: string-label ===> number of samples
        for (int_label, num_this_label) in res:
            # Get str label from int label:
            str_label = self.label_encodings[int_label]
            label_count_dict_whole_set[str_label] = num_this_label
            
        # Get train set label distribution:
        
        res = self.db.execute('''SELECT label, count(*) as label_count
                              FROM TrainQueue LEFT JOIN Samples
                               ON TrainQueue.sample_id = Samples.sample_id
                             GROUP BY label;
                        ''')
        label_count_dict_train = {}
        # Build dict: string-label ===> number of samples
        for (int_label, num_this_label) in res:
            # Get str label from int label:
            str_label = self.label_encodings[int_label]
            label_count_dict_train[str_label] = num_this_label

        # Get validation set label distribution:

        res = self.db.execute('''SELECT label, count(*) as label_count
                              FROM ValidateQueue LEFT JOIN Samples
                               ON ValidateQueue.sample_id = Samples.sample_id
                             GROUP BY label;
                        ''')
        
        label_count_dict_validate = {}
        # Build dict: string-label ===> number of samples
        for (int_label, num_this_label) in res:
            # Get str label from int label:
            str_label = self.label_encodings[int_label]
            label_count_dict_validate[str_label] = num_this_label

        # Get test set label distribution:
        
        res = self.db.execute('''SELECT label, count(*) as label_count
                              FROM TestQueue LEFT JOIN Samples
                               ON TestQueue.sample_id = Samples.sample_id
                             GROUP BY label;
                        ''')
        
        label_count_dict_test = {}
        # Build dict: string-label ===> number of samples
        for (int_label, num_this_label) in res:
            # Get str label from int label:
            str_label = self.label_encodings[int_label]
            label_count_dict_test[str_label] = num_this_label

        # Put the all test results into 
        # a dataframe for easy printing:
        train_res_df = pd.DataFrame(test_res_dict,
                                    index=[0])
        # Same for label value distributions:
        samples_label_distrib_df    = pd.DataFrame(label_count_dict_whole_set,
                                                   index=[0]
                                                   )
        train_label_distrib_df      = pd.DataFrame(label_count_dict_train,
                                                   index=[0]
                                                   )
        validate_label_distrib_df   = pd.DataFrame(label_count_dict_validate,
                                                   index=[0]
                                                   )
        
        test_label_distrib_df   = pd.DataFrame(label_count_dict_test,
                                               index=[0]
                                               )
        
        # Turn confusion matrix numpy into a df
        # with string labels to mark rows and columns:
        true_labels = prediction_vs_labels_dict['true_label']
        test_predictions = prediction_vs_labels_dict['prediction']
        int_labels = list(self.label_encodings.keys())
        str_labels = list(self.label_encodings.values())
        conf_mat_df = pd.DataFrame(confusion_matrix(true_labels,
                                                    test_predictions,
                                                    labels=int_labels
                                                    ),
                                   index=str_labels,
                                   columns=str_labels
                                   )
        # We also produce a conf matrix normalized to 
        # the true values. So each cell is percentage 
        # predicted/true:
        conf_mat_norm_df = pd.DataFrame(confusion_matrix(true_labels,
                                                         test_predictions,
                                                         normalize='true',
                                                         labels=int_labels
                                                         ),
                                                         index=str_labels,
                                                         columns=str_labels
                                        )
        # Change entries to be 'x.yy%'
        #conf_mat_norm_df = conf_mat_norm_df.applymap(lambda df_el: f"{round(df_el,2)}%")
        conf_mat_norm_df = conf_mat_norm_df.applymap(lambda df_el: f"{100*round(df_el,1)}%")
        
        print(train_res_df.to_string(index=False, justify='center'))
        print()
        # Label distributions in the sample subsets:
        print("Distribution of labels across all samples:")
        print(samples_label_distrib_df.to_string(index=False, justify='center'))
        print("Distribution of labels across training set:")
        print(train_label_distrib_df.to_string(index=False, justify='center'))
        print("Distribution of labels across validation set:")
        print(validate_label_distrib_df.to_string(index=False, justify='center'))
        print("Distribution of labels across test set:")
        print(test_label_distrib_df.to_string(index=False, justify='center'))
        print()
        print(f"Confusion matrix (rows: true; cols: predicted):")
        print(f"{conf_mat_df}")
        print("")
        print(f"Confusion matrix normalized: percent of true (rows: true; cols: predicted):")
        print(conf_mat_norm_df)
        print("")
        result_report = classification_report(true_labels,
                                              test_predictions)
        print(result_report)

    #------------------------------------
    # plot_train_val_loss_and_accuracy 
    #-------------------
    
    def plot_train_val_loss_and_accuracy(self, training_stats):
        '''
        View the summary of the training process.
        
        @param training_stats: a dict like:
           {
		     'Training' : [{'epoch': 1,
		                    'Training Loss': 0.016758832335472106,
		                    'Validation Loss': 0.102080237865448,
		                    'Training Accuracy': 0.00046875,
		                    'Validation Accuracy.': 0.05,
		                    'Training Time': '0:00:25',
		                    'Validation Time': '0:00:01'},
		                   {'epoch': 2,
		                      ...
		                   }
		                   ]
		   
		     'Testing'  : {'Test Loss': tensor(1.0733),
		                   'Test Accuracy': 0.1,
		                   'Matthews corrcoef': 0.0,
		                   'Confusion matrix': array([[0, 0, 0],
		                                              [3, 1, 6],
		                                              [0, 0, 0]])
		                  }
		   }

        @type training_stats_info: dict
        '''
        
        # Create a DataFrame from our training statistics.
        epoch_stats_dicts = training_stats['Training']
        
        #********
        # For testing when only one epoch's results
        # are available: add some more:
#         epoch_stats_dicts.extend(
#             [
#                 {'epoch': 2, 'Training Loss': 0.01250069046020508, 'Validation Loss': 0.0623801279067993, 'Training Accuracy': 0.01500625, 'Validation Accuracy.': 0.11, 'Training Time': '0:00:24', 'Validation Time': '0:00:01'},
#                 {'epoch': 3, 'Training Loss': 0.00250069046020508, 'Validation Loss': 0.0323801279067993, 'Training Accuracy': 0.02500625, 'Validation Accuracy.': 0.25, 'Training Time': '0:00:24', 'Validation Time': '0:00:01'},
#                 {'epoch': 4, 'Training Loss': 0.0004069046020508, 'Validation Loss': 0.0023801279067993, 'Training Accuracy': 0.01500625, 'Validation Accuracy.': 0.35, 'Training Time': '0:00:24', 'Validation Time': '0:00:01'}
#             ]
#             )
#         #********

        self.plot_stats_dataframe(epoch_stats_dicts)

    #------------------------------------
    # def plot_stats_dataframe 
    #-------------------

    def plot_stats_dataframe(self, epoch_stats_dicts):
        '''
        plot_type: 'loss' or 'accuracy'
        
        @param epoch_stats_dicts:
        @type epoch_stats_dicts:
        @param plot_type:
        @type plot_type:
        '''
        
        # Display floats with two decimal places.
        pd.set_option('precision', 2)

        df_stats = pd.DataFrame(epoch_stats_dicts)
        # Use the 'epoch' as the row index.
        df_stats = df_stats.set_index('epoch')
        
        # A hack to force the column headers to wrap.
        #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
        
        # Display the table.
        df_stats
        
        # If you notice that, while the training loss is 
        # going down with each epoch, the validation loss 
        # is increasing! This suggests that we are training 
        # our model too long, and it's over-fitting on the 
        # training data. 
        
        # Validation Loss is a more precise measure than accuracy, 
        # because with accuracy we don't care about the exact output value, 
        # but just which side of a threshold it falls on. 
        
        # If we are predicting the correct answer, but with less 
        # confidence, then validation loss will catch this, while 
        # accuracy will not.
        
        # Use plot styling from seaborn.
        sns.set(style='darkgrid')
        
        # Increase the plot size and font size.
        sns.set(font_scale=1.5)
        
        _fig, (ax1, ax2) = plt.subplots(nrows=1, 
                                        ncols=2, 
                                        figsize=(12,6),
                                        tight_layout=True
                                        )
        
        # Plot the learning curve.

        ax1.plot(df_stats['Training Loss'], 'b-o', label="Training")
        ax1.plot(df_stats['Validation Loss'], 'g-o', label="Validation")
        # Label the plot.
        ax1.set_title("Training & Validation Loss")
        ax1.set_xlabel("Epoch")
        ax1.set_ylabel("Loss")
        ax1.set_xticks(df_stats.index.values)

        ax2.plot(df_stats['Training Accuracy'], 'b-o', label="Training")
        ax2.plot(df_stats['Validation Accuracy'], 'g-o', label="Validation")
        # Label the plot.
        ax2.set_title("Training & Validation Accuracy")
        ax2.set_xlabel("Epoch")
        ax2.set_ylabel("Accuracy")
        ax2.set_xticks(df_stats.index.values)

        ax1.legend(frameon=False)
        ax2.legend(frameon=False)
        
        plt.ion()
        plt.show(block=False)

    #------------------------------------
    # get_result_file_paths 
    #-------------------
    
    def get_result_file_paths(self, one_result_file):
        '''
        Given one of the result files that are produced
        by bert_train_parallel.py, return a dict with the
        full paths of them all. For example, given path
        /foo/bar/facebook_ads.sqlite, constructs path to
        
        
           /foo/bar/facebook_ads_clean_testset_predictions.csv
           /foo/bar/facebook_ads_clean_train_test_stats.json
           /foo/bar/facebook_ads_clean_trained_model.sav
           /foo/bar/facebook_ads_clean.sqlite
           /foo/bar/facebook_ads_clean.csv
           
        Returns:
                'preds_file' : file_path,
                'stats_file' : file_path,
                'model_file' : file_path,
                'db_file'    : file_path
                }

        @param one_result_file:
        @type one_result_file:
        '''
        preds_str = '_testset_predictions.csv'
        stats_str = '_train_test_stats.json'
        model_str = '_trained_model.sav'
        db_str    = '.sqlite'
        
        if re.search(preds_str, one_result_file) is not None:
            files_root = one_result_file[:one_result_file.index(preds_str)]
        elif re.search(stats_str, one_result_file) is not None:
            files_root = one_result_file[:one_result_file.index(stats_str)]
        elif re.search(model_str, one_result_file) is not None:
            files_root = one_result_file[:one_result_file.index(model_str)]
        elif re.search(db_str, one_result_file) is not None:
            files_root = one_result_file[:one_result_file.index(db_str)]
        else:
            # Assume that caller gave the .csv file, or just the
            # path with the root:
            (files_root, _ext) = os.path.splitext(one_result_file) 
        
        return {'preds_file' : files_root + preds_str,
                'stats_file' : files_root + stats_str,
                'model_file' : files_root + model_str,
                'db_file'    : files_root + db_str
                }

    #------------------------------------
    # get_label_encodings 
    #-------------------
    
    def get_label_encodings(self):
        '''
        Get contents of the LabelEncodings table
        into the dict label_encodings. These are
        the mappings from the bert integer encodings
        of labels (0,1,2,3) to the human readable labels
        ('foo', 'bar', 'fum', 'blue') 
        '''
        label_encodings = OrderedDict()
        try:
            cur = self.db.execute(f'''SELECT key_col, val_col 
                                        FROM LabelEncodings''')
            while True:
                (int_label, str_label) = next(cur)
                label_encodings[int(int_label)] = str_label
        except StopIteration:
            return label_encodings
                             


    #------------------------------------
    # print_model_parms 
    #-------------------

    def print_model_parms(self, model):
        '''

        Printed out the names and dimensions of the weights for:
        
        1. The embedding layer.
        2. The first of the twelve transformers.
        3. The output layer.
        '''
        
        # Get all of the model's parameters as a list of tuples.
        params = list(model.named_parameters())
        
        self.log.info('The BERT model has {:} different named parameters.\n'.format(len(params)))
        self.log.info('==== Embedding Layer ====\n')
        for p in params[0:5]:
            self.log.info("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
        self.log.info('\n==== First Transformer ====\n')
        for p in params[5:21]:
            self.log.info("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
        self.log.info('\n==== Output Layer ====\n')
        for p in params[-4:]:
            self.log.info("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
Ejemplo n.º 15
0
    def __init__(
        self,
        input_dir_path,
        plot=False,
        overwrite_policy=False,
        aug_goals=AugmentationGoals.MEDIAN,
        random_augs=False,
        multiple_augs=False,
    ):
        '''
        
        :param input_dir_path: directory holding .wav files
        :type input_dir_path: str
        :param plot: whether or not to plot informative chars 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        :param random_augs: if this is true, will randomly choose augmentation 
            to use for each new sample
        :type random_augs: bool
        :param multiple_augs: if we want to allow multiple augmentations per sample 
            (ie time shift and warp)):
        :type multiple_augs: bool
        '''

        self.log = LoggingService()

        self.input_dir_path = input_dir_path
        self.multiple_augs = multiple_augs
        self.plot = plot
        self.overwrite_freely = overwrite_policy

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_species
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = utils.sample_compositions_by_species(
            input_dir_path, augmented=False)

        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = self.compute_num_augs_per_species(
            aug_goals, self.sample_distrib_df)

        if random_augs:
            self.output_dir_path = f"{input_dir_path[:-1]}_augmented_samples_random"

        else:
            assert (self.ADD_NOISE + self.TIME_SHIFT + self.WARP == 1)
            self.output_dir_path = f"{input_dir_path[:-1]}_augmented_samples-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.WARP:.2f}w"

        if self.multiple_augs:
            self.output_dir_path += "/"
        else:
            # Indicate that augmentations are mutually exclusive
            self.output_dir_path += "-exc/"

        self.log.info(f"Results will be in {self.output_dir_path}")

        # Creates output file structure
        # Self.output_dir_path
        #       |-- AUG_WAV_DIR
        #       |         |----
        #       |         |----
        #       |-- AUG_SPECTROGRAMS_DIR
        #       |         |----
        #       |         |----
        utils.create_folder(self.output_dir_path, self.overwrite_freely)
        utils.create_folder(
            os.path.join(self.output_dir_path, self.AUG_WAV_DIR),
            self.overwrite_freely)
        utils.create_folder(
            os.path.join(self.output_dir_path, self.AUG_SPECTROGRAMS_DIR),
            self.overwrite_freely)
Ejemplo n.º 16
0
class Augmenter:

    ADD_NOISE = 1 / 3
    TIME_SHIFT = 1 / 3
    WARP = 1 / 3

    P_DIST = [ADD_NOISE, TIME_SHIFT, WARP]
    AUDIO_AUG_NAMES = ["add_noise", "time_shift", "warp"]
    AUG_SPECTROGRAMS_DIR = "spectrograms_augmented/"
    AUG_WAV_DIR = "wav_augmented/"
    NOISE_PATH = "data_augmentation/lib/Noise_Recordings/"

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(
        self,
        input_dir_path,
        plot=False,
        overwrite_policy=False,
        aug_goals=AugmentationGoals.MEDIAN,
        random_augs=False,
        multiple_augs=False,
    ):
        '''
        
        :param input_dir_path: directory holding .wav files
        :type input_dir_path: str
        :param plot: whether or not to plot informative chars 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        :param random_augs: if this is true, will randomly choose augmentation 
            to use for each new sample
        :type random_augs: bool
        :param multiple_augs: if we want to allow multiple augmentations per sample 
            (ie time shift and warp)):
        :type multiple_augs: bool
        '''

        self.log = LoggingService()

        self.input_dir_path = input_dir_path
        self.multiple_augs = multiple_augs
        self.plot = plot
        self.overwrite_freely = overwrite_policy

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_species
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = utils.sample_compositions_by_species(
            input_dir_path, augmented=False)

        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = self.compute_num_augs_per_species(
            aug_goals, self.sample_distrib_df)

        if random_augs:
            self.output_dir_path = f"{input_dir_path[:-1]}_augmented_samples_random"

        else:
            assert (self.ADD_NOISE + self.TIME_SHIFT + self.WARP == 1)
            self.output_dir_path = f"{input_dir_path[:-1]}_augmented_samples-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.WARP:.2f}w"

        if self.multiple_augs:
            self.output_dir_path += "/"
        else:
            # Indicate that augmentations are mutually exclusive
            self.output_dir_path += "-exc/"

        self.log.info(f"Results will be in {self.output_dir_path}")

        # Creates output file structure
        # Self.output_dir_path
        #       |-- AUG_WAV_DIR
        #       |         |----
        #       |         |----
        #       |-- AUG_SPECTROGRAMS_DIR
        #       |         |----
        #       |         |----
        utils.create_folder(self.output_dir_path, self.overwrite_freely)
        utils.create_folder(
            os.path.join(self.output_dir_path, self.AUG_WAV_DIR),
            self.overwrite_freely)
        utils.create_folder(
            os.path.join(self.output_dir_path, self.AUG_SPECTROGRAMS_DIR),
            self.overwrite_freely)

    #------------------------------------
    # compute_num_augs_per_species
    #-------------------

    def compute_num_augs_per_species(self, aug_volumes, sample_distrib_df):
        '''
        Return a dict mapping species name to 
        number of samples that should be available after
        augmentation. 
        
        The aug_volumes arg is either a dict mapping species name
        to an AugmentationGoals (TENTH, MEDIAN, MAX), or just
        an individual AugmentationGoals.

        The sample_distrib_df is a dataframe whose row labels are 
        species names and the single column's values are numbers
        of available samples for training/validation/test for the
        respective row's species.
        
        :param aug_volumes: how many augmentations for each species
        :type aug_volumes: {AugmentationGoals | {str : AugmentationGoals}}
        :param sample_distrib_df: distribution of initially available
            sample numbers for each species
        :type sample_distrib_df: pandas.DataFrame
        :return: dict mapping each species to the 
            number of samples that need to be created.
        :rtype: {str : int}
        '''

        # Get straight array of number of audio samples
        # for each species. Ex: array([6,4,5]) for
        # three species with 6,4, and 5 audio recordings,
        # respectively

        species_np = self.sample_distrib_df.values.flatten()
        index_max, index_min = (self.sample_distrib_df.idxmax().to_numpy()[0],
                                self.sample_distrib_df.idxmin().to_numpy()[0])

        # Get number of recordings for
        # the species with the most number
        # of recordings:
        max_num_samples = np.max(species_np)
        tenth_max_num_samples = max_num_samples // 10 + 1
        median_num_samples = np.median(species_np)

        volumes = {
            AugmentationGoals.TENTH: tenth_max_num_samples,
            AugmentationGoals.MEDIAN: median_num_samples,
            AugmentationGoals.MAX: max_num_samples
        }

        aug_requirements = {}
        if type(aug_volumes) == AugmentationGoals:
            for species in sample_distrib_df.index:
                aug_requirements[species] = volumes[aug_volumes]
        else:
            # Have dict of species-name : AugmentationGoals:
            for species in sample_distrib_df.index:
                aug_requirement = aug_volumes[species]
                aug_requirements[species] = volumes[aug_requirement]

        self.log.info(
            f"Median: {median_num_samples},  Min: {np.min(species_np)} ({index_min}) ,  Max: {max_num_samples} ({index_max})"
        )
        self.log.info(f"10% of max is {tenth_max_num_samples}")

        return aug_requirements

    #------------------------------------
    # generate_all_augmentations
    #-------------------

    def generate_all_augmentations(self):
        '''
        Create new samples via augmentation or each species. 
        Augment the audio and/or spectrogram files to reach 
        the number of spectrograms indicated in the self.aug_requirements.
        
        Spectrograms are created on the way.

        Assumption: self.aug_requirements is a dict mapping 
        species-name : num_required_augmentations
        '''

        for species, rows in self.sample_distrib_df.iterrows():
            num_samples_orig = rows['num_samples']
            self.augment_one_species(species, num_samples_orig,
                                     self.augs_to_do[species])

        #input(f"Finished for {species}")

        # Clean up directory clutter:
        search_root_dir = os.path.join(self.output_dir_path +
                                       self.AUG_SPECTROGRAMS_DIR)
        os.system(f"find {search_root_dir} -name \".DS_Store\" -delete")

        spectro_dir = os.path.join(self.output_dir_path,
                                   self.AUG_SPECTROGRAMS_DIR)
        augmented_df = utils.sample_compositions_by_species(spectro_dir,
                                                            augmented=True)
        augmented_df["total_samples"] = augmented_df.sum(axis=1)

        self.log.debug(f"augmented_df: {augmented_df}")

        if self.plot:
            augmented_df.plot.bar(
                y=["add_bg", "time_shift", "mask",
                   "original"], stacked=True).legend(loc='center left',
                                                     bbox_to_anchor=(1, 0.5))

        self.log.info("Done")

    #------------------------------------
    # augment_one_species
    #-------------------

    def augment_one_species(self, species, num_samples_orig, threshold):

        species_wav_input_dir = os.path.join(self.input_dir_path, species)
        species_wav_output_dir = os.path.join(self.output_dir_path,
                                              self.AUG_WAV_DIR, species)
        species_spectrogram_output_dir = os.path.join(
            self.output_dir_path, self.AUG_SPECTROGRAMS_DIR, species)

        # Make output folders under self.output_dir_path
        # Returns if either folder already exists

        if not (utils.create_folder(species_wav_output_dir,
                                    self.overwrite_freely)
                and utils.create_folder(species_spectrogram_output_dir,
                                        self.overwrite_freely)):
            self.log.info(f"Skipping augmentations for {species}")
            return

        wav_files = os.listdir(species_wav_input_dir)

        # Create original spectrograms
        if num_samples_orig < threshold:
            samples_to_add = int(threshold - num_samples_orig)
            self.create_original_spectrograms(wav_files, num_samples_orig,
                                              species_wav_input_dir,
                                              species_spectrogram_output_dir)
        else:
            samples_to_add = 0
            self.create_original_spectrograms(wav_files, threshold,
                                              species_wav_input_dir,
                                              species_spectrogram_output_dir)
        self.log.info(
            f"Num Original Samples for {species}: {len(wav_files)}. Creating {samples_to_add} more samples using augmentations."
        )

        # Cannot do augmentations for species with 0 samples
        if len(wav_files) == 0:
            self.log.info(
                f"Skipping for {species} since there are no original samples.")
            return

        # Create samples_to_add samples using augmentations
        for i in range(samples_to_add):
            # if samples to add more than original samples,
            # we may have to augment original samples more than once:

            sample_name = wav_files[i % len(wav_files)]

            # The maximum number of augmentations is equal to
            # using each available augmentation at most once per sample.
            max_num_augs = utils.count_max_augs(self.P_DIST)
            num_augs_per_sample = np.random.randint(
                1, max_num_augs + 1) if self.multiple_augs else 1

            self.create_new_sample(
                sample_name, (species_wav_input_dir, species_wav_output_dir,
                              species_spectrogram_output_dir),
                num_augs=num_augs_per_sample)

    #------------------------------------
    # create_new_sample
    #-------------------

    def create_new_sample(self, sample_name, paths, num_augs=1):

        (species_wav_input_dir, species_wav_output_dir,
         species_spectrogram_output_dir) = paths

        aug_choices = np.random.choice(self.AUDIO_AUG_NAMES,
                                       size=num_augs,
                                       p=self.P_DIST,
                                       replace=False)
        # input(f"Aug choices: {aug_choices}")
        # Warping must be done after all the other augmentations take place,
        # after spectrogram is created
        warp = False
        if "warp" in aug_choices:
            warp = True
            aug_choices = aug_choices.tolist()
            # print(f"Aug chioces as list: {aug_choices}")
            aug_choices.remove("warp")
            # print(f"Aug chioces after: {aug_choices}")

        for i in range(len(aug_choices)):
            # print(aug_choices)
            aug_name = aug_choices[i]
            if i != 0:  # if not first augmentation, then, source wav is in output wav directory
                species_wav_input_dir = species_wav_output_dir
            if aug_name == "add_noise":

                # Add_noise; which noise to add will be chosen at random
                updated_name = SoundProcessor.add_background(
                    sample_name,
                    self.NOISE_PATH,
                    species_wav_input_dir,
                    species_wav_output_dir,
                    len_noise_to_add=5.0)
            elif aug_name == "time_shift":
                updated_name = SoundProcessor.time_shift(
                    sample_name, species_wav_input_dir, species_wav_output_dir)
            sample_name = updated_name

        # create new spectrogram if augmented
        if len(aug_choices) != 0:
            sample_name = SoundProcessor.create_spectrogram(
                sample_name,
                species_wav_output_dir,
                species_spectrogram_output_dir,
                n_mels=128)

        if warp:
            #warp
            # if len(aug_choices) +1 > 1:
            #     input(f"num_augs = {len(aug_choices) +1} for {sample_name}")
            sample_name = sample_name[:-len(".wav")] + ".png"
            # Above: if sample is unaugmented to this point, sample_name will be
            # *.wav. Since SoundProcessor.warp_spectrogram expects sample_name to be *.png, we
            # replace extension. If augmented and sample_name is already *.png,
            # there is no change.
            warped_name = SoundProcessor.warp_spectrogram(
                sample_name, species_spectrogram_output_dir,
                species_spectrogram_output_dir)
            # if warp is not the only augmentation,
            # we do not want spectrogram before warp
            if len(aug_choices) != 0:
                assert (warped_name != sample_name)
                fname = os.path.join(species_spectrogram_output_dir,
                                     sample_name)
                os.remove(fname)

    #------------------------------------
    # create_original_spectrograms
    #-------------------

    def create_original_spectrograms(self, samples, n, species_wav_input_dir,
                                     species_spectrogram_output_dir):
        samples = random.sample(samples, int(n))  # choose n from all samples
        for sample_name in samples:
            SoundProcessor.create_spectrogram(sample_name,
                                              species_wav_input_dir,
                                              species_spectrogram_output_dir,
                                              n_mels=128)
Ejemplo n.º 17
0
    def __init__(
        self,
        infiles,
        actions,
        outdir=None,  # Output to same dir as files are located
        normalize=False,  # We may want to consider using this!
        framerate=None,  # this by default will be found from the .wav file
        min_freq=0,  # Hz 
        max_freq=150,  # Hz
        nfft=4096,
        pad_to=4096,
        hop=800,
        logfile=None,
    ):
        '''
        @param infiles: Files to spectrogram identified by the corresponding .wav
        @type infiles:
        @param actions: the tasks to accomplish: 
            {spectro|melspectro|labelmask|copyraven}
            NOTE: copy raven is used simply to copy over the raven gt label .txt
            file if we are moving the spectrogram to a new location
        @type actions: [str] 
        @param outdir: if provided, everything that is created is written
            to this directory. If None, is written to the directory of the file
            on which computed was based. This is a good default!
        @type outdir {None | str}
        @param normalize: whether or not to normalize the signal to be within 16 bits
        @type normalize: bool
        @param framerate: framerate of the recording. Normally 
            obtained from the wav file itself.
        @type framerate: int
        @param min_freq: min frequency in the processed spectrogram
        @type min_freq: int
        @param max_freq: max frequence in the processed spectrogram
        @type max_freq: int
        @param nfft: window width
        @type nfft: int,
        @param logfile: destination for log. Default: display
        @type logfile: {None|str}
        '''

        # Set up class variables related to spectrogram generation
        self.nfft = nfft
        self.pad_to = pad_to
        self.hop = hop
        self.min_freq = min_freq
        self.max_freq = max_freq

        # Output directory
        self.outdir = outdir

        if logfile is None:
            self.log = LoggingService()
        else:
            self.log = LoggingService(logfile, msg_identifier="spectrogrammer")

        if type(infiles) != list:
            infiles = [infiles]

        # Depending on what caller wants us to do,
        # different arguments must be passed. Make
        # all those checks to avoid caller waiting a long
        # time for processing to be done only to fail
        # at the end: - Should update this later as thing
        # come up!

        # Prerequisites:
        if not self._ensure_prerequisites(infiles, actions, framerate, nfft,
                                          outdir):
            return

        # Prepare the desired component of each .wav file
        for infile in infiles:
            # Super basic file checking
            if not os.path.exists(infile):
                print(f"File {infile} does not exist.")
                continue

            spect = None
            spectro_outfile = None
            label_mask = None

            # Get a dict with the file_root and
            # names related to the infile in our
            # file naming scheme:
            # Note this is useful for associating
            # .wav and .txt files
            file_family = FileFamily(infile)

            # Output the files to the same path as input
            # Note this allows self.outdir to change for each file
            if outdir is None:
                self.outdir = file_family.path

            # Start by trying to read the .wav file - the backbone of everything!
            try:
                self.log.info(f"Reading wav file {infile}...")
                (self.framerate, samples) = wavfile.read(infile)
                self.log.info(f"Done reading wav file {infile}.")
            except Exception as e:
                self.log.warn(f"Cannot process .wav file: {repr(e)}")
                # We should continue onto the next one!!
                # this we have seen with currupted .wav files
                continue

            # Generate and process the full spectrogram
            if 'spectro' in actions:
                try:
                    spect, times = self.make_spectrogram(samples)
                except Exception as e:
                    print(f"Cannot create spectrogram for {infile}: {repr(e)}")
                    return

                # Save the spectrogram
                spectro_outfile = os.path.join(self.outdir,
                                               file_family.spectro)
                np.save(spectro_outfile, spect)
                # Save the time mask
                times_outfile = os.path.join(self.outdir,
                                             file_family.time_labels)
                np.save(times_outfile, times)

            if 'labelmask' in actions:
                # Get label mask with 1s at time periods with an elephant call.
                time_file = file_family.fullpath(AudioType.TIME)
                try:
                    times = np.load(time_file)
                except Exception as e:
                    print(
                        f"Have not created the necessary time mask file for the spectrogram {infile}"
                    )
                    continue

                raven_file = file_family.fullpath(AudioType.LABEL)
                label_mask = self.create_label_mask_from_raven_table(
                    times, raven_file)
                np.save(os.path.join(self.outdir, file_family.mask),
                        label_mask)
Ejemplo n.º 18
0
    def __init__(self, 
                 input_dir_path,
                 plot=False,
                 overwrite_policy=False,
                 aug_goals=AugmentationGoals.MEDIAN,
                 random_augs = False,
                 multiple_augs = False,):

        '''
        
        :param input_dir_path: directory holding .wav files
        :type input_dir_path: str
        :param plot: whether or not to plot informative chars 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals; TENTH/MAX/MEDIAN)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        :param random_augs: if this is true, will randomly choose augmentation 
            to use for each new sample
        :type random_augs: bool
        :param multiple_augs: if we want to allow multiple augmentations per sample 
            (e.g. time shift and volume)):
        :type multiple_augs: bool
        '''

        self.log = LoggingService()

        if not isinstance(overwrite_policy, WhenAlreadyDone):
            raise TypeError(f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}") 

        if not os.path.isabs(input_dir_path):
            raise ValueError(f"Input path must be a full, absolute path; not {input_dir_path}")

        self.input_dir_path   = input_dir_path
        self.multiple_augs    = multiple_augs
        self.plot             = plot
        self.overwrite_policy = overwrite_policy
        
        self.species_names = Utils.find_species_names(self.input_dir_path)

        # If aug_goals is not a dict mapping
        # each species to an aug_goals, but just
        # a single AugmentationGoals, create
        # a dict from all bird species, mapping
        # each to that same value:
        
        if type(aug_goals) != dict:
            aug_goals = {species : aug_goals
                          for species in self.species_names
                          }

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_samples
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = Utils.sample_compositions_by_species(input_dir_path, 
                                                                      augmented=False)
        
        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = Utils.compute_num_augs_per_species(aug_goals, 
                                                             self.sample_distrib_df)
        
        # Get input dir path without trailing slash:
#****        canonical_in_path = str(Path(input_dir_path))
        # Create the descriptive name of an output directory 
        # for the augmented samples: 
        if random_augs:
            os.path.join(Path(input_dir_path).parent, 'augmented_samples_random')
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, 
                                                'augmented_samples_random')
        else:
            assert(self.ADD_NOISE + self.TIME_SHIFT + self.VOLUME == 1)
            dir_nm = f"Augmented_samples_-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.VOLUME:.2f}w"
            self.output_dir_path = os.path.join(Path(input_dir_path).parent, dir_nm)

        if self.multiple_augs:
            self.output_dir_path += "/"
        else:
            # Indicate that augmentations are mutually exclusive
            self.output_dir_path += "-exc/"  

        self.log.info(f"Results will be in {self.output_dir_path}")

        Utils.create_folder(self.output_dir_path, self.overwrite_policy)

        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        warnings.filterwarnings(action="ignore",
                                message="PySoundFile failed. Trying audioread instead.",
                                category=UserWarning, 
                                module='', 
                                lineno=0)
Ejemplo n.º 19
0
class SpectrogramAugmenter:

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 input_dir_path,
                 output_dir_path,
                 plot=False,
                 overwrite_policy=False,
                 aug_goals=AugmentationGoals.MEDIAN):
        '''
        
        :param input_dir_path: directory holding .png files
        :type input_dir_path: str
        :param output_dir_path: root of destination dir under
            which each species' subdirectories will be placed.
            Augmentations will be placed in those subdirs.
        :type output_dir_path: str
        :param plot: whether or not to plot informative charts 
            along the way
        :type plot: bool
        :param overwrite_policy: if true, don't ask each time
            previously created work will be replaced
        :type overwrite_policy: bool 
        :param aug_goals: either an AugmentationGoals member,
               or a dict with a separate AugmentationGoals
               for each species: {species : AugmentationGoals}
               (See definition of AugmentationGoals; TENTH/MAX/MEDIAN)
        :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}}
        '''

        self.log = LoggingService()

        if not isinstance(overwrite_policy, WhenAlreadyDone):
            raise TypeError(
                f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}"
            )

        if not os.path.isabs(input_dir_path):
            raise ValueError(
                f"Input path must be a full, absolute path; not {input_dir_path}"
            )

        self.input_dir_path = input_dir_path
        self.output_dir_path = output_dir_path
        self.plot = plot
        self.overwrite_policy = overwrite_policy

        self.species_names = Utils.find_species_names(self.input_dir_path)

        # Get dataframe with row lables being the
        # species, and one col with number of samples
        # in the respective species:
        #       num_species
        # sp1       10
        # sp2       15
        #      ..

        self.sample_distrib_df = Utils.sample_compositions_by_species(
            input_dir_path, augmented=False)

        if plot:
            # Plot a distribution:
            self.sample_distrib_df.plot.bar()

        # Build a dict with number of augmentations to do
        # for each species:
        self.augs_to_do = Utils.compute_num_augs_per_species(
            aug_goals, self.sample_distrib_df)

        self.log.info(f"Results will be in {self.output_dir_path}")

        Utils.create_folder(self.output_dir_path, self.overwrite_policy)

    #------------------------------------
    # generate_all_augmentations
    #-------------------

    def generate_all_augmentations(self):
        '''
        Workhorse:
        Create new samples via augmentation for each spectrogram. 
        Augment the spectro files to reach the number of spectro files
        indicated in the self.aug_requirements.
        
        Assumption: self.aug_requirements is a dict mapping 
        species-name : num_required_augmentations
        
        Assumption: self.sample_distrib_df is a dataframe like
        
        	        num_species
        	  sp1       10
        	  sp2       15
        	       ...
        	  
        '''
        num_augmentations = 0

        for species, _rows in self.sample_distrib_df.iterrows():
            # For each spectrogram, create as many augmentations
            # as was computed earlier:
            num_needed_augs = self.augs_to_do[species]
            if num_needed_augs == 0:
                continue
            in_dir = os.path.join(self.input_dir_path, species)
            out_dir = os.path.join(self.output_dir_path, species)
            aug_paths = self.augment_one_species(in_dir, out_dir,
                                                 num_needed_augs)
            num_augmentations += len(aug_paths)

        # Clean up directory clutter:
        search_root_dir = os.path.join(self.output_dir_path)
        os.system(f"find {search_root_dir} -name \".DS_Store\" -delete")

        self.log.info(f"Total of {num_augmentations} new spectrogam files")

        self.log.info("Done")

    #------------------------------------
    # augment_one_species
    #-------------------

    def augment_one_species(self, in_dir, out_dir, num_augs_to_do):
        '''
        Takes one species, and a number of spectrogram
        augmentations to do. Generates the files,
        and returns a list of the newly created 
        files (full paths).
        
        The maximum number of augmentations created
        depends on the number of spectrogram augmentation 
        methods available (currently 3), and the number
        of spectgrogram files available for the given species:
        
           num-available-spectro-augs * num-of-spectro-files
        
        If num_augs_to_do is higher than the above maximum,
        only that maximum is created. The rest will need to 
        be accomplished by spectrogram augmentation in a 
        different portion of the workflow.

        Augmentations are effectively done round robin across all of
        the species' spectro files such that each file is
        augmented roughly the same number of times until
        num_augs_to_do is accomplished.

        :param in_dir: directory holding one species' spectro files
        :type in_dir: str
        :param out_dir: destination for new spectro files
        :type out_dir: src
        :param num_augs_to_do: number of augmentations
        :type num_augs_to_do: int
        :returns: list of newly created file paths
        :rtype: [src]
        '''

        # By convention, species name is the last part of the directory:
        species_name = Path(in_dir).stem

        # Create subfolder for the given species:
        if not Utils.create_folder(out_dir, self.overwrite_policy):
            self.log.info(f"Skipping augmentations for {species_name}")
            return []

        # Get dict: {full-path-to-an-audio_file : 0}
        # The zeroes will be counts of augmentations
        # needed for that file:
        in_spectro_files = {
            full_in_path: 0
            for full_in_path in Utils.listdir_abs(in_dir)
        }
        # Cannot do augmentations for species with 0 samples
        if len(in_spectro_files) == 0:
            self.log.info(
                f"Skipping for {species_name} since there are no original samples."
            )
            return []

        # Distribute augmenations across the original
        # input files:
        aug_assigned = 0
        while aug_assigned < num_augs_to_do:
            for fname in in_spectro_files.keys():
                in_spectro_files[fname] += 1
                aug_assigned += 1
                if aug_assigned >= num_augs_to_do:
                    break

        new_sample_paths = []
        failures = 0

        for in_fname, num_augs_this_file in in_spectro_files.items():

            # Create augs with different methods:

            # Pick audio aug methods to apply (without replacement)
            # Note that if more augs are to be applied to each file
            # than methods are available, some methods will need
            # to be applied multiple times; no problem, as each
            # method includes randomness:
            max_methods_sample_size = min(len(list(ImgAugMethod)),
                                          num_augs_this_file)
            methods = random.sample(list(ImgAugMethod),
                                    max_methods_sample_size)

            # Now have something like:
            #     [volume, time-shift], or all methods: [volume, time-shift, noise]

            if num_augs_this_file > len(methods):
                # Repeat the methods as often as
                # needed:
                num_method_set_repeats = int(
                    math.ceil(num_augs_this_file / len(methods)))
                # The slice to num_augs_this_file chops off
                # the possible excess from the array replication:
                method_seq = (methods *
                              num_method_set_repeats)[:num_augs_this_file]

                # Assuming num_augs_per_file is 7, we not have method_seq:
                #    [m1,m2,m3,m1,m2,m3,m1]
            else:
                method_seq = methods

            for method in method_seq:
                out_path_or_err = self.create_new_sample(
                    in_fname, out_dir, method)
                if isinstance(out_path_or_err, Exception):
                    failures += 1
                else:
                    new_sample_paths.append(out_path_or_err)

        self.log.info(
            f"Spectrogram aug report: {len(new_sample_paths)} new files; {failures} failures"
        )
        return new_sample_paths

    #------------------------------------
    # create_new_sample
    #-------------------

    def create_new_sample(self, sample_path, out_dir, method):
        '''
        Given one spectrogram file, and an image augmentation
        method name, compute that augmentation, create a file name
        that gives insight into the aug applied, and write that
        new spectrogram file to out_dir.
        
        Currently available types of image augmentation technique:
        
            o adding random or uniform sounds
            o frequency masking
            o time masking

        Returns the full path of the newly created spectrogram file:
        
        :param sample_path: absolute path to spectrogram
        :type sample_path: str
        :param out_dir: destination of resulting new spectros
        :type out_dir: src
        :param method: the (spectrogram) image augmentation method to apply
        :type method: ImgAugMethod
        :return: Newly created spectro file (full path) or None,
            if a failure occurred.
        :rtype: {str | None|
        '''

        success = False
        spectro, metadata = SoundProcessor.load_spectrogram(sample_path)
        if method == ImgAugMethod.NOISE:
            try:
                # Default is uniform noise:
                new_spectro, out_fname = SoundProcessor.random_noise(spectro)
                metadata['augmentation'] = 'noise'
                success = True
            except Exception as e:
                sample_fname = Path(sample_path).stem
                self.log.err(
                    f"Failed to add noise to {sample_fname} ({repr(e)})")

        elif method == ImgAugMethod.FMASK:
            try:
                # Horizontal bands:
                new_spectro, out_fname = SoundProcessor.freq_mask(
                    spectro,
                    max_height=15  # num freq bands
                )
                metadata['augmentation'] = 'fmask'
                success = True
            except Exception as e:
                sample_fname = Path(sample_path).stem
                self.log.err(
                    f"Failed to time shift on {sample_fname} ({repr(e)})")

        elif method == ImgAugMethod.TMASK:
            try:
                # Vertical bands:
                new_spectro, out_fname = SoundProcessor.time_mask(
                    spectro,
                    max_width=15  # num time ticks
                )
                metadata['augmentation'] = 'tmask'
                success = True
            except Exception as e:
                sample_fname = Path(sample_path).stem
                self.log.err(
                    f"Failed to time shift on {sample_fname} ({repr(e)})")

        if success:
            sample_p = Path(sample_path)
            appended_fname = sample_p.stem + out_fname + sample_p.suffix
            out_path = os.path.join(out_dir, appended_fname)
            SoundProcessor.save_image(new_spectro, out_path, metadata)
        return out_path if success else None