Exemple #1
0
    def InitData(self):
        """
        initialize dataflow as the mode

        :return: initialized dataflow
        """

        config = self._config

        # speech data
        parsed_cfg = configparser.ConfigParser()
        parsed_cfg.read(os.path.join(config.train_data_expdir,
                                     'database.conf'))

        name = parsed_cfg.sections()[0]

        conf = dict(parsed_cfg.items(name))

        proc_cfg = configparser.ConfigParser()
        proc_cfg.read(os.path.join(config.train_data_expdir, 'processor.cfg'))

        self.processor = processor_factory.factory(
            proc_cfg.get('processor', 'processor'))(proc_cfg)
        datafile = conf['datafiles'].split(' ')[0]

        #self._dataflow = cycle(open(datafile).readlines())

        # text data
        text_parsed_cfg = configparser.ConfigParser()
        text_parsed_cfg.read(
            os.path.join(config.train_text_data_expdir, 'database.conf'))

        text_data_name = text_parsed_cfg.sections()[0]

        text_conf = dict(text_parsed_cfg.items(text_data_name))

        text_proc_cfg = configparser.ConfigParser()
        text_proc_cfg.read(
            os.path.join(config.train_text_data_expdir, 'processor.cfg'))

        self.text_processor = processor_factory.factory(
            text_proc_cfg.get('processor', 'processor'))(text_proc_cfg)
        text_datafile = text_conf['datafiles'].split(' ')[0]

        self._dataflow = (cycle(open(datafile).readlines()),
                          cycle(open(text_datafile).readlines()))
        self._alphabet = dict(
            text_proc_cfg.items('processor'))['alphabet'].split(' ')
        print(self._alphabet)

        self._lookup = dict()
        for i in range(len(self._alphabet)):
            self._lookup[self._alphabet[i]] = i

        print(self._lookup)
        #print(len(open(datafile).readlines()))
        return self._dataflow
    def __init__(self, dataconf, segment_lengths=['full']):
        '''DataReader constructor

        Args:
            dataconf: the database configuration
            segment_lengths: A list containing the desired lengths of segments. 
            Possibly multiple segment lengths
        '''

        if len(segment_lengths) > 1:
            print(
                'Warning: Not yet implemented __call__ correctly for multiple'
                'segments. The returned utt_info, does not contain the _part sufix and'
                'processed returns only 1 processed')
        self.segment_lengths = segment_lengths

        #read the processor config
        proc_cfg_file = dataconf['processor_config']
        parsed_proc_cfg = configparser.ConfigParser()
        parsed_proc_cfg.read(proc_cfg_file)
        proc_cfg = dict(parsed_proc_cfg.items('processor'))

        #create a processor
        self.processor = processor_factory.factory(proc_cfg['processor'])(
            proc_cfg, self.segment_lengths)

        #get the datafiles lines
        datafile = dataconf[
            'datafiles']  #TODO: for the moment expecting only 1 file, but this also makes sense?
        if datafile[-3:] == '.gz':
            open_fn = gzip.open
        else:
            open_fn = open
        f = open(datafile)
        self.datafile_lines = f.readlines()
Exemple #3
0
def main(expdir):
    '''main function'''

    #read the data conf file
    parsed_cfg = configparser.ConfigParser()
    parsed_cfg.read(os.path.join(expdir, 'database.cfg'))

    #loop over the sections in the data config
    name = parsed_cfg.sections()[0]

    #read the section
    conf = dict(parsed_cfg.items(name))

    #read the processor config
    proc_cfg = configparser.ConfigParser()
    proc_cfg.read(os.path.join(expdir, 'processor.cfg'))

    #create a processor
    processor = processor_factory.factory(
        proc_cfg.get('processor', 'processor'))(proc_cfg)

    #create a writer
    writer = tfwriter_factory.factory(conf['type'])(conf['dir'])

    #loop over the data files
    for datafile in conf['datafiles'].split(' '):

        if datafile[-3:] == '.gz':
            open_fn = gzip.open
        else:
            open_fn = open
        count = 0
        #loop over the lines in the datafile
        for line in open_fn(datafile):

            #split the name and the data line
            splitline = line.strip().split(' ')
            name = splitline[0]
            dataline = ' '.join(splitline[1:])

            print dataline
            try:
                #process the dataline
                processed = processor(dataline)

                #write the processed data to disk
                if processed is not None:
                    writer.write(processed, name)
            except:
                count += 1
                print count
        print count

    #write the metadata to file
    processor.write_metadata(conf['dir'])
    def __init__(self, dataconfs, segment_lengths=['full']):
        """DataReader constructor

		Args:
			dataconfs: the database configuration
			segment_lengths: A list containing the desired lengths of segments.
			Possibly multiple segment lengths
		"""

        if len(segment_lengths) > 1:
            print(
             'Warning: Not yet implemented __call__ correctly for multiple segments. The returned utt_info, does not ' \
             'contain the _part sufix and processed returns only 1 processed')
        self.segment_lengths = segment_lengths

        self.processors = []
        self.start_index_set = [0]
        self.datafile_lines = []
        for dataconf in dataconfs:
            # read the processor config
            proc_cfg_file = dataconf['processor_config']
            if not os.path.isfile(proc_cfg_file):
                raise BaseException('%s does not exist' % proc_cfg_file)
            parsed_proc_cfg = configparser.ConfigParser()
            parsed_proc_cfg.read(proc_cfg_file)
            proc_cfg = dict(parsed_proc_cfg.items('processor'))

            # create a processor
            self.processors.append(
                processor_factory.factory(proc_cfg['processor'])(
                    proc_cfg, self.segment_lengths))

            # get the datafiles lines
            datafile = dataconf[
                'datafiles']  # TODO: for the moment expecting only 1 file, but this also makes sense?
            if datafile[-3:] == '.gz':
                open_fn = gzip.open
            else:
                open_fn = open
            f = open_fn(datafile)
            datalines = f.readlines()
            self.start_index_set.append(self.start_index_set[-1] +
                                        len(datalines))
            self.datafile_lines.extend(datalines)
Exemple #5
0
def main(expdir):
    '''main function'''

    #read the data conf file
    parsed_cfg = configparser.ConfigParser()
    parsed_cfg.read(os.path.join(expdir, 'database.cfg'))

    #loop over the sections in the data config
    name = parsed_cfg.sections()[0]

    #read the section
    conf = dict(parsed_cfg.items(name))
    
    #the length of the segments. Possibly multiple segment lengths
    if 'segment_lengths' in conf:
	segment_lengths = conf['segment_lengths'].split(' ')
    else:
	segment_lengths = ['full']
    
    if not os.path.exists(conf['store_dir']):
	os.makedirs(conf['store_dir'])
    else:
	print '%s already exists, skipping this section' % conf['store_dir']
	return
    
    #read the processor config
    parsed_proc_cfg = configparser.ConfigParser()
    parsed_proc_cfg.read(os.path.join(expdir, 'processor.cfg'))
    proc_cfg = dict(parsed_proc_cfg.items('processor'))

    #create a processor
    processor = processor_factory.factory(proc_cfg['processor'])(proc_cfg, segment_lengths)

    #create the writers
    writers = dict()
    for seg_length in segment_lengths:
	writer_store_dir = os.path.join(conf['store_dir'],seg_length)
	writers[seg_length] = tfwriter_factory.factory(conf['writer_style'])(writer_store_dir)

    #before looping over the data, allow the processor to access the data (e.g. 
    #for global mean and variance calculation) (or should this be done in init?)
    processor.pre_loop(conf)

    #loop over the data files
    for datafile in conf['datafiles'].split(' '):
	if datafile[-3:] == '.gz':
	    open_fn = gzip.open
	else:
	    open_fn = open
	
	#loop over the lines in the datafile
	for line in open_fn(datafile):
	    #split the name and the data line
	    splitline = line.strip().split(' ')
	    utt_name = splitline[0]
	    dataline = ' '.join(splitline[1:])

	    #process the dataline
	    processed, _ = processor(dataline)

	    #write the processed data to disk
	    for seg_length in segment_lengths:
	      
		for i,proc_seg in enumerate(processed[seg_length]):
		  
		    seg_utt_name = utt_name + '_part %d' %i
		    writers[seg_length].write(proc_seg, seg_utt_name)
    
    #after looping over the data, allow the processor to access the data
    processor.post_loop(conf)

    #write the metadata to file
    processor.write_metadata(conf['store_dir'])
Exemple #6
0
def main(expdir):
    """main function"""
    # read the data conf file
    parsed_cfg = configparser.ConfigParser()
    parsed_cfg.read(os.path.join(expdir, 'database.cfg'))

    # loop over the sections in the data config
    name = parsed_cfg.sections()[0]

    # read the section
    conf = dict(parsed_cfg.items(name))

    # the length of the segments. Possibly multiple segment lengths
    if 'segment_lengths' in conf:
        segment_lengths = conf['segment_lengths'].split(' ')
    else:
        segment_lengths = ['full']

    if conf['store_dir'] == '/esat/spchtemp/scratch/jzegers/dataforTF/sreMix_segmented_DANet_recs/singlefeatures_hamming_scipy/train_150k':
        start_ind = 106370
        start_ind_per_segment_length = {'500': 239577, 'full': 106370}
        segment_lengths_still_to_process = segment_lengths
    else:
        start_ind = 0
        start_ind_per_segment_length = {
            seg_len: 0
            for seg_len in segment_lengths
        }

        if not os.path.exists(conf['store_dir']):
            os.makedirs(conf['store_dir'])
            segment_lengths_still_to_process = segment_lengths

            # copy config files to store_dir for archive purposes
            shutil.copyfile(os.path.join(expdir, 'database.cfg'),
                            os.path.join(conf['store_dir'], 'database.cfg'))
            shutil.copyfile(os.path.join(expdir, 'processor.cfg'),
                            os.path.join(conf['store_dir'], 'processor.cfg'))
        else:
            tmp = os.listdir(conf['store_dir'])
            if all([seg_len in tmp for seg_len in segment_lengths]):
                print('%s already exists, skipping this section' %
                      conf['store_dir'])
                return
            else:
                segment_lengths_still_to_process = [
                    seg_len for seg_len in segment_lengths
                    if seg_len not in tmp
                ]

    # read the processor config
    parsed_proc_cfg = configparser.ConfigParser()
    parsed_proc_cfg.read(os.path.join(expdir, 'processor.cfg'))
    proc_cfg = dict(parsed_proc_cfg.items('processor'))

    # create a processor
    processor = processor_factory.factory(proc_cfg['processor'])(
        proc_cfg, segment_lengths_still_to_process)

    # create the writers
    writers = dict()
    for seg_length in segment_lengths_still_to_process:
        writer_store_dir = os.path.join(conf['store_dir'], seg_length)
        writers[seg_length] = tfwriter_factory.factory(conf['writer_style'])(
            writer_store_dir,
            start_ind=start_ind_per_segment_length[seg_length])

    # before looping over the data, allow the processor to access the data (e.g.
    # for global mean and variance calculation) (or should this be done in init?)
    processor.pre_loop(conf)

    # loop over the data files
    for datafile in conf['datafiles'].split(' '):
        if datafile[-3:] == '.gz':
            open_fn = gzip.open
        else:
            open_fn = open

        # loop over the lines in the datafile
        ind = 0
        for line in open_fn(datafile):
            print(ind)
            if ind < start_ind:
                ind += 1
                continue
            # split the name and the data line
            splitline = line.strip().split(' ')
            utt_name = splitline[0]
            dataline = ' '.join(splitline[1:])

            # process the dataline
            processed, _ = processor(dataline)

            # write the processed data to disk
            for seg_length in segment_lengths_still_to_process:
                for i, proc_seg in enumerate(processed[seg_length]):
                    seg_utt_name = utt_name + '_part %d' % i
                    writers[seg_length].write(proc_seg, seg_utt_name)
            ind += 1

    # after looping over the data, allow the processor to access the data
    processor.post_loop(conf)

    # write the metadata to file
    processor.write_metadata(conf['store_dir'])