예제 #1
0
    def initialize(self, examples):
        '''
        initialize the nmf model, creates the dictionaries

        args:
            numlabels: the number of labels
            numac: the number of acoustic co-ocurrences

        returns:
            the initial activations
        '''

        _, tasks = zip(*examples)

        #read all the tasks
        tasks = [read_task(task) for task in tasks]

        #encode the tasks
        vs_full = np.array([self.coder.encode(t) for t in tasks])
        self.knownlabels = np.where(vs_full.sum(0) > 0)[0]
        vs_full = vs_full[:, self.knownlabels]

        #initialize the activations
        h = vs_full
        h = np.concatenate([h] * int(self.conf['numwords_per_label']), axis=1)
        # add noise to content word activations
        #   If set to 0, activations of content word will remain 'pure', i.e.
        #   content words that are not present according to the (weak) training
        #   supervision, will remain absent.
        h = (h + np.random.uniform(0, float(self.conf['activation_scale']),
                                   h.shape))
        garbage = np.zeros(
            [h.shape[0],
             int(h.shape[1] * float(self.conf['garbage_words']))])
        # add noise to garbage word activations
        if 'garbage_scale' in self.conf.keys():
            gbg_scale = float(self.conf['garbage_scale'])
        else:
            gbg_scale = float(self.conf['activation_scale'])

        garbage = (garbage + np.random.uniform(0, gbg_scale, garbage.shape))
        h = np.concatenate([h, garbage], 1)

        numlabels = self.coder.numlabels

        #initialize the semantic dictionary
        self.ws = np.identity(numlabels)
        self.ws = self.ws[self.knownlabels, :]
        self.ws = np.concatenate([self.ws] *
                                 int(self.conf['numwords_per_label']))

        #randomly permute the semantic dictionary
        self.ws = (self.ws + np.random.uniform(
            0, float(self.conf['semantic_scale']), self.ws.shape))

        #initialize the acoustic dictionary
        self.wa = np.random.uniform(0, float(self.conf['acoustic_scale']),
                                    [h.shape[1], self.knownobs.size])

        return h
예제 #2
0
def main(expdir):
    '''main function'''

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    #create a task structure file
    structure = Structure(os.path.join(expdir, 'structure.xml'))

    #create a coder
    coder = coder_factory.factory(coderconf.get('coder', 'name'))(structure,
                                                                  coderconf)

    #read the traintasks
    with open(os.path.join(expdir, 'traintasks')) as f:
        lines = f.readlines()
        for line in lines:
            splitline = line.strip().split(' ')
            taskstring = ' '.join(splitline[1:])
            print(splitline[0] + " : " + taskstring)
            task = read_task(taskstring)
            vs = coder.encode(task).astype(int)
            print("%d : [" % len(vs))
            sys.stdout.write(" ".join(str(x) for x in vs))
            print("]\n")
예제 #3
0
def main(expdir, name):
    '''main function'''

    colorlist = ['black']
    linestyles = ['-', '--', ':', '-.']

    #read the alignment
    alignment = np.load(os.path.join(expdir, 'alignment', '%s.npy' % name))

    #read the decoded task
    taskstrings = dict()
    for line in open(os.path.join(expdir, 'dectasks')):
        splitline = line.strip().split(' ')
        taskstrings[splitline[0]] = ' '.join(splitline[1:])

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    #create a task structure file
    structure = Structure(os.path.join(expdir, 'structure.xml'))

    #create a coder
    coder = coder_factory.factory(coderconf.get('coder', 'name'))(structure,
                                                                  coderconf)

    #encode the decoded task
    labelvec = coder.encode(read_task(taskstrings[name]))

    #create the legend
    legend = coder.labelids
    alignment = [
        alignment[:, l] for l in range(coder.numlabels) if labelvec[l]
    ]
    legend = [legend[l] for l in range(coder.numlabels) if labelvec[l]]

    for i, ali in enumerate(alignment):
        plt.plot(ali,
                 color=colorlist[i % len(colorlist)],
                 linestyle=linestyles[i % len(linestyles)],
                 label=legend[i])
    plt.legend()
    plt.show()
예제 #4
0
def main(expdir, recipe, computing):
    '''main function'''

    overwrite = False
    if os.path.isdir(expdir):
        text = ''
        while text not in ('o', 'r'):
            text = raw_input('%s already exists, do you want to '
                             'resume experiment (r) or overwrite (o) '
                             '(respond with o or r)' % expdir)
        if text == 'o':
            overwrite = True

    else:
        #create the experiments directory
        os.makedirs(expdir)

    #copy the config files
    if overwrite:
        shutil.copyfile(os.path.join(recipe, 'acquisition.cfg'),
                        os.path.join(expdir, 'acquisition.cfg'))
    else:
        tools.safecopy(os.path.join(recipe, 'acquisition.cfg'),
                       os.path.join(expdir, 'acquisition.cfg'))

    shutil.copyfile(os.path.join(recipe, 'coder.cfg'),
                    os.path.join(expdir, 'coder.cfg'))
    shutil.copyfile(os.path.join(recipe, 'structure.xml'),
                    os.path.join(expdir, 'structure.xml'))

    shutil.copyfile(os.path.join(recipe, 'coder.cfg'),
                    os.path.join(expdir, 'coder.cfg'))
    shutil.copyfile(os.path.join(recipe, 'structure.xml'),
                    os.path.join(expdir, 'structure.xml'))
    shutil.copyfile(os.path.join(recipe, 'database.cfg'),
                    os.path.join(expdir, 'database.cfg'))
    shutil.copyfile(os.path.join(recipe, 'cross_validation_ppall.cfg'),
                    os.path.join(expdir, 'cross_validation_ppall.cfg'))

    acquisitionconf = ConfigParser()
    acquisitionconf.read(os.path.join(recipe, 'acquisition.cfg'))
    modelname = acquisitionconf.get('acquisition', 'name')

    shutil.copyfile(
        os.path.join(os.getcwd(), 'assist', 'acquisition', 'defaults',
                     modelname + '.cfg'),
        os.path.join(expdir, modelname + '.cfg'))

    #read the cross_validation config file
    expconf = ConfigParser()
    expconf.read(os.path.join(recipe, 'cross_validation.cfg'))

    #default conf file
    default = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                           'defaults', 'cross_validation.cfg')

    #apply the defaults
    if os.path.exists(default):
        tools.default_conf(expconf, default)

    expconf = dict(expconf.items('cross_validation'))

    #read the data config file
    if not os.path.exists(os.path.join(recipe, 'database.cfg')):
        raise Exception('cannot find database.cfg in %s' % recipe)

    dataconf = ConfigParser()
    dataconf.read(os.path.join(recipe, 'database.cfg'))

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    # filter out all speakers with less than 100 examples
    # (in the FluentSpeechCommands dataset (~20%))
    bad_spks = []
    if os.path.exists(os.path.join(recipe, 'FS_linecounts.txt')):
        for l in open(os.path.join(recipe, 'FS_linecounts.txt')):
            splitline = l.strip().split(' ')
            if int(splitline[1]) < 100:
                bad_spks.append(splitline[0])

    print bad_spks

    for speaker in dataconf.sections():
        if speaker in bad_spks:
            continue

        print 'speaker: %s' % (speaker)

        #create the speaker directory
        if os.path.isdir(os.path.join(expdir, speaker)):
            if overwrite:
                shutil.rmtree(os.path.join(expdir, speaker))
                os.makedirs(os.path.join(expdir, speaker))
        else:
            os.makedirs(os.path.join(expdir, speaker))

        #create a task structure file
        structure = Structure(os.path.join(expdir, 'structure.xml'))

        #create a coder
        coder = coder_factory.factory(coderconf.get('coder',
                                                    'name'))(structure,
                                                             coderconf)

        #read and code all the tasks
        labelvecs = []
        names = []
        taskstrings = dict()
        for line in open(dataconf.get(speaker, 'tasks')):
            splitline = line.strip().split(' ')
            name = speaker + '_' + splitline[0]
            names.append(name)
            taskstring = ' '.join(splitline[1:])
            taskstrings[name] = taskstring
            task = read_task(taskstring)
            labelvecs.append(coder.encode(task))

        #devide the data into blocks
        blocksfile = os.path.join(expdir, speaker, 'blocks.pkl')
        if os.path.exists(blocksfile):
            with open(blocksfile, 'rb') as fid:
                blocks = pickle.load(fid)
        else:
            blocks = make_blocks(np.array(labelvecs), expconf,
                                 dataconf.get(speaker, 'features'))
            with open(blocksfile, 'wb') as fid:
                pickle.dump(blocks, fid)

        #create train-testsets for all experiments

        #seed the random number generator
        random.seed(3105)
        trainids = [None] * (len(blocks) - 1)
        testids = [None] * (len(blocks) - 1)
        for b in range(len(blocks) - 1):
            trainids[b] = [None] * int(expconf['numexp'])
            testids[b] = [None] * int(expconf['numexp'])
            for e in range(int(expconf['numexp'])):
                trainids[b][e] = list(
                    itertools.chain.from_iterable(random.sample(blocks,
                                                                b + 1)))
                testids[b][e] = [
                    x for x in range(len(names)) if x not in trainids[b][e]
                ]

        #read the feature files
        features = dict()
        for l in open(os.path.join(dataconf.get(speaker, 'features'),
                                   'feats')):
            splitline = l.strip().split(' ')
            featname = speaker + '_' + splitline[0]
            features[featname] = ' '.join(splitline[1:])

        #create an expdir for each experiment
        b = int(expconf['startblocks']) - 1

        while True:
            for e in range(int(expconf['numexp'])):

                print '     train blocks: %d, experiment %s' % (b + 1, e)

                #creat the directory
                subexpdir = os.path.join(expdir, speaker,
                                         '%dblocks_exp%d' % (b + 1, e))

                if os.path.exists(os.path.join(subexpdir, 'f1')):
                    continue

                if not os.path.isdir(subexpdir):
                    os.makedirs(subexpdir)

                #create pointers to the config files
                tools.symlink(os.path.join(expdir, 'acquisition.cfg'),
                              os.path.join(subexpdir, 'acquisition.cfg'))
                tools.symlink(os.path.join(expdir, 'coder.cfg'),
                              os.path.join(subexpdir, 'coder.cfg'))
                tools.symlink(os.path.join(expdir, 'structure.xml'),
                              os.path.join(subexpdir, 'structure.xml'))
                tools.symlink(os.path.join(expdir, 'database.cfg'),
                              os.path.join(subexpdir, 'database.cfg'))

                if not os.path.exists(os.path.join(subexpdir, 'trainfeats')):
                    trainutts = [names[i] for i in trainids[b][e]]
                    print 'number of examples: %d' % len(trainutts)
                    testutts = [names[i] for i in testids[b][e]]

                    #create the train and test sets
                    tools.writefile(os.path.join(subexpdir, 'trainfeats'),
                                    {utt: features[utt]
                                     for utt in trainutts})
                    tools.writefile(
                        os.path.join(subexpdir, 'traintasks'),
                        {utt: taskstrings[utt]
                         for utt in trainutts})
                    tools.writefile(os.path.join(subexpdir, 'testfeats'),
                                    {utt: features[utt]
                                     for utt in testutts})
                    tools.writefile(
                        os.path.join(subexpdir, 'testtasks'),
                        {utt: taskstrings[utt]
                         for utt in testutts})

                if computing in ('condor', 'condor_gpu'):
                    #create the outputs directory
                    if not os.path.isdir(os.path.join(subexpdir, 'outputs')):
                        os.makedirs(os.path.join(subexpdir, 'outputs'))

                    if computing == 'condor_gpu':
                        jobfile = 'run_script_GPU.job'
                    else:
                        jobfile = 'run_script.job'

                    #only submit the job if it not running yet
                    in_queue = os.popen(
                        'if condor_q -nobatch -wide | grep -q %s; '
                        'then echo true; else echo false; fi' %
                        subexpdir).read().strip() == 'true'

                    #submit the condor job
                    if not in_queue:
                        os.system('condor_submit expdir=%s script=train_test'
                                  ' assist/condor/%s' % (subexpdir, jobfile))
                else:
                    train_test.main(subexpdir)

            newb = (b + 1) * int(expconf['scale']) + int(
                expconf['increment']) - 1
            newb = min(newb, len(blocks) - 2)
            if b == newb:
                break
            else:
                b = newb
예제 #5
0
    def fit(self, examples, h, parameters='ash'):
        '''fit the model parameters to the data

        Args:
            examples: the training examples as a list of pairs containing the
                inputs and reference tasks
            h: the initial value for the activations
            parameters: a string, the parameters to be updated a for acoustic
                dictionary, s for sementic dictionary and h for activations

        returns:
            the final activations h
        '''

        features, tasks = zip(*examples)

        #read all the tasks
        tasks = [read_task(task) for task in tasks]

        #encode the tasks
        vs_full = np.array([self.coder.encode(t) for t in tasks])
        self.knownlabels = np.where(vs_full.sum(0) > 0)[0]
        vs_full = vs_full[:, self.knownlabels]

        # apply weighting
        z = np.ones([1, vs_full.shape[0]])
        if 'label_weight_train' in self.conf.keys():
            weightingstrategy = self.conf['label_weight_train']
        else:
            weightingstrategy = "none"
            print(
                'Warning: "acquisition" config key "label_weight_train" set '
                'to "none"')
        if weightingstrategy == "frobNMF":
            # find the nonnegative utterance weights z such that vs*z=constant ###labeloccurrence
            # We want a maximally flat z, so add ||z||^4 as cost
            frobregweight = float(self.conf['frob_nmf_regular'])
            #num = vs_full.dot(labeloccurrence)
            num = vs_full.sum(1, keepdims=True).transpose()
            crit = vs_full.sum(0)
            print 'Prior to weighting: labelratio %f' % (crit.max() /
                                                         (crit.min() + 1e-10))
            #num = np.ones(num.shape)
            for _ in range(10):
                y = z.dot(vs_full)
                den = vs_full.dot(np.transpose(y)).transpose()
                den = den + frobregweight * np.power(z, 3)
                z = z * num / (den + 1e-10)
            vs_full *= z.transpose()
            crit = vs_full.sum(0)
            print 'After weighting:    labelratio %f' % (crit.max() /
                                                         (crit.min() + 1e-10))
        # do nothing on "none"

        #use acoustic model for the features
        events = [self.acoustic(f) for f in features]

        #compute the hacs
        va_full = np.array([
            hac.hac(e, self.delays, int(self.conf['numkeep'])) for e in events
        ])
        va_full *= z.transpose()

        #only keep the acoustics that actually occur
        self.knownobs = np.where(va_full.sum(0) > 0)[0]
        va_full = va_full[:, self.knownobs]

        #make sure the semantics and inputs sum to the same value
        self.ac_scale = vs_full.sum() / va_full.sum()
        va_full = va_full * self.ac_scale

        #convert the data matrices to sparse matrices
        va = sparse.csr_matrix(va_full)
        vs = sparse.csr_matrix(vs_full)

        #only retain the known observations and labels
        ws = self.ws[:, self.knownlabels]
        wa = self.wa[:, self.knownobs]
        sv = np.array(va.sum(1) + vs.sum(1))

        #get the number of content words
        nc = self.ws.shape[0]

        #normalize
        h = h.clip(float(self.conf['floor']))
        h *= sv / (2 * h[:, :nc].sum(1, keepdims=True) +
                   h[:, nc:].sum(1, keepdims=True))
        ws = ws.clip(float(self.conf['floor']))
        wa = wa.clip(float(self.conf['floor']))
        ws /= ws.sum(1, keepdims=True)
        wa /= wa.sum(1, keepdims=True)

        #start iteration
        for _ in range(int(self.conf['numiters_train'])):

            xs = h[:, :nc].dot(ws)
            cs = kld(vs, xs)
            xa = h.dot(wa)
            ca = kld(va, xa)

            print 'nmf cost %f = %f + %f' % (ca + cs, ca, cs)

            #update the semantic dictionary
            if 's' in parameters:
                qs = vs.multiply(1 / (h[:, :nc].dot(ws)))
                num = qs.transpose().dot(h[:, :nc]).transpose()
                den = h[:, :nc].sum(0)[:, np.newaxis]
                ws *= num / den
                ws /= ws.sum(1, keepdims=True)
                ws = ws.clip(float(self.conf['floor']))

            #update the acoustic dictionary
            if 'a' in parameters:
                qa = va.multiply(1 / (h.dot(wa)))
                num = qa.transpose().dot(h).transpose()
                den = h.sum(0)[:, np.newaxis]
                wa *= num / den
                wa /= wa.sum(1, keepdims=True)
                wa = wa.clip(float(self.conf['floor']))

            #update the activations
            if 'h' in parameters:
                qs = vs.multiply(1 / (h[:, :nc].dot(ws)))
                qa = va.multiply(1 / (h.dot(wa)))
                h[:, :nc] *= (qs.dot(ws.transpose()) +
                              qa.dot(wa[:nc].transpose()))
                h[:, :nc] /= wa[:nc].sum(1) + ws.sum(1)
                h[:, nc:] *= qa.dot(wa[nc:].transpose())
                h[:, nc:] /= wa[nc:].sum(1)
                h = h.clip(float(self.conf['floor']))

        self.ws = np.zeros(self.ws.shape)
        self.ws[:, self.knownlabels] = ws
        self.wa = np.zeros(self.wa.shape)
        self.wa[:, self.knownobs] = wa

        return h
예제 #6
0
파일: test.py 프로젝트: vrenkens/assist
def main(expdir):
    '''main function'''

    #check if this experiment has been completed
    if os.path.exists(os.path.join(expdir, 'f1')):
        print 'result found %s' % expdir
        return

    #read the acquisition config file
    acquisitionconf = ConfigParser()
    acquisitionconf.read(os.path.join(expdir, 'acquisition.cfg'))

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    #create a task structure file
    structure = Structure(os.path.join(expdir, 'structure.xml'))

    #create a coder
    coder = coder_factory.factory(coderconf.get('coder', 'name'))(structure,
                                                                  coderconf)

    #create an acquisition model
    model = model_factory.factory(acquisitionconf.get('acquisition',
                                                      'name'))(acquisitionconf,
                                                               coder, expdir)

    print 'loading model'
    model.load(os.path.join(expdir, 'model'))

    print 'prepping testing data'

    #load the testing features
    features = dict()
    for line in open(os.path.join(expdir, 'testfeats')):
        splitline = line.strip().split(' ')
        featsfile = ' '.join(splitline[1:])
        features[splitline[0]] = np.load(featsfile)

    #read the testtasks
    references = dict()
    for line in open(os.path.join(expdir, 'testtasks')):
        splitline = line.strip().split(' ')
        references[splitline[0]] = read_task.read_task(' '.join(splitline[1:]))

    print 'testing the model'

    #decode the test uterances
    decoded = model.decode(features)

    #write the decoded tasks to disc
    with open(os.path.join(expdir, 'dectasks'), 'w') as fid:
        for name, task in decoded.items():
            fid.write('%s %s\n' % (name, read_task.to_string(task)))

    (precision, recal, f1, macroprec, macrorecall, macrof1), scores = \
        score.score(decoded, references)

    print 'precision: %f' % precision
    print 'recal: %f' % recal
    print 'f1: %f' % f1
    print 'macro precision: %f' % macroprec
    print 'macro recal: %f' % macrorecall
    print 'macro f1: %f' % macrof1

    with open(os.path.join(expdir, 'precision'), 'w') as fid:
        fid.write(str(precision))
    with open(os.path.join(expdir, 'recal'), 'w') as fid:
        fid.write(str(recal))
    with open(os.path.join(expdir, 'f1'), 'w') as fid:
        fid.write(str(f1))
    with open(os.path.join(expdir, 'macroprecision'), 'w') as fid:
        fid.write(str(macroprec))
    with open(os.path.join(expdir, 'macrorecal'), 'w') as fid:
        fid.write(str(macrorecall))
    with open(os.path.join(expdir, 'macrof1'), 'w') as fid:
        fid.write(str(macrof1))

    score.write_scores(scores, expdir)
예제 #7
0
    def train(self, examples):
        '''train the model

        Args:
            examples: the training examples as a dictionary of pairs containing
                the inputs and reference tasks
        '''

        self.is_training = True

        #create the graph
        graph = tf.Graph()

        features, tasks = zip(*examples.values())

        #read all the tasks
        tasks = [read_task(task) for task in tasks]

        #encode the tasks
        vs = np.array([self.coder.encode(t) for t in tasks])

        if self.conf['batch_size'] == 'None':
            batch_size = features.shape[0]
        else:
            batch_size = min(int(self.conf['batch_size']), len(features))

        with graph.as_default():
            #put the features in a constant
            inputs = tf.placeholder(
                dtype=tf.float32,
                shape=[batch_size, None, features[0].shape[-1]],
                name='inputs')
            seq_length = tf.placeholder(dtype=tf.int32,
                                        shape=[batch_size],
                                        name='seq_length')

            #put the targets in a constant
            targets = tf.placeholder(dtype=tf.float32,
                                     shape=[batch_size, vs.shape[-1]],
                                     name='targets')

            #apply the model
            probs = self.model(inputs, seq_length)

            loss = self.loss(targets, probs)

            #count the number of parameters
            num_params = 0
            for var in tf.trainable_variables():
                num_params += reduce(mul, var.get_shape().as_list())
            print 'number of parameters: %d' % num_params

            #create an optimizer
            optimizer = tf.train.AdamOptimizer(
                float(self.conf['learning_rate']))

            #compute the gradients
            grads_and_vars = optimizer.compute_gradients(loss=loss)

            with tf.variable_scope('clip'):
                #clip the gradients
                grads_and_vars = [(tf.clip_by_value(grad, -1., 1.), var)
                                  for grad, var in grads_and_vars]

            #opperation to apply the gradients
            apply_gradients_op = optimizer.apply_gradients(
                grads_and_vars=grads_and_vars, name='apply_gradients')

            #all remaining operations with the UPDATE_OPS GraphKeys
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            #create an operation to update the model
            update_op = tf.group(*([apply_gradients_op] + update_ops),
                                 name='update')

            #create the init op
            init_op = tf.variables_initializer(tf.global_variables())

            #create a saver
            saver = tf.train.Saver()

            #create a summary
            for var in tf.trainable_variables():
                tf.summary.histogram(var.name, var)
            tf.summary.scalar('loss', loss)

            if self.conf['images'] == 'True':
                images = tf.get_collection('image')
                for image in images:
                    tf.summary.image(image.name, image)

            summary = tf.summary.merge_all()

        #create a session
        session_conf = tf.ConfigProto(inter_op_parallelism_threads=1,
                                      intra_op_parallelism_threads=4)
        sess = tf.Session(graph=graph, config=session_conf)

        #create a summary writer
        writer = tf.summary.FileWriter(os.path.join(self.expdir, 'logdir'),
                                       graph)

        #initialize the model
        sess.run(init_op)

        #create an index queue
        index_queue = []
        for _ in range(int(self.conf['numiters'])):
            i = range(len(tasks))
            shuffle(i)
            index_queue += i

        #iterativaly train the model
        i = 0
        while len(index_queue) > batch_size:
            indices = index_queue[:batch_size]
            index_queue = index_queue[batch_size:]
            batch_inputs = [features[j] for j in indices]
            batch_lengths = np.array([f.shape[0] for f in batch_inputs])
            ml = np.max(batch_lengths)
            batch_inputs = np.array([
                np.pad(f, ((0, ml - f.shape[0]), (0, 0)), 'constant')
                for f in batch_inputs
            ])
            if i == 'a':
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
            else:
                run_options = run_metadata = None
            _, s, l = sess.run(
                (update_op, summary, loss),
                feed_dict={
                    inputs: batch_inputs,
                    seq_length: batch_lengths,
                    targets: vs[indices]
                },
                options=run_options,
                run_metadata=run_metadata)
            print 'step %d: loss = %f' % (i, l)

            #Early stopping
            if i == 0:
                base_l = l
            if l <= base_l * float(self.conf['early_stop']):
                break

            writer.add_summary(s, i)
            if i == 'a':
                writer.add_run_metadata(run_metadata, 'statistics')
            i += 1

        #save the final model
        saver.save(sess, os.path.join(self.expdir, 'logdir', 'model.ckpt'))

        sess.close()

        self.is_training = False
def main(expdir, recipe, computing):
    '''main function'''

    overwrite = False
    if os.path.isdir(expdir):
        text = ''
        while text not in ('o', 'r'):
            text = raw_input('%s already exists, do you want to '
                             'resume experiment (r) or overwrite (o) '
                             '(respond with o or r)' % expdir)
        if text == 'o':
            overwrite = True

    else:
        #create the experiments directory
        os.makedirs(expdir)

    #copy the config files
    if overwrite:
        shutil.copyfile(os.path.join(recipe, 'acquisition.cfg'),
                        os.path.join(expdir, 'acquisition.cfg'))
    else:
        tools.safecopy(os.path.join(recipe, 'acquisition.cfg'),
                       os.path.join(expdir, 'acquisition.cfg'))

    shutil.copyfile(os.path.join(recipe, 'coder.cfg'),
                    os.path.join(expdir, 'coder.cfg'))
    shutil.copyfile(os.path.join(recipe, 'structure.xml'),
                    os.path.join(expdir, 'structure.xml'))
    shutil.copyfile(os.path.join(recipe, 'database.cfg'),
                    os.path.join(expdir, 'database.cfg'))
    shutil.copyfile(os.path.join(recipe, 'cross_validation_ppall.cfg'),
                    os.path.join(expdir, 'cross_validation_ppall.cfg'))

    acquisitionconf = ConfigParser()
    acquisitionconf.read(os.path.join(recipe, 'acquisition.cfg'))
    modelname = acquisitionconf.get('acquisition', 'name')
    shutil.copyfile(
        os.path.join(os.getcwd(), 'assist', 'acquisition', 'defaults',
                     modelname + '.cfg'),
        os.path.join(expdir, modelname + '.cfg'))

    #read the cross_validation config file
    expconf = ConfigParser()
    expconf.read(os.path.join(recipe, 'cross_validation_ppall.cfg'))

    #default conf file
    default = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                           'defaults', 'cross_validation_ppall.cfg')

    #apply the defaults
    if os.path.exists(default):
        tools.default_conf(expconf, default)

    expconf = dict(expconf.items('cross_validation_ppall'))

    #read the data config file
    if not os.path.exists(os.path.join(recipe, 'database.cfg')):
        raise Exception('cannot find database.cfg in %s' % recipe)

    dataconf = ConfigParser()
    dataconf.read(os.path.join(recipe, 'database.cfg'))

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    # for word specific thresholds (not used anymore)
    #if os.path.isfile(os.path.join(recipe,'word_thresholds.pkl')):
    #    print('File with wordthresholds found in recipe')
    #    shutil.copyfile(os.path.join(recipe, 'word_thresholds.pkl'),
    #        os.path.join(expdir, 'word_thresholds.pkl'))
    #    thresholdsarepresent = True
    #else:
    #    print('No file found with wordthresholds, using a fixed one')
    #    thresholdsarepresent = False

    labelvecs = []
    names = []
    taskstrings = dict()
    features = dict()

    print 'Searching for all speakers...'

    for speaker in dataconf.sections():

        print '     speaker: %s' % (speaker)

        #create a task structure file
        structure = Structure(os.path.join(expdir, 'structure.xml'))
        #create a coder
        coder = coder_factory.factory(coderconf.get('coder',
                                                    'name'))(structure,
                                                             coderconf)
        # typesplit_coder.py line 51 to see all labels and corresponding output capsule numbers

        #read and code all the tasks
        for line in open(
                dataconf.get(speaker, 'tasks')
        ):  #'recording1_Voice_10 <move_rel direction="forward" distance="little" throttle="fast" />'
            splitline = line.strip().split(' ')
            name = speaker + '_' + splitline[0]  #'recording1_Voice_10'
            names.append(name)
            taskstring = ' '.join(
                splitline[1:]
            )  #'<move_rel_direction="forwqrd" distance="little" throttle="fast"/>'
            taskstrings[name] = taskstring
            task = read_task(taskstring)
            labelvecs.append(coder.encode(task))

        # read the feature files
        for l in open(os.path.join(dataconf.get(speaker, 'features'),
                                   'feats')):
            splitline = l.strip().split(
                ' '
            )  #['recording1_Voice_10', '/esat/spchtemp/scratch/r0580562/databases/grabo_features/pp2/recording1_Voice_10.npy']
            featname = speaker + '_' + splitline[0]
            features[featname] = ' '.join(splitline[1:])

    print 'Devide data into blocks...'
    #devide the data into blocks, look for existing blocksfile in recipe because takes a very long time to make!!!
    blocksfile = os.path.join(recipe, 'blocks.pkl')
    if os.path.exists(blocksfile):
        print 'Loading found blocks file (check if number of blocks is still the same)'
        with open(blocksfile, 'rb') as fid:
            blocks = pickle.load(fid)
    else:
        print 'No blocksfile found in  recipe, making new one'
        blocks = make_blocks(
            np.array(labelvecs), expconf, expdir
        )  #massive list, matrix of [[..,..,..,..],[..,...    .]]  numbers between 1-350 approx
        with open(blocksfile, 'wb') as fid:
            pickle.dump(blocks, fid)

    print 'Shuffle speakers...'

    # look for existing train and test sets and load them in ('saved_ids' in recipe), because takes a very long time to make!!!

    sets_properties = {}
    if os.path.isdir(os.path.join(recipe, 'saved_ids')):
        saved_ids = ConfigParser()
        saved_ids.read(
            os.path.join(recipe, 'saved_ids', 'cross_validation_ppall.cfg'))
        sets_properties = dict(saved_ids.items('cross_validation_ppall'))
    else:
        sets_properties['numblocks'] = 0
        sets_properties['numexp'] = 0

    if (sets_properties['numblocks']
            == expconf['numblocks']) and (sets_properties['numexp']
                                          == expconf['numexp']):
        print '     Loading found test recipe'
        trainids_saved = os.path.join(recipe, 'saved_ids', 'trainids.pkl')
        with open(trainids_saved, 'rb') as fid:
            trainids = pickle.load(fid)
        testids_saved = os.path.join(recipe, 'saved_ids', 'testids.pkl')
        with open(testids_saved, 'rb') as fid:
            testids = pickle.load(fid)
    else:
        print '     No saved test and train sets found with same crossvalidation configuration in the recipe'
        # seed the random number generator
        random.seed(3105)
        trainids = [None] * (len(blocks) - 1)  #len(blocks)=15
        testids = [None] * (len(blocks) - 1)
        print '     Number of blocks: %d' % (len(blocks))
        b = 0
        while b < (len(blocks) - 1):
            #for b in range(len(blocks)-1):
            print '         block %d' % b
            trainids[b] = [None] * int(expconf['numexp'])
            testids[b] = [None] * int(expconf['numexp'])
            for e in range(int(expconf['numexp'])):
                trainids[b][e] = list(
                    itertools.chain.from_iterable(random.sample(blocks,
                                                                b + 1)))
                testids[b][e] = [
                    x for x in range(len(names)) if x not in trainids[b][e]
                ]
            #scale factor to use more smaller blocks and less bigger blocks (for the curve, it saturates)
            newb = int(
                np.floor((b + 1) * float(expconf['scale']) +
                         int(expconf['increment']) - 1))
            newb = min(newb, len(blocks) - 2)
            if b == newb:
                break
            else:
                b = newb

        os.makedirs(os.path.join(expdir, 'saved_ids'))
        trainids_saved = os.path.join(expdir, 'saved_ids', 'trainids.pkl')
        testids_saved = os.path.join(expdir, 'saved_ids', 'testids.pkl')
        with open(trainids_saved, 'wb') as fid:
            pickle.dump(trainids, fid)
        with open(testids_saved, 'wb') as fid:
            pickle.dump(testids, fid)
        shutil.copyfile(
            os.path.join(recipe, 'cross_validation_ppall.cfg'),
            os.path.join(expdir, 'saved_ids', 'cross_validation_ppall.cfg'))

    #create an expdir for each experiment
    b = int(expconf['startblocks']) - 1  #0

    print 'Launch the experiments...'
    while True:
        for e in range(int(expconf['numexp'])):

            print '     train blocks: %d, experiment %s' % (b + 1, e)

            #creat the directory
            subexpdir = os.path.join(expdir, '%dblocks_exp%d' % (b + 1, e))

            if os.path.exists(os.path.join(subexpdir, 'f1')):
                continue

            if not os.path.isdir(subexpdir):
                os.makedirs(subexpdir)

            #create pointers to the config files
            tools.symlink(os.path.join(expdir, 'acquisition.cfg'),
                          os.path.join(subexpdir, 'acquisition.cfg'))
            tools.symlink(os.path.join(expdir, 'coder.cfg'),
                          os.path.join(subexpdir, 'coder.cfg'))
            tools.symlink(os.path.join(expdir, 'structure.xml'),
                          os.path.join(subexpdir, 'structure.xml'))
            tools.symlink(os.path.join(expdir, 'database.cfg'),
                          os.path.join(subexpdir, 'database.cfg'))
            #if thresholdsarepresent:
            #    tools.symlink(os.path.join(expdir, 'word_thresholds.pkl'),
            #                  os.path.join(subexpdir, 'word_thresholds.pkl'))

            if not os.path.exists(os.path.join(subexpdir, 'trainfeats')):
                trainutts = [names[i] for i in trainids[b][e]]
                print 'number of examples: %d' % len(trainutts)
                testutts = [names[i] for i in testids[b][e]]

                #create the train and test sets
                tools.writefile(os.path.join(subexpdir, 'trainfeats'),
                                {utt: features[utt]
                                 for utt in trainutts})
                tools.writefile(os.path.join(subexpdir, 'traintasks'),
                                {utt: taskstrings[utt]
                                 for utt in trainutts})
                tools.writefile(os.path.join(subexpdir, 'testfeats'),
                                {utt: features[utt]
                                 for utt in testutts})
                tools.writefile(os.path.join(subexpdir, 'testtasks'),
                                {utt: taskstrings[utt]
                                 for utt in testutts})

            if computing in ('condor', 'condor_gpu'):
                #create the outputs directory
                if not os.path.isdir(os.path.join(subexpdir, 'outputs')):
                    os.makedirs(os.path.join(subexpdir, 'outputs'))

                if computing == 'condor_gpu':
                    jobfile = 'run_script_GPU.job'
                else:
                    jobfile = 'run_script.job'

                #only submit the job if it not running yet
                in_queue = os.popen('if condor_q -nobatch -wide | grep -q %s; '
                                    'then echo true; else echo false; fi' %
                                    subexpdir).read().strip() == 'true'

                #submit the condor job
                if not in_queue:
                    os.system('condor_submit expdir=%s script=train_test'
                              ' assist/condor/%s' % (subexpdir, jobfile))
            else:
                train_test.main(subexpdir)

        newb = int(
            np.floor((b + 1) * float(expconf['scale']) +
                     int(expconf['increment']) - 1))
        newb = min(newb, len(blocks) - 2)
        if b == newb:
            break
        else:
            b = newb
예제 #9
0
def main(expdir):
    '''main function'''

    #check if this experiment has been completed
    if os.path.exists(os.path.join(expdir, 'f1')):
        print 'result found %s' % expdir
        return

    #read the acquisition config file
    acquisitionconf = ConfigParser()
    acquisitionconf.read(os.path.join(expdir, 'acquisition.cfg'))

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    #create a task structure file
    structure = Structure(os.path.join(expdir, 'structure.xml'))

    #create a coder
    coder = coder_factory.factory(coderconf.get('coder', 'name'))(structure,
                                                                  coderconf)

    #create an acquisition model
    model = model_factory.factory(acquisitionconf.get('acquisition',
                                                      'name'))(acquisitionconf,
                                                               coder, expdir)

    print 'loading model'
    model.load(os.path.join(expdir, 'model'))

    print 'prepping testing data'

    #load the testing features
    features = dict()
    for line in open(os.path.join(expdir, 'testfeats')):
        splitline = line.strip().split(' ')
        featsfile = ' '.join(splitline[1:])
        features[splitline[0]] = np.load(featsfile)

    #read the testtasks
    references = dict()
    tasklabels = dict()
    for line in open(os.path.join(expdir, 'testtasks')):
        splitline = line.strip().split(' ')
        taskstring = read_task.read_task(' '.join(splitline[1:]))
        references[splitline[0]] = taskstring
        tasklabels[splitline[0]] = coder.encode(taskstring)

    #find all words said by speakers and save all spoken sentences in testtasks
    wordcount = {}
    testsentences = {}
    dataconf = ConfigParser()
    dataconf.read(os.path.join(expdir, 'database.cfg'))
    for speaker in dataconf.sections():
        taskloc = dataconf.get(speaker, 'tasks')
        textloc = taskloc[:-5] + str('text')
        with open(textloc) as fp:
            line = (fp.readline())[:-1]
            while line:
                sentence = (line.split(" "))[1:]
                voice = str(speaker) + '_' + (line.split(" "))[0]
                for word in sentence:
                    if word in wordcount:
                        wordcount[word] += 1
                    else:
                        wordcount[word] = 1
                if voice in references:
                    testsentences[voice] = " ".join(sentence)
                line = (fp.readline())[:-1]
    all_words = sorted(wordcount.keys())  #ordered alphabetically

    #read the singlebest word for each label
    singlebest = []
    with open(os.path.join(expdir, 'singlebestwords'), 'r') as fp:
        line = (fp.readline())[:-1]
        while line:
            word = (line.split(" "))[1]
            singlebest.append(word)
            line = (fp.readline())[:-1]

    #get the singlebest predictions for each voicing
    singlebest_sentences = {}
    for voice in tasklabels:
        label = tasklabels[voice]
        indices = np.nonzero(label)
        sentence = ''
        print(indices)
        for i in indices[0]:
            word = singlebest[i]
            sentence = sentence + ' ' + str(word)
        singlebest_sentences[voice] = sentence

    with open(os.path.join(expdir, 'singlebestsentences'), 'w') as fid:
        for name, sentence in sorted(singlebest_sentences.items()):
            fid.write('%s %s\n' % (name, sentence))

    print 'testing the model'

    word_thresholds = None
    # if you would want word specific thresholds ...
    threshfile = os.path.join(expdir, 'word_thresholds.pkl')
    if os.path.isfile(threshfile):
        with open(threshfile, 'r') as fid:
            word_dict = pickle.load(fid)
        word_thresholds = []
        for word, thresh in sorted(word_dict.items()):
            word_thresholds.append(thresh)

    print 'LAUNCHING MODEL.DECODE'
    decoded_speakers = dict()
    decoded_words = dict()

    # filter out features that are too short
    # (+ corresponding references to not get errors in score.score)
    for k in features.keys():
        if features[k].shape[0] <= 5:
            del features[k]
            del references[k]

    #decode the test uterances
    if acquisitionconf.get('acquisition', 'name') == 'nmf':
        decoded = model.decode(features)
    else:
        #### decoded, decoded_speakers, decoded_words = model.decode(features, all_words, word_thresholds, tasklabels)
        decoded, decoded_speakers, decoded_words = model.decode(
            features, all_words, word_thresholds)
    #decoded = model.decode(features)

    #write the decoded tasks to disc
    with open(os.path.join(expdir, 'dectasks'), 'w') as fid:
        for name, task in decoded.items():
            fid.write('%s %s\n' % (name, read_task.to_string(task)))

    if acquisitionconf.get('acquisition', 'name') != 'nmf':
        with open(os.path.join(expdir, 'decspeakers'), 'w') as fid:
            for name, spk in decoded_speakers.items():
                fid.write('%s %s\n' % (name, spk))

        speakerperformance = score.spk_score(decoded_speakers)
        print 'speakerperformance: %f' % speakerperformance

        with open(os.path.join(expdir, 'speakerperformance'), 'w') as fid:
            fid.write(str(speakerperformance))

        decoded_sentences = {}
        with open(os.path.join(expdir, 'decwords'), 'w') as fid:
            for name, wordslist in sorted(decoded_words.items()):
                sentence = ''
                wordindices = np.nonzero(
                    wordslist)  #returns all indices that are nonzero
                for wordindex in wordindices[0]:
                    word = all_words[wordindex]
                    sentence = sentence + ' ' + str(word)
                fid.write('%s %s\n' % (name, sentence))
                decoded_sentences[name] = sentence

        with open(os.path.join(expdir, 'testwords'), 'w') as fid:
            for name, sentence in sorted(testsentences.items()):
                fid.write('%s %s\n' % (name, sentence))

        word_f1, word_precision, word_recal = score.wordscore(
            decoded_sentences, testsentences)

        singlebest_f1, singlebest_precision, singlebest_recal = score.wordscore(
            singlebest_sentences, testsentences)

        print 'word_f1: %f' % word_f1
        print 'word_precision: %f' % word_precision
        print 'word_recal: %f' % word_recal
        with open(os.path.join(expdir, 'word_f1'), 'w') as fid:
            fid.write(str(word_f1))
        with open(os.path.join(expdir, 'word_precision'), 'w') as fid:
            fid.write(str(word_precision))
        with open(os.path.join(expdir, 'word_recal'), 'w') as fid:
            fid.write(str(word_recal))

        print 'singlebest_f1: %f' % singlebest_f1
        print 'singlebest_precision: %f' % singlebest_precision
        print 'singlebest_recal: %f' % singlebest_recal
        with open(os.path.join(expdir, 'singlebest_f1'), 'w') as fid:
            fid.write(str(singlebest_f1))
        with open(os.path.join(expdir, 'singlebest_precision'), 'w') as fid:
            fid.write(str(singlebest_precision))
        with open(os.path.join(expdir, 'singlebest_recal'), 'w') as fid:
            fid.write(str(singlebest_recal))

    (precision, recal, f1, macroprec, macrorecall, macrof1), scores = \
        score.score(decoded, references)

    fluent_accuracy = score.fluentscore(decoded, references)
    print 'fluent_accuracy: %f' % fluent_accuracy
    with open(os.path.join(expdir, 'fluent_accuracy'), 'w') as fid:
        fid.write(str(fluent_accuracy))

    print 'precision: %f' % precision
    print 'recal: %f' % recal
    print 'f1: %f' % f1
    print 'macro precision: %f' % macroprec
    print 'macro recal: %f' % macrorecall
    print 'macro f1: %f' % macrof1

    with open(os.path.join(expdir, 'precision'), 'w') as fid:
        fid.write(str(precision))
    with open(os.path.join(expdir, 'recal'), 'w') as fid:
        fid.write(str(recal))
    with open(os.path.join(expdir, 'f1'), 'w') as fid:
        fid.write(str(f1))
    with open(os.path.join(expdir, 'macroprecision'), 'w') as fid:
        fid.write(str(macroprec))
    with open(os.path.join(expdir, 'macrorecal'), 'w') as fid:
        fid.write(str(macrorecall))
    with open(os.path.join(expdir, 'macrof1'), 'w') as fid:
        fid.write(str(macrof1))

    score.write_scores(scores, expdir)

    for f in glob.glob('%s*' % os.path.join(expdir, 'logdir')):
        shutil.rmtree(f, ignore_errors=True)

    for f in glob.glob('%s*' % os.path.join(expdir, 'logdir-decode')):
        shutil.rmtree(f, ignore_errors=True)
예제 #10
0
def main(expdir):
    '''main function'''

    #check if this experiment has been completed
    if os.path.isdir(os.path.join(expdir, 'model')):
        return

    #read the acquisition config file
    acquisitionconf = ConfigParser()
    acquisitionconf.read(os.path.join(expdir, 'acquisition.cfg'))

    #read the coder config file
    coderconf = ConfigParser()
    coderconf.read(os.path.join(expdir, 'coder.cfg'))

    #create a task structure file
    structure = Structure(os.path.join(expdir, 'structure.xml'))

    #create a coder
    coder = coder_factory.factory(coderconf.get('coder', 'name'))(
        structure, coderconf)

    #create an acquisition model
    model = model_factory.factory(acquisitionconf.get('acquisition', 'name'))(
        acquisitionconf, coder, expdir)

    print 'prepping training data'

    #load the training features
    features = dict()
    for line in open(os.path.join(expdir, 'trainfeats')):
        splitline = line.strip().split(' ')
        featsfile = ' '.join(splitline[1:])
        features[splitline[0]] = np.load(featsfile)

    #read the traintasks
    taskstrings = dict()
    for line in open(os.path.join(expdir, 'traintasks')):
        splitline = line.strip().split(' ')
        taskstrings[splitline[0]] = ' '.join(splitline[1:])

    task = read_task(taskstrings[splitline[0]])
    label = coder.encode(task)
    num_labels = len(label)
    print('num_labels: ', num_labels)		
    

    #find all words said by speakers and count number of occurrences
    wordcount = {}
    wordcount_in_traintasks = {}
    sentencecount = 0
    sentencecount_in_traintasks = 0
	
    #save file with transcription of traintasks
    trainsentences = {}
    unique_sentencecount = {}
	
    dataconf = ConfigParser()
    dataconf.read(os.path.join(expdir, 'database.cfg'))
    for speaker in dataconf.sections():
        taskloc = dataconf.get(speaker, 'tasks')
        textloc = taskloc[:-5]+str('text')
        with open(textloc) as fp:
            line = (fp.readline())[:-1]
            while line:
                sentencecount += 1
                voice = str(speaker)+'_'+(line.split(" "))[0]
                sentence = (line.split(" "))[1:]
                for word in sentence:
                    if word in wordcount:
                        wordcount[word] += 1
                    else:
                        wordcount[word] = 1
                if voice in taskstrings:
		    trainsentences[voice] = " ".join(sentence)
                    sentencecount_in_traintasks += 1
                    for word in sentence:
                        if word in wordcount_in_traintasks:
                            wordcount_in_traintasks[word] += 1
                        else:
                            wordcount_in_traintasks[word] = 1
		    uniquewords = list(dict.fromkeys(sentence))
		    for word in uniquewords:
		        if word in unique_sentencecount:
			    unique_sentencecount[word] += 1
		        else:
			    unique_sentencecount[word] = 1
                line = (fp.readline())[:-1]
    all_words = sorted(wordcount.keys())  #ordered alphabetically
	
    wordfactors = np.zeros((2,len(all_words)),dtype=np.float32)
    for i in range(0,len(all_words)):
        word = all_words[i]
        if word not in wordcount_in_traintasks:
            continue
        count = wordcount_in_traintasks[word]
        if count == sentencecount_in_traintasks:
            count -= 1
        wordfactors[0,i] = sentencecount_in_traintasks/(2*count)  #factor for present
        wordfactors[1,i] = sentencecount_in_traintasks/(2*(sentencecount_in_traintasks-count))  #factor for absent
        print('Word: ',word,'     Wordfactors (present vs absent): ',wordfactors[0,i],' and ',wordfactors[1,i])
    
    #create lists of features and training tasks
    examples = {utt: (features[utt], taskstrings[utt]) for utt in taskstrings}
	
    #calculate the TF
    #for every tasklabel, count frequency of words in trainsentences corresponding to each tasklabel
    wordfreq_matrix = np.zeros((num_labels,len(all_words)))
    for voice in taskstrings:
        taskstring = taskstrings[voice]
        task = read_task(taskstring)
        labels = coder.encode(task)
        sentence = trainsentences[voice]
        for word in sentence.split(" "):
            wordind = all_words.index(word)
            wordfreq_matrix[:,wordind] += labels  # term frequency TF
	
    #calculate the IDF
    N = sentencecount_in_traintasks  # total number of sentences in the trainingset
    D = np.zeros(len(all_words))  # vector with for each word in how many sentences in trainingset the word occurs
    for word in unique_sentencecount:
        wordind = all_words.index(word)
        count = unique_sentencecount[word]
        D[wordind] = count
    IDF = np.array([np.log(N/(1+x)) for x in D])  # inverse document frequency
	
    TFIDF = wordfreq_matrix * IDF  # TFIDF matrix
	
    print(coder.argindices.items())  # get all labels and corresponding capsule number

    #save the single best chosen to a file
    with open(os.path.join(expdir, 'singlebestwords'), 'w') as fid:
        for i in range(0, num_labels):
            singlebest_ind = np.argmax(TFIDF[i,:])
            singlebest = all_words[singlebest_ind]
            fid.write('%i %s\n' % (i, singlebest))  # hier nog labelnaam in krijgen
			
    #save the trainsentences to a file
    with open(os.path.join(expdir, 'trainwords'),'w') as fid:
        for name, sentence in sorted(trainsentences.items()):
            fid.write('%s %s\n' % (name, sentence))
    
    if acquisitionconf.get('acquisition', 'name') == 'nmf':
        model.train(examples)
    else:
        print 'training acquisition model (LAUNCHING MODEL.TRAIN)'
        model.train(examples, all_words, wordfactors)
    
    #model.train(examples)

    #save the trained model
    model.save(os.path.join(expdir, 'model'))
예제 #11
0
    def train(self, examples, all_words, wordfactors):
        '''train the model

        Args:
            examples: the training examples as a dictionary of pairs containing
                the inputs and reference tasks
            all_words: list with all words said by any speaker, ordered alphabetically
        '''

        self.is_training = True

        #create the graph
        graph = tf.Graph()

        for k in examples.keys():
            if (examples[k][0].shape[0] <= 5) or not np.isfinite(
                    examples[k][0]).all():
                del examples[k]

        features, tasks = zip(*examples.values())
        voices = examples.keys()

        #encode the speakers
        dataconf = ConfigParser()
        dataconf.read(os.path.join(self.expdir, 'database.cfg'))
        nr_spk = len(dataconf.sections())
        print('nr_spk: ', nr_spk)
        speakers_ordered = dataconf.sections()
        print(speakers_ordered)

        encoded_spk = np.zeros(len(tasks))  # one-hot encoding of speakers
        cnt = 0
        for voice in voices:
            spk_id = '_'.join(voice.split('_')[:1])
            encoded_spk[cnt] = speakers_ordered.index(spk_id)
            cnt += 1

        #encode all the words
        encoded_words = np.zeros((len(all_words), len(tasks)))
        voicestemp = list(voices)
        for speaker in speakers_ordered:
            result = sorted(
                [x for x in voicestemp if x.startswith(speaker + '_')])
            voicestemp = [x for x in voicestemp if x not in result]

            if result:  #check if not empty
                taskloc = dataconf.get(
                    speaker, 'tasks'
                )  # to get folder of database with the text written out
                textloc = taskloc[:-5] + str('text')

                with open(textloc) as fp:
                    line = (fp.readline())[:-1]

                    while line:
                        voice = speaker + '_' + str((line.split(" "))[0])
                        sentence = (line.split(" "))[1:]

                        if voice in result:
                            corr_index = voices.index(voice)

                            for word in sentence:
                                pos = all_words.index(word)
                                encoded_words[pos, corr_index] = 1

                        line = (fp.readline())[:-1]
        encoded_words = np.transpose(encoded_words)

        #read all the tasks
        tasks = [read_task(task) for task in tasks]

        #encode the tasks
        vs = np.array([self.coder.encode(t) for t in tasks])

        if self.conf['batch_size'] == 'None':
            batch_size = features.shape[0]
        else:
            batch_size = min(int(self.conf['batch_size']), len(features))

        with graph.as_default():
            #put the features in a constant
            inputs = tf.placeholder(
                dtype=tf.float32,
                shape=[batch_size, None, features[0].shape[-1]],
                name='inputs')
            seq_length = tf.placeholder(dtype=tf.int32,
                                        shape=[batch_size],
                                        name='seq_length')

            #put the targets in a constant
            targets = tf.placeholder(dtype=tf.float32,
                                     shape=[batch_size, vs.shape[-1]],
                                     name='targets')

            #put the labels in a constant
            speakers = tf.placeholder(dtype=tf.int32,
                                      shape=[batch_size],
                                      name='speakers')

            correct_words = tf.placeholder(dtype=tf.float32,
                                           shape=[batch_size,
                                                  len(all_words)],
                                           name='correct_words')

            expanded_wordfactors_present = np.zeros(
                (batch_size, len(all_words)), np.float32)
            expanded_wordfactors_absent = np.zeros(
                (batch_size, len(all_words)), np.float32)
            for i in range(0, batch_size):
                expanded_wordfactors_present[i, :] = wordfactors[0, :]
                expanded_wordfactors_absent[i, :] = wordfactors[1, :]

            expanded_wordfactors_present = tf.convert_to_tensor(
                expanded_wordfactors_present, dtype=tf.float32)
            expanded_wordfactors_absent = tf.convert_to_tensor(
                expanded_wordfactors_absent, dtype=tf.float32)

            #apply the model
            labelprobs, spklogits, wordlogits = self.model(
                inputs, seq_length, nr_spk, all_words, targets)
            loss, labelloss, spkloss, wordloss = self.loss(
                targets, speakers, correct_words, labelprobs, spklogits,
                wordlogits, expanded_wordfactors_present,
                expanded_wordfactors_absent)

            #count the number of parameters
            num_params = 0
            for var in tf.trainable_variables():
                num_params += reduce(mul, var.get_shape().as_list())
            print 'number of parameters: %d' % num_params

            #create an optimizer
            optimizer = tf.train.AdamOptimizer(learning_rate=float(
                self.conf['learning_rate']),
                                               epsilon=1e-03)

            #compute the gradients
            grads_and_vars = optimizer.compute_gradients(loss=loss)

            with tf.variable_scope('clip'):
                #clip the gradients
                grads_and_vars = [(tf.clip_by_value(grad, -1., 1.), var)
                                  for grad, var in grads_and_vars]

            #opperation to apply the gradients
            apply_gradients_op = optimizer.apply_gradients(
                grads_and_vars=grads_and_vars, name='apply_gradients')

            #all remaining operations with the UPDATE_OPS GraphKeys
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            #create an operation to update the model
            update_op = tf.group(*([apply_gradients_op] + update_ops),
                                 name='update')

            #create the init op
            init_op = tf.variables_initializer(tf.global_variables())

            #create a saver
            saver = tf.train.Saver()

            #create a summary
            for var in tf.trainable_variables():
                tf.summary.histogram(var.name, var)
            tf.summary.scalar('loss', loss)

            if self.conf['images'] == 'True':
                images = tf.get_collection('image')
                for image in images:
                    tf.summary.image(image.name, image)

            summary = tf.summary.merge_all()

        #create a session
        session_conf = tf.ConfigProto(inter_op_parallelism_threads=0,
                                      intra_op_parallelism_threads=0)
        sess = tf.Session(graph=graph, config=session_conf)

        #create a summary writer
        writer = tf.summary.FileWriter(os.path.join(self.expdir, 'logdir'),
                                       graph)

        #initialize the model
        sess.run(init_op)

        #create an index queue
        index_queue = []
        for _ in range(int(self.conf['numiters'])):
            i = range(len(tasks))
            shuffle(i)
            index_queue += i

        #iterativaly train the model
        i = 0
        while len(index_queue) > batch_size:
            indices = index_queue[:batch_size]
            index_queue = index_queue[batch_size:]
            batch_inputs = [features[j] for j in indices]
            batch_lengths = np.array([f.shape[0] for f in batch_inputs])
            ml = np.max(batch_lengths)
            batch_inputs = np.array([
                np.pad(f, ((0, ml - f.shape[0]), (0, 0)), 'constant')
                for f in batch_inputs
            ])
            if i == 'a':
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
            else:
                run_options = run_metadata = None
            _, s, l, lbl, spkl, spklgts, wrdl, wrdlgts = sess.run(
                (update_op, summary, loss, labelloss, spkloss, spklogits,
                 wordloss, wordlogits),
                feed_dict={
                    inputs: batch_inputs,
                    seq_length: batch_lengths,
                    targets: vs[indices],
                    speakers: encoded_spk[indices],
                    correct_words: encoded_words[indices]
                },
                options=run_options,
                run_metadata=run_metadata)
            print 'step %d: loss = %f' % (i, l)
            print '     labelloss = %f' % lbl
            print '     spkloss = %f' % spkl
            print '     wordloss = %f' % wrdl
            writer.add_summary(s, i)
            if i == 'a':
                writer.add_run_metadata(run_metadata, 'statistics')
            i += 1

        #save the final model
        saver.save(sess, os.path.join(self.expdir, 'logdir', 'model.ckpt'))

        sess.close()

        self.is_training = False
예제 #12
0
파일: score_pp.py 프로젝트: qmeeus/assist
structure = Structure(os.path.join(expdir, 'structure.xml'))

#create a coder
coder = coder_factory.factory(coderconf.get('coder', 'name'))(structure,
                                                              coderconf)

#ref=dict()
with open(os.path.join(expdir, 'testtasks')) as f:
    ref = dict([l.split(' ', 1) for l in f])
    # lines = f.readlines()
    # for line in lines:
    #     splitline = line.strip().split(' ')
    #     taskstrings[splitline[0]] = ' '.join(splitline[1:])
with open(os.path.join(args.expdir, 'dectasks')) as f:
    hyp = dict([l.split(' ', 1) for l in f])

# reftasks = [read_task(task) for task in ref.values()]
# hyptasks = [read_task(task) for task in hyp.values()]
tasks = [(read_task(ref[k]), read_task(hyp[k])) for k in hyp.keys()]

vs_ref = np.array([coder.encode(t[0], None, 0.0) for t in tasks])
vs_hyp = np.array([coder.encode(t[1], None, 0.0) for t in tasks])

eq = vs_ref == vs_hyp
eqtasks = eq.all(axis=1)
corr = np.sum(eqtasks)
# tasks=[(read_task(ref[k]),read_task(hyp[k])) for k in hyp.keys()]
# print (len(tasks))
# sum = reduce((lambda x,y: int(x==y)),tasks)
pct = 100.0 * float(corr) / float(len(eqtasks))
print('%10.3g\n' % pct)