Beispiel #1
0
def loadChunkList(chunkfnm, workid, worknum):
    log.message("Load chunk list from file {} ......".format(chunkfnm))
    finfo = chunkfnm + nk.sKeychunklistinfo
    if (not os.path.exists(finfo)) or (not os.path.exists(chunkfnm)):
        raise Exception("Cant find file {} or {}?".format(chunkfnm, finfo))

    """
    Load file info
    """
    flist = None
    chunklist = None
    with open(finfo) as f:
        flist = f.readlines()
    for i in range(len(flist)):
        flist[i] = cleanline(flist[i])

    with open(chunkfnm) as f:
        chunklist = f.readlines()
    """
    Load chunk info
    """

    for i in range(len(chunklist)):
        chunklist[i] = cleanline(chunklist[i])

    assert len(chunklist) > worknum
    nchunkblock = int(math.ceil(len(chunklist) / worknum))
    startid = nchunkblock * workid
    stopid = nchunkblock
    if workid == worknum - 1:
        stopid = len(chunklist) - nchunkblock * (worknum - 1)

    retchunklist = chunklist[startid : startid + stopid - 1]

    return flist, retchunklist
Beispiel #2
0
    def createProcessor(self, arguments):
        name = to_lowercase(arguments.name)

        mnm = "tensorgou.io" + "." + name + "_io"
        log.message("Import module {}".format(mnm))

        return loadmodel(mnm)
Beispiel #3
0
    def load_word_list(self, isload=True):
        assert (os.path.exists(self.listfnm))

        word2id = None
        log.message("Load word list file {}......".format(self.listfnm))
        with open(self.listfnm) as f:
            self.listnum = sum(1 for x in f)

            if not isload:
                return word2id

            f.seek(0, 0)
            log.message("\tFound total {} word list".format(self.listnum))
            wordlist = f.readlines()

            """ split to word dict """
            for i in range(len(wordlist)):
                wordlist[i] = cleanline(wordlist[i])

            if self.checkwordlist is True:
                wordset = set(wordlist)
                for item in wordset:
                    if(wordlist.count(item) > 1):
                        raise Exception("Found multi same word {} {} in wordlist!"
                                        .format(item, wordlist.count(item)))

            word2id = dict(zip(wordlist, range(len(wordlist))))

            if not word2id.has_key("</s>"):
                raise Exception("Expect </s> word in file {}".format(self.listfnm))

        return word2id
Beispiel #4
0
def load_config_file(config_file, ignore_names):
    """ Loads and builds the model from the configuration

    Arguments:
        config_file: The configuration file
        ignore_names: A set of names that should be ignored during the loading.
    """
    config_dicts = parsing.parse_file(config_file)
    message("Configure file is parsed.")

    # first load the configuration into a dictionary
    """
    for k in config_dicts:
        print(k, config_dicts[k])
    """

    if "main" not in config_dicts:
        raise Exception("Configuration does not contain the main block.")

    existing_objects = dict()

    main_config = config_dicts['main']

    configuration = dict()
    for key, value in main_config.items():
        if key not in ignore_names:
            try:
                configuration[key] = build_object(value, config_dicts,
                                                  existing_objects, 0)
            except Exception:
                raise Exception("Can't parse key: {}".format(key))

    return configuration
Beispiel #5
0
    def printparameter(self, args):
        message("Parameter Info:")
        message("===============")
        for name in args.__dict__:
            message("{} = {}".format(name, args.__dict__[name]))

        message("===============")
Beispiel #6
0
    def evalit(self, loops, results):
        l_cost = 0.0
        l_correctnum = 0
        l_totalnum = 0
        l_step = loops - self.steps
        self.steps = loops

        for item in results:
            if len(item) < 2:
                raise Exception("Bad result type! Expect {}, get {}".format(
                    3, len(item)))
            self.cost += item[0]

        ## cal correct
        nr = len(results)
        #add by xjk
        out = open("/search/odin/tensorflow/lstm_output/out", 'a')
        if nr > 0:
            numsteps = Config().num_steps
            output = results[nr - 1][1]
            l_cost = results[nr - 1][0]
            for j in range(self.batchsize):
                for k in range(numsteps):
                    if k < G_Vals.g_lengths[j] and G_Vals.g_frameweight[k,
                                                                        j] > 0:
                        if G_Vals.g_targets[k,
                                            j] == output[k * self.batchsize +
                                                         j]:
                            # add by xjk
                            out.write(
                                str(G_Vals.g_targets[k, j]) + " " +
                                str(output[k * self.batchsize + j]) + '\n')
                            l_correctnum += 1

                        l_totalnum += 1

        l_accuracy = float(l_correctnum) / float(l_totalnum)

        count = self.steps * self.batchsize
        if count <= 0:
            raise Exception("Bad l_step {}? Expect > 0".format(l_step))

        l_avgcost = self.cost / float(count)

        log.message(
            "No.{} batch: curr_lost[{:.5f}]    curr_prec[{:.6f}]    ave_loss[{:.6f}]"
            .format(loops + 1, l_cost, l_accuracy, l_avgcost))

        return False
Beispiel #7
0
def filterfiles(files):
    assert not len(files) == 0

    badfiles = []
    goodfiles = []
    for i in range(len(files)):
        filenm = cleanline(files[i])
        if not os.path.exists(filenm):
            badfiles.append(filenm)
        else:
            goodfiles.append(filenm)

    if not len(badfiles) == 0:
        log.error("Can't find data files:")
        for i in range(len(badfiles)):
            log.error("\t'{}'".format(badfiles[i]))

    if len(goodfiles) == 0:
        raise Exception("No available data files!")

    log.message("Data files: ")

    readablefiles = []
    for i in range(len(goodfiles)):
        curflen = os.path.getsize(goodfiles[i])
        curflen /= (1024 * 1024)
        if curflen < 10:
            log.warnning("File: {} {}M, too small?".format(goodfiles[i], curflen))
        else:
            readablefiles.append(goodfiles[i])
            log.message("\tNo.{} file:{}".format(i + 1, goodfiles[i]))
        """
        with open(goodfiles[i]) as f:
            lines = sum(1 for x in f)
            if lines == 0:
                log.warnning("File: {} is empty?".format(goodfiles[i]))
            else:
                totallines += lines
                log.message("\tNo.{} file:{} with {} recodes"
                            .format(i + 1, goodfiles[i], lines))

                readablefiles.append(goodfiles[i])
        """
    if len(readablefiles) == 0:
        raise Exception("Found 0 records?")

    return readablefiles
Beispiel #8
0
    def load_word_dict(self, word2id, isload=True, wedim=None):
        assert (os.path.exists(self.dictfnm))

        eb = None
        log.message("Load word dict file {}......".format(self.dictfnm))

        with open(self.dictfnm) as f:
            self.dictnum, = struct.unpack("i", f.read(4))
            self.worddim, = struct.unpack("i", f.read(4))
            if not isload:
                return

            if wedim is not None:
                if not wedim == self.worddim:
                    raise Exception("Embedding size '{}' can't match file '{}' defined size '{}' dictnum='{}'"
                                    .format(wedim, self.dictfnm, self.worddim, self.dictnum))

            log.message("\tInfo: word_num = {}\tword_dim = {}"
                        .format(self.dictnum, self.worddim))

            if self.dictnum < len(word2id):
                log.warnning("\tword_dict num '{}' < word_list num '{}' ?"
                             .format(self.dictnum, len(word2id)))

            eb = np.zeros((len(word2id), self.worddim), dtype = np.float32)

            pb = click.progressbar(length=self.dictnum, label="Load word dict")
            for i in range(self.dictnum):
                word_len, = struct.unpack("i", f.read(4))
                word_str, = struct.unpack(str(word_len) + "s", f.read(word_len))
                if word2id.has_key(word_str):
                    wordid = word2id[word_str]
                    for j in range(self.worddim):
                        elem_value, = struct.unpack("f", f.read(4))
                        eb[wordid, j] = elem_value
                else:
                    for j in range(self.worddim):
                        _, = struct.unpack("f", f.read(4))

                pb.update(1)

            del pb
            print("")

        return eb
Beispiel #9
0
def readerProc(flist, chunklist, batchsize, queue, maxepoch, quitEvent):
    nchunk = len(chunklist)
    assert nchunk > 0
    assert len(flist) > 0

    random.shuffle(chunklist)

    curepoch = 0
    buffer = []
    curptr = 0
    numchunk = 0
    while 1:
        if quitEvent.is_set():
            return
        ## load next batch
        if len(buffer) < batchsize:
            chunkdata, curptr = loadNextChunk(flist, chunklist, batchsize, curptr)
            buffer.extend(chunkdata)
            numchunk += 2
            if numchunk >= nchunk:
                curepoch += 1
                
                if curepoch >= maxepoch:
                    quitEvent.set()
                    return
                random.shuffle(chunklist)
                curptr = 0
                numchunk = 0
            else:
                log.message("Loader: {}/{} chunks loaded ......"
                            .format(numchunk, nchunk))

        batch, buffer = loadNextBatch(buffer, batchsize)

        ## push into queue
        while 1:
            try:
                queue.put_nowait(batch)
                break
            except lQueue.Full:
                if quitEvent.is_set():
                    return
                time.sleep(0.1)
                continue
Beispiel #10
0
    def getmodule(self, config_dict):
        if self.inmodule is not None:
            return

        if not config_dict.has_key('name'):
            raise Exception("No parameter 'name' be defined?")

        ## only support default type here!
        if config_dict.has_key('type'):
            raise Exception("Only support default 'type' define in current version!")
        if not self.defaults.has_key('type'):
            raise Exception("No parameter 'type' be defined [default]?")

        name = to_lowercase(config_dict['name'])
        type = to_lowercase(self.defaults['type'])
        mnm = "tensorgou.graph" + "." + name + "." + type
        message("Import module {}".format(mnm))

        self.inmodule = loadmodel(mnm)
        return self.inmodule
Beispiel #11
0
def doCreateChunkList(arguments):
    chunkfnm = os.path.join(arguments.output, nk.sKeychunklist)
    chunkfnminfo = chunkfnm + nk.sKeychunklistinfo
    assert not os.path.exists(chunkfnm)
    log.message("Build train dataset chunklist {} ...".format(chunkfnm))

    trainfiles = arguments.trainfnms.split(',')
    if len(trainfiles) == 0:
        raise Exception("No train data file be defined?")
    trainfilelist = filterfiles(trainfiles)

    fileid = []
    filelen = []
    for i in range(len(trainfilelist)):
        log.message("No.{} file {} ......".format(i + 1, trainfilelist[i]))
        curflen = os.path.getsize(trainfilelist[i])
        curflen /= 1024 * 1024
        fileid.append(trainfilelist[i])
        #modify by xjk
        #nchunk = int(math.ceil(curflen / 64)) # chunk size == 64M
        nchunk = int(math.ceil(curflen / 1))
        for num in range(nchunk):
            #modify by xjk
            #offset = 64 * num
            offset = 1 * num
            """
            if num == nchunk - 1:
                rest = curflen - offset
                assert rest <= 64
                assert rest > 0
                filelen.append("{}\t{}".format(i, offset * 1024 * 1024))
            else:
            """
            filelen.append("{}\t{}".format(i, offset * 1024 * 1024))
    """
    Write out info
    """
    log.message("Write chunk list file info {} ...".format(chunkfnminfo))
    with open(chunkfnminfo, "w") as f:
        for item in fileid:
            f.write("%s\n" % item)

    log.message("Write chunk list file {} ...".format(chunkfnm))
    with open(chunkfnm, "w") as f:
        ## f.write("%s\n" % len(filelen))
        for item in filelen:
            f.write("%s\n" % item)
Beispiel #12
0
    def evalit(self, loops, results):
        l_cost = 0.0
        l_correctnum = 0
        l_step = loops - self.steps
        self.steps = loops

        for item in results:
            if len(item) < 2:
                raise Exception("Bad result type! Expect {}, get {}".format(
                    3, len(item)))
            l_cost += item[0]
            l_correctnum += item[1]

        count = l_step * self.batchsize
        if count <= 0:
            raise Exception("Bad l_step {}? Expect > 0".format(l_step))

        l_avgcost = l_cost / count
        l_accuracy = l_correctnum / count

        log.message("No.{} batch: Cost[{:.5f}]    Accuracy[{:.5f}]".format(
            loops + 1, l_avgcost, l_accuracy))

        return False
Beispiel #13
0
    def load_file(self, path):
        message("Loading INI file: '{}'".format(path))

        try:
            # config_f = codecs.open(path, 'r', 'utf-8')
            arguments = Namespace()

            config_dict = load_config_file(path, self.ignored)
            self.getmodule(config_dict)
            self.buildparameter()

            self._check_loaded_conf(config_dict)

            for name, value in config_dict.items():
                if name in self.conditions and not self.conditions[name](value):
                    cond_code = self.conditions[name].__code__
                    cond_filename = cond_code.co_filename
                    cond_line_number = cond_code.co_firstlineno
                    raise Exception(
                        "Value of field '{}' does not satisfy "
                        "condition defined at {}:{}."
                        .format(name, cond_filename, cond_line_number))

                setattr(arguments, name, value)

            for name, value in self.defaults.items():
                if name not in arguments.__dict__:
                    arguments.__dict__[name] = value
            message("INI file loaded.")

        except Exception as exc:
            message("Failed to load INI file: {}".format(exc))
            traceback.print_exc()
            exit(1)

        self.printparameter(arguments)

        return arguments, self.inmodule