コード例 #1
0
def loadChunkList(chunkfnm, workid, worknum):
    log.message("Load chunk list from file {} ......".format(chunkfnm))
    finfo = chunkfnm + nk.sKeychunklistinfo
    if (not os.path.exists(finfo)) or (not os.path.exists(chunkfnm)):
        raise Exception("Cant find file {} or {}?".format(chunkfnm, finfo))

    """
    Load file info
    """
    flist = None
    chunklist = None
    with open(finfo) as f:
        flist = f.readlines()
    for i in range(len(flist)):
        flist[i] = cleanline(flist[i])

    with open(chunkfnm) as f:
        chunklist = f.readlines()
    """
    Load chunk info
    """

    for i in range(len(chunklist)):
        chunklist[i] = cleanline(chunklist[i])

    assert len(chunklist) > worknum
    nchunkblock = int(math.ceil(len(chunklist) / worknum))
    startid = nchunkblock * workid
    stopid = nchunkblock
    if workid == worknum - 1:
        stopid = len(chunklist) - nchunkblock * (worknum - 1)

    retchunklist = chunklist[startid : startid + stopid - 1]

    return flist, retchunklist
コード例 #2
0
    def load_word_list(self, isload=True):
        assert (os.path.exists(self.listfnm))

        word2id = None
        log.message("Load word list file {}......".format(self.listfnm))
        with open(self.listfnm) as f:
            self.listnum = sum(1 for x in f)

            if not isload:
                return word2id

            f.seek(0, 0)
            log.message("\tFound total {} word list".format(self.listnum))
            wordlist = f.readlines()

            """ split to word dict """
            for i in range(len(wordlist)):
                wordlist[i] = cleanline(wordlist[i])

            if self.checkwordlist is True:
                wordset = set(wordlist)
                for item in wordset:
                    if(wordlist.count(item) > 1):
                        raise Exception("Found multi same word {} {} in wordlist!"
                                        .format(item, wordlist.count(item)))

            word2id = dict(zip(wordlist, range(len(wordlist))))

            if not word2id.has_key("</s>"):
                raise Exception("Expect </s> word in file {}".format(self.listfnm))

        return word2id
コード例 #3
0
def doProcess(recodes):
    querys = []
    answers = []

    for i in range(len(recodes)):
        line = recodes[i]
        line = cleanline(line)
        parts = line.split('\t')
        if not len(parts) == 2:
            raise Exception("Bad line, expect 2 parts [query answer], get {} parts!"
                            .format(len(parts)))
        querys.append(parts[0])
        answers.append(parts[1])

    return querys, answers
コード例 #4
0
def filterfiles(files):
    assert not len(files) == 0

    badfiles = []
    goodfiles = []
    for i in range(len(files)):
        filenm = cleanline(files[i])
        if not os.path.exists(filenm):
            badfiles.append(filenm)
        else:
            goodfiles.append(filenm)

    if not len(badfiles) == 0:
        log.error("Can't find data files:")
        for i in range(len(badfiles)):
            log.error("\t'{}'".format(badfiles[i]))

    if len(goodfiles) == 0:
        raise Exception("No available data files!")

    log.message("Data files: ")

    readablefiles = []
    for i in range(len(goodfiles)):
        curflen = os.path.getsize(goodfiles[i])
        curflen /= (1024 * 1024)
        if curflen < 10:
            log.warnning("File: {} {}M, too small?".format(goodfiles[i], curflen))
        else:
            readablefiles.append(goodfiles[i])
            log.message("\tNo.{} file:{}".format(i + 1, goodfiles[i]))
        """
        with open(goodfiles[i]) as f:
            lines = sum(1 for x in f)
            if lines == 0:
                log.warnning("File: {} is empty?".format(goodfiles[i]))
            else:
                totallines += lines
                log.message("\tNo.{} file:{} with {} recodes"
                            .format(i + 1, goodfiles[i], lines))

                readablefiles.append(goodfiles[i])
        """
    if len(readablefiles) == 0:
        raise Exception("Found 0 records?")

    return readablefiles
コード例 #5
0
def doProcess(recodes):
    sentencelist = []
    labellist = []

    for i in range(len(recodes)):
        line = recodes[i]
        line = cleanline(line)
        parts = line.split('\t')
        if not len(parts) == 2:
            raise Exception(
                "Bad line, expect 2 parts [sentence \t label], get {} parts!".
                format(len(parts)))
        #modify by xjk
        sentencelist.append(parts[0])
        labellist.append(parts[1])

    return sentencelist, labellist
コード例 #6
0
def doProcess(recodes):
    querys = []
    titles_a = []
    titles_b = []

    for i in range(len(recodes)):
        line = recodes[i]
        line = cleanline(line)
        parts = line.split('\t')
        if not len(parts) == 3:
            raise Exception(
                "Bad line, expect 3 parts [query title1 title2], get {} parts!"
                .format(len(parts)))
        querys.append(parts[0])
        titles_a.append(parts[1])
        titles_b.append(parts[2])

    return querys, titles_a, titles_b