def loadChunkList(chunkfnm, workid, worknum): log.message("Load chunk list from file {} ......".format(chunkfnm)) finfo = chunkfnm + nk.sKeychunklistinfo if (not os.path.exists(finfo)) or (not os.path.exists(chunkfnm)): raise Exception("Cant find file {} or {}?".format(chunkfnm, finfo)) """ Load file info """ flist = None chunklist = None with open(finfo) as f: flist = f.readlines() for i in range(len(flist)): flist[i] = cleanline(flist[i]) with open(chunkfnm) as f: chunklist = f.readlines() """ Load chunk info """ for i in range(len(chunklist)): chunklist[i] = cleanline(chunklist[i]) assert len(chunklist) > worknum nchunkblock = int(math.ceil(len(chunklist) / worknum)) startid = nchunkblock * workid stopid = nchunkblock if workid == worknum - 1: stopid = len(chunklist) - nchunkblock * (worknum - 1) retchunklist = chunklist[startid : startid + stopid - 1] return flist, retchunklist
def load_word_list(self, isload=True): assert (os.path.exists(self.listfnm)) word2id = None log.message("Load word list file {}......".format(self.listfnm)) with open(self.listfnm) as f: self.listnum = sum(1 for x in f) if not isload: return word2id f.seek(0, 0) log.message("\tFound total {} word list".format(self.listnum)) wordlist = f.readlines() """ split to word dict """ for i in range(len(wordlist)): wordlist[i] = cleanline(wordlist[i]) if self.checkwordlist is True: wordset = set(wordlist) for item in wordset: if(wordlist.count(item) > 1): raise Exception("Found multi same word {} {} in wordlist!" .format(item, wordlist.count(item))) word2id = dict(zip(wordlist, range(len(wordlist)))) if not word2id.has_key("</s>"): raise Exception("Expect </s> word in file {}".format(self.listfnm)) return word2id
def doProcess(recodes): querys = [] answers = [] for i in range(len(recodes)): line = recodes[i] line = cleanline(line) parts = line.split('\t') if not len(parts) == 2: raise Exception("Bad line, expect 2 parts [query answer], get {} parts!" .format(len(parts))) querys.append(parts[0]) answers.append(parts[1]) return querys, answers
def filterfiles(files): assert not len(files) == 0 badfiles = [] goodfiles = [] for i in range(len(files)): filenm = cleanline(files[i]) if not os.path.exists(filenm): badfiles.append(filenm) else: goodfiles.append(filenm) if not len(badfiles) == 0: log.error("Can't find data files:") for i in range(len(badfiles)): log.error("\t'{}'".format(badfiles[i])) if len(goodfiles) == 0: raise Exception("No available data files!") log.message("Data files: ") readablefiles = [] for i in range(len(goodfiles)): curflen = os.path.getsize(goodfiles[i]) curflen /= (1024 * 1024) if curflen < 10: log.warnning("File: {} {}M, too small?".format(goodfiles[i], curflen)) else: readablefiles.append(goodfiles[i]) log.message("\tNo.{} file:{}".format(i + 1, goodfiles[i])) """ with open(goodfiles[i]) as f: lines = sum(1 for x in f) if lines == 0: log.warnning("File: {} is empty?".format(goodfiles[i])) else: totallines += lines log.message("\tNo.{} file:{} with {} recodes" .format(i + 1, goodfiles[i], lines)) readablefiles.append(goodfiles[i]) """ if len(readablefiles) == 0: raise Exception("Found 0 records?") return readablefiles
def doProcess(recodes): sentencelist = [] labellist = [] for i in range(len(recodes)): line = recodes[i] line = cleanline(line) parts = line.split('\t') if not len(parts) == 2: raise Exception( "Bad line, expect 2 parts [sentence \t label], get {} parts!". format(len(parts))) #modify by xjk sentencelist.append(parts[0]) labellist.append(parts[1]) return sentencelist, labellist
def doProcess(recodes): querys = [] titles_a = [] titles_b = [] for i in range(len(recodes)): line = recodes[i] line = cleanline(line) parts = line.split('\t') if not len(parts) == 3: raise Exception( "Bad line, expect 3 parts [query title1 title2], get {} parts!" .format(len(parts))) querys.append(parts[0]) titles_a.append(parts[1]) titles_b.append(parts[2]) return querys, titles_a, titles_b