def makeFeatFile(chF, enF, waF, outF, numProc): chSentL = [ line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines() ] enSentL = [ line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines() ] waL = [line.split() for line in codecs.open(waF, 'r', 'utf-8').readlines()] assert len(chSentL) == len(enSentL) == len(waL), \ "len chSentL == %d, len enSentL == %d, len waL == %d" % (len(chSentL), len(enSentL), len(waL)) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") s = time.clock() if numProc > 1: pool = mp.Pool(processes=numProc) tmp = [] base = len(chSentL) / (numProc - 1) for i in xrange(1, numProc + 1): start = base * (i - 1) end = base * i if i < numProc else len(chSentL) tmp.append( pool.apply_async(extract, args=(chSentL[start:end], enSentL[start:end], waL[start:end], start, fwD, wpD))) expList = [] for t in tmp: expL = t.get() expList.extend(expL) else: expList = extract(chSentL, enSentL, waL, 0, fwD, wpD) print >> sys.stderr, "\nextraction time: %f" % (time.clock() - s) s = time.clock() outf = codecs.open("/dev/shm/tmp", 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close() print >> sys.stderr, "outputing time: %f" % (time.clock() - s) subprocess.call("mv /dev/shm/tmp " + outF, shell=True)
def make(chF, enF, gwaF, waF, outF): chSentL = [ line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines() ] enSentL = [ line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines() ] if gwaF == "None": gwaL = [[] for i in xrange(len(chSentL))] else: gwaL = [line.split() for line in open(gwaF).readlines()] waL = [line.split() for line in open(waF).readlines()] print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len( enSentL), len(gwaL), len(waL) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") #wpD = loadWordPairDict("hacept_train.dict") expList = [] for k, chSent in enumerate(chSentL): if k % 100 == 0: print k, enSent = enSentL[k] waSent = waL[k] gwaSent = gwaL[k] for wa in waSent: ID = 'ID' + str(k) + '--' + wa label = 'False' if wa in gwaSent: label = 'True' exp = Example(ID, label) i, j = int(wa.split('-')[0]), int(wa.split('-')[1]) exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD) expList.append(exp) outf = codecs.open(outF, 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close()
def makeFeatFile(chF, enF, waF, outF, numProc): chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()] enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()] waL = [line.split() for line in codecs.open(waF, 'r', 'utf-8').readlines()] assert len(chSentL) == len(enSentL) == len(waL), \ "len chSentL == %d, len enSentL == %d, len waL == %d" % (len(chSentL), len(enSentL), len(waL)) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") s = time.clock() if numProc > 1: pool = mp.Pool(processes = numProc) tmp = [] base = len(chSentL) / (numProc - 1) for i in xrange(1, numProc + 1): start = base * (i - 1) end = base * i if i < numProc else len(chSentL) tmp.append(pool.apply_async(extract, args=(chSentL[start:end], enSentL[start:end], waL[start:end], start, fwD, wpD))) expList = [] for t in tmp: expL = t.get() expList.extend(expL) else: expList = extract(chSentL, enSentL, waL, 0, fwD, wpD) print >> sys.stderr, "\nextraction time: %f" % (time.clock() - s) s = time.clock() outf = codecs.open("/dev/shm/tmp", 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close() print >> sys.stderr, "outputing time: %f" % (time.clock() - s) subprocess.call("mv /dev/shm/tmp " + outF, shell=True)
def make(chF, enF, gwaF, waF, outF): chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()] enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()] if gwaF == "None": gwaL = [[] for i in xrange(len(chSentL))] else: gwaL = [line.split() for line in open(gwaF).readlines()] waL = [line.split() for line in open(waF).readlines()] print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(enSentL), len(gwaL), len(waL) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") #wpD = loadWordPairDict("hacept_train.dict") expList = [] for k, chSent in enumerate(chSentL): if k % 100 == 0: print k, enSent = enSentL[k] waSent = waL[k] gwaSent = gwaL[k] for wa in waSent: ID = 'ID' + str(k) + '--' + wa label = 'False' if wa in gwaSent: label = 'True' exp = Example(ID, label) i, j = int(wa.split('-')[0]), int(wa.split('-')[1]) exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD) expList.append(exp) outf = codecs.open(outF, 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close()