def extract(chSentL, enSentL, waL, baseID, fwD, wpD): expL = [] for k, chSent in enumerate(chSentL): #print >> sys.stderr, k, enSent = enSentL[k] waSent = waL[k] for i, ch in enumerate(chSent): for j, en in enumerate(enSent): ID = "ID" + str(baseID + k) + '--' + str(i) + '-' + str(j) label = "False" if str(i) + '-' + str(j) in waSent: label = "True" exp = Example(ID, label) exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD) expL.append(exp) return expL
def make(chF, enF, gwaF, waF, outF): chSentL = [ line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines() ] enSentL = [ line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines() ] if gwaF == "None": gwaL = [[] for i in xrange(len(chSentL))] else: gwaL = [line.split() for line in open(gwaF).readlines()] waL = [line.split() for line in open(waF).readlines()] print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len( enSentL), len(gwaL), len(waL) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") #wpD = loadWordPairDict("hacept_train.dict") expList = [] for k, chSent in enumerate(chSentL): if k % 100 == 0: print k, enSent = enSentL[k] waSent = waL[k] gwaSent = gwaL[k] for wa in waSent: ID = 'ID' + str(k) + '--' + wa label = 'False' if wa in gwaSent: label = 'True' exp = Example(ID, label) i, j = int(wa.split('-')[0]), int(wa.split('-')[1]) exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD) expList.append(exp) outf = codecs.open(outF, 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close()
def make(chF, enF, gwaF, waF, outF): chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()] enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()] if gwaF == "None": gwaL = [[] for i in xrange(len(chSentL))] else: gwaL = [line.split() for line in open(gwaF).readlines()] waL = [line.split() for line in open(waF).readlines()] print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(enSentL), len(gwaL), len(waL) fwD = loadFuncWordDict("ch_funcWordL.txt") wpD = loadWordPairDict("cedict_hacept_train.dict") #wpD = loadWordPairDict("hacept_train.dict") expList = [] for k, chSent in enumerate(chSentL): if k % 100 == 0: print k, enSent = enSentL[k] waSent = waL[k] gwaSent = gwaL[k] for wa in waSent: ID = 'ID' + str(k) + '--' + wa label = 'False' if wa in gwaSent: label = 'True' exp = Example(ID, label) i, j = int(wa.split('-')[0]), int(wa.split('-')[1]) exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD) expList.append(exp) outf = codecs.open(outF, 'w', 'utf-8') for exp in expList: outf.write(exp.__str__()) outf.close()