Ejemplo n.º 1
0
def extract(chSentL, enSentL, waL, baseID, fwD, wpD):
    expL = []
    for k, chSent in enumerate(chSentL):
        #print >> sys.stderr, k,
        enSent = enSentL[k]
        waSent = waL[k]
        for i, ch in enumerate(chSent):
            for j, en in enumerate(enSent):
                ID = "ID" + str(baseID + k) + '--' + str(i) + '-' + str(j)
                label = "False"
                if str(i) + '-' + str(j) in waSent:
                    label = "True"
                exp = Example(ID, label)
                exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
                expL.append(exp)

    return expL
Ejemplo n.º 2
0
def extract(chSentL, enSentL, waL, baseID, fwD, wpD):
	expL = []
	for k, chSent in enumerate(chSentL):
		#print >> sys.stderr, k, 
		enSent = enSentL[k]
		waSent = waL[k]
		for i, ch in enumerate(chSent):
			for j, en in enumerate(enSent):
				ID = "ID" + str(baseID + k) + '--' + str(i) + '-' + str(j)
				label = "False"
				if str(i) + '-' + str(j) in waSent:
					label = "True"
				exp = Example(ID, label)
				exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
				expL.append(exp)

	return expL
Ejemplo n.º 3
0
def make(chF, enF, gwaF, waF, outF):
    chSentL = [
        line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()
    ]
    enSentL = [
        line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()
    ]
    if gwaF == "None":
        gwaL = [[] for i in xrange(len(chSentL))]
    else:
        gwaL = [line.split() for line in open(gwaF).readlines()]
    waL = [line.split() for line in open(waF).readlines()]

    print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(
        enSentL), len(gwaL), len(waL)

    fwD = loadFuncWordDict("ch_funcWordL.txt")
    wpD = loadWordPairDict("cedict_hacept_train.dict")
    #wpD = loadWordPairDict("hacept_train.dict")

    expList = []
    for k, chSent in enumerate(chSentL):
        if k % 100 == 0: print k,
        enSent = enSentL[k]
        waSent = waL[k]
        gwaSent = gwaL[k]

        for wa in waSent:
            ID = 'ID' + str(k) + '--' + wa
            label = 'False'
            if wa in gwaSent:
                label = 'True'
            exp = Example(ID, label)
            i, j = int(wa.split('-')[0]), int(wa.split('-')[1])
            exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
            expList.append(exp)

    outf = codecs.open(outF, 'w', 'utf-8')
    for exp in expList:
        outf.write(exp.__str__())
    outf.close()
Ejemplo n.º 4
0
def make(chF, enF, gwaF, waF, outF):
	chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()]
	enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()]
	if gwaF == "None":
		gwaL = [[] for i in xrange(len(chSentL))]
	else:
		gwaL = [line.split() for line in open(gwaF).readlines()]
	waL = [line.split() for line in open(waF).readlines()]

	print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(enSentL), len(gwaL), len(waL)

	fwD = loadFuncWordDict("ch_funcWordL.txt")
	wpD = loadWordPairDict("cedict_hacept_train.dict")
	#wpD = loadWordPairDict("hacept_train.dict")

	expList = []
	for k, chSent in enumerate(chSentL):
		if k % 100 == 0: print k,
		enSent = enSentL[k]
		waSent = waL[k]
		gwaSent = gwaL[k]

		for wa in waSent:
			ID = 'ID' + str(k) + '--' + wa
			label = 'False'
			if wa in gwaSent:
				label = 'True'
			exp = Example(ID, label)
			i, j = int(wa.split('-')[0]), int(wa.split('-')[1])
			exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
			expList.append(exp)
	
	outf = codecs.open(outF, 'w', 'utf-8')
	for exp in expList:
		outf.write(exp.__str__())
	outf.close()