Example #1
0
def makeFeatFile(chF, enF, waF, outF, numProc):
    chSentL = [
        line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()
    ]
    enSentL = [
        line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()
    ]
    waL = [line.split() for line in codecs.open(waF, 'r', 'utf-8').readlines()]

    assert len(chSentL) == len(enSentL) == len(waL), \
      "len chSentL == %d, len enSentL == %d, len waL == %d" % (len(chSentL), len(enSentL), len(waL))

    fwD = loadFuncWordDict("ch_funcWordL.txt")
    wpD = loadWordPairDict("cedict_hacept_train.dict")

    s = time.clock()
    if numProc > 1:
        pool = mp.Pool(processes=numProc)
        tmp = []
        base = len(chSentL) / (numProc - 1)
        for i in xrange(1, numProc + 1):
            start = base * (i - 1)
            end = base * i if i < numProc else len(chSentL)
            tmp.append(
                pool.apply_async(extract,
                                 args=(chSentL[start:end], enSentL[start:end],
                                       waL[start:end], start, fwD, wpD)))

        expList = []
        for t in tmp:
            expL = t.get()
            expList.extend(expL)
    else:
        expList = extract(chSentL, enSentL, waL, 0, fwD, wpD)

    print >> sys.stderr, "\nextraction time: %f" % (time.clock() - s)

    s = time.clock()
    outf = codecs.open("/dev/shm/tmp", 'w', 'utf-8')
    for exp in expList:
        outf.write(exp.__str__())
    outf.close()
    print >> sys.stderr, "outputing time: %f" % (time.clock() - s)
    subprocess.call("mv /dev/shm/tmp " + outF, shell=True)
Example #2
0
def make(chF, enF, gwaF, waF, outF):
    chSentL = [
        line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()
    ]
    enSentL = [
        line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()
    ]
    if gwaF == "None":
        gwaL = [[] for i in xrange(len(chSentL))]
    else:
        gwaL = [line.split() for line in open(gwaF).readlines()]
    waL = [line.split() for line in open(waF).readlines()]

    print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(
        enSentL), len(gwaL), len(waL)

    fwD = loadFuncWordDict("ch_funcWordL.txt")
    wpD = loadWordPairDict("cedict_hacept_train.dict")
    #wpD = loadWordPairDict("hacept_train.dict")

    expList = []
    for k, chSent in enumerate(chSentL):
        if k % 100 == 0: print k,
        enSent = enSentL[k]
        waSent = waL[k]
        gwaSent = gwaL[k]

        for wa in waSent:
            ID = 'ID' + str(k) + '--' + wa
            label = 'False'
            if wa in gwaSent:
                label = 'True'
            exp = Example(ID, label)
            i, j = int(wa.split('-')[0]), int(wa.split('-')[1])
            exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
            expList.append(exp)

    outf = codecs.open(outF, 'w', 'utf-8')
    for exp in expList:
        outf.write(exp.__str__())
    outf.close()
Example #3
0
def makeFeatFile(chF, enF, waF, outF, numProc):
	chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()]
	enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()]
	waL = [line.split() for line in codecs.open(waF, 'r', 'utf-8').readlines()]

	assert len(chSentL) == len(enSentL) == len(waL), \
			"len chSentL == %d, len enSentL == %d, len waL == %d" % (len(chSentL), len(enSentL), len(waL))

	fwD = loadFuncWordDict("ch_funcWordL.txt")
	wpD = loadWordPairDict("cedict_hacept_train.dict")

	s = time.clock()
	if numProc > 1:
		pool = mp.Pool(processes = numProc)
		tmp = []
		base = len(chSentL) / (numProc - 1)
		for i in xrange(1, numProc + 1):
			start = base * (i - 1)
			end = base * i if i < numProc else len(chSentL)
			tmp.append(pool.apply_async(extract, args=(chSentL[start:end], enSentL[start:end], waL[start:end], start, fwD, wpD)))
		
		expList = []
		for t in tmp:
			expL = t.get()
			expList.extend(expL)
	else:
		expList = extract(chSentL, enSentL, waL, 0, fwD, wpD)

	print >> sys.stderr, "\nextraction time: %f" % (time.clock() - s)
		
	s = time.clock()
	outf = codecs.open("/dev/shm/tmp", 'w', 'utf-8')
	for exp in expList:
		outf.write(exp.__str__())
	outf.close()
	print >> sys.stderr, "outputing time: %f" % (time.clock() - s)
	subprocess.call("mv /dev/shm/tmp " + outF, shell=True)
Example #4
0
def make(chF, enF, gwaF, waF, outF):
	chSentL = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()]
	enSentL = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()]
	if gwaF == "None":
		gwaL = [[] for i in xrange(len(chSentL))]
	else:
		gwaL = [line.split() for line in open(gwaF).readlines()]
	waL = [line.split() for line in open(waF).readlines()]

	print "len of chSentL, enSentL, gwaL, waL: ", len(chSentL), len(enSentL), len(gwaL), len(waL)

	fwD = loadFuncWordDict("ch_funcWordL.txt")
	wpD = loadWordPairDict("cedict_hacept_train.dict")
	#wpD = loadWordPairDict("hacept_train.dict")

	expList = []
	for k, chSent in enumerate(chSentL):
		if k % 100 == 0: print k,
		enSent = enSentL[k]
		waSent = waL[k]
		gwaSent = gwaL[k]

		for wa in waSent:
			ID = 'ID' + str(k) + '--' + wa
			label = 'False'
			if wa in gwaSent:
				label = 'True'
			exp = Example(ID, label)
			i, j = int(wa.split('-')[0]), int(wa.split('-')[1])
			exp.featList = extractFeat(i, j, chSent, enSent, wpD, fwD)
			expList.append(exp)
	
	outf = codecs.open(outF, 'w', 'utf-8')
	for exp in expList:
		outf.write(exp.__str__())
	outf.close()