Ejemplo n.º 1
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    coriset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            coriset.append(isclass)
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        senti_utters = list()
        for j in range(0, len(splits[i][0])):
            senti_utters.append(utterances[splits[i][0][j]])
        likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        csims = np.array([0.0] * 38)
        totz = 0
        #for j in range(0, len(splits[i][0])):
        #	speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0]);
        #	cossim = leastSquares.cosineUserWE(likesMatrix, slist.index(speaker));
        #	np.add(csims, cossim);
        #	totz += 1;
        for j in range(0, len(splits[i][1])):
            speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            cossim = leastSquares.cosineUserWE(likesMatrix,
                                               slist.index(speaker))
            cossim = np.array(cossim)
            csims = np.add(csims, cossim)
            totz += 1
        for j in range(0, len(csims)):
            csims[j] /= totz
        print(csims.tolist())
Ejemplo n.º 2
0
def main():
    if not os.path.exists('classifiers'):
        os.makedirs('classifiers')

    allines = NLU.getALines()
    allU = NLU.getUtterances(allines)
    textLines = NLU.getTextLines(allU)
    slots = [NLU.getSlots(i) for i in allU]

    sents = list()
    targets = list()
    tagset = list()
    sent_to_xtc = dict()

    index = 0
    for i in range(len(slots)):
        tstx = []
        for etype in ENT_TYPES:
            for j in range(len(slots[i][etype])):
                tstx.append(index)
                index += 1
                targets.append(slots[i][etype][j]['sentiment'])
                ttags = [
                    slots[i][etype][j][k] for k in ALL_IDS
                    if k in slots[i][etype][j]
                ]
                tagset.append(ttags)
                sents.append(textLines[i])
        sent_to_xtc[i] = tstx

    cprint('Number of Utterances: ' + str(index))
    cprint('Length of Lines: ' + str(len(sents)))
    cprint('Length of Targets: ' + str(len(targets)))

    cv = set()
    regex = re.compile(r'[^a-zA-Z0-9_\~\- ]+')
    for sent in range(0, len(sents)):
        parts = sents[sent].split(' ')
        for part in range(0, len(parts)):
            thepart = regex.sub('', parts[part])
            # corner case for hyphens
            hps = thepart.split('-')
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cprint('Vocabulary Size: ' + str(len(cv)))

    xtc = []
    for sent in range(0, len(sents)):
        #print('sentence: ' + str(sent))
        #print('s1: ' + str(sents[sent]))

        #print(sents[sent] + ' - with tagset - ' + str(tagset[sent]))
        #dparse = spwrap.parse(sents[sent])
        #print('DPARSE: ' + dparse)

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          ' ~~t~~ ' + tagset[sent][tag])
        #print(tokenSent)
        parts = regex.sub('', tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(' |-', parts)

        # remove empty parts from the sentence
        while '' in parts:
            parts.remove('')

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if '~~t~~' == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    #print('parts?: ' + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        #print('window features: ' + str(windowFeatures))

        #print('parts: ' + str(parts))
        row = []
        # featureMapG = [[0]*300]*4
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub('', parts[part])
            #thepart = thepart.lower()
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            #print(theid)
            #g_vec = glove_features.getGloveWord(glove_dict, parts[part])
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse)
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            #for g_vi in range(0, len(g_vec)):
            #	featureMapG[0][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[1][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[2][g_vi] += g_vec[g_vi];# - mindist/10.0
            #	featureMapG[3][g_vi] += g_vec[g_vi];# - mindist/10.0
            if theid in featureMap:
                # 1.0 - mindist / 10.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist
                featureMap[theid][0] += 1.0 - mindist / 10.0
                featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 1.0 - mindist / 10.0
                featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        # add on the glove features
        # for a in range(0, len(featureMapG)):
        # 	temp_vec = []
        # 	for a_a in range(0, len(featureMapG[a])):
        # 		temp_vec.append(featureMapG[a][a_a]*1.0/len(parts))
        # 	row.extend(temp_vec)
        xtc.append(row)

    #instead read the data from splits file
    fsplits = open('splits')
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(':')
        train = list()
        test = list()
        for s in parts[0][1:-1].split(', '):
            train.append(int(s))
        for s in parts[1][1:-1].split(', '):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0])
    #print(splits[0][1])

    #do gridsearch + evaluation
    fscores = open('scores_sentiment', 'w')
    bestsplit = -1
    BSscore = 0
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # add the utterance set generation here for senti_set
        # senti_utters = list()
        # for j in range(0, len(splits[i][0])):
        # 	senti_utters.append(utterances[splits[i][0][j]])
        #likesMatrix, slist = leastSquares.getMatrix(senti_utters)
        # do train-test split
        for j in range(0, len(splits[i][0])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][0][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            #print('\n' + speaker + ': ' + utterances[splits[i][0][j]][0].strip())
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][0][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][0][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #print('speaker: ' + str(speaker) + ' - ' + str(slist.index(speaker)))
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtrain.append(fvector)
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            #speaker = senti_set.getSpeaker(utterances[splits[i][1][j]][0])
            #cossim = leastSquares.consineUser(likesMatrix, slist.index(speaker))
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                #fvector = likesMatrix[slist.index(speaker)]
                #fvector = fvector.tolist()[0]
                fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
                #fvector.append(slist.index(speaker))
                ##############################################################
                #entity = tagset[sent_to_xtc[splits[i][1][j]][LL]]
                #entity = tagset2entity(entity)
                #gscore = leastSquares.getGuess(likesMatrix, entity, slist.index(speaker))
                #gscore = leastSquares.getWeightedGuess(cossim, likesMatrix, entity)
                #fvector.append(gscore)
                ########fvector = [gscore]
                ##############################################################
                xtest.append(fvector)
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = 0

        for gamma in numpy.linspace(0.0001, 0.05, 10):  #10steps
            for C in numpy.linspace(0.1, 10, 10):  #10steps
                #2 fold
                x1 = xtrain[len(xtrain) / 2:]
                x2 = xtrain[:len(xtrain) / 2]
                y1 = ytrain[len(ytrain) / 2:]
                y2 = ytrain[:len(ytrain) / 2]
                x11 = csr_matrix(x1)
                x22 = csr_matrix(x2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x1, y1)
                score = clf.score(x2, y2)
                clf = svm.SVC(gamma=gamma, C=C)
                testout = clf.fit(x2, y2)
                score += clf.score(x1, y1)
                score /= 2
                if score > bestScore:
                    bestC = C
                    bestGamma = gamma
                    bestScore = score
                    cprint('Cross Validation Score: ' + str(score))
                    cprint('Gamma = ' + str(gamma) + ' and C = ' + str(C))

        ################ THIS IS FOR CvI EVALUATION ################
        #Ixtest = list()
        #Iytest = list()
        #Cxtest = list()
        #Cytest = list()
        #for j in range(0, len(splits[i][1])):
        #	for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
        #		fvector = xtc[sent_to_xtc[splits[i][1][j]][LL]]
        #		if coriset[sent_to_xtc[splits[i][1][j]][LL]]:
        #			Cxtest.append(fvector)
        #			Cytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #		else:
        #			Ixtest.append(fvector)
        #			Iytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        #xtrain = csr_matrix(xtrain)
        #Cxtest = csr_matrix(Cxtest)
        #Ixtest = csr_matrix(Ixtest)
        #clf = svm.SVC(gamma=bestGamma, C=bestC)
        #testout = clf.fit(xtrain, ytrain)
        #CBscore = clf.score(Cxtest, Cytest)
        #IBscore = clf.score(Ixtest, Iytest)
        #cprint('Actual Score: ' + str(CBscore) + ':' + str(IBscore))
        #fscores.write(str(CBscore) + ':' + str(IBscore) + '\n')
        #fscores.flush()
        ###############################################################
        ################ THIS IS FOR NORMAL EVALUATION ################
        xtrain = csr_matrix(xtrain)
        xtest = csr_matrix(xtest)
        clf = svm.SVC(gamma=bestGamma, C=bestC)
        testout = clf.fit(xtrain, ytrain)
        bestScore = clf.score(xtest, ytest)
        cprint('Actual Score: ' + str(bestScore))
        fscores.write(str(bestScore) + '\n')
        ###############################################################
        # save best classifier per fold
        cString = pickle.dumps(clf)
        fsave1 = open('classifiers/sentiment_classifier' + str(i), 'w')
        fsave1.write(cString)
        fsave1.close()

    fscores.close()
    # save feature dictionary
    cvString = pickle.dumps(cv)
    fsave2 = open('sentiment_dictionary', 'w')
    fsave2.write(cvString)
    fsave2.close()
Ejemplo n.º 3
0
def runfold(fold):
	# read files
	found_ents = open("fold" + str(fold) + "_found_entities")
	train_file = open("../../data/Open Domain Targeted Sentiment/en/10-fold/train." + str(fold))
	test_file = open("../../data/Open Domain Targeted Sentiment/en/10-fold/test." + str(fold))
	train_lines = train_file.readlines()
	test_lines = test_file.readlines()
	found_lines = found_ents.readlines()
	train_file.close()
	test_file.close()
	found_ents.close()
	# evaluation metrics
	ent_guessed = 0
	ent_actual = 0
	ent_correct = 0
	# compute
	utterances = list()
	tlist = list()
	targets = list()
	sents = list()
	ents = list()
	for line in train_lines:
		tl = line.strip()
		if tl == "":
			if tlist:
				utterances.append(tlist)
				tlist = list()
		else:
			if not tl.startswith("## Tweet"):
				tlist.append(tl.split("\t"))
	if tlist:
		utterances.append(tlist)
		tlist = list()
	splitpoint = len(utterances)
	for line in test_lines:
		tl = line.strip()
		if tl == "":
			if tlist:
				utterances.append(tlist)
				tlist = list()
		else:
			if not tl.startswith("## Tweet"):
				tlist.append(tl.split("\t"))
	if tlist:
		utterances.append(tlist)
	#cprint("Utterances: " + str(len(utterances)))
	######## figure out what to do with the predicted entities
	print("LEN OF FLINES: " + str(len(found_lines)))
	print("LEN OF UTTERANCES: " + str(len(utterances) - splitpoint))
	found_entities = list()
	for f_line in found_lines:
		PTZ = f_line.strip();#.split("::::::")
		if PTZ != "[]":#PTZ[0]
			partf = PTZ[2:-2].split("', '");#PTZ[0]
			found_entities.append(partf)
		else:
			found_entities.append([])
	print("LEN OF PARSED FLINES: " + str(len(found_entities)))
	#split tlist into targets and sents
	splitpoint2 = 0
	count = 0
	for utt in utterances:
		sent = ""
		target = list()
		ent = list()
		f_ents = None
		if count >= splitpoint:
			f_ents = found_entities[count - splitpoint]
		_t = None
		for i in utt:
			if sent != "":
				sent += " "
			sent += i[0]
			if i[1] == "B-ORGANIZATION" or i[1] == "B-PERSON":
			#if i[1][0] == "B":
				target.append(i[0])
				ent.append(i[2])
			#print(str(i[0]) + ":" + str(i[1][0]) + ":" + str(i[2]))
		## Figure out if entity was missed or not
		#### debug if block
		#if f_ents != None:
		#	print(ent)
		#	print(target)
		#	print(f_ents)
		#	if count == 2117:
		#		break
		if f_ents != None:
			#ent_guessed += len(f_ents);#overcounts by including _
			#ent_actual += len(target);#overcounts by including _
			for T_T in range(0, len(target)):
				if ent[T_T] != "_" and ent[T_T] != "neutral":
					ent_actual += 1
			TOTEST_targets = list()
			TOTEST_ents = list()
			for T_T in f_ents:
				if T_T in target:
					idx = target.index(T_T)
					TOTEST_targets.append(T_T)
					TOTEST_ents.append(ent[idx])
					if ent[idx] != "_" and ent[idx] != "neutral":
						ent_guessed += 1
				else:
					ent_guessed += 1
			#print(TOTEST_targets)
			#print(TOTEST_ents)
			ents.extend(TOTEST_ents)
			for a in range(0, len(TOTEST_ents)):
				sents.append(sent)
			targets.extend(TOTEST_targets)
		else:
			ents.extend(ent)
			for a in range(0, len(ent)):
				sents.append(sent)
			targets.extend(target)
		######## split point counters
		if count > splitpoint and splitpoint2 == 0:
			splitpoint2 = len(targets)
		count += 1
	#return;########################################################################

	#print("SPLIT POINT 1: " + str(splitpoint))
	#print("SPLIT POINT 2: " + str(splitpoint2))

	print("LEN TARGETS: " + str(len(targets)));#ntities
	print("LEN SENTS: " + str(len(sents)));#sentences
	print("LEN ENTS: " + str(len(ents)));#sentiments
	print("LEN REAL TARGETS: " + str(len(found_entities)))#real entities

	# Generate vocab
	cv = set()
	regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
	for sent in range(0, len(sents)):
		parts = sents[sent].split(" ")
		for part in range(0, len(parts)):
			thepart = regex.sub("", parts[part])
			# corner case for hyphens
			hps = thepart.split("-")
			if len(hps) > 1:
				for hi in range(0, len(hps)):
					cv.add(hps[hi].lower())
			# end corner case for hyphens
			thepart = thepart.lower()
			cv.add(thepart)
	for sent in range(0, len(sents)):
		tokenSent = sents[sent]
		tokenSent = tokenSent.replace(targets[sent], " ~~t~~ " + targets[sent])
		parts = regex.sub("", tokenSent)
		parts = re.split(" |-", parts)
		while "" in parts:
			parts.remove("")
		windowFeatures = []
		done = False
		while not done:
			for part in range(0, len(parts)):
				if "~~t~~" == parts[part]:
					windowFeatures += [part]
					parts.remove(parts[part])
					#print("parts?: " + str(parts))
					break
				if part == len(parts) - 1:
					done = True
		for part in range(0, len(parts)):
			thepart = parts[part].lower()
			if thepart not in cv:
				cv.add(thepart)
	cv = list(cv)
	#cprint("Vocabulary Size: " + str(len(cv)))


	# Generate the feature vectors
	xtc = []
	xtcT = []
	train_ents = []
	test_ents = []
	for sent in range(0, len(sents)):
		# add token boundaries to the sentence
		tokenSent = sents[sent]
		#print(targets[sent])
		tokenSent = tokenSent.replace(targets[sent], " ~~t~~ " + targets[sent])
		#print(tokenSent)
		parts = regex.sub("", tokenSent)
		# this handles split and hyphen corner case
		parts = re.split(" |-", parts)

		# remove empty parts from the sentence
		while "" in parts:
			parts.remove("")

		# locate window feature indicies
		windowFeatures = []
		done = False
		while not done:
			for part in range(0, len(parts)):
				if "~~t~~" == parts[part]:
					windowFeatures += [part]
					parts.remove(parts[part])
					#print("parts?: " + str(parts))
					break
				if part == len(parts) - 1:
					done = True
		#print("window features: " + str(windowFeatures))

		#print("parts: " + str(parts))
		row = []
		featureMap = {}
		Nflag = 0
		for part in range(0, len(parts)):
			#thepart = regex.sub("", parts[part])
			#thepart = thepart.lower()
			thepart = parts[part].lower()
			if thepart not in cv:
				cv.append(thepart)
			theid = cv.index(thepart)
			#print(theid)
			mindist = 999
			for wf in range(0, len(windowFeatures)):
				##############################################################
				## This is the distance measure for window linear distance!
				distance = abs(windowFeatures[wf] - part)
				##############################################################
				## This is the distance measure for dependency tree distnace!
				## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse)
				##############################################################
				if distance < mindist:
					mindist = distance
			mindist += 1
			sentiz = senti_lexis.lexCounts(thepart)
			if theid in featureMap:
				# featureMap[theid] += 1.0 / mindist
				featureMap[theid][0] += 2.0 - mindist / 7.0
				featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
				featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
				featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
				if Nflag > 0:
					featureMap[theid][4] = 1.0
			else:
				# featureMap[theid] = 1.0 / mindist
				# count, positive, negative, neutral, negate
				featureMap[theid] = [0, 0, 0, 0, 0]
				featureMap[theid][0] = 2.0 - mindist / 7.0
				featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
				featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
				featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
				if Nflag > 0:
					featureMap[theid][4] = 1.0
			if Nflag > 0:
				Nflag -= 1
			if senti_lexis.lexNegate(thepart):
				Nflag = 2
		for i in range(0, len(cv)):
			if i in featureMap:
				row.extend(featureMap[i])
			else:
				row.extend([0, 0, 0, 0, 0])
		if sent < splitpoint2:
			#print("ROW: " + str(len(row)))
			#print("LABEL: " + str(ents[sent]))
			xtc.append(row)
			train_ents.append(ents[sent])
		else:
			xtcT.append(row)
			test_ents.append(ents[sent])

	dist = numpy.array(ents)
	#print((dist=="neutral").sum())
	#print((dist=="negative").sum())
	#print((dist=="positive").sum())
	#print((dist=="_").sum())

	#print("LENTR: " + str(len(xtc)))
	#print("LENTE: " + str(len(xtcT)))
	print("LEN TRAIN ENTS: " + str(len(train_ents)))
	print("LEN TEST ENTS: " + str(len(test_ents)))

	#do gridsearch + evaluation
	bestC = 0
	bestGamma = 0
	bestScore = 0
	xtest = list()
	xtrain = list()
	ytest = list()
	ytrain = list()
	# do train-test split
	for j in range(0, len(xtc)):
		LB = train_ents[j]
		if LB != "_" and LB != "neutral":
			xtrain.append(xtc[j])
			ytrain.append(LB)
	for j in range(0, len(xtcT)):
		LB = test_ents[j]
		if LB != "_" and LB != "neutral":
			xtest.append(xtcT[j])
			ytest.append(LB)
	score = 0

	print("LEN TRAIN: " + str(len(ytrain)))
	print("LEN TEST: " + str(len(ytest)))

	#print(xtrain)
	#print(len(xtrain))
	#print(len(xtrain[0]))
	#print(len(xtrain[1]))
	#print(len(xtrain[2]))
	#print(len(xtrain[3]))
	#print(len(xtrain[4]))
	#for __ in xtrain:
		#if len(__) != 56410:
		#	print len(__)
	for gamma in numpy.linspace(0.0001, 0.05, 10):#10steps
		for C in numpy.linspace(0.1, 10, 10):#10steps
			#gamma = 0.005644444444444444
			#C = 6.0
			xtrain = csr_matrix(xtrain)
			xtest = csr_matrix(xtest)
			clf = svm.SVC(gamma=gamma, C=C)
			testout = clf.fit(xtrain, ytrain)
			score = clf.score(xtest, ytest)
			if score > bestScore:
				bestC = C
				bestGamma = gamma
				bestScore = score
				cprint("Cross Validation Score: " + str(score))
				cprint("Gamma = " + str(gamma) + " and C = " + str(C))

	################ THIS IS FOR NORMAL EVALUATION ################
	xtrain = csr_matrix(xtrain)
	xtest = csr_matrix(xtest)
	clf = svm.SVC(gamma=bestGamma, C=bestC)
	testout = clf.fit(xtrain, ytrain)
	bestScore = clf.score(xtest, ytest)
	#print(clf.predict(xtest))
	ent_correct = (clf.predict(xtest) == ytest).sum()
	cprint("Actual Score: " + str(bestScore))
	###############################################################
	print(str(ent_guessed) + "\t" + str(ent_actual) + "\t" + str(ent_correct))
Ejemplo n.º 4
0
def getClassLabel(utterance, entity, BIO, tokens, cv, clf):
    regex = re.compile(r"[^a-zA-Z0-9_\~ ]+")

    # add token boundaries to the sentence
    tokenSent = utterance
    # use the ID, not the department...
    for tag in entity:
        if "" != entity[tag]:
            print("eT:" + str(entity[tag]))
            tokenSent = tokenSent.replace(entity[tag], " ~~t~~ " + entity[tag])
    #print(tokenSent)
    parts = regex.sub("", tokenSent).split(" ")

    # remove empty parts from the sentence
    while "" in parts:
        parts.remove("")

    # locate window feature indicies
    windowFeatures = []
    done = False
    while not done:
        for part in range(0, len(parts)):
            if "~~t~~" == parts[part]:
                windowFeatures += [part]
                parts.remove(parts[part])
                #print("parts?: " + str(parts))
                break
            if part == len(parts) - 1:
                done = True
    print("window features: " + str(windowFeatures))

    for i in range(0, len(tokens)):
        tokens[i] = regex.sub("", tokens[i])
    row = []
    featureMapG = [[0] * 300] * 4
    featureMap = {}
    Nflag = 0
    for part in range(0, len(tokens)):
        thepart = tokens[part].lower()
        if thepart in cv:
            theid = cv.index(thepart)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                distance = abs(windowFeatures[wf] - part)
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            #for g_vi in range(0, len(g_vec)):
            #	featureMapG[0][g_vi] += g_vec[g_vi]# - mindist/10.0
            #	featureMapG[1][g_vi] += g_vec[g_vi]# - mindist/10.0
            #	featureMapG[2][g_vi] += g_vec[g_vi]# - mindist/10.0
            #	featureMapG[3][g_vi] += g_vec[g_vi]# - mindist/10.0
            if theid in featureMap:
                # 1.0 - mindist / 10.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist
                featureMap[theid][0] += 1.0 - mindist / 10.0
                featureMap[theid][1] += (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] += (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] += (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 1.0 - mindist / 10.0
                featureMap[theid][1] = (1.0 - mindist / 10.0) * sentiz[0]
                featureMap[theid][2] = (1.0 - mindist / 10.0) * sentiz[1]
                featureMap[theid][3] = (1.0 - mindist / 10.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
    for i in range(0, len(cv)):
        if i in featureMap:
            row.extend(featureMap[i])
        else:
            row.extend([0, 0, 0, 0, 0])
    # make prediction
    return clf.predict([row])
Ejemplo n.º 5
0
def main():
    fo = open("EECS_annotated_samples_anonymized", "r")
    lines = fo.readlines()
    utterances = NLU.getUtterances(lines)
    mode = False
    sents = list()
    targets = list()
    lastTaken = ""
    lastSent = ""
    isclass = False
    tagset = list()
    lastTagset = list()
    index = 0
    # to make cross validation work after sentences are duplicated for entities
    sent_to_xtc = dict()
    sent_to_xtc[0] = list()
    for i in range(len(lines)):
        data = lines[i].strip()
        if "" == data:
            index += 1
            sent_to_xtc[index] = list()
        if data.startswith("<class") or data.startswith("<instructor"):
            mode = True
            lastTaken = ""
            lastTagset = list()
        if data.startswith("<class"):
            isclass = True
        if mode and data.startswith("sentiment="):
            lastTaken = data[10:]
            if lastTaken.endswith(">"):
                lastTaken = lastTaken[:-1]
        if mode and data.startswith("name="):
            temp = data[5:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("id="):
            temp = data[3:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if mode and data.startswith("department="):
            temp = data[11:]
            if temp.endswith(">"):
                temp = temp[:-1]
            lastTagset.append(temp)
        if not mode and "" != data:
            lastSent = data
        if data.endswith(">"):
            mode = False
            isclass = False
            sents.append(lastSent)
            tagset.append(lastTagset)
            sent_to_xtc[index].append(len(sents) - 1)
            if lastTaken == "":
                targets.append("neutral")
            else:
                targets.append(lastTaken)

    # This will print out mapping from sentences to entity vectors (XTC)
    #foutest = open("outtestJ", "w");
    #for key in sent_to_xtc:
    #	foutest.write(str(key) + " : " + str(sent_to_xtc[key]) + "\n");
    #foutest.flush();
    #foutest.close();

    #randomly sample utterances
    #testdata = random.sample(range(0, index), index/5);

    print("number of utterances: " + str(index))
    print("length of lines: " + str(len(sents)))
    print("length of targets: " + str(len(targets)))
    print("sent 2: " + str(sents[2]))
    print("tagset 2: " + str(tagset[2]))

    cv = set()
    regex = re.compile(r"[^a-zA-Z0-9_\~\- ]+")
    for sent in range(0, len(sents)):
        parts = sents[sent].split(" ")
        for part in range(0, len(parts)):
            thepart = regex.sub("", parts[part])
            # corner case for hyphens
            hps = thepart.split("-")
            if len(hps) > 1:
                for hi in range(0, len(hps)):
                    cv.add(hps[hi].lower())
            # end corner case for hyphens
            thepart = thepart.lower()
            cv.add(thepart)
    cv = list(cv)
    cv.append("452")
    #bug?
    print("vocabulary size: " + str(len(cv)))
    print("index of I: " + str(cv.index("i")))
    xtc = []
    for sent in range(0, len(sents)):
        print("sentence: " + str(sent))
        print("s1: " + str(sents[sent]))

        #print(sents[sent] + " - with tagset - " + str(tagset[sent]));
        #dparse = spwrap.parse(sents[sent]);
        #print("DPARSE: " + dparse);

        # add token boundaries to the sentence
        tokenSent = sents[sent]
        for tag in range(0, len(tagset[sent])):
            tokenSent = tokenSent.replace(tagset[sent][tag],
                                          " ~~t~~ " + tagset[sent][tag])
        print(tokenSent)
        parts = regex.sub("", tokenSent)
        # this handles split and hyphen corner case
        parts = re.split(" |-", parts)

        # remove empty parts from the sentence
        while "" in parts:
            parts.remove("")

        # locate window feature indicies
        windowFeatures = []
        done = False
        while not done:
            for part in range(0, len(parts)):
                if "~~t~~" == parts[part]:
                    windowFeatures += [part]
                    parts.remove(parts[part])
                    print("parts?: " + str(parts))
                    break
                if part == len(parts) - 1:
                    done = True
        print("window features: " + str(windowFeatures))

        print("parts: " + str(parts))
        row = []
        featureMap = {}
        Nflag = 0
        for part in range(0, len(parts)):
            #thepart = regex.sub("", parts[part]);
            #thepart = thepart.lower();
            thepart = parts[part].lower()
            theid = cv.index(thepart)
            print(theid)
            mindist = 999
            for wf in range(0, len(windowFeatures)):
                ##############################################################
                ## This is the distance measure for window linear distance!
                distance = abs(windowFeatures[wf] - part)
                ##############################################################
                ## This is the distance measure for dependency tree distnace!
                ## distance = spwrap.treeDistance(parts[windowFeatures[wf]], parts[part], dparse);
                ##############################################################
                if distance < mindist:
                    mindist = distance
            mindist += 1
            sentiz = senti_lexis.lexCounts(thepart)
            if theid in featureMap:
                # 2.0 - mindist / 7.0 worked well for the first distance measure...
                # featureMap[theid] += 1.0 / mindist;
                featureMap[theid][0] += 2.0 - mindist / 7.0
                featureMap[theid][1] += (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] += (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] += (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            else:
                # featureMap[theid] = 1.0 / mindist;
                # count, positive, negative, neutral, negate
                featureMap[theid] = [0, 0, 0, 0, 0]
                featureMap[theid][0] = 2.0 - mindist / 7.0
                featureMap[theid][1] = (2.0 - mindist / 7.0) * sentiz[0]
                featureMap[theid][2] = (2.0 - mindist / 7.0) * sentiz[1]
                featureMap[theid][3] = (2.0 - mindist / 7.0) * sentiz[2]
                if Nflag > 0:
                    featureMap[theid][4] = 1.0
            if Nflag > 0:
                Nflag -= 1
            if senti_lexis.lexNegate(thepart):
                Nflag = 2
        for i in range(0, len(cv)):
            if i in featureMap:
                row.extend(featureMap[i])
            else:
                row.extend([0, 0, 0, 0, 0])
        xtc.append(row)

    #cv = CountVectorizer();
    #xtc = cv.fit_transform(sents);

    #examining data structures here
    #parts = sents[0].split(" ");
    #for part in range(0, len(parts)):
    #	print("PART: " + parts[part]);
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'i')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'took')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u'don')));
    #print("WORD DONT: " + str(cv.vocabulary_.get(u't')));
    #print("WORD TAKE: " + str(cv.vocabulary_.get(u'183')));
    #print(str(xtc.shape));
    #print("ROW0");
    #print(xtc[0]);
    #print("ROW1");
    #print(xtc[1]);
    print("ROW2")
    print(xtc[2])
    print(len(xtc[2]))
    #print(type(xtc[0]));
    #print(type(xtc));
    #print(str(len(sents)));
    #endtest

    #xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(xtc, targets, test_size=0.2, random_state=0);

    #use this section of code to do cross validation.
    #shuffle and split into Nfolds parts.
    #testdata = range(0, index);
    #random.shuffle(testdata);
    #folds = list();
    #Nfolds = 10;
    #fsavef = open("folds", "w");
    #for i in range(0, Nfolds):
    #	print("i = " + str(i));
    #	nthfold = testdata[i*index/Nfolds:(i+1)*index/Nfolds];
    #	folds.append(nthfold);
    #	fsavef.write(str(nthfold) + "\n");
    #	print("fold(" + str(i) + "): " + str(nthfold));
    #fsavef.flush();
    #fsavef.close();

    #instead read the data from splits file
    fsplits = open("splits")
    lines = fsplits.readlines()
    splits = list()
    for i in range(0, len(lines)):
        parts = lines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()
    #test print the first split
    #print(splits[0][0]);
    #print(splits[0][1]);

    #do gridsearch + evaluation
    fscores = open("baseline_scores", "w")
    for i in range(0, len(splits)):
        bestC = 0
        bestGamma = 0
        bestScore = 0
        xtest = list()
        xtrain = list()
        ytest = list()
        ytrain = list()
        # do train-test split
        for j in range(0, len(splits[i][0])):
            # VECTOR is 38 x 141 -> 264 total
            for LL in range(0, len(sent_to_xtc[splits[i][0][j]])):
                ytrain.append(targets[sent_to_xtc[splits[i][0][j]][LL]])
        for j in range(0, len(splits[i][1])):
            for LL in range(0, len(sent_to_xtc[splits[i][1][j]])):
                ytest.append(targets[sent_to_xtc[splits[i][1][j]][LL]])
        score = ytrain.count("neutral") * 1.0 / len(ytrain)

        print("Actual Score: " + str(score))
        fscores.write(str(score) + "\n")
        fscores.flush()
    fscores.close()