Esempio n. 1
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :],
            trainVals[0:(trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :],
                     trainVals[(trainMat.shape[0] * 0.7):])
Esempio n. 2
0
def tfidf_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)])
    return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
Esempio n. 3
0
def get_glove_logreg(train_file, trainMat=None):
    if trainMat == None:
        trainMat = buildGloveTrainMat(train_file)

    wd = buildwd.buildWD(train_file, randomize=True)
    labels = wd[3]
    trainVals = buildwd.trainValsFromSubjects(labels)

    logreg = linear_model.LogisticRegression()
    logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return logreg, trainMat, trainVals
Esempio n. 4
0
def glove_knn(train_file, trainMat=None):
    if trainMat == None:
        trainMat = buildGloveTrainMat(train_file)

    wd = buildwd.buildWD(train_file, randomize=True)
    labels = wd[3]
    trainVals = buildwd.trainValsFromSubjects(labels)

    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return knn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):])
Esempio n. 5
0
def get_bag_logreg(train_file):
    wd = buildwd.buildWD(train_file, randomize=True, sentiment=True)
    colnames = wd[1]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = buildwd.trainValsFromSubjects(subjects)

    print 'Training bag_logreg...'
    logreg = linear_model.LogisticRegression()
    logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return logreg, trainMat, trainVals
Esempio n. 6
0
def get_bag_knn(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = buildwd.trainValsFromSubjects(subjects)

    print 'Training bag_knn...'
    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)])
    return knn, trainMat, trainVals
Esempio n. 7
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if numWords > 0:
                trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff
Esempio n. 8
0
def get_tfidf_logreg(train_file):
    wd = buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.split()
        if words[0] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            for word in words[2:]:
                pword = buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            if (numWords > 0): trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, train, labels, cutoff