def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, rownames
def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]

    trainMat = np.transpose(wd[0])
    row_sums = trainMat.sum(axis=1)
    trainMat = trainMat / row_sums[:, np.newaxis]

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index*0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg, rownames
def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        if words[1] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            tweet = sentiment_buildwd.buildTweet(words[5:])
            for word in tweetprocess.tokenize(tweet):
                pword = sentiment_buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow*1.0) / numWords
            trainMat[matCol,:] = trainRow
            matCol += 1
    f.close()

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index*0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg.score(train[cutoff:], labels[cutoff:])
Example #4
0
def tfidf_logreg(train_file):
    wd = sentiment_buildwd.buildWD(train_file)
    colnames = wd[1]
    rownames = wd[2]
    subjects = wd[3]
    idf = tfidf(wd[0], rownames)

    trainMat = np.zeros((len(colnames), wd[0].shape[1]))
    f = open(train_file)
    matCol = 0
    for line in f:
        words = line.strip('\"').split(',')
        if words[1] in colnames:
            trainRow = np.zeros(wd[0].shape[1])
            numWords = 0
            tweet = sentiment_buildwd.buildTweet(words[5:])
            for word in tweetprocess.tokenize(tweet):
                pword = sentiment_buildwd.processWord(word)
                if pword in rownames:
                    numWords += 1
                    trainRow = trainRow + idf[0][rownames.index(pword)]
            trainRow = (trainRow * 1.0) / numWords
            trainMat[matCol, :] = trainRow
            matCol += 1
    f.close()

    trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)

    # RANDOMIZE
    random.seed(17)
    shuffle = range(len(subjects))
    random.shuffle(shuffle)
    train = []
    labels = []
    index = 0
    for i in shuffle:
        train.append(trainMat[i])
        labels.append(trainVals[i])
        index += 1
    cutoff = int(index * 0.7)

    logreg = linear_model.LogisticRegression()
    logreg.fit(train[0:cutoff], labels[0:cutoff])
    return logreg.score(train[cutoff:], labels[cutoff:])