コード例 #1
0
def sanity_check():
    """
    Run python softmaxreg.py.
    """
    random.seed(314159)
    np.random.seed(265)

    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    _, wordVectors0, _ = load_saved_params()
    wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
    dimVectors = wordVectors.shape[1]

    dummy_weights = 0.1 * np.random.randn(dimVectors, 5)
    dummy_features = np.zeros((10, dimVectors))
    dummy_labels = np.zeros((10,), dtype=np.int32)    
    for i in range(10):
        words, dummy_labels[i] = dataset.getRandomTrainSentence()
        dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)
    print("==== Gradient check for softmax regression ====")
    gradcheck_naive(lambda weights: softmaxRegression(dummy_features,
        dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)

    print("\n===Results ===")
    print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0))
コード例 #2
0
ファイル: sentiment.py プロジェクト: yucdong/nlp_toolkit
def __test_getSentenceFeatures():
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    numWords = len(tokens)

    _, wordVectors, _ = load_saved_params("../saved_params")

    # Concatenate the input/output word embedding vectors to form the final word representation
    wordVectors = np.concatenate(
        (wordVectors[:numWords, :], wordVectors[numWords:, :]), axis=1)
    sentence = "make a splash".split()
    sent_feature = getSentenceFeatures(tokens, wordVectors, sentence)
    print(sent_feature)
コード例 #3
0
ファイル: query_demo.py プロジェクト: yysherlock/embeddings
causeprior = loadObj(config.get(section, "cause_prior"))
N = len(causeprior)
cause_offset, effect_offset = 0, N

while True:
    dim = input(
        'Please enter the dimension of word vectors you want to try, e.g. 200,\nEnter `q` to quit. Please enter your dimension: '
    )
    if dim == 'q' or dim == 'quit':
        sys.exit(0)
    params_path = datasets_dir + '/dim=' + dim
    if not os.path.exists(params_path):
        print('dimension not exists!')
    else:
        # load params
        _, wordVectors, _, _ = load_saved_params(params_path)
        causeVectors, effectVectors = wordVectors[:N, :], wordVectors[N:, :]

        break

while True:
    word = input(
        'Please enter a word you want to query, end with "_c" or "_e" : ')
    if word == 'q' or word == 'quit': sys.exit(0)
    if word not in wordlist:
        print('NOT FOUND!')
        continue

    word_type = word.split('_')[1]

    if word_type == 'c':
コード例 #4
0
from softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper

# Try different regularizations and pick the best!
# NOTE: fill in one more "your code here" below before running!
REGULARIZATION = None   # Assign a list of floats in the block below
### YOUR CODE HERE
REGULARIZATION = [0.00001, 0.00003, 0.0001, 0.0003, 0.001,0.003,0.01]
### END YOUR CODE

# Load the dataset
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# Load the word vectors we trained earlier 
_, wordVectors0, _ = load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]

# Load the train set
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain,), dtype=np.int32)
for i in xrange(nTrain):
    words, trainLabels[i] = trainset[i]
    trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

# Prepare dev set features
devset = dataset.getDevSentences()
nDev = len(devset)
コード例 #5
0
ファイル: Clustering.py プロジェクト: yysherlock/embeddings
    return obj


config = configparser.ConfigParser(
    interpolation=configparser.ExtendedInterpolation())
configPath = 'bi-config.ini'
config.read(configPath)
section = "COPA"
datasets_dir = config.get(section, "datasets_dir")
wordlist = loadObj(config.get(section, "id2word_list"))
dim = '300'
params_path = datasets_dir + '/GloveInit/dim=' + dim
#params_path = '../oldRndWrongCausalSkip/GloveInit/dim=300'
#params_path = '../oldRndWrongCausalSkip/dim=300'

_, X, _, _ = load_saved_params(params_path)  # X: wordVectors
k_means = sklearn.cluster.KMeans(n_clusters=500, max_iter=100000)
k_means.fit(X)

print(len(k_means.labels_))

d = {}
for i, label in enumerate(k_means.labels_):
    d.setdefault(label, [])
    d[label].append(wordlist[i])

with open('kmeans_gloveinit_cluster500.txt', 'w') as outf:
    for k, v in d.items():
        outf.write(str(k) + ': ' + str(v) + '\n')
        outf.flush()
コード例 #6
0
ファイル: sentiment.py プロジェクト: yucdong/nlp_toolkit
def __main__():
    dataset = StanfordSentiment(root_dir="../datasets")
    tokens = dataset.tokens()
    numWords = len(tokens)

    _, wordVectors, _ = load_saved_params("../saved_params")

    # Concatenate the input/output word embedding vectors to form the final word representation
    inputVectors, outputVectors = wordVectors[:numWords, :], wordVectors[
        numWords:, :]
    print(inputVectors.shape)
    print(outputVectors.shape)
    wordVectors = np.concatenate((inputVectors, outputVectors), axis=1)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    # and see if the regularization value performs well
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        })

    # Print the accuracies
    print("")
    print("=== Recap ===")
    print("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print("%.2E\t%.3f\t%.3f\t%.3f" %
              (result["reg"], result["train"], result["dev"], result["test"]))
    print("")

    bestResult = chooseBestModel(results)
    print("Best regularization value: %0.2E" % bestResult["reg"])
    print("Test accuracy (%%): %f" % bestResult["test"])
コード例 #7
0
config.read(configPath)

for section in config.sections():
    if not section=="COPA": continue
    datasets_dir = config.get(section, "datasets_dir")
    tokens = loadObj(config.get(section, "tokens"))
    wordlist = loadObj(config.get(section, "id2word_list"))
    causeprior = loadObj(config.get(section, "cause_prior"))

    N = len(causeprior)

    for f in glob.glob(datasets_dir+"/GloveInit/dim=*"):
        # Load the causal vectors we trained earlier
        if not os.listdir(f): continue
        print(f)
        _, wordVectors, _, _ = load_saved_params(f)
        causeVectors, effectVectors = wordVectors[:N,:], wordVectors[N:,:]

        visualizeWords = ['kill_c','guilty_e','happy_e', 'gift_c', 'fire_c',
        'property_e', 'flood_c', 'damage_e', 'war_c', 'hide_c', 'surprise_e',
        'coupon_c', 'discount_e','click_e','death_e','child_e','add_e',
        'information_e','email_e','contact_e','information_c','like_c','find_c',
        'like_c','look_c',
        'intention_c','interrupt_c','intersection_c']

        #visualizeWords = wordlist
        #visualizeWords = wordlist[0:N]
        """
        indices = list(np.random.choice(np.arange(0,len(wordlist)),size=20,replace=False))
        visualizeWords = [ wordlist[idx] for idx in indices ]
        """