def doc_vect(alldocs):
    print 'Doc2Vec Each Tag is ID'
    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
    documents = []
    for doc in train_docs:
        sentence = TaggedDocument(doc.words, doc.tags)
        documents.append(sentence)
    print len(documents)
    cores = multiprocessing.cpu_count()
    simple_models = [
                # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
                Doc2Vec(documents, dm=1, dm_concat=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                # PV-DBOW
                Doc2Vec(documents, dm=0, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                # PV-DM w/average
                Doc2Vec(documents, dm=1, dm_mean=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                    ]

    models_by_name = OrderedDict((str(model), model) for model in simple_models)
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

    for name, model in models_by_name.items():
        print name
        train_targets, train_regressors = zip(*[(doc.sentiment, model.docvecs[doc.tags[0]]) for doc in train_docs])
        test_targets, test_regressors = zip(*[(doc.sentiment, model.infer_vector(doc.words)) for doc in test_docs])
        util.logit(train_regressors, train_targets, test_regressors, test_targets)
        util.svm(train_regressors, train_targets, test_regressors, test_targets)
Beispiel #2
0
    def get_mu_sigma(self, X_noisy, t):
        """
        Generate mu and sigma for one step in the reverse trajectory,
        starting from a minibatch of images X_noisy, and at timestep t.
        """
        Z = self.mlp.apply(X_noisy)
        mu_coeff, beta_coeff = self.temporal_readout(Z, t)
        # reverse variance is perturbation around forward variance
        beta_forward = self.get_beta_forward(t)
        # make impact of beta_coeff scaled appropriately with mu_coeff
        beta_coeff_scaled = beta_coeff / np.sqrt(self.trajectory_length).astype(theano.config.floatX)
        beta_reverse = T.nnet.sigmoid(beta_coeff_scaled + util.logit(beta_forward))
        # # reverse mean is decay towards mu_coeff
        # mu = (X_noisy - mu_coeff)*T.sqrt(1. - beta_reverse) + mu_coeff
        # reverse mean is a perturbation around the mean under forward
        # process


        # # DEBUG -- use these lines to test objective is 0 for isotropic Gaussian model
        # beta_reverse = beta_forward
        # mu_coeff = mu_coeff*0


        mu = X_noisy*T.sqrt(1. - beta_forward) + mu_coeff*T.sqrt(beta_forward)
        sigma = T.sqrt(beta_reverse)
        mu.name = 'mu p'
        sigma.name = 'sigma p'
        return mu, sigma
Beispiel #3
0
    def classify(self, doc):
        model_file_path = self.path + "svm_model"

        model_file = '%ssvm_model' % self.path
        if not self.feats: util.die('Incomplete model')
        if not os.path.isfile(model_file):
            util.die('no model [%s]' % model_file)

        ## testing data file
        sys.stderr.write('SVM classifying... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None: svm_label = '0'
            elif frag.label: svm_label = '+1'
            else: svm_label = '-1'
            line = '%s ' % svm_label
            feats = [f + '_' + v for f, v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats if f in self.feats]
            svm_feats.sort(lambda x, y: x - y)
            line += ' '.join(['%d:1' % x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        unused, test_file = tempfile.mkstemp()
        fh = open(test_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()

        ## classify test data
        unused, pred_file = tempfile.mkstemp()
        options = '-v 0'

        #cmd = '%s %s %s %s %s' %(SVM_CLASSIFY, options, test_file, model_file, pred_file)
        #cmd = SVM_CLASSIFY + " " +options + " " +test_file + " " + model_file + " " + pred_file
        cmd = "\"" + SVM_CLASSIFY + "\" " + options + " \"" + test_file + "\" \"" + model_file_path + "\" \"" + pred_file + "\""
        print cmd
        os.system(cmd)
        ## get predictions
        total = 0
        preds = map(float, open(pred_file).read().splitlines())
        frag = doc.frag
        while frag:
            frag.pred = util.logit(preds[total])
            frag = frag.next
            total += 1

        ## clean up
        #os.remove(test_file)
        #os.remove(pred_file)
        sys.stderr.write('done!\n')
Beispiel #4
0
    def classify(self, doc):
        model_file_path = self.path + "svm_model"
     
        model_file = '%ssvm_model' %self.path
        if not self.feats: util.die('Incomplete model')
        if not os.path.isfile(model_file): util.die('no model [%s]' %model_file)

        ## testing data file
        sys.stderr.write('SVM classifying... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None: svm_label = '0'
            elif frag.label: svm_label = '+1'
            else: svm_label = '-1'
            line = '%s ' %svm_label
            feats = [f+'_'+v for f,v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats if f in self.feats]
            svm_feats.sort(lambda x,y: x-y)
            line += ' '.join(['%d:1' %x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        unused, test_file = tempfile.mkstemp()
        fh = open(test_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()
    
        ## classify test data
        unused, pred_file = tempfile.mkstemp()
        options = '-v 0'
        
        #cmd = '%s %s %s %s %s' %(SVM_CLASSIFY, options, test_file, model_file, pred_file)
        #cmd = SVM_CLASSIFY + " " +options + " " +test_file + " " + model_file + " " + pred_file
        cmd = "\""+SVM_CLASSIFY + "\" " +options + " \"" +test_file + "\" \"" + model_file_path + "\" \"" + pred_file+"\""
        print cmd
        os.system(cmd)
        ## get predictions
        total = 0
        preds = map(float, open(pred_file).read().splitlines())
        frag = doc.frag
        while frag:
            frag.pred = util.logit(preds[total])
            frag = frag.next
            total += 1

        ## clean up
        #os.remove(test_file)
        #os.remove(pred_file)
        sys.stderr.write('done!\n')
Beispiel #5
0
    def spec_loss(self, y_hat, y, mask, priority_bin=None, priority_w=0):
        masked_l1 = MaskedL1Loss()
        l1 = nn.L1Loss()

        w = self.hparams.masked_loss_weight

        # L1 loss
        if w > 0:
            assert mask is not None
            l1_loss = w * masked_l1(y_hat, y, mask=mask) + (1 - w) * l1(
                y_hat, y)
        else:
            assert mask is None
            l1_loss = l1(y_hat, y)

        # Priority L1 loss
        if priority_bin is not None and priority_w > 0:
            if w > 0:
                priority_loss = w * masked_l1(
                    y_hat[:, :, :priority_bin], y[:, :, :priority_bin], mask=mask) \
                                + (1 - w) * l1(y_hat[:, :, :priority_bin], y[:, :, :priority_bin])
            else:
                priority_loss = l1(y_hat[:, :, :priority_bin],
                                   y[:, :, :priority_bin])
            l1_loss = (1 - priority_w) * l1_loss + priority_w * priority_loss

        # Binary divergence loss
        if self.w <= 0:
            binary_div = y.data.new(1).zero_()
        else:
            y_hat_logits = logit(y_hat)
            z = -y * y_hat_logits + torch.log1p(torch.exp(y_hat_logits))
            if w > 0:
                binary_div = w * masked_mean(z, mask) + (1 - w) * z.mean()
            else:
                binary_div = z.mean()

        return l1_loss, binary_div
def cross_validation(X_train, y_train, params, X_test=None, verbose_eval=False):
    NUM_BOOST_ROUND = 1000
    best_iterations = []
    train_scores = []
    valid_scores = []
    y_preds = []

    kf = KFold(y_train.shape[0], n_folds=5, shuffle=True, random_state=12345)

    for train_index, valid_index in kf:
        _X_train, _X_valid = X_train.ix[train_index], X_train.ix[valid_index]
        _y_train, _y_valid = y_train[train_index], y_train[valid_index]

        dtrain = xgb.DMatrix(_X_train, _y_train)
        dvalid = xgb.DMatrix(_X_valid, _y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        bst = xgb.train(params, dtrain, NUM_BOOST_ROUND, evals=watchlist,
                        early_stopping_rounds=200, verbose_eval=verbose_eval)

        # best iterations and valid score
        best_iterations.append(bst.best_iteration + 1)
        valid_scores.append(bst.best_score)

        if X_test is not None:
            dtest = xgb.DMatrix(X_test)
            y_pred = bst.predict(dtest, ntree_limit=bst.best_iteration)
            y_preds.append(y_pred)

    y_pred = util.sigmoid(np.mean(util.logit(np.array(y_preds)), axis=0))

    result = {"best-iterations": best_iterations,
              "best-iteration": np.mean(best_iterations),
              "valid-score": np.mean(valid_scores),
              "valid-scores": valid_scores,
              "y_pred": y_pred,
              "y_preds": y_preds}
    return result
Beispiel #7
0
def label_vect(alldocs):
    print 'Label2Vec with pre-classification'
    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    non_docs = [doc for doc in alldocs if doc.split == 'extra']
    print('%d docs: %d train-sentiment, %d test-sentiment' %
          (len(alldocs), len(train_docs), len(test_docs)))
    ylin = pre_class(train_docs, test_docs, non_docs)
    documents = []
    for doc in train_docs:
        sentence = TaggedDocument(doc.words, [str(doc.sentiment)])
        documents.append(sentence)
    i = 0
    for doc in test_docs + non_docs:
        sentence = TaggedDocument(doc.words, [str(ylin[i])])
        documents.append(sentence)
        i += 1
    print len(documents)
    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        Doc2Vec(documents,
                dm=1,
                dm_concat=1,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
        # PV-DBOW
        Doc2Vec(documents,
                dm=0,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(documents,
                dm=1,
                dm_mean=1,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
    ]

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])

    for name, model in models_by_name.items():
        print name
        train_targets, train_regressors = zip(*[(doc.sentiment,
                                                 model.infer_vector(doc.words))
                                                for doc in train_docs])
        test_targets, test_regressors = zip(*[(doc.sentiment,
                                               model.infer_vector(doc.words))
                                              for doc in test_docs])
        util.logit(train_regressors, train_targets, test_regressors,
                   test_targets)
Beispiel #8
0
 def _logit_transform(x):
     """
     Transforms pixel values with logit to be unconstrained.
     """
     return util.logit(CIFAR10.alpha + (1 - 2 * CIFAR10.alpha) * x)
Beispiel #9
0
def inv(i, max):
    sc = (i / max) * 0.999 + 0.0005
    return logit(sc)