Ejemplo n.º 1
0
def main(transductive: bool = False):
    try:
        from classifier import pretrain
    except ImportError:
        part2xy = load_dataset_fast('FILIMDB', parts=SCORED_PARTS)
        train_ids, train_texts, train_labels = part2xy['train']
        print('\nTraining classifier on %d examples from train set ...' %
              len(train_texts))
        st = time()
        params = train(train_texts, train_labels)
        print('Classifier trained in %.2fs' % (time() - st))
    else:
        part2xy = load_dataset_fast('FILIMDB',
                                    parts=SCORED_PARTS + ('train_unlabeled', ))
        train_ids, train_texts, train_labels = part2xy['train']
        _, train_unlabeled_texts, _ = part2xy['train_unlabeled']

        st = time()

        if transductive:
            all_texts = list(text for _, text, _ in part2xy.values())
        else:
            all_texts = [train_texts, train_unlabeled_texts]

        total_texts = sum(len(text) for text in all_texts)
        print('\nPretraining classifier on %d examples' % total_texts)
        params = pretrain(all_texts)
        print('Classifier pretrained in %.2fs' % (time() - st))
        print('\nTraining classifier on %d examples from train set ...' %
              len(train_texts))
        st = time()
        params = train(train_texts, train_labels, params)
        print('Classifier trained in %.2fs' % (time() - st))
        del part2xy["train_unlabeled"]

    allpreds = []
    for part, (ids, x, y) in part2xy.items():
        print('\nClassifying %s set with %d examples ...' % (part, len(x)))
        st = time()
        preds = classify(x, params)
        print('%s set classified in %.2fs' % (part, time() - st))
        allpreds.extend(zip(ids, preds))

        if y is None:
            print('no labels for %s set' % part)
        else:
            score(preds, y)

    save_preds(allpreds, preds_fname=PREDS_FNAME)
    print('\nChecking saved predictions ...')
    score_preds(preds_fname=PREDS_FNAME, data_dir='FILIMDB')
Ejemplo n.º 2
0
def main():
    part2xy = load_dataset_fast('FILIMDB')
    train_ids, train_texts, train_labels = part2xy['train']

    print('\nTraining classifier on %d examples from train set ...' %
          len(train_texts))
    st = time()
    params = train(train_texts, train_labels)
    print('Classifier trained in %.2fs' % (time() - st))

    allpreds = []
    for part, (ids, x, y) in part2xy.items():
        print('\nClassifying %s set with %d examples ...' % (part, len(x)))
        st = time()
        preds = classify(x, params)
        print('%s set classified in %.2fs' % (part, time() - st))
        allpreds.extend(zip(ids, preds))

        if y is None:
            print('no labels for %s set' % part)
        else:
            score(preds, y)

    save_preds(allpreds, preds_fname=PREDS_FNAME)
    print('\nChecking saved predictions ...')
    score_preds(preds_fname=PREDS_FNAME)
Ejemplo n.º 3
0
def main():
    try:
        from classifier import pretrain
    except ImportError:
        part2xy = load_dataset_fast('FILIMDB')
        train_ids, train_texts, train_labels = part2xy['train']
        print('\nTraining classifier on %d examples from train set ...' %
              len(train_texts))
        st = time()
        params = train(train_texts, train_labels)
        print('Classifier trained in %.2fs' % (time() - st))
    else:
        part2xy = load_dataset_fast('FILIMDB',
                                    parts=('train', 'dev', 'test',
                                           'train_unlabeled'))
        train_ids, train_texts, train_labels = part2xy['train']
        _, train_unlabeled_texts, _ = part2xy['train_unlabeled']
        all_texts = train_texts + train_unlabeled_texts

        print('\nPretraining classifier on %d examples' % len(all_texts))
        st = time()
        params = pretrain(all_texts)
        print('Classifier pretrained in %.2fs' % (time() - st))
        print('\nTraining classifier on %d examples from train set ...' %
              len(train_texts))
        st = time()
        params = train(train_texts, train_labels, params)
        print('Classifier trained in %.2fs' % (time() - st))
        del part2xy["train_unlabeled"]

    allpreds = []
    for part, (ids, x, y) in part2xy.items():
        print('\nClassifying %s set with %d examples ...' % (part, len(x)))
        st = time()
        preds = classify(x, params)
        print('%s set classified in %.2fs' % (part, time() - st))
        allpreds.extend(zip(ids, preds))

        if y is None:
            print('no labels for %s set' % part)
        else:
            score(preds, y)

    save_preds(allpreds, preds_fname=PREDS_FNAME)
    print('\nChecking saved predictions ...')
    score_preds(preds_fname=PREDS_FNAME, data_dir='FILIMDB')
Ejemplo n.º 4
0
def pretrain(ds_name, module, part2xy, transductive):
    train_ids, train_texts, train_labels = part2xy['train']
    
    _, train_unlabeled_texts, _ = score.load_dataset_fast(ds_name, parts=('train_unlabeled',))['train_unlabeled']

    if transductive:
        all_texts = [text for _, text, _ in part2xy.values()] + [train_unlabeled_texts]
    else:
        all_texts = [train_texts, train_unlabeled_texts]

    total_texts = sum(len(text) for text in all_texts)
#    print('\nPretraining classifier on %d examples from %s; transductive=%s' % (total_texts, pretrain_parts,transductive))
    print('\nPretraining classifier on %d examples' % total_texts)
    st = time()
    params = module.pretrain(all_texts)
    print('Classifier pretrained in %.2fs' % (time() - st))
    return params
Ejemplo n.º 5
0
        for word in doc_set:
            k = doc.count(word)
            if word not in vocab:
                p_pos = alpha / (alpha * (vocab_size + 1) + params.all_pos)
                p_neg = alpha / (alpha * (vocab_size + 1) + params.all_neg)
            else:
                p_pos = params.pos_prob[word]
                p_neg = params.neg_prob[word]

            k_log_fact = math.log(math.factorial(k))
            pos_bayes += k * math.log(p_pos) - k_log_fact
            neg_bayes += k * math.log(p_neg) - k_log_fact
        pos_bayes += math.log(params.prob_of_pos_class)
        neg_bayes += math.log(params.prob_of_neg_class)
        if pos_bayes > neg_bayes:
            labels.append('pos')
        else:
            labels.append('neg')

    return labels


t0 = time.clock()
text = score.load_dataset_fast()
train_ids, train_texts, train_labels = text['train']
test_ids, test_texts, test_labels = text['dev']

w = train(train_texts, train_labels)
print(w)
print("Training time: %f" % time.clock())
Ejemplo n.º 6
0
def main(train_timeout=5 * 60, eval_timeout=5 * 60):
    results = {}
    try:
        import classifier
        importlib.reload(classifier)
    except Exception as e:
        print(e)
        results["exception"] = str(e)
        if sys.modules.get("classifier"):
            del sys.modules['classifier']
        return results

    part2xy = load_dataset_fast('FILIMDB_hidden', SCORED_PARTS)
    train_ids, train_texts, train_labels = part2xy['train']

    print('\nTraining classifier on %d examples from train set ...' % len(train_texts))
    st = time()

    try:
        with time_limit(train_timeout):
            params = classifier.train(train_texts, train_labels)
    except (TimeoutException, ValueError, Exception) as e:
        del sys.modules['classifier']
        print(e)
        if isinstance(e, TimeoutException):
            results["train_time"] = train_timeout
        results["exception"] = str(e)
        return results

    train_time = time() - st
    results["train_time"] = train_time

    print('Classifier trained in %.2fs' % train_time)

    allpreds = []
    for part, (ids, x, y) in part2xy.items():
        print('\nClassifying %s set with %d examples ...' % (part, len(x)))
        st = time()
        try:
            with time_limit(eval_timeout):
                preds = classifier.classify(x, params)
        except (TimeoutException, ValueError) as e:
            del sys.modules['classifier']
            if isinstance(e, TimeoutException):
                print("Timeout on evaluating %s set!" % part)
                results["eval_on_%s_set_time" % part] = eval_timeout
            else:
                print(e)
            results["exception"] = str(e)
            return results

        eval_time = time() - st
        results["eval_on_%s_set_time" % part] = eval_time
        print('%s set classified in %.2fs' % (part, eval_time))
        allpreds.extend(zip(ids, preds))

        if y is None:
            print('no labels for %s set' % part)
        else:
            acc = score(preds, y)
            results["eval_on_%s_set_acc" % part] = acc
    del sys.modules['classifier']
    return results
Ejemplo n.º 7
0
def load_ds(ds_name: str):
    return score.load_dataset_fast(ds_name, parts=score.SCORED_PARTS)