Beispiel #1
0
def run(training, validation, k, config):
    texts_by_motifs = defaultdict(list)
    motifs_in_docs = defaultdict(list)

    # construct the bigdocuments
    for i, (source, motifs, text) in enumerate(training):
        for motif in motifs:
            if motif != 'DUMMY':
                motifs_in_docs[motif].append(i)
                texts_by_motifs[motif].extend(text)

    labels, texts = zip(*texts_by_motifs.items())
    indexer = Indexer()
    for label, text in zip(labels, texts):
        indexer.add(text, label)

    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []
    for j, (source, motifs, text) in enumerate(validation):
        nDocs += 1
        scores = list(indexer.predict_proba(
            text, config.getfloat('bm25', 'k1'), config.getfloat('bm25', 'b')))
        preds = sorted(scores, key=lambda i: i[1], reverse=True)
        preds = [label for label,score in preds]
        refs = set(motifs)
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Beispiel #2
0
def run(training, validation, k, config=None):
    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []

    class_index = Index()
    traindocs, train_X, train_y = zip(*load_data(training, class_index))
    testdocs, test_X, test_y = zip(*load_data(validation, class_index))

    n_iter = np.ceil(10**6 / len(traindocs))

    clf = SGDClassifier(alpha=.000001, loss='log', n_iter=50, penalty='elasticnet')
    #clf = MultinomialNB(alpha=0.000001)

    classifier = Pipeline([
                ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)),
                ('tfidf', TfidfTransformer(norm='l2')),
                ('clf', OneVsRestClassifier(clf, n_jobs=-1))])

    classifier.fit(train_X, train_y)
    predictions = classifier.predict_proba(test_X)
    for j, prediction in enumerate(predictions):
        nDocs += 1
        refs = np.zeros(len(prediction))
        refs[list(test_y[j])] = 1
        preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True)
        refs = set(test_y[j])
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Beispiel #3
0
def run(training, validation, k, config):
    texts_by_motifs = defaultdict(list)
    motifs_in_docs = defaultdict(list)

    # construct the bigdocuments
    for i, (source, motifs, text) in enumerate(training):
        for motif in motifs:
            if motif != 'DUMMY':
                motifs_in_docs[motif].append(i)
                texts_by_motifs[motif].extend(text)

    labels, texts = zip(*texts_by_motifs.items())
    indexer = Indexer()
    for label, text in zip(labels, texts):
        indexer.add(text, label)

    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []
    for j, (source, motifs, text) in enumerate(validation):
        nDocs += 1
        scores = list(
            indexer.predict_proba(text, config.getfloat('bm25', 'k1'),
                                  config.getfloat('bm25', 'b')))
        preds = sorted(scores, key=lambda i: i[1], reverse=True)
        preds = [label for label, score in preds]
        refs = set(motifs)
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Beispiel #4
0
def run(training, validation, k, config):

    norm = config.get('tfidf', 'norm')
    smooth_idf = config.getboolean('tfidf', 'smooth_idf')

    bigdoc = config.getboolean('NB', 'bigdoc')
    clf = config.get('system', 'system')
    if clf == 'NB':
        clf = MultinomialNB(alpha=config.getfloat('NB', 'alpha'))
        if not bigdoc:
            clf = OneVsRestClassifier(clf, n_jobs=-1)
    elif clf == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=10, weights='distance')
        if not bigdoc:
            clf = OneVsRestClassifier(clf)
    elif clf == 'SVC':
        clf = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3)
        if not bigdoc:
            clf = OneVsRestClassifier(clf)
    elif clf == 'dtree':
        clf = DecisionTreeClassifier()
    else:
        clf = OneVsRestClassifier(
            SGDClassifier(alpha=config.getfloat('sgd', 'alpha'),
                          loss=config.get('sgd', 'loss'),
                          n_iter=config.getint('sgd', 'iterations'),
                          penalty=config.get('sgd', 'penalty')), n_jobs=-1)

    classifier = Pipeline([
        ('vectorizer', CountVectorizer(min_df=1, max_df=1, analyzer=lambda t: t)),
        ('tfidf', TfidfTransformer(norm=norm, smooth_idf=smooth_idf)),
        ('clf', clf)])

    if bigdoc:
        (train_y, train_X), class_index = construct_bigdocuments(training)
        _, test_y, test_X = zip(*validation)
        test_y = [set(class_index[l] for l in ls) for ls in test_y]
    else:
        class_index = Index()
        _, train_X, train_y = zip(*load_data(training, class_index))
        _, test_X, test_y = zip(*load_data(validation, class_index))

    classifier.fit(train_X, train_y)
    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []
    predictions = classifier.predict_proba(test_X)
    for j, prediction in enumerate(predictions):
        nDocs += 1
        preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True)
        refs = test_y[j]
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Beispiel #5
0
def run(training, validation, k, config):

    norm = config.get('tfidf', 'norm')
    smooth_idf = config.getBoolean('tfidf', 'smooth_idf')

    bigdoc = False
    clf = config.get('system', 'system')
    if clf == 'NB':
        alpha = config.getFloat('NB', 'alpha')
        if config.getBoolean('NB', 'bigdoc'):
            bigdoc = True
            clf = MultinomialNB(alpha=alpha)
        else:
            clf = OneVsRestClassifier(BernoulliNB(alpha=alpha))
    else:
        clf = SGDClassifier(alpha=config.getFloat('sgd', 'alpha'),
                            loss=config.get('sgd', 'loss'),
                            n_iter=config.getInt('sgd', 'iterations'),
                            penalty=config.get('sgd', 'penalty'))

    classifier = Pipeline([('vectorizer',
                            CountVectorizer(min_df=1,
                                            max_df=1.0,
                                            analyzer=lambda t: t)),
                           ('tfidf',
                            TfidfTransformer(norm=norm,
                                             smooth_idf=smooth_idf)),
                           ('clf', clf)])

    if bigdoc:
        (train_y, train_X), class_index = construct_bigdocuments(training)
        _, test_y, test_X = zip(*validation)
        test_y = [tuple(class_index[l] for l in ls) for ls in test_y]
    else:
        class_index = Index()
        _, train_X, train_y = zip(*load_data(training, class_index))
        _, test_X, test_y = zip(*load_data(validation, class_index))

    classifier.fit(train_X, train_y)
    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []
    predictions = classifier.predict_proba(test_X)
    for j, prediction in enumerate(predictions):
        nDocs += 1
        preds = sorted(range(len(prediction)),
                       key=lambda i: prediction[i],
                       reverse=True)
        refs = set(labelings[j])
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Beispiel #6
0
def run(training, validation, k, config):
    ground_truth = {}
    ROOTDIR = config.get('filepaths', 'corpus')
    alpha, beta = config.get('llda', 'alpha'), config.get('llda', 'beta')
    iterations = config.get('llda', 'iterations')

    with open(ROOTDIR + 'training-%s.tmp' % k, 'w') as training_out:
        writer = csv.writer(training_out, quoting=csv.QUOTE_MINIMAL)
        for (source, motifs, text) in training:
            motifs = r' '.join(motifs) + ' DUMMY'
            writer.writerow([source, motifs, ' '.join(text)])

    with open(ROOTDIR + 'testing-%s.tmp' % k, 'w') as testing_out:
        writer = csv.writer(testing_out, quoting=csv.QUOTE_MINIMAL)
        for (source, motifs, text) in validation:
            ground_truth[source] = motifs
            writer.writerow([source, r' '.join(motifs), ' '.join(text)])
    
    # train LLDA
    with open(os.devnull, 'w') as null:
        subprocess.call('java -Xmx2000mb -jar tmt-0.4.0.jar llda-train.scala %s %s %s %s' %
            (ROOTDIR + 'training-%s.tmp' % k, alpha, beta, iterations),
            stdout=null, stderr=null, shell=True)
    # retrieve the model path
    modelpath = open(ROOTDIR + 'training-%s.tmp.config' % k).read().strip()
    # preform inference on led-out dataset using trained model
    with open(os.devnull, 'w') as null:
        subprocess.call('java -Xmx2000mb -jar tmt-0.4.0.jar llda-test.scala %s %s' %
            (modelpath, (ROOTDIR + 'testing-%s.tmp' % k)),
            stdout=sys.stdout, stderr=sys.stderr, shell=True)

    # evaluation starts here!
    isError, oneError, nDocs = 0, 0, 0
    AP, margins = [], []
    label_file = '/%05d/label-index.txt' % config.getint('llda', 'iterations')
    topicIndex = [topic.strip() for topic in open(modelpath + label_file)]
    reader = csv.reader(open(modelpath + '/testing-%s.tmp-document-topic-distributuions.csv' % k))
    for row in reader:
        nDocs += 1
        idnumber, topics = row[0], [float(score) for score in row[1:]]
        topics = sorted([(topicIndex[i], score) for i, score in enumerate(topics)],
                        key=lambda i: i[1], reverse=True)
        preds = [topic for topic, _ in topics if topic != 'DUMMY']
        refs = ground_truth[idnumber]
        ap = average_precision(preds, refs)
        isError += is_error(ap)
        oneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
        AP.append(ap)
    return isError, oneError, nDocs, margins, AP
Beispiel #7
0
def run(training, validation, k, config):

    norm = config.get('tfidf', 'norm')
    smooth_idf = config.getBoolean('tfidf', 'smooth_idf')

    bigdoc = False
    clf = config.get('system', 'system')
    if clf == 'NB':
        alpha=config.getFloat('NB', 'alpha')
        if config.getBoolean('NB', 'bigdoc'):
            bigdoc = True
            clf = MultinomialNB(alpha=alpha)
        else:
            clf = OneVsRestClassifier(BernoulliNB(alpha=alpha))
    else:
        clf = SGDClassifier(alpha=config.getFloat('sgd', 'alpha'),
                            loss=config.get('sgd', 'loss'),
                            n_iter=config.getInt('sgd', 'iterations'),
                            penalty=config.get('sgd', 'penalty'))

    classifier = Pipeline([
        ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)),
        ('tfidf', TfidfTransformer(norm=norm, smooth_idf=smooth_idf)),
        ('clf', clf)])

    if bigdoc:
        (train_y, train_X), class_index = construct_bigdocuments(training)
        _, test_y, test_X = zip(*validation)
        test_y = [tuple(class_index[l] for l in ls) for ls in test_y]
    else:
        class_index = Index()
        _, train_X, train_y = zip(*load_data(training, class_index))
        _, test_X, test_y = zip(*load_data(validation, class_index))

    classifier.fit(train_X, train_y)
    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []
    predictions = classifier.predict_proba(test_X)
    for j, prediction in enumerate(predictions):
        nDocs += 1
        preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True)
        refs = set(labelings[j])
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP