def get_divergence(sampleA, sampleB):
    '''
    This function applies model a to b, and vice versa, and returns
    a couple of measures of divergence: notably lost accuracy and
    z-tranformed spearman correlation.
    '''

    # We start by constructing the paths to the sampleA
    # standard model criteria (.pkl) and
    # model output (.csv) on the examples
    # originally used to train it.

    # We're going to try applying the sampleA standard
    # criteria to another model's output, and vice-
    # versa.

    model1 = '../models/' + sampleA + '.pkl'
    meta1 = '../models/' + sampleA + '.csv'

    # Now we construct paths to the test model
    # criteria (.pkl) and output (.csv).

    model2 = '../models/' + sampleB + '.pkl'
    meta2 = '../models/' + sampleB + '.csv'

    model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/',
                                                      '.tsv', meta2)
    model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/',
                                                      '.tsv', meta1)

    spearman1on2 = np.arctanh(
        stats.spearmanr(model1on2.probability, model1on2.alien_model)[0])
    spearman2on1 = np.arctanh(
        stats.spearmanr(model2on1.probability, model2on1.alien_model)[0])
    spearman = (spearman1on2 + spearman2on1) / 2

    loss1on2 = accuracy_loss(model1on2)
    loss2on1 = accuracy_loss(model2on1)
    loss = (loss1on2 + loss2on1) / 2

    alienacc2 = accuracy(model1on2, 'alien_model')
    alienacc1 = accuracy(model2on1, 'alien_model')

    acc2 = accuracy(model1on2, 'probability')
    acc1 = accuracy(model2on1, 'probability')

    meandate2 = np.mean(model1on2.std_date)
    meandate1 = np.mean(model2on1.std_date)

    return spearman, loss, spearman1on2, spearman2on1, loss1on2, loss2on1, acc1, acc2, alienacc1, alienacc2, meandate1, meandate2
def get_divergence(sampleA, sampleB):
    '''
    This function applies model a to b, and vice versa, and returns
    a couple of measures of divergence: notably lost accuracy and
    z-tranformed spearman correlation.
    '''

    # We start by constructing the paths to the sampleA
    # standard model criteria (.pkl) and
    # model output (.csv) on the examples
    # originally used to train it.

    # We're going to try applying the sampleA standard
    # criteria to another model's output, and vice-
    # versa.

    model1 = '../models/' + sampleA + '.pkl'
    meta1 = '../models/' + sampleA + '.csv'

    # Now we construct paths to the test model
    # criteria (.pkl) and output (.csv).

    model2 = '../models/' + sampleB + '.pkl'
    meta2 = '../models/' + sampleB + '.csv'

    model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2)
    model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1)

    spearman1on2 = np.arctanh(stats.spearmanr(model1on2.probability, model1on2.alien_model)[0])
    spearman2on1 = np.arctanh(stats.spearmanr(model2on1.probability, model2on1.alien_model)[0])
    spearman = (spearman1on2 + spearman2on1) / 2

    loss1on2 = accuracy_loss(model1on2)
    loss2on1 = accuracy_loss(model2on1)
    loss = (loss1on2 + loss2on1) / 2

    alienacc2 = accuracy(model1on2, 'alien_model')
    alienacc1 = accuracy(model2on1, 'alien_model')

    acc2 = accuracy(model1on2, 'probability')
    acc1 = accuracy(model2on1, 'probability')

    meandate2 = np.mean(model1on2.std_date)
    meandate1 = np.mean(model2on1.std_date)

    return spearman, loss, spearman1on2, spearman2on1, loss1on2, loss2on1, acc1, acc2, alienacc1, alienacc2, meandate1, meandate2
def get_divergences(gold, testname, itera, size, pct):
    '''
    This function gets several possible measures of divergence
    between two models.
    '''

    # We start by constructing the paths to the gold
    # standard model criteria (.pkl) and
    # model output (.csv) on the examples
    # originally used to train it.

    # We're going to try applying the gold standard
    # criteria to another model's output, and vice-
    # versa.

    model1 = '../measuredivergence/modeloutput/' + gold + '.pkl'
    meta1 = '../measuredivergence/modeloutput/' + gold + '.csv'

    # Now we construct paths to the test model
    # criteria (.pkl) and output (.csv).

    testpath = '../measuredivergence/modeloutput/' + testname
    model2 = testpath + '.pkl'
    meta2 = testpath + '.csv'

    model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2)
    model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1)

    pearson1on2 = stats.pearsonr(model1on2.probability, model1on2.alien_model)[0]
    pearson2on1 = stats.pearsonr(model2on1.probability, model2on1.alien_model)[0]
    pearson = averagecorr(pearson1on2, pearson2on1)

    spearman1on2 = stats.spearmanr(model1on2.probability, model1on2.alien_model)[0]
    spearman2on1 = stats.spearmanr(model2on1.probability, model2on1.alien_model)[0]
    spearman = averagecorr(spearman1on2, spearman2on1)

    loss1on2 = accuracy_loss(model1on2)
    loss2on1 = accuracy_loss(model2on1)
    loss = (loss1on2 + loss2on1) / 2

    kl1on2 = kldivergence(model1on2.probability, model1on2.alien_model)
    kl2on1 = kldivergence(model2on1.probability, model2on1.alien_model)
    kl = (kl1on2 + kl2on1) / 2

    return pearson, spearman, loss, kl, spearman1on2, spearman2on1, loss1on2, loss2on1
def get_divergences(gold, testname, itera, size, pct):
    '''
    This function gets several possible measures of divergence
    between two models.
    '''

    # We start by constructing the paths to the gold
    # standard model criteria (.pkl) and
    # model output (.csv) on the examples
    # originally used to train it.

    # We're going to try applying the gold standard
    # criteria to another model's output, and vice-
    # versa.

    model1 = '../measuredivergence/modeloutput/' + gold + '.pkl'
    meta1 = '../measuredivergence/modeloutput/' + gold + '.csv'

    # Now we construct paths to the test model
    # criteria (.pkl) and output (.csv).

    testpath = '../measuredivergence/modeloutput/' + testname
    model2 = testpath + '.pkl'
    meta2 = testpath + '.csv'

    model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2)
    model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1)

    pearson1on2 = stats.pearsonr(model1on2.probability, model1on2.alien_model)[0]
    pearson2on1 = stats.pearsonr(model2on1.probability, model2on1.alien_model)[0]
    pearson = averagecorr(pearson1on2, pearson2on1)

    spearman1on2 = stats.spearmanr(model1on2.probability, model1on2.alien_model)[0]
    spearman2on1 = stats.spearmanr(model2on1.probability, model2on1.alien_model)[0]
    spearman = averagecorr(spearman1on2, spearman2on1)

    loss1on2 = accuracy_loss(model1on2)
    loss2on1 = accuracy_loss(model2on1)
    loss = (loss1on2 + loss2on1) / 2

    kl1on2 = kldivergence(model1on2.probability, model1on2.alien_model)
    kl2on1 = kldivergence(model2on1.probability, model2on1.alien_model)
    kl = (kl1on2 + kl2on1) / 2

    return pearson, spearman, loss, kl, spearman1on2, spearman2on1, loss1on2, loss2on1
Ejemplo n.º 5
0
              encoding='utf-8') as f:
        for line in f:
            fields = line.split(',')
            if fields[0] == 'docid':
                header2 = line
            elif fields[0] in all_ids:
                line = add_metafeatures(fields[0], line, df)
                rows.append(line)
                found.add(fields[0])

    header = header1.strip('\n') + ",#noveltitle,#juvaudience,#notfiction\n"
    with open('holding_data.csv', mode='w', encoding='utf-8') as f:
        f.write(header)
        for r in rows:
            f.write(r)

    data = pd.read_csv('holding_data.csv', index_col='docid')

    newmeta = vt2.apply_pickled_model('output/juvmodel.pkl', df, data,
                                      'juvenileprob')
    newmeta = vt2.apply_pickled_model('output/nonmodel.pkl', df, data,
                                      'nonficprob')

    frames.append(newmeta)

enrichedrecord = pd.concat(frames, sort=False)

enrichedrecord.to_csv('../../enrichedrecordmeta.tsv',
                      index_label='docid',
                      sep='\t')
Ejemplo n.º 6
0
def measure_sf_divergences():
    if not os.path.isfile('../measuredivergence/divergences.tsv'):
        with open('../measuredivergence/divergences.tsv',
                  mode='a',
                  encoding='utf-8') as f:
            outline = 'name1\tname2\tsize1\tsize2\tacc1\tacc2\tratiodiff\tpearson\tspearman\tspearman2on1\tloss\tkl\n'
            f.write(outline)

    goldstandards = [
        'iter5_size80_ratio0', 'iter6_size80_ratio0', 'iter7_size80_ratio0'
    ]
    size = 80

    modeldata = pd.read_csv('../measuredivergence/modeldata.tsv',
                            sep='\t',
                            index_col='name')

    for gold in goldstandards:
        for itera in [5, 6]:
            for pct in range(0, 105, 5):
                ratio = pct / 100

                model1 = '../measuredivergence/modeloutput/' + gold + '.pkl'
                meta1 = '../measuredivergence/modeloutput/' + gold + '.csv'

                testpath = '../measuredivergence/modeloutput/iter' + str(
                    itera) + '_size' + str(size) + '_ratio' + str(pct)

                testname = 'iter' + str(itera) + '_size' + str(
                    size) + '_ratio' + str(pct)

                if testname == gold:
                    continue
                    # we don't test a model against itself.
                if testname != 'iter7_size80_ratio0' and ratio != 0:
                    continue
                    # we're extending previous work

                model2 = testpath + '.pkl'
                meta2 = testpath + '.csv'

                acc1 = modeldata.loc[gold, 'accuracy']
                acc2 = modeldata.loc[testname, 'accuracy']

                model1on2 = versatiletrainer2.apply_pickled_model(
                    model1, '../data/', '.tsv', meta2)
                model2on1 = versatiletrainer2.apply_pickled_model(
                    model2, '../data/', '.tsv', meta1)

                pearson1on2 = stats.pearsonr(model1on2.probability,
                                             model1on2.alien_model)[0]
                pearson2on1 = stats.pearsonr(model2on1.probability,
                                             model2on1.alien_model)[0]
                pearson = averagecorr(pearson1on2, pearson2on1)

                spearman1on2 = stats.spearmanr(model1on2.probability,
                                               model1on2.alien_model)[0]
                spearman2on1 = stats.spearmanr(model2on1.probability,
                                               model2on1.alien_model)[0]
                spearman = averagecorr(pearson1on2, pearson2on1)

                loss1on2 = accuracy_loss(model1on2)
                loss2on1 = accuracy_loss(model2on1)
                loss = (loss1on2 + loss2on1) / 2

                kl1on2 = kldivergence(model1on2.probability,
                                      model1on2.alien_model)
                kl2on1 = kldivergence(model2on1.probability,
                                      model2on1.alien_model)
                kl = (kl1on2 + kl2on1) / 2

                with open('../measuredivergence/divergences.tsv',
                          mode='a',
                          encoding='utf-8') as f:
                    outline = gold + '\t' + testname + '\t' + str(
                        size) + '\t' + str(size) + '\t' + str(
                            acc1) + '\t' + str(acc2) + '\t' + str(
                                ratio) + '\t' + str(pearson) + '\t' + str(
                                    spearman) + '\t' + str(
                                        spearman2on1) + '\t' + str(
                                            loss) + '\t' + str(kl) + '\n'
                    f.write(outline)