def get_divergence(sampleA, sampleB): ''' This function applies model a to b, and vice versa, and returns a couple of measures of divergence: notably lost accuracy and z-tranformed spearman correlation. ''' # We start by constructing the paths to the sampleA # standard model criteria (.pkl) and # model output (.csv) on the examples # originally used to train it. # We're going to try applying the sampleA standard # criteria to another model's output, and vice- # versa. model1 = '../models/' + sampleA + '.pkl' meta1 = '../models/' + sampleA + '.csv' # Now we construct paths to the test model # criteria (.pkl) and output (.csv). model2 = '../models/' + sampleB + '.pkl' meta2 = '../models/' + sampleB + '.csv' model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2) model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1) spearman1on2 = np.arctanh( stats.spearmanr(model1on2.probability, model1on2.alien_model)[0]) spearman2on1 = np.arctanh( stats.spearmanr(model2on1.probability, model2on1.alien_model)[0]) spearman = (spearman1on2 + spearman2on1) / 2 loss1on2 = accuracy_loss(model1on2) loss2on1 = accuracy_loss(model2on1) loss = (loss1on2 + loss2on1) / 2 alienacc2 = accuracy(model1on2, 'alien_model') alienacc1 = accuracy(model2on1, 'alien_model') acc2 = accuracy(model1on2, 'probability') acc1 = accuracy(model2on1, 'probability') meandate2 = np.mean(model1on2.std_date) meandate1 = np.mean(model2on1.std_date) return spearman, loss, spearman1on2, spearman2on1, loss1on2, loss2on1, acc1, acc2, alienacc1, alienacc2, meandate1, meandate2
def get_divergence(sampleA, sampleB): ''' This function applies model a to b, and vice versa, and returns a couple of measures of divergence: notably lost accuracy and z-tranformed spearman correlation. ''' # We start by constructing the paths to the sampleA # standard model criteria (.pkl) and # model output (.csv) on the examples # originally used to train it. # We're going to try applying the sampleA standard # criteria to another model's output, and vice- # versa. model1 = '../models/' + sampleA + '.pkl' meta1 = '../models/' + sampleA + '.csv' # Now we construct paths to the test model # criteria (.pkl) and output (.csv). model2 = '../models/' + sampleB + '.pkl' meta2 = '../models/' + sampleB + '.csv' model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2) model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1) spearman1on2 = np.arctanh(stats.spearmanr(model1on2.probability, model1on2.alien_model)[0]) spearman2on1 = np.arctanh(stats.spearmanr(model2on1.probability, model2on1.alien_model)[0]) spearman = (spearman1on2 + spearman2on1) / 2 loss1on2 = accuracy_loss(model1on2) loss2on1 = accuracy_loss(model2on1) loss = (loss1on2 + loss2on1) / 2 alienacc2 = accuracy(model1on2, 'alien_model') alienacc1 = accuracy(model2on1, 'alien_model') acc2 = accuracy(model1on2, 'probability') acc1 = accuracy(model2on1, 'probability') meandate2 = np.mean(model1on2.std_date) meandate1 = np.mean(model2on1.std_date) return spearman, loss, spearman1on2, spearman2on1, loss1on2, loss2on1, acc1, acc2, alienacc1, alienacc2, meandate1, meandate2
def get_divergences(gold, testname, itera, size, pct): ''' This function gets several possible measures of divergence between two models. ''' # We start by constructing the paths to the gold # standard model criteria (.pkl) and # model output (.csv) on the examples # originally used to train it. # We're going to try applying the gold standard # criteria to another model's output, and vice- # versa. model1 = '../measuredivergence/modeloutput/' + gold + '.pkl' meta1 = '../measuredivergence/modeloutput/' + gold + '.csv' # Now we construct paths to the test model # criteria (.pkl) and output (.csv). testpath = '../measuredivergence/modeloutput/' + testname model2 = testpath + '.pkl' meta2 = testpath + '.csv' model1on2 = versatiletrainer2.apply_pickled_model(model1, '../data/', '.tsv', meta2) model2on1 = versatiletrainer2.apply_pickled_model(model2, '../data/', '.tsv', meta1) pearson1on2 = stats.pearsonr(model1on2.probability, model1on2.alien_model)[0] pearson2on1 = stats.pearsonr(model2on1.probability, model2on1.alien_model)[0] pearson = averagecorr(pearson1on2, pearson2on1) spearman1on2 = stats.spearmanr(model1on2.probability, model1on2.alien_model)[0] spearman2on1 = stats.spearmanr(model2on1.probability, model2on1.alien_model)[0] spearman = averagecorr(spearman1on2, spearman2on1) loss1on2 = accuracy_loss(model1on2) loss2on1 = accuracy_loss(model2on1) loss = (loss1on2 + loss2on1) / 2 kl1on2 = kldivergence(model1on2.probability, model1on2.alien_model) kl2on1 = kldivergence(model2on1.probability, model2on1.alien_model) kl = (kl1on2 + kl2on1) / 2 return pearson, spearman, loss, kl, spearman1on2, spearman2on1, loss1on2, loss2on1
encoding='utf-8') as f: for line in f: fields = line.split(',') if fields[0] == 'docid': header2 = line elif fields[0] in all_ids: line = add_metafeatures(fields[0], line, df) rows.append(line) found.add(fields[0]) header = header1.strip('\n') + ",#noveltitle,#juvaudience,#notfiction\n" with open('holding_data.csv', mode='w', encoding='utf-8') as f: f.write(header) for r in rows: f.write(r) data = pd.read_csv('holding_data.csv', index_col='docid') newmeta = vt2.apply_pickled_model('output/juvmodel.pkl', df, data, 'juvenileprob') newmeta = vt2.apply_pickled_model('output/nonmodel.pkl', df, data, 'nonficprob') frames.append(newmeta) enrichedrecord = pd.concat(frames, sort=False) enrichedrecord.to_csv('../../enrichedrecordmeta.tsv', index_label='docid', sep='\t')
def measure_sf_divergences(): if not os.path.isfile('../measuredivergence/divergences.tsv'): with open('../measuredivergence/divergences.tsv', mode='a', encoding='utf-8') as f: outline = 'name1\tname2\tsize1\tsize2\tacc1\tacc2\tratiodiff\tpearson\tspearman\tspearman2on1\tloss\tkl\n' f.write(outline) goldstandards = [ 'iter5_size80_ratio0', 'iter6_size80_ratio0', 'iter7_size80_ratio0' ] size = 80 modeldata = pd.read_csv('../measuredivergence/modeldata.tsv', sep='\t', index_col='name') for gold in goldstandards: for itera in [5, 6]: for pct in range(0, 105, 5): ratio = pct / 100 model1 = '../measuredivergence/modeloutput/' + gold + '.pkl' meta1 = '../measuredivergence/modeloutput/' + gold + '.csv' testpath = '../measuredivergence/modeloutput/iter' + str( itera) + '_size' + str(size) + '_ratio' + str(pct) testname = 'iter' + str(itera) + '_size' + str( size) + '_ratio' + str(pct) if testname == gold: continue # we don't test a model against itself. if testname != 'iter7_size80_ratio0' and ratio != 0: continue # we're extending previous work model2 = testpath + '.pkl' meta2 = testpath + '.csv' acc1 = modeldata.loc[gold, 'accuracy'] acc2 = modeldata.loc[testname, 'accuracy'] model1on2 = versatiletrainer2.apply_pickled_model( model1, '../data/', '.tsv', meta2) model2on1 = versatiletrainer2.apply_pickled_model( model2, '../data/', '.tsv', meta1) pearson1on2 = stats.pearsonr(model1on2.probability, model1on2.alien_model)[0] pearson2on1 = stats.pearsonr(model2on1.probability, model2on1.alien_model)[0] pearson = averagecorr(pearson1on2, pearson2on1) spearman1on2 = stats.spearmanr(model1on2.probability, model1on2.alien_model)[0] spearman2on1 = stats.spearmanr(model2on1.probability, model2on1.alien_model)[0] spearman = averagecorr(pearson1on2, pearson2on1) loss1on2 = accuracy_loss(model1on2) loss2on1 = accuracy_loss(model2on1) loss = (loss1on2 + loss2on1) / 2 kl1on2 = kldivergence(model1on2.probability, model1on2.alien_model) kl2on1 = kldivergence(model2on1.probability, model2on1.alien_model) kl = (kl1on2 + kl2on1) / 2 with open('../measuredivergence/divergences.tsv', mode='a', encoding='utf-8') as f: outline = gold + '\t' + testname + '\t' + str( size) + '\t' + str(size) + '\t' + str( acc1) + '\t' + str(acc2) + '\t' + str( ratio) + '\t' + str(pearson) + '\t' + str( spearman) + '\t' + str( spearman2on1) + '\t' + str( loss) + '\t' + str(kl) + '\n' f.write(outline)