Beispiel #1
0
    def get_metamatrix(self):
        preds, preds_type = self.get_probas()
        str_sents = self.get_sentences_for_lm()
        probs = get_lm_probas(str_sents)
        # temp for local windows - file is processed by lm on server separately
        #self.write_sentences_for_lm(str_sents)
        #with open('lm_preds_test_articles.json','r',encoding='utf-8') as f:
        #    probs = json.loads(f.read())
        # end temp
        self.metafeats = pd.concat([pd.DataFrame(preds,columns=['present','zero']),
                                    pd.DataFrame(preds_type,columns=['a','an','the']),
                                    pd.DataFrame(probs,columns=['lm_a','lm_an','lm_the','lm_zero'])],
                                   axis=1)
        probs_ratio = []
        probs_delta = []
        init_probs = []
        corr_probs = []
        lm_choice = []
        for i in range(self.metafeats.shape[0]):
            row = self.metafeats.iloc[i]
            feat_row = self.feats.iloc[i]
            init_prob = row['lm_'+feat_row['Target']]
            corr_prob = row['lm_'+feat_row['Predicted']]
            init_probs.append(init_prob)
            corr_probs.append(corr_prob)
            probs_ratio.append(init_prob / corr_prob)
            probs_delta.append(init_prob - corr_prob)
            lm_choice.append(np.argmax(row[['lm_a','lm_an','lm_the','lm_zero']]).split('_')[1])
        self.metafeats['init_prob'] = init_probs
        self.metafeats['corr_prob'] = corr_probs
        self.metafeats['probs_ratio'] = probs_ratio
        self.metafeats['probs_delta'] = probs_delta
        self.feats['LM'] = lm_choice
        #for sent,np,iprob,cprob in zip(self.feats['Sentence'],self.feats['raw_NP'],
        #                               self.metafeats['init_prob'],self.metafeats['corr_prob']):
        #    print(sent,np,iprob,cprob)

        self.metafeats = self.metafeats.loc[(self.feats['Target'] != self.feats['Predicted']) |
                                            (self.feats['Target'] != self.feats['LM']),:]

        with open('../models/article_choice_vectorizer.pickle','rb') as f:
            art_vect = pickle.load(f)
        self.metafeats_sparse = hstack((self.metafeats.to_sparse(),
                                 art_vect.transform(self.feats.loc[self.metafeats.index,'Target']),
                                 art_vect.transform(self.feats.loc[self.metafeats.index,'LM']),
                                 art_vect.transform(self.feats.loc[self.metafeats.index,'Predicted'])))
        print(self.metafeats_sparse.shape)
Beispiel #2
0
def lm_decision(sent, initial, suggestions, idx):
    options = []
    for s in suggestions:
        options.append(sent[:idx] + s + sent[idx + len(initial):])
    probs = get_lm_probas('\n'.join(options) + '\n', 'text')
    return np.argmax(probs)
Beispiel #3
0
        #if i != len(error_spans):
        #    print(text)
        #    print(i)
        #    print(error_spans)
        #    print(article_corrector.feats[['raw_NP','Start_idx','Sent_start_idx']])
        #    print('=================')
        corrector.feats = []
        tn += 1
    #with open('init_sents_for_'+err.lower()+'.txt','w',encoding='utf-8') as f:
    #    f.write('\n==========\n\n'.join(init_sents))

    #with open('tagged_sents_for_'+err.lower()+'.pickle','wb') as f:
    #    pickle.dump(tagged_sents,f)

    lm_preds = get_lm_probas('\n\n'.join(['\n'.join(x)
                                          for x in all_sents]) + '\n',
                             inp_type='text')
    with open(err.lower() + '_meta.csv', 'w', encoding='utf-8-sig',
              newline='') as f:
        csvw = csv.writer(f,
                          delimiter=';',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
        csvw.writerow(corrector.logit_bin.classes_.tolist() +
                      corrector.logit_type.classes_.tolist() + [
                          'raw_NP', 'Start_idx', 'Sent_start_idx', 'Initial',
                          'ML_L1', 'Ann'
                      ] + ['lm_' + x for x in options])
        for pred, predt, corr, lm_pred in zip(predsp, predst, correct,
                                              lm_preds):
            csvw.writerow(list(pred) + list(predt) + corr + lm_pred)