def __init__(self, domain_port, domain_order, gigaword_port, gigaword_order): self.tok_ = RegexpTokenizer(r'\w+') self.domain_lm_ = Client(domain_port, domain_order, True) self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True) self.features = [u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp"]
def __init__(self, domain_port, domain_order, gigaword_port, gigaword_order): self.tok_ = RegexpTokenizer(r'\w+') self.domain_lm_ = Client(domain_port, domain_order, True) self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True) self.features = [ u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp" ]
class LMProbExtractor(object): def __init__(self, domain_port, domain_order, gigaword_port, gigaword_order): self.tok_ = RegexpTokenizer(r'\w+') self.domain_lm_ = Client(domain_port, domain_order, True) self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True) self.features = [ u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp" ] def process_corenlp_strings(self, strings): return [self.process_corenlp_string(string) for string in strings] def process_corenlp_string(self, string): dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(string) gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(string) return { u"LM_FEATS: domain avg lp": dmn_avg_lp, u"LM_FEATS: gigaword avg lp": gw_avg_lp } def process_article(self, si): if u'article-clf' not in si.body.sentences: return list() lm_scores = [] for sentence in si.body.sentences[u'article-clf']: bytes_string = ' '.join(token.token for token in sentence.tokens) uni_string = bytes_string.decode(u'utf-8') uni_string = uni_string.lower() uni_tokens = self.tok_.tokenize(uni_string) uni_string = u' '.join(uni_tokens) bytes_string = uni_string.encode(u'utf-8') dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob( bytes_string) gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob( bytes_string) lm_scores.append({ u"domain avg lp": dmn_avg_lp, u"gigaword avg lp": gw_avg_lp }) return lm_scores
class LMProbExtractor(object): def __init__(self, domain_port, domain_order, gigaword_port, gigaword_order): self.tok_ = RegexpTokenizer(r'\w+') self.domain_lm_ = Client(domain_port, domain_order, True) self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True) self.features = [u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp"] def process_corenlp_strings(self, strings): return [self.process_corenlp_string(string) for string in strings] def process_corenlp_string(self, string): dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(string) gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(string) return {u"LM_FEATS: domain avg lp": dmn_avg_lp, u"LM_FEATS: gigaword avg lp": gw_avg_lp} def process_article(self, si): if u'article-clf' not in si.body.sentences: return list() lm_scores = [] for sentence in si.body.sentences[u'article-clf']: bytes_string = ' '.join(token.token for token in sentence.tokens) uni_string = bytes_string.decode(u'utf-8') uni_string = uni_string.lower() uni_tokens = self.tok_.tokenize(uni_string) uni_string = u' '.join(uni_tokens) bytes_string = uni_string.encode(u'utf-8') dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob( bytes_string) gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob( bytes_string) lm_scores.append( {u"domain avg lp": dmn_avg_lp, u"gigaword avg lp": gw_avg_lp}) return lm_scores