Beispiel #1
0
 def __init__(self, domain_port, domain_order,
              gigaword_port, gigaword_order):
     self.tok_ = RegexpTokenizer(r'\w+')
     self.domain_lm_ = Client(domain_port, domain_order, True)
     self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True)
     self.features = [u"LM_FEATS: domain avg lp",
                      u"LM_FEATS: gigaword avg lp"]
Beispiel #2
0
 def __init__(self, domain_port, domain_order, gigaword_port,
              gigaword_order):
     self.tok_ = RegexpTokenizer(r'\w+')
     self.domain_lm_ = Client(domain_port, domain_order, True)
     self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True)
     self.features = [
         u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp"
     ]
Beispiel #3
0
class LMProbExtractor(object):
    def __init__(self, domain_port, domain_order, gigaword_port,
                 gigaword_order):
        self.tok_ = RegexpTokenizer(r'\w+')
        self.domain_lm_ = Client(domain_port, domain_order, True)
        self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True)
        self.features = [
            u"LM_FEATS: domain avg lp", u"LM_FEATS: gigaword avg lp"
        ]

    def process_corenlp_strings(self, strings):
        return [self.process_corenlp_string(string) for string in strings]

    def process_corenlp_string(self, string):
        dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(string)
        gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(string)
        return {
            u"LM_FEATS: domain avg lp": dmn_avg_lp,
            u"LM_FEATS: gigaword avg lp": gw_avg_lp
        }

    def process_article(self, si):
        if u'article-clf' not in si.body.sentences:
            return list()
        lm_scores = []

        for sentence in si.body.sentences[u'article-clf']:
            bytes_string = ' '.join(token.token for token in sentence.tokens)
            uni_string = bytes_string.decode(u'utf-8')
            uni_string = uni_string.lower()
            uni_tokens = self.tok_.tokenize(uni_string)
            uni_string = u' '.join(uni_tokens)
            bytes_string = uni_string.encode(u'utf-8')
            dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(
                bytes_string)
            gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(
                bytes_string)
            lm_scores.append({
                u"domain avg lp": dmn_avg_lp,
                u"gigaword avg lp": gw_avg_lp
            })
        return lm_scores
Beispiel #4
0
class LMProbExtractor(object):
    def __init__(self, domain_port, domain_order,
                 gigaword_port, gigaword_order):
        self.tok_ = RegexpTokenizer(r'\w+')
        self.domain_lm_ = Client(domain_port, domain_order, True)
        self.gigaword_lm_ = Client(gigaword_port, gigaword_order, True)
        self.features = [u"LM_FEATS: domain avg lp",
                         u"LM_FEATS: gigaword avg lp"]

    def process_corenlp_strings(self, strings):
        return [self.process_corenlp_string(string) for string in strings]

    def process_corenlp_string(self, string):
        dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(string)
        gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(string)
        return {u"LM_FEATS: domain avg lp": dmn_avg_lp,
                u"LM_FEATS: gigaword avg lp": gw_avg_lp}

    def process_article(self, si):
        if u'article-clf' not in si.body.sentences:
            return list()
        lm_scores = []

        for sentence in si.body.sentences[u'article-clf']:
            bytes_string = ' '.join(token.token for token in sentence.tokens)
            uni_string = bytes_string.decode(u'utf-8')
            uni_string = uni_string.lower()
            uni_tokens = self.tok_.tokenize(uni_string)
            uni_string = u' '.join(uni_tokens)
            bytes_string = uni_string.encode(u'utf-8')
            dmn_lp, dmn_avg_lp = self.domain_lm_.sentence_log_prob(
                bytes_string)
            gw_lp, gw_avg_lp = self.gigaword_lm_.sentence_log_prob(
                bytes_string)
            lm_scores.append(
                {u"domain avg lp": dmn_avg_lp,
                 u"gigaword avg lp": gw_avg_lp})
        return lm_scores