Ejemplo n.º 1
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         article_headline = get_tokenized_lemmas(s.articleHeadline)
         claim_headline = get_tokenized_lemmas(s.claimHeadline)
         mat[i, 0] = len(set(article_headline).intersection(claim_headline)) / \
                     float(len(set(article_headline).union(claim_headline)))
     return mat
Ejemplo n.º 2
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         article_headline = get_tokenized_lemmas(s.articleHeadline)
         claim_headline = get_tokenized_lemmas(s.claimHeadline)
         mat[i, 0] = len(set(article_headline).intersection(claim_headline)) / \
                     float(len(set(article_headline).union(claim_headline)))
     return mat
Ejemplo n.º 3
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         idx = get_aligned_data().get((s.claimId, s.articleId))
         if idx:
             try:
                 claim_tok = get_tokenized_lemmas(s.claimHeadline)
                 article_tok = get_tokenized_lemmas(s.articleHeadline)
                 mat[i, 0] = self._sts(claim_tok, article_tok, idx)
             except:
                 pass
     return mat
Ejemplo n.º 4
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         idx = get_aligned_data().get((s.claimId, s.articleId))
         if idx:
             try:
                 claim_tok = get_tokenized_lemmas(s.claimHeadline)
                 article_tok = get_tokenized_lemmas(s.articleHeadline)
                 mat[i, 0] = self._sts(claim_tok, article_tok, idx)
             except:
                 pass
     return mat
Ejemplo n.º 5
0
 def transform(self, X):
     mat = np.zeros((len(X), len(_refuting_words)))
     for i, (_, s) in enumerate(X.iterrows()):
         # article_headline = [get_stem(w) for w in get_tokenized_lemmas(s.articleHeadline)]
         article_headline = get_tokenized_lemmas(s.articleHeadline)
         # mat[i, :] = np.array([1 if get_stem(w) in article_headline else 0 for w in _refuting_words])
         mat[i, :] = np.array([1 if w in article_headline else 0 for w in _refuting_words])
     return mat
Ejemplo n.º 6
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)
        mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32)
        for i, (_, s) in enumerate(X.iterrows()):
            claim_headline = get_tokenized_lemmas(s.claimHeadline)
            article_headline = get_tokenized_lemmas(s.articleHeadline)
            word_pairs = it.product(article_headline, claim_headline)

            for v, w in word_pairs:
                v_cluster = bc_data.get(v)
                w_cluster = bc_data.get(w)
                if v_cluster is None or w_cluster is None:
                    continue

                idx = bc_data_idx[(v_cluster, w_cluster)]
                mat[i, idx] = 1
        return mat
Ejemplo n.º 7
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)
        mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32)
        for i, (_, s) in enumerate(X.iterrows()):
            claim_headline = get_tokenized_lemmas(s.claimHeadline)
            article_headline = get_tokenized_lemmas(s.articleHeadline)
            word_pairs = it.product(article_headline, claim_headline)

            for v, w in word_pairs:
                v_cluster = bc_data.get(v)
                w_cluster = bc_data.get(w)
                if v_cluster is None or w_cluster is None:
                    continue

                idx = bc_data_idx[(v_cluster, w_cluster)]
                mat[i, idx] = 1
        return mat
Ejemplo n.º 8
0
    def transform(self, X):
        mat = np.zeros((len(X), 1))
        for i, (_, s) in enumerate(X.iterrows()):
            idx = get_aligned_data().get((s.claimId, s.articleId))
            f = 0
            if idx:
                claim_tok = get_tokenized_lemmas(s.claimHeadline)
                article_tok = get_tokenized_lemmas(s.articleHeadline)
                for x, y in idx:
                    if x > 0 and y == 0:
                        f = self._match(claim_tok[x-1])
                    elif x == 0 and y > 0:
                        f = self._match(article_tok[y-1])
                    elif [x-1, y-1] not in idx:
                        f = self._match(claim_tok[x-1]) or self._match(article_tok[y-1])
            mat[i, 0] = f

        return mat
Ejemplo n.º 9
0
 def transform(self, X):
     mat = np.zeros((len(X), len(_refuting_words)))
     for i, (_, s) in enumerate(X.iterrows()):
         # article_headline = [get_stem(w) for w in get_tokenized_lemmas(s.articleHeadline)]
         article_headline = get_tokenized_lemmas(s.articleHeadline)
         # mat[i, :] = np.array([1 if get_stem(w) in article_headline else 0 for w in _refuting_words])
         mat[i, :] = np.array(
             [1 if w in article_headline else 0 for w in _refuting_words])
     return mat
Ejemplo n.º 10
0
    def transform(self, X):
        mat = np.zeros((len(X), 1))
        for i, (_, s) in enumerate(X.iterrows()):
            idx = get_aligned_data().get((s.claimId, s.articleId))
            f = 0
            if idx:
                claim_tok = get_tokenized_lemmas(s.claimHeadline)
                article_tok = get_tokenized_lemmas(s.articleHeadline)
                for x, y in idx:
                    if x > 0 and y == 0:
                        f = self._match(claim_tok[x - 1])
                    elif x == 0 and y > 0:
                        f = self._match(article_tok[y - 1])
                    elif [x - 1, y - 1] not in idx:
                        f = self._match(claim_tok[x - 1]) or self._match(
                            article_tok[y - 1])
            mat[i, 0] = f

        return mat
def calc_hungarian_alignment_score(s, t):
    """Calculate the alignment score between the two texts s and t
    using the implementation of the Hungarian alignment algorithm
    provided in https://pypi.python.org/pypi/munkres/."""
    s_toks = get_tokenized_lemmas(s)
    t_toks = get_tokenized_lemmas(t)

    df = pd.DataFrame(index=s_toks, columns=t_toks, data=0.)

    for c in s_toks:
        for a in t_toks:
            df.ix[c, a] = compute_paraphrase_score(c, a)

    matrix = df.values
    cost_matrix = make_cost_matrix(matrix, lambda cost: _max_ppdb_score - cost)

    indexes = _munk.compute(cost_matrix)
    total = 0.0
    for row, column in indexes:
        value = matrix[row][column]
        total += value
    return indexes, total / float(np.min(matrix.shape))
def calc_hungarian_alignment_score(s, t):
    """Calculate the alignment score between the two texts s and t
    using the implementation of the Hungarian alignment algorithm
    provided in https://pypi.python.org/pypi/munkres/."""
    s_toks = get_tokenized_lemmas(s)
    t_toks = get_tokenized_lemmas(t)

    df = pd.DataFrame(index=s_toks, columns=t_toks, data=0.)

    for c in s_toks:
        for a in t_toks:
            df.ix[c, a] = compute_paraphrase_score(c, a)

    matrix = df.values
    cost_matrix = make_cost_matrix(matrix, lambda cost: _max_ppdb_score - cost)

    indexes = _munk.compute(cost_matrix)
    total = 0.0
    for row, column in indexes:
        value = matrix[row][column]
        total += value
    return indexes, total / float(np.min(matrix.shape))
Ejemplo n.º 13
0
 def _compute_overlap(row):
     claim_lemmas = get_tokenized_lemmas(row.claimHeadline)
     article_lemmas = get_tokenized_lemmas(row.articleHeadline)
     intersect = set(claim_lemmas).intersection(article_lemmas)
     union = set(claim_lemmas).union(article_lemmas)
     return float(len(intersect)) / len(union)
Ejemplo n.º 14
0
 def _calc_polarity(s):
     tokens = get_tokenized_lemmas(s)
     return sum([t in _refuting_words for t in tokens]) % 2
Ejemplo n.º 15
0
def _get_bigram_clusters(s, bc_data):
    clusters = filter(None, [bc_data.get(l) for l in get_tokenized_lemmas(s)])
    return ngrams(clusters, 2)
Ejemplo n.º 16
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         if '?' in get_tokenized_lemmas(s.articleHeadline):
             mat[i, 0] = 1
     return mat
Ejemplo n.º 17
0
def _get_bigram_clusters(s, bc_data):
    clusters = filter(None, [bc_data.get(l) for l in get_tokenized_lemmas(s)])
    return ngrams(clusters, 2)
Ejemplo n.º 18
0
 def transform(self, X):
     mat = np.zeros((len(X), 1))
     for i, (_, s) in enumerate(X.iterrows()):
         if '?' in get_tokenized_lemmas(s.articleHeadline):
             mat[i, 0] = 1
     return mat
Ejemplo n.º 19
0
 def _calc_polarity(s):
     tokens = get_tokenized_lemmas(s)
     return sum([t in _refuting_words for t in tokens]) % 2
    import pickle

from aligner import align

from model.utils import get_dataset, get_tokenized_lemmas


def _get_unaligned_tokens(tokens, alignment):
    aligned = [a-1 for (a, _) in alignment]
    unaligned = [i for i in range(len(tokens)) if i not in aligned]
    return [tokens[i] for i in unaligned]


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for id, row in df.iterrows():
        article_hl_tok = get_tokenized_lemmas(row.articleHeadline)
        claim_hl_tok = get_tokenized_lemmas(row.claimHeadline)
        try:
            alignment = align(claim_hl_tok, article_hl_tok)
            data[(row.claimId, row.articleId)] = [(s-1, t-1) for (s, t) in alignment[0]]
        except:
            print 'Unable to align', article_hl_tok, 'and', claim_hl_tok
            print row.articleId,  row.claimId

    with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'), 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

from aligner import align

from model.utils import get_dataset, get_tokenized_lemmas


def _get_unaligned_tokens(tokens, alignment):
    aligned = [a - 1 for (a, _) in alignment]
    unaligned = [i for i in range(len(tokens)) if i not in aligned]
    return [tokens[i] for i in unaligned]


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for id, row in df.iterrows():
        article_hl_tok = get_tokenized_lemmas(row.articleHeadline)
        claim_hl_tok = get_tokenized_lemmas(row.claimHeadline)
        try:
            alignment = align(claim_hl_tok, article_hl_tok)
            data[(row.claimId, row.articleId)] = [(s - 1, t - 1)
                                                  for (s, t) in alignment[0]]
        except:
            print 'Unable to align', article_hl_tok, 'and', claim_hl_tok
            print row.articleId, row.claimId

    with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'),
              'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 22
0
 def _compute_overlap(row):
     claim_lemmas = get_tokenized_lemmas(row.claimHeadline)
     article_lemmas = get_tokenized_lemmas(row.articleHeadline)
     intersect = set(claim_lemmas).intersection(article_lemmas)
     union = set(claim_lemmas).union(article_lemmas)
     return float(len(intersect)) / len(union)