def transform(self, X): mat = np.zeros((len(X), 1)) for i, (_, s) in enumerate(X.iterrows()): article_headline = get_tokenized_lemmas(s.articleHeadline) claim_headline = get_tokenized_lemmas(s.claimHeadline) mat[i, 0] = len(set(article_headline).intersection(claim_headline)) / \ float(len(set(article_headline).union(claim_headline))) return mat
def transform(self, X): mat = np.zeros((len(X), 1)) for i, (_, s) in enumerate(X.iterrows()): idx = get_aligned_data().get((s.claimId, s.articleId)) if idx: try: claim_tok = get_tokenized_lemmas(s.claimHeadline) article_tok = get_tokenized_lemmas(s.articleHeadline) mat[i, 0] = self._sts(claim_tok, article_tok, idx) except: pass return mat
def transform(self, X): mat = np.zeros((len(X), len(_refuting_words))) for i, (_, s) in enumerate(X.iterrows()): # article_headline = [get_stem(w) for w in get_tokenized_lemmas(s.articleHeadline)] article_headline = get_tokenized_lemmas(s.articleHeadline) # mat[i, :] = np.array([1 if get_stem(w) in article_headline else 0 for w in _refuting_words]) mat[i, :] = np.array([1 if w in article_headline else 0 for w in _refuting_words]) return mat
def transform(self, X): bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size) mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32) for i, (_, s) in enumerate(X.iterrows()): claim_headline = get_tokenized_lemmas(s.claimHeadline) article_headline = get_tokenized_lemmas(s.articleHeadline) word_pairs = it.product(article_headline, claim_headline) for v, w in word_pairs: v_cluster = bc_data.get(v) w_cluster = bc_data.get(w) if v_cluster is None or w_cluster is None: continue idx = bc_data_idx[(v_cluster, w_cluster)] mat[i, idx] = 1 return mat
def transform(self, X): mat = np.zeros((len(X), 1)) for i, (_, s) in enumerate(X.iterrows()): idx = get_aligned_data().get((s.claimId, s.articleId)) f = 0 if idx: claim_tok = get_tokenized_lemmas(s.claimHeadline) article_tok = get_tokenized_lemmas(s.articleHeadline) for x, y in idx: if x > 0 and y == 0: f = self._match(claim_tok[x-1]) elif x == 0 and y > 0: f = self._match(article_tok[y-1]) elif [x-1, y-1] not in idx: f = self._match(claim_tok[x-1]) or self._match(article_tok[y-1]) mat[i, 0] = f return mat
def transform(self, X): mat = np.zeros((len(X), len(_refuting_words))) for i, (_, s) in enumerate(X.iterrows()): # article_headline = [get_stem(w) for w in get_tokenized_lemmas(s.articleHeadline)] article_headline = get_tokenized_lemmas(s.articleHeadline) # mat[i, :] = np.array([1 if get_stem(w) in article_headline else 0 for w in _refuting_words]) mat[i, :] = np.array( [1 if w in article_headline else 0 for w in _refuting_words]) return mat
def transform(self, X): mat = np.zeros((len(X), 1)) for i, (_, s) in enumerate(X.iterrows()): idx = get_aligned_data().get((s.claimId, s.articleId)) f = 0 if idx: claim_tok = get_tokenized_lemmas(s.claimHeadline) article_tok = get_tokenized_lemmas(s.articleHeadline) for x, y in idx: if x > 0 and y == 0: f = self._match(claim_tok[x - 1]) elif x == 0 and y > 0: f = self._match(article_tok[y - 1]) elif [x - 1, y - 1] not in idx: f = self._match(claim_tok[x - 1]) or self._match( article_tok[y - 1]) mat[i, 0] = f return mat
def calc_hungarian_alignment_score(s, t): """Calculate the alignment score between the two texts s and t using the implementation of the Hungarian alignment algorithm provided in https://pypi.python.org/pypi/munkres/.""" s_toks = get_tokenized_lemmas(s) t_toks = get_tokenized_lemmas(t) df = pd.DataFrame(index=s_toks, columns=t_toks, data=0.) for c in s_toks: for a in t_toks: df.ix[c, a] = compute_paraphrase_score(c, a) matrix = df.values cost_matrix = make_cost_matrix(matrix, lambda cost: _max_ppdb_score - cost) indexes = _munk.compute(cost_matrix) total = 0.0 for row, column in indexes: value = matrix[row][column] total += value return indexes, total / float(np.min(matrix.shape))
def _compute_overlap(row): claim_lemmas = get_tokenized_lemmas(row.claimHeadline) article_lemmas = get_tokenized_lemmas(row.articleHeadline) intersect = set(claim_lemmas).intersection(article_lemmas) union = set(claim_lemmas).union(article_lemmas) return float(len(intersect)) / len(union)
def _calc_polarity(s): tokens = get_tokenized_lemmas(s) return sum([t in _refuting_words for t in tokens]) % 2
def _get_bigram_clusters(s, bc_data): clusters = filter(None, [bc_data.get(l) for l in get_tokenized_lemmas(s)]) return ngrams(clusters, 2)
def transform(self, X): mat = np.zeros((len(X), 1)) for i, (_, s) in enumerate(X.iterrows()): if '?' in get_tokenized_lemmas(s.articleHeadline): mat[i, 0] = 1 return mat
import pickle from aligner import align from model.utils import get_dataset, get_tokenized_lemmas def _get_unaligned_tokens(tokens, alignment): aligned = [a-1 for (a, _) in alignment] unaligned = [i for i in range(len(tokens)) if i not in aligned] return [tokens[i] for i in unaligned] if __name__ == "__main__": df = get_dataset() data = {} for id, row in df.iterrows(): article_hl_tok = get_tokenized_lemmas(row.articleHeadline) claim_hl_tok = get_tokenized_lemmas(row.claimHeadline) try: alignment = align(claim_hl_tok, article_hl_tok) data[(row.claimId, row.articleId)] = [(s-1, t-1) for (s, t) in alignment[0]] except: print 'Unable to align', article_hl_tok, 'and', claim_hl_tok print row.articleId, row.claimId with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'), 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
from aligner import align from model.utils import get_dataset, get_tokenized_lemmas def _get_unaligned_tokens(tokens, alignment): aligned = [a - 1 for (a, _) in alignment] unaligned = [i for i in range(len(tokens)) if i not in aligned] return [tokens[i] for i in unaligned] if __name__ == "__main__": df = get_dataset() data = {} for id, row in df.iterrows(): article_hl_tok = get_tokenized_lemmas(row.articleHeadline) claim_hl_tok = get_tokenized_lemmas(row.claimHeadline) try: alignment = align(claim_hl_tok, article_hl_tok) data[(row.claimId, row.articleId)] = [(s - 1, t - 1) for (s, t) in alignment[0]] except: print 'Unable to align', article_hl_tok, 'and', claim_hl_tok print row.articleId, row.claimId with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'), 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)