def transform(self, X): bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size) y_dim = len(bc_data_idx.values()) mat = dok_matrix((len(X), y_dim * 2), dtype=np.float32) def set_cluster_pair(i, s, offset=0): cx = _get_bigram_clusters(s, bc_data) for x in cx: idx = bc_data_idx[x] mat[i, idx + (y_dim * offset)] = 1 for i, (_, s) in enumerate(X.iterrows()): set_cluster_pair(i, s.claimHeadline) set_cluster_pair(i, s.articleHeadline, 1) return mat
def transform(self, X): bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size) mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32) for i, (_, s) in enumerate(X.iterrows()): claim_headline = get_tokenized_lemmas(s.claimHeadline) article_headline = get_tokenized_lemmas(s.articleHeadline) word_pairs = it.product(article_headline, claim_headline) for v, w in word_pairs: v_cluster = bc_data.get(v) w_cluster = bc_data.get(w) if v_cluster is None or w_cluster is None: continue idx = bc_data_idx[(v_cluster, w_cluster)] mat[i, idx] = 1 return mat