def transform( self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation), with an optional selector indicating which objects should be considered for pairing. :param corpus: target Corpus :param selector: a (lambda) function that takes a Corpus object and returns a bool (True = include) :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus, selector) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_attribute_name, "pos") neg_obj.add_meta(self.label_attribute_name, "neg") pos_obj.add_meta(self.pair_id_attribute_name, pair_id) neg_obj.add_meta(self.pair_id_attribute_name, pair_id) pos_obj.add_meta(self.pair_orientation_attribute_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_attribute_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_attribute_name not in obj.meta: obj.add_meta(self.label_attribute_name, None) obj.add_meta(self.pair_id_attribute_name, None) obj.add_meta(self.pair_orientation_attribute_name, None) return corpus
def transform( self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Computes the vector matrix for the Corpus component objects and then stores it in a ConvoKitMatrix object, which is saved in the Corpus as `vector_name`. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus component object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = list(corpus.iter_objs(self.obj_type, selector)) ids = [obj.id for obj in objs] docs = [self.text_func(obj) for obj in objs] matrix = self.vectorizer.transform(docs) try: column_names = self.vectorizer.get_feature_names() except AttributeError: column_names = np.arange(matrix.shape[1]) corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names) for obj in objs: obj.add_vector(self.vector_name) return corpus
def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Annotate the corpus objects with the vectorized representation of the object's text, with an optional selector that filters for objects to be transformed. Objects that are not selected will get a metadata value of 'None' instead of the vector. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = list(corpus.iter_objs(self.obj_type, selector)) ids = [obj.id for obj in objs] docs = [self.text_func(obj) for obj in objs] matrix = self.vectorizer.transform(docs) try: column_names = self.vectorizer.get_feature_names() except AttributeError: column_names = np.arange(matrix.shape[1]) corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names) for obj in objs: obj.add_vector(self.vector_name) return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation) :param corpus: target Corpus :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_feat_name, "pos") neg_obj.add_meta(self.label_feat_name, "neg") pos_obj.add_meta(self.pair_id_feat_name, pair_id) neg_obj.add_meta(self.pair_id_feat_name, pair_id) pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_feat_name not in obj.meta: obj.add_meta(self.label_feat_name, None) obj.add_meta(self.pair_id_feat_name, None) obj.add_meta(self.pair_orientation_feat_name, None) return corpus
def fit(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True, y=None): """ Fit the Transformer's internal classifier model on the vector matrix that represents one of the Corpus components, with an optional selector that selects for objects to be fit on. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the fitted VectorClassifier """ # collect texts for vectorization obj_ids = [] y = [] for obj in corpus.iter_objs(self.obj_type, selector): obj_ids.append(obj.id) y.append(self.labeller(obj)) X = corpus.get_vectors(self.vector_name, ids=obj_ids, columns=self.columns) y = np.array(y) # print(corpus.get_vector_matrix(self.vector_name).matrix.shape) # print(X.shape) # print(y.shape) self.clf.fit(X, y) return self
def fit(self, corpus: Corpus, y=None): # collect texts for vectorization docs = [] for obj in corpus.iter_objs(self.obj_type, self.selector): docs.append(self.text_func(obj)) self.vectorizer.fit(docs)
def transform(self, corpus: Corpus, selector: Callable[[CorpusObject], bool] = lambda x: True) -> Corpus: """ Annotate the corpus objects with the classifier prediction and prediction score, with an optional selector that filters for objects to be classified. Objects that are not selected will get a metadata value of 'None' instead of the classifier prediction. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = [] X = [] for obj in corpus.iter_objs(self.obj_type): if selector(obj): objs.append(obj) X.append(obj.meta[self.vector_name]) else: obj.add_meta(self.clf_feat_name, None) obj.add_meta(self.clf_prob_feat_name, None) X = vstack(X) clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1] for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))): obj = objs[idx] obj.add_meta(self.clf_feat_name, clf) obj.add_meta(self.clf_prob_feat_name, clf_prob) return corpus
def transform(self, corpus: Corpus) -> Corpus: for obj in corpus.iter_objs(self.obj_type): if self.selector(obj): obj.meta[self.vector_name] = self.vectorizer.transform( [self.text_func(obj)]) else: obj.meta[self.vector_name] = None return corpus
def fit(self, corpus: Corpus, y=None): # collect texts for vectorization X = [] y = [] for obj in corpus.iter_objs(self.obj_type, self.selector): X.append(obj.meta[self.vector_name]) y.append(self.labeller(obj)) X = vstack(X) self.clf.fit(X, y) return self
def summarize(self, corpus: Corpus, use_selector=True): objId_clf_prob = [] for obj in corpus.iter_objs( self.obj_type, self.selector if use_selector else lambda _: True): objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name], obj.meta[self.clf_prob_feat_name])) return pd.DataFrame(list(objId_clf_prob), columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\ .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda x: True): """ Fit the Transformer's internal vectorizer on the Corpus objects' texts, with an optional selector that filters for objects to be fit on. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the fitted BoWTransformer """ # collect texts for vectorization docs = [self.text_func(obj) for obj in corpus.iter_objs(self.obj_type, selector)] self.vectorizer.fit(docs) return self
def _get_pos_neg_objects(self, corpus: Corpus, selector): """ Get positively-labelled and negatively-labelled lists of objects :param corpus: target Corpus :return: list of positive objects, list of negative objects """ pos_objects = [] neg_objects = [] for obj in corpus.iter_objs(self.obj_type, selector): if self.pos_label_func(obj): pos_objects.append(obj) elif self.neg_label_func(obj): neg_objects.append(obj) return pos_objects, neg_objects
def summarize(self, corpus: Corpus, selector: Callable[[CorpusObject], bool] = lambda x: True): """ Generate a DataFrame indexed by object id with the classifier predictions and scores :param corpus: the annotated Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: a pandas DataFrame """ objId_clf_prob = [] for obj in corpus.iter_objs(self.obj_type, selector): objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name], obj.meta[self.clf_prob_feat_name])) return pd.DataFrame(list(objId_clf_prob), columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\ .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
def transform(self, corpus: Corpus) -> Corpus: objs = [] X = [] for obj in corpus.iter_objs(self.obj_type): if self.selector(obj): objs.append(obj) X.append(obj.meta[self.vector_name]) else: obj.meta[self.vector_name] = None X = vstack(X) clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1] for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))): obj = objs[idx] obj.add_meta(self.clf_feat_name, clf) obj.add_meta(self.clf_prob_feat_name, clf_prob) return corpus
def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusObject], bool] = lambda x: True): """ Fit the Transformer's internal classifier model on the Corpus objects, with an optional selector that filters for objects to be fit on. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the fitted BoWClassifier """ # collect texts for vectorization X = [] y = [] for obj in corpus.iter_objs(self.obj_type, selector): X.append(obj.meta[self.vector_name]) y.append(self.labeller(obj)) X = vstack(X) self.clf.fit(X, y) return self
def transform( self, corpus: Corpus, selector: Callable[[CorpusObject], bool] = lambda x: True) -> Corpus: """ Annotate the corpus objects with the vectorized representation of the object's text, with an optional selector that filters for objects to be transformed. Objects that are not selected will get a metadata value of 'None' instead of the vector. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ for obj in corpus.iter_objs(self.obj_type): if selector(obj): obj.meta[self.vector_name] = self.vectorizer.transform( [self.text_func(obj)]) else: obj.meta[self.vector_name] = None return corpus
def transform(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda obj: True) -> Corpus: """ Annotate corpus objects with scores and rankings. :param corpus: target corpus :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate. :return: annotated corpus """ obj_iters = {"conversation": corpus.iter_conversations, "speaker": corpus.iter_speakers, "utterance": corpus.iter_utterances} obj_scores = [(obj.id, self.score_func(obj)) for obj in obj_iters[self.obj_type](selector)] df = pd.DataFrame(obj_scores, columns=["id", self.score_attribute_name]) \ .set_index('id').sort_values(self.score_attribute_name, ascending=False) df[self.rank_attribute_name] = [idx + 1 for idx, _ in enumerate(df.index)] for obj in corpus.iter_objs(obj_type=self.obj_type): if obj.id in df.index: obj.add_meta(self.score_attribute_name, df.loc[obj.id][self.score_attribute_name]) obj.add_meta(self.rank_attribute_name, df.loc[obj.id][self.rank_attribute_name]) else: obj.add_meta(self.score_attribute_name, None) obj.add_meta(self.rank_attribute_name, None) return corpus