def get_coefs_helper(clf, feature_names: List[str] = None, coef_func=None):
    """
    Get dataframe of classifier coefficients. By default, assumes it is a pipeline with a logistic regression component
    :param clf: classifier model
    :param feature_names: list of feature names to get coefficients for
    :param coef_func: function for accessing the list of coefficients from the classifier model
    :return: DataFrame of features and coefficients, indexed by feature names
    """
    if coef_func is None:
        try:
            coefs = clf.named_steps['logreg'].coef_[0].tolist()
        except AttributeError:
            warn(
                "Classifier is not a pipeline with a logistic regression component, so default coefficient getter function"
                "did not work. Choose a valid coef_func argument.")
            return
    else:
        coefs = coef_func(clf)

    assert len(feature_names) == len(coefs)
    feats_coefs = sorted(list(zip(feature_names, coefs)),
                         key=lambda x: x[1],
                         reverse=True)
    return pd.DataFrame(feats_coefs, columns=['feat_name', 'coef'])\
                        .set_index('feat_name').sort_values('coef', ascending=False)
Esempio n. 2
0
    def __setitem__(self, key, value):
        if not isinstance(key, str):
            warn(
                "Metadata attribute keys must be strings. Input key has been casted to a string."
            )
            key = str(key)

        if self.index.type_check:
            if not isinstance(
                    value, type(None)):  # do nothing to index if value is None
                if key not in self.index.indices[self.obj_type]:
                    type_ = _optimized_type_check(value)
                    self.index.update_index(self.obj_type,
                                            key=key,
                                            class_type=type_)
                else:
                    # entry exists
                    if self.index.get_index(self.obj_type)[key] != [
                            "bin"
                    ]:  # if "bin" do no further checks
                        if str(type(value)) not in self.index.get_index(
                                self.obj_type)[key]:
                            new_type = _optimized_type_check(value)

                            if new_type == "bin":
                                self.index.set_index(self.obj_type, key, "bin")
                            else:
                                self.index.update_index(
                                    self.obj_type, key, new_type)
        dict.__setitem__(self, key, value)
Esempio n. 3
0
 def set_id(self, value):
     if not isinstance(value, str) and value is not None:
         self._id = str(value)
         warn(
             "{} id must be a string. ID input has been casted to a string."
             .format(self.obj_type))
     else:
         self._id = value
Esempio n. 4
0
    def __setitem__(self, key, value):
        if not isinstance(key, str):
            warn(
                "Metadata attribute keys must be strings. Input key has been casted to a string."
            )
            key = str(key)

        if self.index.type_check:
            ConvoKitMeta._check_type_and_update_index(self.index,
                                                      self.obj_type, key,
                                                      value)
        dict.__setitem__(self, key, value)
def initialize_speakers_and_utterances_objects(corpus, utt_dict, utterances,
                                               speakers_dict, speakers_data):
    """
    Initialize Speaker and Utterance objects
    """
    if len(
            utterances
    ) > 0:  # utterances might be empty for invalid corpus start/end indices
        KeySpeaker = "speaker" if "speaker" in utterances[0] else "user"
        KeyConvoId = "conversation_id" if "conversation_id" in utterances[
            0] else "root"

    for i, u in enumerate(utterances):
        u = defaultdict(lambda: None, u)
        speaker_key = u[KeySpeaker]
        if speaker_key not in speakers_dict:
            if u[KeySpeaker] not in speakers_data:
                warn(
                    "CorpusLoadWarning: Missing speaker metadata for speaker ID: {}. "
                    "Initializing default empty metadata instead.".format(
                        u[KeySpeaker]))
                speakers_data[u[KeySpeaker]] = {}
            if KeyMeta in speakers_data[u[KeySpeaker]]:
                speakers_dict[speaker_key] = Speaker(
                    owner=corpus,
                    id=u[KeySpeaker],
                    meta=speakers_data[u[KeySpeaker]][KeyMeta])
            else:
                speakers_dict[speaker_key] = Speaker(
                    owner=corpus,
                    id=u[KeySpeaker],
                    meta=speakers_data[u[KeySpeaker]])

        speaker = speakers_dict[speaker_key]
        speaker.vectors = speakers_data[u[KeySpeaker]].get(KeyVectors, [])

        # temp fix for reddit reply_to
        if "reply_to" in u:
            reply_to_data = u["reply_to"]
        else:
            reply_to_data = u[KeyReplyTo]
        utt = Utterance(owner=corpus,
                        id=u[KeyId],
                        speaker=speaker,
                        conversation_id=u[KeyConvoId],
                        reply_to=reply_to_data,
                        timestamp=u[KeyTimestamp],
                        text=u[KeyText],
                        meta=u[KeyMeta])
        utt.vectors = u.get(KeyVectors, [])
        utt_dict[utt.id] = utt
Esempio n. 6
0
 def __delitem__(self, key):
     if self.obj_type == 'corpus':
         dict.__delitem__(self, key)
         self.index.del_from_index(self.obj_type, key)
     else:
         if self.index.lock_metadata_deletion[self.obj_type]:
             warn(
                 "For consistency in metadata attributes in Corpus component objects, deleting metadata attributes "
                 "from component objects individually is not allowed. "
                 "To delete this metadata attribute from all Corpus components of this type, "
                 "use corpus.delete_metadata(obj_type='{}', attribute='{}') instead."
                 .format(self.obj_type, key))
         else:
             dict.__delitem__(self, key)
    def from_dir(dirpath, matrix_name):
        """
        Initialize a ConvoKitMatrix of the specified `matrix_name` from a specified directory `dirpath`.

        :param dirpath: path to Corpus directory
        :param matrix_name: name of vector matrix
        :return: the initialized ConvoKitMatrix
        """
        try:
            with open(os.path.join(dirpath, 'vectors.{}.p'.format(matrix_name)), 'rb') as f:
                retval: ConvoKitMatrix = pickle.load(f)
                if not retval._sparse:
                    retval.matrix = retval.matrix.toarray()
                return retval
        except FileNotFoundError:
            warn("Could not find vector with name: {} at {}.".format(matrix_name, dirpath))
            return None
Esempio n. 8
0
 def __setitem__(self, key, value):
     if not isinstance(key, str):
         warn(
             "Metadata keys must be strings. Input key has been casted to a string."
         )
     key = str(key)
     if key not in self.index.indices[self.obj_type]:
         # update Corpus index
         try:
             json.dumps(value)
             self.index.update_index(self.obj_type,
                                     key=key,
                                     class_type=str(type(value)))
         except (TypeError, OverflowError):  # unserializable
             self.index.update_index(self.obj_type,
                                     key=key,
                                     class_type="bin")
     dict.__setitem__(self, key, value)
Esempio n. 9
0
    def check_integrity(self, verbose: bool = True) -> bool:
        """
        Check the integrity of this Conversation; i.e. do the constituent utterances form a complete reply-to chain?

        :param verbose: whether to print errors indicating the problems with the Conversation
        :return: True if the conversation structure is complete else False
        """
        if verbose: print("Checking reply-to chain of Conversation", self.id)
        utt_reply_tos = {
            utt.id: utt.reply_to
            for utt in self.iter_utterances()
        }
        target_utt_ids = set(list(utt_reply_tos.values()))
        speaker_utt_ids = set(list(utt_reply_tos.keys()))
        root_utt_id = target_utt_ids - speaker_utt_ids  # There should only be 1 root_utt_id: None

        if len(root_utt_id) != 1:
            if verbose:
                for utt_id in root_utt_id:
                    if utt_id is not None:
                        warn("ERROR: Missing utterance {}".format(utt_id))
            return False
        else:
            root_id = list(root_utt_id)[0]
            if root_id is not None:
                if verbose: warn("ERROR: Missing utterance {}".format(root_id))
                return False

        # sanity check
        utts_replying_to_none = 0
        for utt in self.iter_utterances():
            if utt.reply_to is None:
                utts_replying_to_none += 1

        if utts_replying_to_none > 1:
            if verbose:
                warn("ERROR: Found more than one Utterance replying to None.")
            return False

        circular = [
            utt_id for utt_id, utt_reply_to in utt_reply_tos.items()
            if utt_id == utt_reply_to
        ]
        if len(circular) > 0:
            if verbose:
                warn(
                    "ERROR: Found utterances with .reply_to pointing to themselves: {}"
                    .format(circular))
            return False

        if verbose: print("No issues found.\n")
        return True
    def check_integrity(self, verbose=True):
        if verbose: print("Checking reply-to chain of Conversation", self.id)
        utt_reply_tos = {
            utt.id: utt.reply_to
            for utt in self.iter_utterances()
        }
        target_utt_ids = set(list(utt_reply_tos.values()))
        speaker_utt_ids = set(list(utt_reply_tos.keys()))
        root_utt_id = target_utt_ids - speaker_utt_ids  # There should only be 1 root_utt_id: None

        if len(root_utt_id) != 1:
            if verbose:
                for utt_id in root_utt_id:
                    if utt_id is not None:
                        warn("ERROR: Missing utterance {}".format(utt_id))
            return False
        else:
            root_id = list(root_utt_id)[0]
            if root_id is not None:
                if verbose: warn("ERROR: Missing utterance {}".format(root_id))
                return False

        # sanity check
        utts_replying_to_none = 0
        for utt in self.iter_utterances():
            if utt.reply_to is None:
                utts_replying_to_none += 1

        if utts_replying_to_none > 1:
            if verbose:
                warn("ERROR: Found more than one Utterance replying to None.")
            return False

        if verbose: print("No issues found.\n")
        return True
Esempio n. 11
0
 def __init__(self, owner=None, id: Optional[str] = None, speaker: Optional[Speaker] = None,
              user: Optional[Speaker] = None, conversation_id: Optional[str] = None,
              root: Optional[str] = None, reply_to: Optional[str] = None,
              timestamp: Optional[int] = None, text: str = '',
              meta: Optional[Dict] = None):
     super().__init__(obj_type="utterance", owner=owner, id=id, meta=meta)
     speaker_ = speaker if speaker is not None else user
     self.speaker = speaker_
     if self.speaker is None:
         raise ValueError("No Speaker found: Utterance must be initialized with a Speaker.")
     self.user = speaker # for backwards compatbility
     self.conversation_id = conversation_id if conversation_id is not None else root
     if self.conversation_id is not None and not isinstance(self.conversation_id, str):
         warn("Utterance conversation_id must be a string: conversation_id of utterance with ID: {} "
              "has been casted to a string.".format(self.id))
         self.conversation_id = str(self.conversation_id)
     self._root = self.conversation_id
     self.reply_to = reply_to
     self.timestamp = timestamp # int(timestamp) if timestamp is not None else timestamp
     if not isinstance(text, str):
         warn("Utterance text must be a string: text of utterance with ID: {} "
              "has been casted to a string.".format(self.id))
         text = '' if text is None else str(text)
     self.text = text
Esempio n. 12
0
 def matrix(self):
     warn(
         "ConvoKitMatrix's internal matrix cannot be deleted. Use Corpus.delete_vector_matrix() instead."
     )