Ejemplo n.º 1
0
    def fit(self,
            corpus: Corpus,
            selector: Callable[[CorpusComponent], bool] = lambda x: True,
            y=None):
        """
        Fit the Transformer's internal classifier model on the vector matrix that represents one of
        the Corpus components, with an optional selector that selects for objects to be fit on.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: the fitted VectorClassifier
        """
        # collect texts for vectorization
        obj_ids = []
        y = []
        for obj in corpus.iter_objs(self.obj_type, selector):
            obj_ids.append(obj.id)
            y.append(self.labeller(obj))
        X = corpus.get_vectors(self.vector_name,
                               ids=obj_ids,
                               columns=self.columns)
        y = np.array(y)
        # print(corpus.get_vector_matrix(self.vector_name).matrix.shape)
        # print(X.shape)
        # print(y.shape)
        self.clf.fit(X, y)
        return self
Ejemplo n.º 2
0
def load_conversations(corpus_name, max_samples, eval_percent=0.1):
    logging.info('Loading data.')

    def split_data(inputs, outputs, eval_percent):
        eval_index = int(len(inputs) * (1 - eval_percent))
        return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:],
                outputs[eval_index:])

    corpus = Corpus(filename=download(corpus_name))

    deleted_filter = re.compile(r'^(\[deleted]|\[removed])$')

    inputs, outputs = [], []
    for paths in corpus.iter_conversations():
        for path in paths.get_root_to_leaf_paths():
            for i in range(len(path) - 1):

                if deleted_filter.match(path[i].text) \
                or deleted_filter.match(path[i-1].text) \
                or deleted_filter.match(path[i+1].text):
                    continue

                inputs.append(path[i].text)
                outputs.append(path[i + 1].text)

                if len(inputs) >= max_samples:
                    return split_data(inputs, outputs, eval_percent)

    logging.info('Done!')
    return split_data(inputs, outputs, eval_percent)
Ejemplo n.º 3
0
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus components with the classifier prediction and prediction score, with an optional selector
        that selects for objects to be classified. Objects that are not selected will get a metadata value of 'None'
        instead of the classifier prediction.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = []
        for obj in corpus.iter_objs(self.obj_type):
            if selector(obj):
                objs.append(obj)
            else:
                obj.add_meta(self.clf_attribute_name, None)
                obj.add_meta(self.clf_prob_attribute_name, None)

        obj_ids = [obj.id for obj in objs]
        X = corpus.get_vector_matrix(self.vector_name).get_vectors(
            obj_ids, self.columns)

        clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1]

        for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))):
            obj = objs[idx]
            obj.add_meta(self.clf_attribute_name, clf)
            obj.add_meta(self.clf_prob_attribute_name, clf_prob)
        return corpus
def process_corpus(corpus_name,
                   to_download=TO_DOWNLOAD,
                   min_wc_source=MIN_WC_SOURCE,
                   max_wc_source=MAX_WC_SOURCE,
                   min_wc_target=MIN_WC_TARGET,
                   max_wc_target=MAX_WC_TARGET,
                   source_filter=SOURCE_FILTER,
                   target_filter=TARGET_FILTER,
                   text_cols=TEXT_COLS,
                   data_dir=DATA_DIR):

    if to_download:
        corpus = Corpus(download(corpus_name, data_dir=data_dir))
    else:
        corpus = Corpus(os.path.join(data_dir, corpus_name))
    corpus_name = corpus.get_meta()['name']
    print(corpus_name)
    corpus.print_summary_stats()
    print('processing', corpus.get_meta()['name'])
    corpus.load_info('utterance', ['parsed'])

    corpus = text_prep_pipe().transform(corpus)

    source_df, target_df = get_train_subset(corpus, min_wc_source,
                                            max_wc_source, min_wc_target,
                                            max_wc_target, source_filter,
                                            target_filter, text_cols)
    source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'),
                     sep='\t')
    target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'),
                     sep='\t')
    def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus objects with the vectorized representation of the object's text, with an optional
        selector that filters for objects to be transformed. Objects that are not selected will get a metadata value
        of 'None' instead of the vector.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus
Ejemplo n.º 6
0
    def _read(self, corpus_split):
        corpus_split = corpus_split.split('_')

        corpus_name = corpus_split[0]
        self.split = corpus_split[1] if len(corpus_split) > 1 else None

        corpus = Corpus(filename=download(corpus_name))
        conversations = corpus.iter_conversations()
        if self.sample:
            conversations = itertools.islice(conversations, self.sample)

        for conv in conversations:
            meta = conv.meta

            if (meta.get('split') != self.split) and (meta.get(
                    'annotation_year', 2018) != 2018):
                continue

            label = str(meta[self.label_field])
            # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))]
            turns = [
                u.meta.parsed for u in conv.iter_utterances()
                if not u.meta.get('is_section_header')
            ]

            end = len(turns) - 1 if self.forecast else None
            turns = turns[-self.max_turns:end]

            if turns and all(turns):
                inst = self.text_to_instance(turns, label)
                if inst:
                    yield inst
Ejemplo n.º 7
0
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Computes the vector matrix for the Corpus component objects and then stores it in a ConvoKitMatrix object,
        which is saved in the Corpus as `vector_name`.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus component object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name,
                                 matrix=matrix,
                                 ids=ids,
                                 columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus
Ejemplo n.º 8
0
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a Corpus from an Intermediate.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        if block.user not in users:
            users[block.user] = User(id=block.user)
        segments = accum.segment_contiguous_blocks(block.reply_chain)
        for seg in segments[:-1]:
            sos = helpers.string_of_seg(seg)
            complete_utterances.add(sos)

        assert (block_hash == segments[-1][-1])
        if not accum.blocks[segments[-1][-1]].is_followed:
            complete_utterances.add(helpers.string_of_seg(segments[-1]))
        block_hashes_to_segments[block_hash] = segments

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = belongs_to_segment[0][0]
        u_replyto = _find_reply_to_from_segment(belongs_to_segment)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes

        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=u_replyto,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)
        # this_utterance.meta = u_meta

        utterances.append(this_utterance)

    corpus = Corpus(utterances=utterances)
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids

    return corpus
Ejemplo n.º 9
0
def add_title_to_root(corpus: Corpus):
    for conversation in corpus.iter_conversations():
        utterance = corpus.get_utterance(conversation.id)
        title = conversation.retrieve_meta('title')
        if title is None:
            title = ''
        if utterance.text is None:
            utterance.text = title
        else:
            utterance.text = title + ' ' + utterance.text
Ejemplo n.º 10
0
    def test_partial_load_end_idx_specified_only(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_end_index=0)

        self.assertEqual(len(list(corpus2.iter_utterances())), 1)
        self.assertEqual(corpus1.get_utterance("0"),
                         corpus2.get_utterance("0"))
Ejemplo n.º 11
0
 def transform(self, corpus: Corpus) -> Corpus:
     corpus = copy.deepcopy(corpus)
     for convo in corpus.iter_conversations():
         if 'rank' in convo.meta.keys():
             raise Exception(
                 'rank is already a key in this conversations meta! aborting'
             )
         t = 0
         for id in convo._utterance_ids:
             u = corpus.get_utterance(id)
             t += len(u.text)
         convo.meta['rank'] = t
     return corpus
Ejemplo n.º 12
0
 def rank2(self, corpus: Corpus, score=None):
     if score == None:
         score = self.convo_length
     h = defaultdict(list)
     for convo in corpus.iter_conversations():
         h[score(corpus, convo)].append(convo)
     return h
Ejemplo n.º 13
0
def print_corpus(c: Corpus) -> None:
    leaves = get_corpus_leaf_ids(c)

    for leaf_id in leaves:
        utt = c.get_utterance(leaf_id)
        chain = [utt]
        while utt.reply_to:
            utt = c.get_utterance(utt.reply_to)
            chain.append(utt)

        depth = ""
        print("this conversation is", len(chain), "utterances long.")
        for utterance in reversed(chain):
            print(depth + utterance.text.replace("\n", " "))
            depth += "--> "
        print("\n")
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str,
                         meta_cols: List[str]) -> Corpus:
    """ Helper function to convert data to Corpus format
     
    Arguments:
        df {DataFrame} -- Actual data, in a pandas Dataframe
        id_col {str} -- name of the column that corresponds to utterances ids 
        text_col {str} -- name of the column that stores texts of the utterances  
        meta_cols {List[str]} -- set of columns that stores relevant metadata 
    
    Returns:
        Corpus -- the converted corpus
    """

    # in this particular case, speaker, reply_to, and timestamp information are all not applicable
    # and we will simply either create a placeholder entry, or leave it as None

    generic_speaker = Speaker(id="speaker")
    time = "NOT_RECORDED"

    utterance_list = []
    for index, row in tqdm(df.iterrows()):

        # extracting meta data
        metadata = {}
        for meta_col in meta_cols:
            metadata[meta_col] = row[meta_col]

        utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \
                                        conversation_id=str(row[id_col]), reply_to=None, \
                                        timestamp=time, text=row[text_col], \
                                        meta=metadata))

    return Corpus(utterances=utterance_list)
Ejemplo n.º 15
0
    def fit(self, corpus: Corpus, y=None):
        # collect texts for vectorization
        docs = []
        for obj in corpus.iter_objs(self.obj_type, self.selector):
            docs.append(self.text_func(obj))

        self.vectorizer.fit(docs)
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation), with an optional selector indicating which objects should be considered for pairing.

        :param corpus: target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns a bool (True = include)
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus, selector)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_attribute_name, "pos")
            neg_obj.add_meta(self.label_attribute_name, "neg")
            pos_obj.add_meta(self.pair_id_attribute_name, pair_id)
            neg_obj.add_meta(self.pair_id_attribute_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_attribute_name,
                             pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_attribute_name,
                             pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_attribute_name not in obj.meta:
                obj.add_meta(self.label_attribute_name, None)
                obj.add_meta(self.pair_id_attribute_name, None)
                obj.add_meta(self.pair_orientation_attribute_name, None)

        return corpus
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation)
        :param corpus: target Corpus
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_feat_name, "pos")
            neg_obj.add_meta(self.label_feat_name, "neg")
            pos_obj.add_meta(self.pair_id_feat_name, pair_id)
            neg_obj.add_meta(self.pair_id_feat_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_feat_name,
                             pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_feat_name,
                             pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_feat_name not in obj.meta:
                obj.add_meta(self.label_feat_name, None)
                obj.add_meta(self.pair_id_feat_name, None)
                obj.add_meta(self.pair_orientation_feat_name, None)

        return corpus
Ejemplo n.º 18
0
 def transform(self, corpus: Corpus) -> Corpus:
     for utt in corpus.iter_utterances():
         if self.utt_selector(utt):
             utt.add_meta(self.perplexity_feat_name, self.model.str_perplexity(self.utt_text_func(utt)))
         else:
             utt.add_meta(self.perplexity_feat_name, None)
     return corpus
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing speakers with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(id="alice",
                                      meta={
                                          'speaker_binary_data':
                                          speaker_byte_arr1,
                                          'index': 99
                                      }),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        alice = corpus1.utterances["0"].speaker
        bob = corpus1.utterances["1"].speaker

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.utterances["0"].speaker
        bob2 = corpus2.utterances["1"].speaker

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(corpus1.utterances["0"].meta,
                         corpus2.utterances["0"].meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(corpus1.utterances["1"].meta,
                         corpus2.utterances["1"].meta)
Ejemplo n.º 20
0
 def rank(self, corpus: Corpus, score=None):
     if score == None:
         score = self.convo_length
     h = []
     for convo in corpus.iter_conversations():
         heappush(h, (score(corpus, convo), len(h), convo))
     while len(h) > 0:
         yield heappop(h)
    def test_partial_load_invalid_start_index(self):
        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(
                          id="alice",
                          meta={'speaker_binary_data': speaker_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=99)

        self.assertEqual(len(list(corpus2.iter_utterances())), 0)
Ejemplo n.º 22
0
def get_manual_corpus() -> Corpus:
    try:
        return Corpus(filename='build/manual')
    except:
        manual_corpus = build_manual_corpus()
        manual_corpus.dump(name='manual',
                           increment_version=False,
                           base_path='build')
        return manual_corpus
Ejemplo n.º 23
0
    def transform(self, corpus: Corpus) -> Corpus:
        for obj in corpus.iter_objs(self.obj_type):
            if self.selector(obj):
                obj.meta[self.vector_name] = self.vectorizer.transform(
                    [self.text_func(obj)])
            else:
                obj.meta[self.vector_name] = None

        return corpus
Ejemplo n.º 24
0
def get_imessage_corpus() -> Corpus:
    try:
        return Corpus(filename='build/imessages')
    except:
        imessage_corpus = build_imessage_corpus()
        imessage_corpus.dump(name='imessages',
                             increment_version=False,
                             base_path='build')
        return imessage_corpus
Ejemplo n.º 25
0
 def fit(self, corpus: Corpus, y=None):
     # collect texts for vectorization
     X = []
     y = []
     for obj in corpus.iter_objs(self.obj_type, self.selector):
         X.append(obj.meta[self.vector_name])
         y.append(self.labeller(obj))
     X = vstack(X)
     self.clf.fit(X, y)
     return self
Ejemplo n.º 26
0
 def __init__(self, subReddit=""):
     self._startIndex = 0
     self._endIndex = 5 * (10**5)
     self._startDate = 2007
     self._endDate = 2018
     self._target = subReddit
     if self._target != "":
         Corpus(filename=download(self._target),
                utterance_start_index=self._startIndex,
                utterance_end_index=self._endIndex)
Ejemplo n.º 27
0
def get_corpus_leaf_ids(c: Corpus) -> set:
    leaves = set()
    not_leaves = set()
    for utt in c.iter_utterances():
        if utt.id not in not_leaves:
            leaves.add(utt.id)
        if utt.reply_to in leaves:
            leaves.remove(utt.reply_to)
        not_leaves.add(utt.reply_to)
    return leaves
Ejemplo n.º 28
0
    def summarize(self, corpus: Corpus, use_selector=True):
        objId_clf_prob = []

        for obj in corpus.iter_objs(
                self.obj_type,
                self.selector if use_selector else lambda _: True):
            objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name],
                                   obj.meta[self.clf_prob_feat_name]))

        return pd.DataFrame(list(objId_clf_prob),
                           columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\
                        .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
    def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda x: True):
        """
        Fit the Transformer's internal vectorizer on the Corpus objects' texts, with an optional selector that filters for objects to be fit on.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: the fitted BoWTransformer
        """
        # collect texts for vectorization
        docs = [self.text_func(obj) for obj in corpus.iter_objs(self.obj_type, selector)]
        self.vectorizer.fit(docs)
        return self
    def _get_pos_neg_objects(self, corpus: Corpus, selector):
        """
        Get positively-labelled and negatively-labelled lists of objects

        :param corpus: target Corpus
        :return: list of positive objects, list of negative objects
        """
        pos_objects = []
        neg_objects = []
        for obj in corpus.iter_objs(self.obj_type, selector):
            if self.pos_label_func(obj):
                pos_objects.append(obj)
            elif self.neg_label_func(obj):
                neg_objects.append(obj)
        return pos_objects, neg_objects
#
# The plots answer these questions: 
# - Do users on the whole coordinate more to admins or nonadmins?
# - Do admins coordinate to other people more than nonadmins do?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by whether they are an admin
# this means that if a user has spoken in the corpus as both an admin and
#   a non-admin, then we will split this user into two users, one for each of
#   these roles
corpus = Corpus(filename=download("wiki-corpus"))
split = ["is_admin"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
# This example extracts politeness strategies from the Conversations Gone Awry dataset,
#   one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html).
#   For code reproducing the full results of the paper, see the example notebook in the
#   `conversations-gone-awry` example subdirectory.

import pandas as pd
from convokit import PolitenessStrategies, Corpus, download

print("Loading awry corpus...")
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

# extract the politeness strategies.
# Note: politeness strategies are a hand-engineered feature set, so no fitting is needed.
ps = PolitenessStrategies(verbose=100)
print("Extracting politeness strategies...")
corpus = ps.transform(corpus)

values = []
idx = []
for utterance in corpus.iter_utterances():
    values.append(utterance.meta["politeness_strategies"])
    idx.append(utterance.id)
pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv")
print("Done, results written to awry_strategy_df_v2.csv")
# The plots answer these questions:
# - Do lawyers coordinate more to justices than the other way around?
# - Do lawyers coordinate more to unfavorable or favorable justices?
# - Do unfavorable justices coordinate to lawyers more than favorable justices,
#     or vice versa?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by case id and split the justices by whether they are
#     favorable to the current presenting side
# this treats the same person across two different cases as two different users
corpus = Corpus(filename=download("supreme-corpus"))
split = ["case", "justice-is-favorable"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)