Ejemplo n.º 1
0
def arg_path_diversity_by_pattern(instances, outgoing_only=False):
    cause_paths = defaultdict(Counter)
    effect_paths = defaultdict(Counter)

    cause_path_lens = defaultdict(list)
    effect_path_lens = defaultdict(list)

    for i in instances:
        sentence = i.sentence
        cause, effect = i.cause, i.effect
        connective_str = stringify_connective(i).lower()
        connective_str = V2_REMAPPINGS.get(connective_str, connective_str)
        for arg, paths, path_lens in zip([cause, effect],
                                         [cause_paths, effect_paths],
                                         [cause_path_lens, effect_path_lens]):
            if arg:
                arg_head = sentence.get_head(arg)
                if arg_head:
                    conn_head = sentence.get_closest_of_tokens(arg_head, i.connective)[0]
                    path = sentence.extract_dependency_path(conn_head, arg_head)
                    path_lens[connective_str].append(len(path))
                    if outgoing_only:
                        path = DependencyPath(path.start, [path[0]])
                    paths[connective_str][str(path)] += 1
                # Else skip invalid head

    cause_effect_path_entropies = [{k: entropy(v) for k, v in paths.iteritems()}
                                   for paths in [cause_paths, effect_paths]]
    cause_effect_path_len_stds = [{k: np.std(v) for k, v in lens.iteritems()}
                                  for lens in [cause_path_lens, effect_path_lens]]
    return cause_effect_path_entropies + cause_effect_path_len_stds
Ejemplo n.º 2
0
    def _train_structured(self, instances, parts_by_instance):
        for instance_pcs in parts_by_instance:
            for possible_causation in instance_pcs:
                label = bool(possible_causation.true_causation_instance)
                connective = stringify_connective(possible_causation)
                self.frequencies_dict[connective][label] += 1

        for key in self.frequencies_dict:
            negative, positive = self.frequencies_dict[key]
            self.frequencies_dict[key] = float(positive) / (positive + negative)
Ejemplo n.º 3
0
def pattern_saturation(documents, num_folds=20, num_increments=20):
    sentences = list(chain.from_iterable(d.sentences for d in documents))
    xs = np.linspace(0, 1, num_increments + 1)
    ys = np.empty((num_folds, num_increments + 1))
    ys[:, 0] = 0 # At 0% of sentences, we always have 0% of the patterns.

    for fold in range(num_folds):
        np.random.shuffle(sentences)
        patterns_seen = set()
        increments = partition(sentences, num_increments)
        for i, increment in enumerate(increments):
            for sentence in increment:
                for causation in sentence.causation_instances:
                    patterns_seen.add(stringify_connective(causation))
            ys[fold, i + 1] = len(patterns_seen)
    averages = np.average(ys, 0)

    tmp_A = []
    for i in range(1, len(xs)):
        tmp_A.append([np.log(xs[i])**2, np.log(xs[i]), 1])
    b = np.matrix(averages[1:]).T
    A = np.matrix(tmp_A)
    fit = (A.T * A).I * A.T * b
    print(fit)
    # errors = b - A * fit
    # residual = np.linalg.norm(errors)

    fit_x = np.linspace(0, 4, 2000)
    fit_y = [float(fit[0]) * np.log(x)**2 + float(fit[1]) * np.log(x)
             + float(fit[2]) for x in fit_x]

    with utopia_context():
        plt.tick_params(axis='both', labelsize=11)

        plt.fill_between([1, 4], 0, 380, color='gray', alpha=0.1, lw=0)
        plt.plot(xs, averages, color='black')
        plt.plot(fit_x, fit_y, color='orange', dashes=[3,4], alpha=0.8)
        plt.xlabel(r'\% of sentences in corpus', fontsize=13, labelpad=12)
        plt.ylabel(r'\# of patterns', fontsize=13, labelpad=12)

        fig = plt.gcf()
        size = fig.get_size_inches()
        fig.set_size_inches(size[0]*1.25, size[1])

        ax = plt.gca()
        ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))

        plt.tight_layout()
        # plt.show(False)
        plt.savefig('/home/jesse/Documents/Work/Research/My Publications/Thesis/repr_annot/saturation.pdf')
Ejemplo n.º 4
0
def count_connectives_remapped(docs):
    to_remap = {'for too to': 'too for to', 'for too': 'too for',
                'that now': 'now that', 'to for': 'for to', 'give': 'given',
                'citizen-sparked': 'spark', 'encouraging': 'encourage',
                'have to for to': 'for to have to', 'thank to': 'thanks to',
                'on grounds of': 'on ground of', 'precipitating': 'precipitate',
                'to need': 'need to', 'to need to': 'need to to',
                'to take': 'take to', 'HELPS': 'help', 'helps': 'help',
                'on grounds that': 'on ground that'
    }
    instances = chain.from_iterable(chain.from_iterable(
        [s.causation_instances for s in doc.sentences] for doc in docs))
    stringified = [stringify_connective(causation).lower()
                   for causation in instances]
    for s, inst in zip(stringified, instances):
        assert s != 'without '
    return Counter([to_remap.get(s, s) for s in stringified])
Ejemplo n.º 5
0
    def get_next(self):
        document = self.sentence_reader.get_next()
        if not document:
            return None

        lines = self._file_stream.readlines()
        if not lines:
            logging.warn("No annotations found in file %s" %
                         self._file_stream.name)
            # Don't close the reader: we still want to return the sentences,
            # even if they have no causality annotations.
        else:
            ids_to_annotations = {}
            ids_to_instances = {}
            # Map of causal instances to their overlapping relations
            instances_also_overlapping = defaultdict(set)
            unused_arg_ids = set()
            self.__process_lines(lines, ids_to_annotations, ids_to_instances,
                                 instances_also_overlapping, unused_arg_ids,
                                 document)

            for to_duplicate, types in instances_also_overlapping.items():
                to_duplicate.sentence.add_overlapping_instance(
                    types, to_duplicate.connective, to_duplicate.arg0,
                    to_duplicate.arg1, to_duplicate.id, to_duplicate)

            for sentence in document:
                for ovl_instance in sentence.overlapping_rel_instances:
                    if ovl_instance.type is None:
                        from causeway.because_data.iaa import (
                            stringify_connective)
                        logging.warn(
                            "No relation type for non-causal instance %s (%s)",
                            ovl_instance.id,
                            stringify_connective(ovl_instance))

        return document
Ejemplo n.º 6
0
 def _score_parts(self, sentence, possible_causations):
     return [(self.frequencies_dict[stringify_connective(pc)]
              if pc.cause and pc.effect else 0.0)
             for pc in possible_causations]