def arg_path_diversity_by_pattern(instances, outgoing_only=False): cause_paths = defaultdict(Counter) effect_paths = defaultdict(Counter) cause_path_lens = defaultdict(list) effect_path_lens = defaultdict(list) for i in instances: sentence = i.sentence cause, effect = i.cause, i.effect connective_str = stringify_connective(i).lower() connective_str = V2_REMAPPINGS.get(connective_str, connective_str) for arg, paths, path_lens in zip([cause, effect], [cause_paths, effect_paths], [cause_path_lens, effect_path_lens]): if arg: arg_head = sentence.get_head(arg) if arg_head: conn_head = sentence.get_closest_of_tokens(arg_head, i.connective)[0] path = sentence.extract_dependency_path(conn_head, arg_head) path_lens[connective_str].append(len(path)) if outgoing_only: path = DependencyPath(path.start, [path[0]]) paths[connective_str][str(path)] += 1 # Else skip invalid head cause_effect_path_entropies = [{k: entropy(v) for k, v in paths.iteritems()} for paths in [cause_paths, effect_paths]] cause_effect_path_len_stds = [{k: np.std(v) for k, v in lens.iteritems()} for lens in [cause_path_lens, effect_path_lens]] return cause_effect_path_entropies + cause_effect_path_len_stds
def _train_structured(self, instances, parts_by_instance): for instance_pcs in parts_by_instance: for possible_causation in instance_pcs: label = bool(possible_causation.true_causation_instance) connective = stringify_connective(possible_causation) self.frequencies_dict[connective][label] += 1 for key in self.frequencies_dict: negative, positive = self.frequencies_dict[key] self.frequencies_dict[key] = float(positive) / (positive + negative)
def pattern_saturation(documents, num_folds=20, num_increments=20): sentences = list(chain.from_iterable(d.sentences for d in documents)) xs = np.linspace(0, 1, num_increments + 1) ys = np.empty((num_folds, num_increments + 1)) ys[:, 0] = 0 # At 0% of sentences, we always have 0% of the patterns. for fold in range(num_folds): np.random.shuffle(sentences) patterns_seen = set() increments = partition(sentences, num_increments) for i, increment in enumerate(increments): for sentence in increment: for causation in sentence.causation_instances: patterns_seen.add(stringify_connective(causation)) ys[fold, i + 1] = len(patterns_seen) averages = np.average(ys, 0) tmp_A = [] for i in range(1, len(xs)): tmp_A.append([np.log(xs[i])**2, np.log(xs[i]), 1]) b = np.matrix(averages[1:]).T A = np.matrix(tmp_A) fit = (A.T * A).I * A.T * b print(fit) # errors = b - A * fit # residual = np.linalg.norm(errors) fit_x = np.linspace(0, 4, 2000) fit_y = [float(fit[0]) * np.log(x)**2 + float(fit[1]) * np.log(x) + float(fit[2]) for x in fit_x] with utopia_context(): plt.tick_params(axis='both', labelsize=11) plt.fill_between([1, 4], 0, 380, color='gray', alpha=0.1, lw=0) plt.plot(xs, averages, color='black') plt.plot(fit_x, fit_y, color='orange', dashes=[3,4], alpha=0.8) plt.xlabel(r'\% of sentences in corpus', fontsize=13, labelpad=12) plt.ylabel(r'\# of patterns', fontsize=13, labelpad=12) fig = plt.gcf() size = fig.get_size_inches() fig.set_size_inches(size[0]*1.25, size[1]) ax = plt.gca() ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x))) plt.tight_layout() # plt.show(False) plt.savefig('/home/jesse/Documents/Work/Research/My Publications/Thesis/repr_annot/saturation.pdf')
def count_connectives_remapped(docs): to_remap = {'for too to': 'too for to', 'for too': 'too for', 'that now': 'now that', 'to for': 'for to', 'give': 'given', 'citizen-sparked': 'spark', 'encouraging': 'encourage', 'have to for to': 'for to have to', 'thank to': 'thanks to', 'on grounds of': 'on ground of', 'precipitating': 'precipitate', 'to need': 'need to', 'to need to': 'need to to', 'to take': 'take to', 'HELPS': 'help', 'helps': 'help', 'on grounds that': 'on ground that' } instances = chain.from_iterable(chain.from_iterable( [s.causation_instances for s in doc.sentences] for doc in docs)) stringified = [stringify_connective(causation).lower() for causation in instances] for s, inst in zip(stringified, instances): assert s != 'without ' return Counter([to_remap.get(s, s) for s in stringified])
def get_next(self): document = self.sentence_reader.get_next() if not document: return None lines = self._file_stream.readlines() if not lines: logging.warn("No annotations found in file %s" % self._file_stream.name) # Don't close the reader: we still want to return the sentences, # even if they have no causality annotations. else: ids_to_annotations = {} ids_to_instances = {} # Map of causal instances to their overlapping relations instances_also_overlapping = defaultdict(set) unused_arg_ids = set() self.__process_lines(lines, ids_to_annotations, ids_to_instances, instances_also_overlapping, unused_arg_ids, document) for to_duplicate, types in instances_also_overlapping.items(): to_duplicate.sentence.add_overlapping_instance( types, to_duplicate.connective, to_duplicate.arg0, to_duplicate.arg1, to_duplicate.id, to_duplicate) for sentence in document: for ovl_instance in sentence.overlapping_rel_instances: if ovl_instance.type is None: from causeway.because_data.iaa import ( stringify_connective) logging.warn( "No relation type for non-causal instance %s (%s)", ovl_instance.id, stringify_connective(ovl_instance)) return document
def _score_parts(self, sentence, possible_causations): return [(self.frequencies_dict[stringify_connective(pc)] if pc.cause and pc.effect else 0.0) for pc in possible_causations]