from __future__ import absolute_import, print_function from collections import defaultdict, deque from copy import copy from gflags import FLAGS, DEFINE_bool, DuplicateFlagError import logging from nltk.util import flatten from os.path import splitext import sys from causeway.because_data import CausalityStandoffReader, CausationInstance from nlpypline.data.io import DirectoryReader, InstancesDocumentWriter try: DEFINE_bool( 'separate_new_conn', True, 'Whether a separate "NEW-CONN" transition should be generated' ' at the start of each new relation') DEFINE_bool( 'separate_shift', False, 'Whether a separate "SHIFT" transition should be generated when' ' a relation is completed') except DuplicateFlagError as e: logging.warn(e) class CausalityOracleTransitionWriter(InstancesDocumentWriter): def __init__(self, filepath=None): super(CausalityOracleTransitionWriter, self).__init__(filepath) self._byte_offset_in_doc = None def write_all_instances(self, document, instances_getter=None):
from collections import defaultdict from gflags import DEFINE_bool, FLAGS, DuplicateFlagError import logging import re import time from causeway import PossibleCausation, PairwiseAndNonIAAEvaluator from nlpypline.pipeline import Stage from nlpypline.pipeline.models import Model from nlpypline.util import Enum try: DEFINE_bool('regex_include_pos', True, 'Whether to include POS tags in the strings matched by regex') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class RegexConnectiveModel(Model): def __init__(self, *args, **kwargs): super(RegexConnectiveModel, self).__init__(*args, **kwargs) self.regexes = [] def _train_model(self, sentences): self.regexes = [ (re.compile(pattern), matching_groups) for pattern, matching_groups in self._extract_patterns(sentences) ] def test(self, sentences):
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError from itertools import chain # , izip_longest import logging import numpy as np import pycrfsuite import time from types import MethodType from nlpypline.pipeline.models import Model, MultiplyFeaturizedModel from nlpypline.pipeline.featurization import DictOnlyFeaturizer, Featurizer try: DEFINE_bool('pycrfsuite_verbose', False, 'Verbose logging output from python-crfsuite trainer') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class StructuredModel(Model): ''' In a structured model, every instance is divided up into "parts." Those parts are treated as the thing to be scored by the model. Thus, this class overrides the default train and test methods to extract parts first, and then call the normal test/train on the parts rather than the instances. (Thus, it's often a good idea for the parts to store pointers to the original instances for use in featurization, as the feature extractors won't get a copy of the original instance on the side.) A StructuredModel also has a StructuredDecoder, which is used to decode the scored parts into a coherent labeling for the instance. '''
from gflags import FLAGS, DEFINE_bool, DEFINE_string, DuplicateFlagError import io import logging import os import re from nlpypline.data import StanfordParsedSentence, SentencesDocument from nlpypline.util import recursively_list_files from nlpypline.util.streams import (read_stream_until, peek_and_revert_unless, CharacterTrackingStreamWrapper) try: DEFINE_string('reader_codec', 'utf-8', 'The encoding to assume for data files') DEFINE_bool( 'reader_gold_parses', False, 'Whether to read .parse.gold files instead of .parse files for' ' sentence parses') DEFINE_bool( 'gold_parses_fallback', False, 'If reader_gold_parses is True, falls back to automated parse' ' files instead of failing if gold parses are not found') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class DocumentStream(object): def __init__(self, filepath=None): self._file_stream = None if filepath: self.open(filepath)
'in_parse_tree', 'pattern', 'pattern+conn_parse_path', 'conn_rel_pos', 'is_alnum' ], 'Features for the argument-labeling CRF') DEFINE_integer( 'arg_label_max_dep_path_len', 4, "Maximum number of dependency path steps to allow before" " just making the value 'LONG-RANGE'") DEFINE_enum('arg_label_training_alg', 'lbfgs', ['lbfgs', 'l2sgd', 'ap', 'pa', 'arow'], 'Algorithm for training argument labeling CRF') DEFINE_bool( 'arg_label_save_crf_info', False, 'Whether to read in and save an accessible version of the CRF' ' model parameters in the model (useful for debugging)') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class ArgumentLabelerModel(CRFModel): CAUSE_LABEL = 'Cause' EFFECT_LABEL = 'Effect' NONE_LABEL = 'None' def __init__(self, training_algorithm, training_params, *args, **kwargs): super(ArgumentLabelerModel, self).__init__(selected_features=FLAGS.arg_label_features, training_algorithm=training_algorithm, training_params=training_params,
import sys from causeway.because_data import CausalityStandoffReader from causeway.because_data.iaa import CausalityMetrics, print_indented from nlpypline.data.io import DirectoryReader try: DEFINE_list( 'iaa_file_regexes', r".*\.ann$", "Regexes to match filenames against for IAA (non-matching files will" " not be compared).") DEFINE_integer( 'iaa_max_sentence', sys.maxint, 'Maximum number of sentences to analyze when computing IAA.') DEFINE_bool( 'iaa_include_partial', False, 'Include a comparison that counts partial overlap of spans as a' ' match.') DEFINE_bool('iaa_recurse', False, 'Whether to recurse into the data directories') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') def compare_instance_lists(gold, predicted, indent=0): printing_some_metrics = FLAGS.iaa_log_confusion or FLAGS.iaa_log_stats if FLAGS.iaa_include_partial: partial_possibilities = [True, False] else: partial_possibilities = [False]
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError import logging from causeway import PairwiseAndNonIAAEvaluator from nlpypline.pipeline import Stage from nlpypline.pipeline.models.structured import (StructuredModel, StructuredDecoder) try: DEFINE_bool('combiner_print_test_instances', False, 'Whether to print differing IAA results during evaluation') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class BaselineCombinerModel(StructuredModel): def __init__(self, baseline_causations_attr_name): super(BaselineCombinerModel, self).__init__(BaselineDecoder()) self.baseline_causations_attr_name = baseline_causations_attr_name def _train_structured(self, instances, parts_by_instance): pass def _make_parts(self, sentence, is_train): if is_train: return [] else: return getattr(sentence, self.baseline_causations_attr_name) def _score_parts(self, instance, instance_parts): pass
import cPickle from gflags import DEFINE_bool, FLAGS, DuplicateFlagError import itertools import logging import numpy as np from scipy.sparse import lil_matrix, vstack from sklearn.base import BaseEstimator from nlpypline.pipeline.featurization import (FeatureExtractor, Featurizer, FeaturizationError) from nlpypline.util import NameDictionary, listify # from nlpypline.util.metrics import diff_binary_vectors try: DEFINE_bool( 'rebalance_stochastically', False, 'Rebalance classes by stochastically choosing samples to replicate') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class Model(object): def __init__(self, *args, **kwargs): if args or kwargs: logging.debug("Extra model arguments: args=%s, kwargs=%s", args, kwargs) def train(self, instances): self.reset() # Reset state in case we've been previously trained. self._train_model(instances) self._post_model_train()
import re from scipy.sparse import lil_matrix, csr_matrix, csgraph from nlpypline.util import Enum, merge_dicts, listify, nwise from nlpypline.util.nltk import (collins_find_heads, nltk_tree_to_graph, is_parent_of_leaf) from nlpypline.util.scipy import bfs_shortest_path_costs from nlpypline.util.streams import ( CharacterTrackingStreamWrapper, eat_whitespace, is_at_eof, peek_and_revert_unless, read_stream_until) try: DEFINE_bool('use_constituency_parse', False, 'Whether to build constituency parse trees from the provided' ' constituency parse string when constructing' ' StanfordParsedSentences. Setting to false makes reading in' ' data more efficient.') except DuplicateFlagError: pass class Document(object): # TODO: there are probably a lot of other things we should offer here. # Starting with the ability to recover the text of the document... def __init__(self, filename): self.filename = filename def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, self.filename)
from __future__ import absolute_import import copy from gflags import DEFINE_bool, FLAGS, DuplicateFlagError import logging import numpy as np from nltk.metrics import confusionmatrix from nlpypline.util.scipy import add_rows_and_cols_to_matrix from nlpypline.util import floats_same_or_nearly_equal try: DEFINE_bool( 'metrics_log_raw_counts', False, "Log raw counts (TP, FP, etc.) for evaluation or IAA metrics.") except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') def safe_divide(dividend, divisor): if divisor != 0: return float(dividend) / divisor elif dividend == 0: return 0.0 else: return np.nan def f1(precision, recall): return safe_divide(2 * precision * recall, precision + recall) class ClassificationMetrics(object):
from nlpypline.data.io import DirectoryReader from nlpypline.pipeline import Pipeline, SimpleStage from nlpypline.pipeline.models import ClassBalancingClassifierWrapper from nlpypline.util import print_indented try: DEFINE_enum('classifier_model', 'logistic', ['tree', 'knn', 'logistic', 'svm', 'forest', 'nb'], 'What type of machine learning model to use as the underlying' ' causality filter classifier') DEFINE_float( 'rebalance_ratio', 1.0, 'The maximum ratio by which to rebalance classes for training') DEFINE_bool('eval_with_cv', False, 'Evaluate with cross-validation. Overrides --evaluate flag, and' ' causes both train and test to be combined.') DEFINE_bool('debug', False, 'Whether to print debug-level logging.') DEFINE_integer('seed', None, 'Seed for the numpy RNG.') DEFINE_enum('pipeline_type', 'tregex', ['tregex', 'regex', 'baseline', 'tregex+baseline', 'regex+baseline', 'tregex_mostfreq', 'regex_mostfreq', 'tregex_cache'], 'Which causality pipeline to run') DEFINE_bool('filter_overlapping', True, 'Whether to filter smaller connectives that overlap with larger' ' ones') DEFINE_bool('save_models', False, "Whether to save pipeline models post-train (if not doing CV).") DEFINE_string('models_dir', None,
import operator import os from os import path import subprocess from subprocess import PIPE import tempfile from causeway.because_data import CausationInstance from causeway.because_data.iaa import CausalityMetrics from nlpypline.data import StanfordParsedSentence from nlpypline.pipeline import Stage, Evaluator from nlpypline.pipeline.models import Model from nlpypline.util import listify, print_indented, Enum, make_getter, make_setter try: DEFINE_bool("iaa_calculate_partial", False, "Whether to compute metrics for partial overlap") DEFINE_string('stanford_ner_path', '/home/jesse/Documents/Work/Research/stanford-corenlp-full-2015-04-20', 'Path to Stanford NER directory') DEFINE_string('stanford_ner_jar', 'stanford-corenlp-3.5.2.jar', 'Name of JAR file containing Stanford NER') DEFINE_string( 'stanford_ner_model_name', 'english.all.3class.distsim.crf.ser.gz', 'Name of model file for Stanford NER') DEFINE_bool('print_patterns', False, 'Whether to print all connective patterns') DEFINE_bool('patterns_print_test_instances', False, 'Whether to print differing IAA results during evaluation of' ' pattern matching stage') DEFINE_bool('args_print_test_instances', False, 'Whether to print differing IAA results during evaluation of'
from copy import copy, deepcopy from gflags import FLAGS, DuplicateFlagError, DEFINE_bool import logging from nltk.tree import ImmutableParentedTree import numpy as np import os from scipy.sparse.lil import lil_matrix from nlpypline.data import Annotation, Token, StanfordParsedSentence from nlpypline.data.io import (DocumentReader, StanfordParsedSentenceReader, InstancesDocumentWriter) from nlpypline.util import listify, Enum, make_getter, make_setter, Object from textwrap import TextWrapper try: DEFINE_bool('reader_binarize_degrees', True, 'Whether to turn all degrees into "Facilitate" and "Inhibit"') DEFINE_bool( 'reader_ignore_overlapping', False, 'Whether, when reading causality data, instances with an' ' accompanying overlapping relation should be ignored') except DuplicateFlagError as e: logging.warn('Ignoring flag redefinitions; assuming module reload') class CausewaySentence(StanfordParsedSentence): def __init__(self, *args, **kwargs): super(CausewaySentence, self).__init__(*args, **kwargs) self.causation_instances = [] self.overlapping_rel_instances = [] def add_causation_instance(self, *args, **kwargs):