コード例 #1
0
def eval_per_sent(pred, gold, ignore_punct=True):
    """Evaluate UAS for each sentence."""
    uas = []
    for tpred, tgold in zip(read_conllu(pred), read_conllu(gold)):
        assert len(tpred) == len(tgold)
        n = 0
        n_correct = 0
        for i in range(len(tgold)):
            if ignore_punct:
                if tgold[i]['upostag'] == "PUNCT":
                    continue
            n += 1
            if tpred[i]['head'] == tgold[i]['head']:
                n_correct += 1
        uas.append(float(n_correct) / n if n != 0 else 1.0)
    return np.asarray(uas)
コード例 #2
0
def compute_features(filename):
    """ Compute Liu 2010 features from conllu file."""
    deprel_stats = {}
    for tree in read_conllu(filename):
        for tok in tree:
            deprel = tok['deprel']
            if deprel not in deprel_stats:
                deprel_stats[deprel] = {'r': 0, 'l': 0}
            if tok['head'] > tok['id']:
                deprel_stats[deprel]['r'] += 1
            else:
                deprel_stats[deprel]['l'] += 1

    # features = {} # for named vector
    features = []
    for deprel in DEPRELS:
        if deprel in deprel_stats.keys():
            fval = (float(deprel_stats[deprel]['r']) /
                    (deprel_stats[deprel]['r'] + deprel_stats[deprel]['l']))
            features.append(fval)
        else:
            features.append(.5)

    return features
コード例 #3
0
def compute_features(filename, threshold=3.0, one_hot=True):
    """Compute WALS features from conllu file.

    Args:
      filename: Path to conllu file.
      threshold: A scalar threshold for determining dominancy.
        - If a feature has a "No Dominant Order" option, the frequency f(d) of
          order d must be >= threshold x f(d') to be considered the dominant
          order. Otherwise, the feature is marked with "No Dominant Order."
        - If a feature does not have a "No Dominant Order" option, order d is
          considered dominant simply if f(d) > f(d').
      one_hot: Return WALS features as a one-hot representation.

    Returns:
      WALS feature vector.
    """
    # Collect WALS statistics (directionalities).
    wals_stats = collections.defaultdict(dict)
    for w, _ in WALS_CONDITIONS.items():
        wals_stats[w]['r'] = 0
        wals_stats[w]['l'] = 0

    # Iterate over parsed trees.
    for tree in read_conllu(filename):
        for tok in tree:
            for w, cond in WALS_CONDITIONS.items():
                # Check that the dependency type matches the condition's
                # relevant dependency type, if any. Skip if not relevant.
                deprel = tok['deprel']
                if 'rel' in cond:
                    if deprel not in cond['rel']:
                        continue

                # Skip tokens whose head is ROOT.
                if tok['head'] == 0:
                    continue

                # Check if the modifier and head tags match the pattern.
                pos_m = tok['upostag']
                pos_h = tree[tok['head'] - 1]['upostag']
                if "%s/%s" % (pos_h, pos_m) in cond['pos'] or \
                   (pos_m == "DET" and "*/DET" in cond['pos']):
                    # If the head is to the left, this is right-leaning: h-->m.
                    if tok['head'] < tok['id']:
                        wals_stats[w]['r'] += 1
                    # If the head is to the right, it is left-leaning: m<--h.
                    elif tok['head'] > tok['id']:
                        wals_stats[w]['l'] += 1
                    else:
                        raise ValueError('Should not have head as self!')

    # Determine WALS values based on these statistics.
    # Use Laplace smoothing.
    wals_features = collections.defaultdict(dict)
    for w, stats in wals_stats.items():
        if float(stats['r'] + 1) / (stats['l'] + 1) >= threshold:
            direction = 'r'
        elif float(stats['l'] + 1) / (stats['r'] + 1) >= threshold:
            direction = 'l'
        else:
            direction = 'o'

        # Pick the weakly dominant order if "No Dominant Order" is not usable.
        if direction == 'o' and ('o' not in STATS_TO_WALS[w]):
            direction = 'r' if stats['r'] > stats['l'] else 'l'

        wals_features[w] = STATS_TO_WALS[w][direction]

    # Convert to one-hot if specified.
    if one_hot:
        wals_features = flatten(wals_features)

    return wals_features
コード例 #4
0
def compute_features(filename):
    """Compute Wang and Eisner 2018a features from conllu file."""
    g_uni = {}
    g_bi = {}
    for w in CONTEXT_WINDOW:
        g_uni[w] = collections.defaultdict(list)  # {w: \pi_t^w}
        g_bi[w] = collections.defaultdict(list)  # {w: \pi_{t,s}^w}

    # Gather statistics.
    for ex in read_conllu(filename):
        # Get part-of-speech tags.
        ex_pos = [t['upostag'] for t in ex]

        # Only sentences of <= 40 tokens are considered.
        if len(ex_pos) > 40:
            continue

        for w in CONTEXT_WINDOW:
            for j, s in enumerate(ex_pos):
                # Right context.
                if w > 0:
                    context = ex_pos[j + 1:j + w + 1]

                # Left context.
                else:
                    context = ex_pos[max(j + w, 0):j]

                # Count the frequencies of the tags in the context.
                tag_counts = collections.Counter(context)

                # For each tag type, store its context frequency
                # per window type (unigram + bigram).
                for t in POSSIBLE_TAGS:
                    cnt = tag_counts[t] if t in tag_counts else 0
                    tag_frac = float(cnt) / abs(w)

                    # Map window --> token --> frequencies.
                    g_uni[w][t].append(tag_frac)

                    # Map window --> bigram --> frequencies.
                    g_bi[w][(s, t)].append(tag_frac)

    # Convert to final mean fractions and ratios.
    pi_uni = {}
    pi_bi = {}
    features_uni = []
    features_bi = []

    for w in CONTEXT_WINDOW:
        pi_uni[w] = {}
        pi_bi[w] = {}

        for t in POSSIBLE_TAGS:
            pi_uni[w][t] = statistics.mean(g_uni[w][t])

            # Unigram features are only taken for POSITIVE w.
            if w > 0:
                features_uni.append(pi_uni[w][t])

            # Bigram features are used for bigram:unigram ratios only.
            for s in POSSIBLE_TAGS:
                if (s, t) not in g_bi[w]:
                    pi_bi[w][(s, t)] = 0.0
                else:
                    pi_bi[w][(s, t)] = statistics.mean(g_bi[w][(s, t)])

                if pi_uni[w][t] == 0.0:
                    # If token doesn't exist, default to 1.
                    assert pi_bi[w][(s, t)] == 0.0, ('Unigram frequency is 0'
                                                     ' but bigram is not!')
                    features_bi.append(1.0)
                else:
                    # Bound ratio by 1.
                    features_bi.append(min(pi_bi[w][(s, t)] / pi_uni[w][t], 1))

    # Concat features.
    full_features = features_uni + features_bi

    return full_features
コード例 #5
0
    def load_from_conllu(
            self,
            filename,
            min_tree_len=1,
            max_tree_len=None,
            subsample=None,
            subsampling_key='arcs',
            selective_sharing_feature_loader=None):
        """Initialize treebank from conllu file.

        Args:
          filename: Path to treebank in conllu format.
          min_tree_len: Skip trees with less than min_tree_len tokens.
          max_tree_len: Skip trees with more than max_tree_len tokens.
          subsample: Take a random sample of N examples.
          subsampling_key: Subsample by either "trees" or "arcs".
          selective_sharing_feature_loader: Callback to compute selective
            sharing features. First two arguments must be language and POS tags.
        """
        # Parse conllu file.
        self.examples = []
        arcs = 0
        trees = 0
        for i, ex in enumerate(read_conllu(filename)):
            # Throwout trees that don't meet length requirements.
            if min_tree_len and len(ex) < min_tree_len:
                continue
            if max_tree_len and len(ex) > max_tree_len:
                continue

            # Store the index of the tree as "tid" so that we can
            # recover the original order. (This is necessary to
            # compare to the gold annotations file.)
            ex = dict(tid=i,
                      lang=self.lang,
                      words=[t['form'] for t in ex],
                      pos=[t['upostag'].lower() for t in ex],
                      deprels=[t['deprel'].lower() for t in ex],
                      heads=[t['head'] for t in ex])
            self.examples.append(ex)
            arcs += len(ex['words'])
            trees += 1

        # If subsample is set, subsample up to N arcs or trees.
        if subsampling_key not in ['arcs', 'trees']:
            raise ValueError('Unknown subsampling key %s' % subsampling_key)
        total = arcs if subsampling_key == 'arcs' else trees
        if subsample and total > subsample:
            kept = []
            arcs = 0
            trees = 0
            np.random.shuffle(self.examples)
            for ex in self.examples:
                total = arcs if subsampling_key == 'arcs' else trees
                if total >= subsample:
                    break
                kept.append(ex)
                arcs += len(ex['words'])
                trees += 1
            self.examples = kept

        self.num_examples = len(self.examples)
        logger.info('Loaded %d arcs and %d trees from %s' %
                    (arcs, self.num_examples, filename))

        # Compute selective sharing features, if function provided.
        # This is done using multiprocessing for speed.
        if selective_sharing_feature_loader is not None:
            logger.info('Computing selective sharing features...')
            workers = multiprocessing.Pool(
                max(1, multiprocessing.cpu_count() - 1))
            selective_sharing_feature_loader = functools.partial(
                selective_sharing_feature_loader, self.lang)
            features = workers.map(selective_sharing_feature_loader,
                                   (ex['pos'] for ex in self.examples))
            for f, ex in zip(features, self.examples):
                ex['selective'] = torch.from_numpy(f)

        self.num_examples = len(self.examples)