def eval_per_sent(pred, gold, ignore_punct=True): """Evaluate UAS for each sentence.""" uas = [] for tpred, tgold in zip(read_conllu(pred), read_conllu(gold)): assert len(tpred) == len(tgold) n = 0 n_correct = 0 for i in range(len(tgold)): if ignore_punct: if tgold[i]['upostag'] == "PUNCT": continue n += 1 if tpred[i]['head'] == tgold[i]['head']: n_correct += 1 uas.append(float(n_correct) / n if n != 0 else 1.0) return np.asarray(uas)
def compute_features(filename): """ Compute Liu 2010 features from conllu file.""" deprel_stats = {} for tree in read_conllu(filename): for tok in tree: deprel = tok['deprel'] if deprel not in deprel_stats: deprel_stats[deprel] = {'r': 0, 'l': 0} if tok['head'] > tok['id']: deprel_stats[deprel]['r'] += 1 else: deprel_stats[deprel]['l'] += 1 # features = {} # for named vector features = [] for deprel in DEPRELS: if deprel in deprel_stats.keys(): fval = (float(deprel_stats[deprel]['r']) / (deprel_stats[deprel]['r'] + deprel_stats[deprel]['l'])) features.append(fval) else: features.append(.5) return features
def compute_features(filename, threshold=3.0, one_hot=True): """Compute WALS features from conllu file. Args: filename: Path to conllu file. threshold: A scalar threshold for determining dominancy. - If a feature has a "No Dominant Order" option, the frequency f(d) of order d must be >= threshold x f(d') to be considered the dominant order. Otherwise, the feature is marked with "No Dominant Order." - If a feature does not have a "No Dominant Order" option, order d is considered dominant simply if f(d) > f(d'). one_hot: Return WALS features as a one-hot representation. Returns: WALS feature vector. """ # Collect WALS statistics (directionalities). wals_stats = collections.defaultdict(dict) for w, _ in WALS_CONDITIONS.items(): wals_stats[w]['r'] = 0 wals_stats[w]['l'] = 0 # Iterate over parsed trees. for tree in read_conllu(filename): for tok in tree: for w, cond in WALS_CONDITIONS.items(): # Check that the dependency type matches the condition's # relevant dependency type, if any. Skip if not relevant. deprel = tok['deprel'] if 'rel' in cond: if deprel not in cond['rel']: continue # Skip tokens whose head is ROOT. if tok['head'] == 0: continue # Check if the modifier and head tags match the pattern. pos_m = tok['upostag'] pos_h = tree[tok['head'] - 1]['upostag'] if "%s/%s" % (pos_h, pos_m) in cond['pos'] or \ (pos_m == "DET" and "*/DET" in cond['pos']): # If the head is to the left, this is right-leaning: h-->m. if tok['head'] < tok['id']: wals_stats[w]['r'] += 1 # If the head is to the right, it is left-leaning: m<--h. elif tok['head'] > tok['id']: wals_stats[w]['l'] += 1 else: raise ValueError('Should not have head as self!') # Determine WALS values based on these statistics. # Use Laplace smoothing. wals_features = collections.defaultdict(dict) for w, stats in wals_stats.items(): if float(stats['r'] + 1) / (stats['l'] + 1) >= threshold: direction = 'r' elif float(stats['l'] + 1) / (stats['r'] + 1) >= threshold: direction = 'l' else: direction = 'o' # Pick the weakly dominant order if "No Dominant Order" is not usable. if direction == 'o' and ('o' not in STATS_TO_WALS[w]): direction = 'r' if stats['r'] > stats['l'] else 'l' wals_features[w] = STATS_TO_WALS[w][direction] # Convert to one-hot if specified. if one_hot: wals_features = flatten(wals_features) return wals_features
def compute_features(filename): """Compute Wang and Eisner 2018a features from conllu file.""" g_uni = {} g_bi = {} for w in CONTEXT_WINDOW: g_uni[w] = collections.defaultdict(list) # {w: \pi_t^w} g_bi[w] = collections.defaultdict(list) # {w: \pi_{t,s}^w} # Gather statistics. for ex in read_conllu(filename): # Get part-of-speech tags. ex_pos = [t['upostag'] for t in ex] # Only sentences of <= 40 tokens are considered. if len(ex_pos) > 40: continue for w in CONTEXT_WINDOW: for j, s in enumerate(ex_pos): # Right context. if w > 0: context = ex_pos[j + 1:j + w + 1] # Left context. else: context = ex_pos[max(j + w, 0):j] # Count the frequencies of the tags in the context. tag_counts = collections.Counter(context) # For each tag type, store its context frequency # per window type (unigram + bigram). for t in POSSIBLE_TAGS: cnt = tag_counts[t] if t in tag_counts else 0 tag_frac = float(cnt) / abs(w) # Map window --> token --> frequencies. g_uni[w][t].append(tag_frac) # Map window --> bigram --> frequencies. g_bi[w][(s, t)].append(tag_frac) # Convert to final mean fractions and ratios. pi_uni = {} pi_bi = {} features_uni = [] features_bi = [] for w in CONTEXT_WINDOW: pi_uni[w] = {} pi_bi[w] = {} for t in POSSIBLE_TAGS: pi_uni[w][t] = statistics.mean(g_uni[w][t]) # Unigram features are only taken for POSITIVE w. if w > 0: features_uni.append(pi_uni[w][t]) # Bigram features are used for bigram:unigram ratios only. for s in POSSIBLE_TAGS: if (s, t) not in g_bi[w]: pi_bi[w][(s, t)] = 0.0 else: pi_bi[w][(s, t)] = statistics.mean(g_bi[w][(s, t)]) if pi_uni[w][t] == 0.0: # If token doesn't exist, default to 1. assert pi_bi[w][(s, t)] == 0.0, ('Unigram frequency is 0' ' but bigram is not!') features_bi.append(1.0) else: # Bound ratio by 1. features_bi.append(min(pi_bi[w][(s, t)] / pi_uni[w][t], 1)) # Concat features. full_features = features_uni + features_bi return full_features
def load_from_conllu( self, filename, min_tree_len=1, max_tree_len=None, subsample=None, subsampling_key='arcs', selective_sharing_feature_loader=None): """Initialize treebank from conllu file. Args: filename: Path to treebank in conllu format. min_tree_len: Skip trees with less than min_tree_len tokens. max_tree_len: Skip trees with more than max_tree_len tokens. subsample: Take a random sample of N examples. subsampling_key: Subsample by either "trees" or "arcs". selective_sharing_feature_loader: Callback to compute selective sharing features. First two arguments must be language and POS tags. """ # Parse conllu file. self.examples = [] arcs = 0 trees = 0 for i, ex in enumerate(read_conllu(filename)): # Throwout trees that don't meet length requirements. if min_tree_len and len(ex) < min_tree_len: continue if max_tree_len and len(ex) > max_tree_len: continue # Store the index of the tree as "tid" so that we can # recover the original order. (This is necessary to # compare to the gold annotations file.) ex = dict(tid=i, lang=self.lang, words=[t['form'] for t in ex], pos=[t['upostag'].lower() for t in ex], deprels=[t['deprel'].lower() for t in ex], heads=[t['head'] for t in ex]) self.examples.append(ex) arcs += len(ex['words']) trees += 1 # If subsample is set, subsample up to N arcs or trees. if subsampling_key not in ['arcs', 'trees']: raise ValueError('Unknown subsampling key %s' % subsampling_key) total = arcs if subsampling_key == 'arcs' else trees if subsample and total > subsample: kept = [] arcs = 0 trees = 0 np.random.shuffle(self.examples) for ex in self.examples: total = arcs if subsampling_key == 'arcs' else trees if total >= subsample: break kept.append(ex) arcs += len(ex['words']) trees += 1 self.examples = kept self.num_examples = len(self.examples) logger.info('Loaded %d arcs and %d trees from %s' % (arcs, self.num_examples, filename)) # Compute selective sharing features, if function provided. # This is done using multiprocessing for speed. if selective_sharing_feature_loader is not None: logger.info('Computing selective sharing features...') workers = multiprocessing.Pool( max(1, multiprocessing.cpu_count() - 1)) selective_sharing_feature_loader = functools.partial( selective_sharing_feature_loader, self.lang) features = workers.map(selective_sharing_feature_loader, (ex['pos'] for ex in self.examples)) for f, ex in zip(features, self.examples): ex['selective'] = torch.from_numpy(f) self.num_examples = len(self.examples)