def features_for(eg): features = [] for token in tokenise_parse.tokens_for(eg, 1): features.append("s1_" + token) for token in tokenise_parse.tokens_for(eg, 2): features.append("s2_" + token) return features
#!/usr/bin/env python from collections import Counter import json import numpy as np import tokenise_parse import sys PARSE_MODE = sys.argv[1] def quantiles(v): return np.percentile(v, np.linspace(0, 100, 5)) token_freq = Counter() s1_lengths = [] s2_lengths = [] for line in sys.stdin: eg = json.loads(line) s1_tokens = tokenise_parse.tokens_for(eg, 1, PARSE_MODE) s1_lengths.append(len(s1_tokens)) s2_tokens = tokenise_parse.tokens_for(eg, 2, PARSE_MODE) s2_lengths.append(len(s2_tokens)) token_freq.update(s1_tokens) token_freq.update(s2_tokens) print "s1_lengths quantiles", quantiles(s1_lengths) print "s2_lengths quantiles", quantiles(s2_lengths) print token_freq.most_common(30)
def tokens_in_sentences(eg, parse_mode): return (tokenise_parse.tokens_for(eg, 1, parse_mode), tokenise_parse.tokens_for(eg, 2, parse_mode))