def transcripts_to_vocabulary(target, source, env): word_counts = FrequencyList() for fname in source: with meta_open(fname.rstr()) as ifd: for line in [x for x in ifd if not re.match(r"^\[.*\]\s*", x)]: for tok in line.split(): word_counts[tok] = word_counts.get(tok, 0) + 1 with meta_open(target[0].rstr(), "w") as ofd: ofd.write(word_counts.format()) return None
def plot_reduction(target, source, env): args = source[-1].read() bins = args["bins"] with meta_open(source[-3].rstr()) as in_voc_fd, meta_open(source[-2].rstr()) as all_voc_fd: in_vocabulary = FrequencyList(in_voc_fd).make_conservative() other_vocabulary = FrequencyList(all_voc_fd).make_conservative() all_vocabulary = other_vocabulary.join(in_vocabulary) out_of_vocabulary = set([x for x in all_vocabulary.keys() if x not in in_vocabulary]) num_iv_types = len(in_vocabulary) num_iv_tokens = sum([all_vocabulary.get(x, 0) for x in in_vocabulary]) num_types = len(all_vocabulary) num_tokens = sum(all_vocabulary.values()) num_oov_types = num_types - num_iv_types num_oov_tokens = num_tokens - num_iv_tokens logging.info("%d/%d in-vocabulary types", num_iv_types, num_types) logging.info("%d/%d in-vocabulary tokens", num_iv_tokens, num_tokens) #pyplot.figure(figsize=(8 * 2, 7)) pyplot.figure(figsize=(8, 7)) for expansion_fname in source[0:-3]: good_tokens = 0 good_types = 0 token_based = numpy.empty(shape=(bins + 1)) type_based = numpy.empty(shape=(bins + 1)) token_based[0] = float(0.0) type_based[0] = float(0.0) name = {"morph" : "just Morfessor", "lm" : "reranking by ngrams", "lm_avg" : "reranking by ngram average", "lm_morph" : "reranking by boundary-ngrams", }[os.path.splitext(os.path.basename(expansion_fname.rstr()))[0]] method = os.path.dirname(expansion_fname.rstr()).split("/")[-1] name = "%s - %s" % (method, name) with meta_open(expansion_fname.rstr()) as expansion_fd: expansions = [(w, p) for w, p in [x.strip().split() for x in expansion_fd]] bin_size = len(expansions) / bins for i in range(bins): correct = [x for x in expansions[i*bin_size:(i+1)*bin_size] if x[0] in all_vocabulary] good_types += len(correct) good_tokens += sum([all_vocabulary.get(x[0], 0) for x in correct]) type_based[i + 1] = good_types token_based[i + 1] = good_tokens logging.info("%d recovered types", good_types) logging.info("%d recovered tokens", good_tokens) #pyplot.subplot(1, 2, 1) logging.info("%s at %d, %d/%d recovered types", name, (type_based.shape[0] / 2) * bin_size, type_based[type_based.shape[0] / 2], num_oov_types) pyplot.plot(100 * type_based / float(num_oov_types), label=name) #pyplot.subplot(1, 2, 2) #pyplot.plot(token_based, label=name) #pyplot.subplot(1, 2, 1) #pyplot.title("Type-based") #pyplot.xlabel("Expansion threshold (in 1000s of words)") pyplot.ylabel("% OOV reduction") pyplot.legend(loc="lower right", fontsize=10) pyplot.xticks([x * bin_size for x in range(11)], [(x * bin_size) / 10 for x in range(11)]) #print type_based.max() yinc = float(type_based.max()) / 9 #yinc = 2409.0 / 9 #pyplot.yticks([x * yinc for x in range(10)], ["%d" % (int(100 * x * yinc / float(num_oov_types))) for x in range(10)]) yinc = 35.0 / 9 pyplot.yticks([x * yinc for x in range(10)], ["%d" % (x * yinc) for x in range(10)]) #pyplot.yticks([x * yinc for x in range(10)], ["%d/%d" % (int(100 * x * yinc / float(num_oov_types)), int(100 * x * yinc / float(num_iv_types))) for x in range(10)], fontsize=8) pyplot.grid() #pyplot.subplot(1, 2, 2) #pyplot.title("Token-based") pyplot.xlabel("1000s of words") #pyplot.ylabel("%% OOV reduction/IV increase (%d initially OOV tokens)" % (num_oov_tokens)) #pyplot.legend(loc="lower right", fontsize=10) #pyplot.xticks([x * bin_size for x in range(11)], [(x * bin_size) / 10 for x in range(11)]) #yinc = float(token_based.max()) / 9 #pyplot.yticks([x * yinc for x in range(10)], ["%d" % (int(100 * x * yinc / float(num_oov_tokens))) for x in range(10)]) #pyplot.yticks([x * yinc for x in range(10)], ["%d/%d" % (int(100 * x * yinc / float(num_oov_tokens)), int(100 * x * yinc / float(num_iv_tokens))) for x in range(10)], fontsize=8) #pyplot.grid() pyplot.savefig(target[0].rstr()) pyplot.cla() pyplot.clf() return None