def main(): triplewise = ngrams_gen(3) if len(sys.argv) > 1: content = get_corpus(sys.argv[1]) else: content = get_corpus() names = get_names() prevs = [arity_dict(), arity_dict(), arity_dict()] nexts = [arity_dict(), arity_dict(), arity_dict()] for phrase in re.split(r"\n\n|[.;?!]", content): for prev_word, middle, next_word in triplewise(no_empties(re.split(r"[ ,-^]", phrase))): if middle.lower() in names: for prev, next, n in zip(prevs, nexts, (3, 4, 5)): if len(prev_word) > 3: prev.count(ngrams_gen(n)(prev_word.lower())) if len(next_word) > 3: next.count(ngrams_gen(n)(next_word.lower())) with open("ngrams.dat", "w") as f: for prev in prevs: for line in top(prev): print >> f, line[0].encode("utf-8") print >> f, "--" for next in nexts: for line in top(next): print >> f, line[0].encode("utf-8")
def check_ngrams(text, ngrams_set=None): ret = [] if ngrams_set is None: ngrams_set = retrieve_ngrams() for n, word in enumerate(text): for length in [3, 4, 5]: ngrams = ngrams_gen(length) if len(word) >= length: if ((word[:length], START) in ngrams_set or (word[-length:], END) in ngrams_set ): ret.append(n) for ngram in ngrams(word[1:-1]): if (ngram, MIDDLE) in ngrams_set: ret.append(n) return ret