def repl(model, lm, emissions, cfd): while True: try: line = input('>') except: break line = line.strip() sentences = nltk.sent_tokenize(line) s_tokenized = [nltk.word_tokenize(sent) for sent in sentences] tokenized = [] for sent in s_tokenized: tokenized.extend(sent) tagger = stanford.get_tagger() postags = [t.lower() for (w,t) in tagger.tag(tokenized)] sss = learn.maybe_lemmatize([tokenized], 'en', tt_home) lemmas = sss[0] ss = list(map(nltk.tag.tuple2str, zip(lemmas,postags))) print(" ".join(ss)) if model == "unigram": tagged = skinnyhmm.mfs(cfd, ss) if model == "bigram": tagged = skinnyhmm.viterbi(lm, emissions, cfd, ss) elif model == "trigram": tagged = searches.beam(lm, emissions, cfd, ss, beamwidth=BEAMWIDTH) print(tagged)
def main(): parser = argparse.ArgumentParser(description="clwsd") parser.add_argument("--sourcetext", type=str, required=True) parser.add_argument("--taggerhome", type=str, required=True) args = parser.parse_args() stanford.taggerhome = args.taggerhome sourcefn = args.sourcetext tagger = stanford.get_tagger() with open(sourcefn) as infile: sents = [line.strip().split() for line in infile] tagged_sents = tagger.batch_tag(sents) print("tagged.") with open(sourcefn + ".pretagged", "w") as outfile: for tagged_sent in tagged_sents: print(" ".join(list(map(nltk.tag.tuple2str, tagged_sent))), file=outfile)
def extract_wsd_problems(fn): handler = SentenceExtractor() parser = make_parser() parser.setContentHandler(handler) parser.parse(fn) out = [] for (lexelt, head_count, context, inst) in list(handler.sentences): problem = WSDProblem(lexelt, context, instance_id=inst, testset=True) out.append(problem) sents = [problem.tokenized for problem in out] tagger = stanford.get_tagger() tagged_sents = tagger.batch_tag(sents) assert len(tagged_sents) == len(out) for tagged_sent, problem in zip(tagged_sents, out): problem.tagged = tagged_sent print("tagged.") return out