def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False): if debug: redshift.parser.set_debug(debug) if not os.path.exists(out_dir): os.mkdir(out_dir) print "Loading parser" parser = redshift.parser.Parser(parser_dir) for i in range(1000): with codecs.open(text_loc, 'r', 'utf8') as file_: input_text = file_.read() sentences = [ Input.from_pos(p.strip().encode(codec)) for i, p in enumerate(input_text.split('\n')) if p.strip() ] t1 = time.time() parse(parser, sentences) t2 = time.time() print '%d sents took %0.3f ms. %s mem' % (len(sentences), (t2 - t1) * 1000.0, mem())
def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False): if debug: redshift.parser.set_debug(debug) if not os.path.exists(out_dir): os.mkdir(out_dir) print "Loading parser" parser = redshift.parser.Parser(parser_dir) with codecs.open(text_loc, 'r', 'utf8') as file_: input_text = file_.read() sentences = [Input.from_pos(p.strip().encode(codec)) for i, p in enumerate(input_text.split('\n')) if p.strip()] if profile: cProfile.runctx("parse(parser, sentences)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() else: t1 = time.time() parse(parser, sentences) t2 = time.time() print '%d sents took %0.3f ms' % (len(sentences), (t2-t1)*1000.0) with open(os.path.join(out_dir, 'parses'), 'w') as out_file: for sentence in sentences: out_file.write(sentence.to_conll()) out_file.write('\n\n')
def main(model_dir, train_loc, dev_loc, iters=5, n_sents=0, feat_thresh=5, beam_width=4): sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n') # Apply limit if n_sents != 0: sent_strs = sent_strs[:n_sents] tagger = redshift.tagger.train('\n'.join(sent_strs), model_dir, beam_width=beam_width, nr_iter=iters, feat_thresh=feat_thresh) dev_input = [ Input.from_pos(s.replace('|', '/')) for s in open(dev_loc).read().strip().split('\n') ] t = 1e-100 c = 0 for sent in dev_input: gold_tags = [tok.tag for tok in sent.tokens] tagger.tag(sent) for i, token in enumerate(sent.tokens): c += gold_tags[i] == token.tag t += 1 print c / t
def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False): if debug: redshift.parser.set_debug(debug) if not os.path.exists(out_dir): os.mkdir(out_dir) print "Loading parser" parser = redshift.parser.Parser(parser_dir) with codecs.open(text_loc, 'r', 'utf8') as file_: input_text = file_.read() sentences = [ Input.from_pos(p.strip().encode(codec)) for i, p in enumerate(input_text.split('\n')) if p.strip() ] if profile: cProfile.runctx("parse(parser, sentences)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() else: t1 = time.time() parse(parser, sentences) t2 = time.time() print '%d sents took %0.3f ms' % (len(sentences), (t2 - t1) * 1000.0) with open(os.path.join(out_dir, 'parses'), 'w') as out_file: for sentence in sentences: out_file.write(sentence.to_conll()) out_file.write('\n\n')
def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False): if debug: redshift.parser.set_debug(debug) if not os.path.exists(out_dir): os.mkdir(out_dir) print "Loading parser" parser = redshift.parser.Parser(parser_dir) for i in range(1000): with codecs.open(text_loc, 'r', 'utf8') as file_: input_text = file_.read() sentences = [Input.from_pos(p.strip().encode(codec)) for i, p in enumerate(input_text.split('\n')) if p.strip()] t1 = time.time() parse(parser, sentences) t2 = time.time() print '%d sents took %0.3f ms. %s mem' % (len(sentences), (t2-t1)*1000.0, mem())
def postag(rvdata): piece = rvdata[0][3] test = TextBlob(piece) print test # print [str(x[0] + '/' + x[1]) for x in test.sentences[0].pos_tags] tagstc = ' '.join( [str(x[0]) + '/' + str(x[1]) for x in test.sentences[-1].pos_tags]) parser = redshift.parser.Parser('models') # print tagstc print test.sentences[0] + '' testsent = Input.from_pos( tagstc) #['A', 'list', 'of', 'tokens', 'is', 'required', '.']) # testsent = Input.from_untagged(test.sentences[0].words) parser.parse(testsent) print testsent.to_conll() #.split('\n') # sentence = Input.from_untagged( # parser.parse(test) '''
def main(model_dir, train_loc, dev_loc, iters=5, n_sents=0, feat_thresh=5, beam_width=4): sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n') # Apply limit if n_sents != 0: sent_strs = sent_strs[:n_sents] tagger = redshift.tagger.train('\n'.join(sent_strs), model_dir, beam_width=beam_width, nr_iter=iters, feat_thresh=feat_thresh) dev_input = [Input.from_pos(s.replace('|', '/')) for s in open(dev_loc).read().strip().split('\n')] t = 1e-100 c = 0 for sent in dev_input: gold_tags = [tok.tag for tok in sent.tokens] tagger.tag(sent) for i, token in enumerate(sent.tokens): c += gold_tags[i] == token.tag t += 1 print c / t
def sentence(): from redshift.sentence import Input return Input.from_pos('This/?? is/?? a/?? test/?? ./.')
def chunk(self, text, postagged=False, sent_tokenized=False, output_tags=False, split_words=False): if sent_tokenized: sentences = text else: sentences = nltk.sent_tokenize(text) if not postagged: sentences = [ ' '.join('/'.join(word_pos) for word_pos in postagger.tag(sent)) for sent in sentences ] else: # Sentences are postagged. It can be ['sent/NN 1/CD ./.'] format (no change required) or # [('sent','NN'), ('1','CD'), ('.','.')] (change to the earlier format is required) if len(sentences) > 0 and not (isinstance(sentences[0], str) or isinstance(sentences[0], unicode)): sentences = [ ' '.join('/'.join(word_pos) for word_pos in sent) for sent in sentences ] # Convert into Redshift sentence object sentences = [Input.from_pos(sent) for sent in sentences] for sentence in sentences: # This will store the depparse result in each sentence object self.parser.parse(sentence) result = [] for sentence in sentences: chunks = [] if split_words: noun_phrase = [] else: noun_phrase = '' noun_head_idx = None #length = sentence.length for token in reversed(list(sentence.tokens)): idx = token.id word = token.word pos = token.tag parent = token.head #rel = token.label word = reverse_map_paren(word) if word in RIGHT_PAREN: continue if parent == noun_head_idx and word not in LEFT_PAREN: if output_tags: if split_words: noun_phrase[0:0] = (str(word), str(pos)) else: noun_phrase = str(word) + '/' + str( pos) + ' ' + noun_phrase else: if split_words: noun_phrase[0:0] = str(word) else: noun_phrase = str(word) + ' ' + noun_phrase else: if noun_phrase: chunks[0:0] = [noun_phrase] noun_phrase = None noun_head_idx = None if pos.startswith('NN'): if output_tags: if split_words: noun_phrase = [(str(word), str(pos))] else: noun_phrase = str(word) + '/' + str(pos) else: if split_words: noun_phrase = [str(word)] else: noun_phrase = word noun_head_idx = idx if noun_phrase: chunks[0:0] = [noun_phrase] if sent_tokenized: # Input is a list of sentences, output the chunks grouped by sentences result.append(chunks) else: # Input is a plain text, output the chunks as one long list result.extend(chunks) return result