Esempio n. 1
0
def main(parser_dir,
         text_loc,
         out_dir,
         codec="utf8",
         profile=False,
         debug=False):
    if debug:
        redshift.parser.set_debug(debug)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    print "Loading parser"
    parser = redshift.parser.Parser(parser_dir)
    for i in range(1000):
        with codecs.open(text_loc, 'r', 'utf8') as file_:
            input_text = file_.read()
            sentences = [
                Input.from_pos(p.strip().encode(codec))
                for i, p in enumerate(input_text.split('\n')) if p.strip()
            ]
            t1 = time.time()
            parse(parser, sentences)
            t2 = time.time()
            print '%d sents took %0.3f ms. %s mem' % (len(sentences),
                                                      (t2 - t1) * 1000.0,
                                                      mem())
Esempio n. 2
0
def main(model_dir,
         train_loc,
         dev_loc,
         iters=5,
         n_sents=0,
         feat_thresh=5,
         beam_width=4):
    sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n')
    # Apply limit
    if n_sents != 0:
        sent_strs = sent_strs[:n_sents]
    tagger = redshift.tagger.train('\n'.join(sent_strs),
                                   model_dir,
                                   beam_width=beam_width,
                                   nr_iter=iters,
                                   feat_thresh=feat_thresh)
    dev_input = [
        Input.from_pos(s.replace('|', '/'))
        for s in open(dev_loc).read().strip().split('\n')
    ]
    t = 1e-100
    c = 0
    for sent in dev_input:
        gold_tags = [tok.tag for tok in sent.tokens]
        tagger.tag(sent)
        for i, token in enumerate(sent.tokens):
            c += gold_tags[i] == token.tag
            t += 1
    print c / t
Esempio n. 3
0
def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False):
    if debug:
        redshift.parser.set_debug(debug)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    print "Loading parser"
    parser = redshift.parser.Parser(parser_dir)
    with codecs.open(text_loc, 'r', 'utf8') as file_:
        input_text = file_.read()
    sentences = [Input.from_pos(p.strip().encode(codec)) for i, p in
                 enumerate(input_text.split('\n'))
                 if p.strip()]
    if profile:
        cProfile.runctx("parse(parser, sentences)",
                        globals(), locals(), "Profile.prof")
        s = pstats.Stats("Profile.prof")
        s.strip_dirs().sort_stats("time").print_stats()
    else:
        t1 = time.time()
        parse(parser, sentences)
        t2 = time.time()
        print '%d sents took %0.3f ms' % (len(sentences), (t2-t1)*1000.0)

    with open(os.path.join(out_dir, 'parses'), 'w') as out_file:
        for sentence in sentences:
            out_file.write(sentence.to_conll())
            out_file.write('\n\n')
Esempio n. 4
0
def main(parser_dir,
         text_loc,
         out_dir,
         codec="utf8",
         profile=False,
         debug=False):
    if debug:
        redshift.parser.set_debug(debug)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    print "Loading parser"
    parser = redshift.parser.Parser(parser_dir)
    with codecs.open(text_loc, 'r', 'utf8') as file_:
        input_text = file_.read()
    sentences = [
        Input.from_pos(p.strip().encode(codec))
        for i, p in enumerate(input_text.split('\n')) if p.strip()
    ]
    if profile:
        cProfile.runctx("parse(parser, sentences)", globals(), locals(),
                        "Profile.prof")
        s = pstats.Stats("Profile.prof")
        s.strip_dirs().sort_stats("time").print_stats()
    else:
        t1 = time.time()
        parse(parser, sentences)
        t2 = time.time()
        print '%d sents took %0.3f ms' % (len(sentences), (t2 - t1) * 1000.0)

    with open(os.path.join(out_dir, 'parses'), 'w') as out_file:
        for sentence in sentences:
            out_file.write(sentence.to_conll())
            out_file.write('\n\n')
Esempio n. 5
0
def main(parser_dir, text_loc, out_dir, codec="utf8", profile=False, debug=False):
    if debug:
        redshift.parser.set_debug(debug)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    print "Loading parser"
    parser = redshift.parser.Parser(parser_dir)
    for i in range(1000):
        with codecs.open(text_loc, 'r', 'utf8') as file_:
            input_text = file_.read()
            sentences = [Input.from_pos(p.strip().encode(codec)) for i, p in
                     enumerate(input_text.split('\n'))
                     if p.strip()]
            t1 = time.time()
            parse(parser, sentences)
            t2 = time.time()
            print '%d sents took %0.3f ms. %s mem' % (len(sentences), (t2-t1)*1000.0,
                                                      mem())
Esempio n. 6
0
def postag(rvdata):
    piece = rvdata[0][3]
    test = TextBlob(piece)
    print test
    # print [str(x[0] + '/' + x[1]) for x in test.sentences[0].pos_tags]
    tagstc = ' '.join(
        [str(x[0]) + '/' + str(x[1]) for x in test.sentences[-1].pos_tags])
    parser = redshift.parser.Parser('models')
    # print tagstc
    print test.sentences[0] + ''
    testsent = Input.from_pos(
        tagstc)  #['A', 'list', 'of', 'tokens', 'is', 'required', '.'])
    # testsent = Input.from_untagged(test.sentences[0].words)
    parser.parse(testsent)
    print testsent.to_conll()  #.split('\n')
    # sentence = Input.from_untagged(
    # parser.parse(test)
    '''
Esempio n. 7
0
def main(model_dir, train_loc, dev_loc, iters=5, n_sents=0, feat_thresh=5, beam_width=4):
    sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n')
    # Apply limit
    if n_sents != 0:
        sent_strs = sent_strs[:n_sents]
    tagger = redshift.tagger.train('\n'.join(sent_strs), model_dir,
        beam_width=beam_width, nr_iter=iters, feat_thresh=feat_thresh)
    dev_input = [Input.from_pos(s.replace('|', '/'))
                 for s in open(dev_loc).read().strip().split('\n')]
    t = 1e-100
    c = 0
    for sent in dev_input:
        gold_tags = [tok.tag for tok in sent.tokens]
        tagger.tag(sent)
        for i, token in enumerate(sent.tokens):
            c += gold_tags[i] == token.tag
            t += 1
    print c / t
Esempio n. 8
0
 def __init__(self, loc):
     self.sents = []
     for i, sent_str in enumerate(open(loc).read().strip().split('\n\n')):
         if not sent_str.strip():
             continue
         self.sents.append(list(Input.from_conll(sent_str).tokens))
Esempio n. 9
0
def sentence():
    from redshift.sentence import Input
    return Input.from_pos('This/?? is/?? a/?? test/?? ./.')
Esempio n. 10
0
def sentence():
    from redshift.sentence import Input
    return Input.from_untagged('This is a test .')
Esempio n. 11
0
def sentence():
    from redshift.sentence import Input
    return Input.from_pos('This/?? is/?? a/?? test/?? ./.')
Esempio n. 12
0
 def chunk(self,
           text,
           postagged=False,
           sent_tokenized=False,
           output_tags=False,
           split_words=False):
     if sent_tokenized:
         sentences = text
     else:
         sentences = nltk.sent_tokenize(text)
     if not postagged:
         sentences = [
             ' '.join('/'.join(word_pos)
                      for word_pos in postagger.tag(sent))
             for sent in sentences
         ]
     else:
         # Sentences are postagged. It can be ['sent/NN 1/CD ./.'] format (no change required) or
         # [('sent','NN'), ('1','CD'), ('.','.')] (change to the earlier format is required)
         if len(sentences) > 0 and not (isinstance(sentences[0], str) or
                                        isinstance(sentences[0], unicode)):
             sentences = [
                 ' '.join('/'.join(word_pos) for word_pos in sent)
                 for sent in sentences
             ]
     # Convert into Redshift sentence object
     sentences = [Input.from_pos(sent) for sent in sentences]
     for sentence in sentences:
         # This will store the depparse result in each sentence object
         self.parser.parse(sentence)
     result = []
     for sentence in sentences:
         chunks = []
         if split_words:
             noun_phrase = []
         else:
             noun_phrase = ''
         noun_head_idx = None
         #length = sentence.length
         for token in reversed(list(sentence.tokens)):
             idx = token.id
             word = token.word
             pos = token.tag
             parent = token.head
             #rel = token.label
             word = reverse_map_paren(word)
             if word in RIGHT_PAREN:
                 continue
             if parent == noun_head_idx and word not in LEFT_PAREN:
                 if output_tags:
                     if split_words:
                         noun_phrase[0:0] = (str(word), str(pos))
                     else:
                         noun_phrase = str(word) + '/' + str(
                             pos) + ' ' + noun_phrase
                 else:
                     if split_words:
                         noun_phrase[0:0] = str(word)
                     else:
                         noun_phrase = str(word) + ' ' + noun_phrase
             else:
                 if noun_phrase:
                     chunks[0:0] = [noun_phrase]
                     noun_phrase = None
                     noun_head_idx = None
                 if pos.startswith('NN'):
                     if output_tags:
                         if split_words:
                             noun_phrase = [(str(word), str(pos))]
                         else:
                             noun_phrase = str(word) + '/' + str(pos)
                     else:
                         if split_words:
                             noun_phrase = [str(word)]
                         else:
                             noun_phrase = word
                     noun_head_idx = idx
         if noun_phrase:
             chunks[0:0] = [noun_phrase]
         if sent_tokenized:
             # Input is a list of sentences, output the chunks grouped by sentences
             result.append(chunks)
         else:
             # Input is a plain text, output the chunks as one long list
             result.extend(chunks)
     return result