Exemple #1
0
def parse_parser_results(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.
    """

    data = Data()

    state = STATE_START
    #for line in re.split("\r\n(?![^\[]*\])",text):
    for line in re.split("\r\n", text):
        line = line.strip()

        if line == 'NLP>':
            break
        if line.startswith("Sentence #"):
            state = STATE_TEXT

        elif state == STATE_TEXT:
            Data.newSen()
            data.addText(line)
            state = STATE_WORDS

        elif state == STATE_WORDS:
            if len(line) == 0:
                continue
            if not line.startswith("[Text="):
                raise Exception('Parse error. Could not find "[Text=" in: %s' %
                                line)
            for s in WORD_PATTERN.findall(line):
                t = parse_bracketed(s)
                if t[0] == '': continue
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'],
                              t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],
                              t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag'])
            state = STATE_TREE
            parsed = []

        elif state == STATE_TREE:
            if len(line) == 0:
                state = STATE_DEPENDENCY
                parsed = " ".join(parsed)
                #data.addTree(Tree.parse(parsed))
            else:
                parsed.append(line)

        elif state == STATE_DEPENDENCY:
            if len(line) == 0:
                state = STATE_COREFERENCE
            else:
                pass
                '''
                # don't need here
                split_entry = re.split("\(|, ", line[:-1])
                if len(split_entry) == 3:
                    rel, l_lemma, r_lemma = split_entry
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
                    l_lemma, l_index = m.group('lemma'), m.group('index')
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma)
                    r_lemma, r_index = m.group('lemma'), m.group('index')

                    data.addDependency( rel, l_lemma, r_lemma, l_index, r_index)
                '''
        elif state == STATE_COREFERENCE:
            if "Coreference set" in line:
                ##                if 'coref' not in results:
                ##                    results['coref'] = []
                coref_set = []
                data.addCoref(coref_set)
            else:
                for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(
                        line):
                    src_i, src_pos, src_l, src_r = int(src_i), int(
                        src_pos), int(src_l), int(src_r)
                    sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(
                        sink_pos), int(sink_l), int(sink_r)
                    coref_set.append(
                        ((src_word, src_i, src_pos, src_l, src_r),
                         (sink_word, sink_i, sink_pos, sink_l, sink_r)))

    return data
Exemple #2
0
def parse_parser_results_new(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.

    updated for newer version of stanford corenlp -- 2015
    """
    data_list = []
    data = None
    lastline = None
    following_line = None
    state = STATE_START
    #for line in re.split("\r\n(?![^\[]*\])",text):
    seqs = re.split("\r\n", text)
    i = 0

    #for line in re.split("\r\n", text):
    while i < len(seqs):
        line = seqs[i]
        line = line.strip()

        if line.startswith('NLP>'):  # end
            if data: data_list.append(data)  # add last one
            break
        if line.startswith("Sentence #"):
            if data: data_list.append(data)
            data = Data()
            if SENTENCE_NO_PATTERN.match(line):
                state = STATE_TEXT
            else:
                lastline = line
                state = STATE_SENT_ERROR
            i += 1

        elif state == STATE_SENT_ERROR:
            line = lastline + line
            assert SENTENCE_NO_PATTERN.match(line) is not None
            state = STATE_TEXT
            i += 1

        elif state == STATE_TEXT_ERROR:
            line = line + following_line
            data.addText(line)
            state = STATE_WORDS
            i += 2

        elif state == STATE_TEXT:
            Data.newSen()
            data.addText(line)
            state = STATE_WORDS
            i += 1

        elif state == STATE_WORDS:
            if len(line) == 0:
                continue
            if not line.startswith("[Text="):
                #raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
                print >> sys.stderr, 'Parse error. Could not find "[Text=" in: %s' % line
                print >> sys.stderr, 'Attempt to fixing error.'
                following_line = line
                state = STATE_TEXT_ERROR
                i -= 1
                continue

            #for s in WORD_PATTERN.findall(line):
            wline = line
            while WORD_PATTERN.match(wline):
                t = parse_bracketed(wline[1:-1])
                if t[0] == '':
                    i += 1
                    wline = seqs[i]
                    continue
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'],
                              t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],
                              t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag'])
                i += 1
                wline = seqs[i]

            if WORD_ERROR_PATTERN.match(wline):  # handle format error
                wline = wline + seqs[i + 1]
                wline = wline.strip()
                t = parse_bracketed(wline[1:-1])
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'],
                              t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],
                              t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag'])
                i += 2
                state = STATE_WORDS
                continue
            state = STATE_TREE
            parsed = []

        elif state == STATE_TREE:
            if len(line) == 0:
                state = STATE_DEPENDENCY
                parsed = " ".join(parsed)
                i += 1
                #data.addTree(Tree.parse(parsed))
            else:
                parsed.append(line)
                i += 1

        elif state == STATE_DEPENDENCY:
            if len(line) == 0:
                state = STATE_COREFERENCE
            else:
                pass
                '''
                # don't need here
                split_entry = re.split("\(|, ", line[:-1])
                if len(split_entry) == 3:
                    rel, l_lemma, r_lemma = split_entry
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
                    l_lemma, l_index = m.group('lemma'), m.group('index')
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma)
                    r_lemma, r_index = m.group('lemma'), m.group('index')

                    data.addDependency( rel, l_lemma, r_lemma, l_index, r_index)
                '''

            i += 1
        elif state == STATE_COREFERENCE:
            if "Coreference set" in line:
                #if 'coref' not in results:
                #    results['coref'] = []
                coref_set = []
                data.addCoref(coref_set)
            else:
                for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(
                        line):
                    src_i, src_pos, src_l, src_r = int(src_i), int(
                        src_pos), int(src_l), int(src_r)
                    sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(
                        sink_pos), int(sink_l), int(sink_r)
                    coref_set.append(
                        ((src_word, src_i, src_pos, src_l, src_r),
                         (sink_word, sink_i, sink_pos, sink_l, sink_r)))

            i += 1
        else:
            i += 1

    return data_list
Exemple #3
0
def parse_parser_results(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.
    """

    data = Data()
    
    state = STATE_START
    for line in re.split("\r\n(?!=)",text):
        line = line.strip()
        if line == 'NLP>':
            break
        if line.startswith("Sentence #"):
            state = STATE_TEXT
        
        elif state == STATE_TEXT:
            Data.newSen()
            data.addText(line)
            state = STATE_WORDS
        
        elif state == STATE_WORDS:
            if len(line) == 0:
                continue
            if not line.startswith("[Text="):
                raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
            for s in WORD_PATTERN.findall(line):
                t = parse_bracketed(s)
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'],
                              t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag'])
            state = STATE_TREE
            parsed = []
        
        elif state == STATE_TREE:
            if len(line) == 0:
                state = STATE_DEPENDENCY
                parsed = " ".join(parsed)
                #data.addTree(Tree.parse(parsed))
            else:
                parsed.append(line)
        
        elif state == STATE_DEPENDENCY:
            if len(line) == 0:
                state = STATE_COREFERENCE
            else:
                pass
                '''
                # don't need here
                split_entry = re.split("\(|, ", line[:-1])
                if len(split_entry) == 3:
                    rel, l_lemma, r_lemma = split_entry
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
                    l_lemma, l_index = m.group('lemma'), m.group('index')
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma)
                    r_lemma, r_index = m.group('lemma'), m.group('index')

                    data.addDependency( rel, l_lemma, r_lemma, l_index, r_index)
                '''
        elif state == STATE_COREFERENCE:
            if "Coreference set" in line:
##                if 'coref' not in results:
##                    results['coref'] = []
                coref_set = []
                data.addCoref(coref_set)
            else:
                for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
                    src_i, src_pos, src_l, src_r = int(src_i), int(src_pos), int(src_l), int(src_r)
                    sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(sink_pos), int(sink_l), int(sink_r)
                    coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
    
    return data
Exemple #4
0
def parse_parser_results_new(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.

    updated for newer version of stanford corenlp -- 2015
    """
    data_list = []
    data = None
    lastline = None
    following_line = None
    state = STATE_START
    #for line in re.split("\r\n(?![^\[]*\])",text):
    seqs = re.split("\r\n", text)
    i = 0

    #for line in re.split("\r\n", text):
    while i < len(seqs):
        line = seqs[i]
        line = line.strip()

        if line.startswith('NLP>'): # end
            if data: data_list.append(data) # add last one
            break
        if line.startswith("Sentence #"):
            if data: data_list.append(data)
            data = Data()
            if SENTENCE_NO_PATTERN.match(line):
                state = STATE_TEXT
            else:
                lastline = line
                state = STATE_SENT_ERROR
            i += 1
            
        elif state == STATE_SENT_ERROR:
            line = lastline + line
            assert SENTENCE_NO_PATTERN.match(line) is not None
            state = STATE_TEXT
            i += 1
            
        elif state == STATE_TEXT_ERROR:
            line = line + following_line
            data.addText(line)
            state = STATE_WORDS
            i += 2
        
        elif state == STATE_TEXT:
            Data.newSen()
            data.addText(line)
            state = STATE_WORDS
            i += 1
        
        elif state == STATE_WORDS:
            if len(line) == 0:
                continue
            if not line.startswith("[Text="):
                #raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
                print >> sys.stderr, 'Parse error. Could not find "[Text=" in: %s' % line
                print >> sys.stderr, 'Attempt to fixing error.'
                following_line = line
                state = STATE_TEXT_ERROR
                i -= 1
                continue
                
            #for s in WORD_PATTERN.findall(line):
            wline = line
            while WORD_PATTERN.match(wline):
                t = parse_bracketed(wline[1:-1])
                if t[0] == '':
                    i += 1
                    wline = seqs[i]
                    continue
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'],
                              t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag'])
                i += 1
                wline = seqs[i]

            if WORD_ERROR_PATTERN.match(wline): # handle format error
                wline = wline + seqs[i+1]
                wline = wline.strip()
                t = parse_bracketed(wline[1:-1])
                data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'],
                              t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag'])
                i+=2
                state = STATE_WORDS
                continue
            state = STATE_TREE
            parsed = []
        
        elif state == STATE_TREE:
            if len(line) == 0:
                state = STATE_DEPENDENCY
                parsed = " ".join(parsed)
                i += 1
                #data.addTree(Tree.parse(parsed))
            else:
                parsed.append(line)
                i += 1
        
        elif state == STATE_DEPENDENCY:
            if len(line) == 0:
                state = STATE_COREFERENCE
            else:
                pass
                '''
                # don't need here
                split_entry = re.split("\(|, ", line[:-1])
                if len(split_entry) == 3:
                    rel, l_lemma, r_lemma = split_entry
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
                    l_lemma, l_index = m.group('lemma'), m.group('index')
                    m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma)
                    r_lemma, r_index = m.group('lemma'), m.group('index')

                    data.addDependency( rel, l_lemma, r_lemma, l_index, r_index)
                '''

            i += 1
        elif state == STATE_COREFERENCE:
            if "Coreference set" in line:
                #if 'coref' not in results:
                #    results['coref'] = []
                coref_set = []
                data.addCoref(coref_set)
            else:
                for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
                    src_i, src_pos, src_l, src_r = int(src_i), int(src_pos), int(src_l), int(src_r)
                    sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(sink_pos), int(sink_l), int(sink_r)
                    coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))

            i += 1
        else:
            i += 1
        
    return data_list