Example #1
0
 def test_find_head_long_phrases(self):
     token_ids = [x for x in range(1, 6)]
     doc = OrderedDict({
         'dependencies': [{
             'style': 'universal',
             'arcs': {
                 1: [{
                     'governor': 2
                 }],
                 2: [{
                     'governor': 3
                 }],
                 3: [{
                     'governor': 4
                 }],
                 4: [{
                     'governor': 0
                 }],
                 5: [{
                     'governor': 4
                 }],
             }
         }]
     })
     actual = pyjsonnlp.find_head(doc, token_ids, 'universal')
     assert 4 == actual, actual
Example #2
0
 def test_find_head(self):
     token_ids = [1]
     doc = OrderedDict({
         'dependencies': [{
             'style': 'universal',
             'arcs': {
                 1: [{
                     'governor': 2
                 }],
                 2: [{
                     'governor': 3
                 }],
                 3: [{
                     'governor': 4
                 }],
                 4: [{
                     'governor': 0
                 }],
                 5: [{
                     'governor': 4
                 }],
             }
         }]
     })
     actual = pyjsonnlp.find_head(doc, token_ids, 'universal')
     assert 1 == actual, actual
Example #3
0
 def test_find_head_no_deps(self):
     no_deps = OrderedDict(j['documents'][0])
     no_deps['dependencies'] = []
     with pytest.raises(ValueError):
         pyjsonnlp.find_head(no_deps, [], 'universal')
Example #4
0
 def test_find_head_no_enhanced(self):
     with pytest.raises(ValueError):
         pyjsonnlp.find_head(OrderedDict(), [], 'Enhanced++')
Example #5
0
 def test_find_head_style_not_found(self):
     no_deps = OrderedDict(j['documents'][0])
     no_deps['dependencies'] = []
     with pytest.raises(ValueError):
         pyjsonnlp.find_head(no_deps, [], 'no such style')
Example #6
0
    def process(text: str = '',
                spacy_model='en_core_web_sm',
                coreferences=False,
                constituents=False,
                dependencies=True,
                expressions=True) -> OrderedDict:
        """Process provided text"""
        nlp = get_model(spacy_model, coreferences, constituents)
        nlp.tokenizer = SyntokTokenizer(nlp.vocab)
        doc = nlp(text)
        j: OrderedDict = get_base()
        d: OrderedDict = get_base_document(1)
        j['documents'].append(d)

        d['meta']['DC.source'] = 'SpaCy {}'.format(spacy.__version__)
        d['text'] = text

        model_lang = spacy_model[0:2]
        lang = Counter()  # track the frequency of each language
        sent_lookup: Dict[int, int] = {}  # map sentence end_char to our index
        token_lookup: Dict[Tuple[int, int], int] = {
        }  # map (sent_id, spacy token index) to our token index

        # tokens and sentences
        token_id = 1
        sent_num = 1
        for sent in doc.sents:

            current_sent = {
                'id': sent_num,
                'tokenFrom': token_id,
                'tokenTo':
                token_id + len(sent),  # begin inclusive, end exclusive
                'tokens': []
            }
            if constituents:
                try:
                    d['constituents'].append(
                        build_constituents(sent_num, sent._.parse_string))
                except Exception:
                    pass

            sent_lookup[sent.end_char] = sent_num
            d['sentences'][current_sent['id']] = current_sent
            #d['sentences'].append(current_sent)
            last_char_index = 0
            for token in sent:
                t = {
                    'id': token_id,
                    'sentence_id': sent_num,
                    'text': token.text,
                    'lemma': token.lemma_,
                    'xpos': token.tag_,
                    'upos': token.pos_,
                    'entity_iob': token.ent_iob_,
                    'characterOffsetBegin': token.idx,
                    'characterOffsetEnd': token.idx + len(token),
                    'lang': token.lang_,
                    'features': {
                        'Overt': True,
                        'Stop': True if token.is_stop else False,
                        'Alpha': True if token.is_alpha else False,
                    },
                    'misc': {
                        'SpaceAfter': False
                    }
                }

                # shape
                if WORD_REGEX.findall(token.text):
                    t['shape'] = token.shape_

                # space after?
                if token.idx != 0 and token.idx != last_char_index:
                    # we don't know there was a space after the previous token until we see where this one
                    # starts in relation to where the last one finished
                    d['tokenList'][token_id - 2]['misc']['SpaceAfter'] = True
                last_char_index = t['characterOffsetEnd']

                # morphology
                for i, kv in enumerate(
                        nlp.vocab.morphology.tag_map.get(token.tag_,
                                                         {}).items()):
                    if i > 0:  # numeric k/v pair at the beginning
                        t['features'][kv[0]] = str(kv[1]).title()

                # entities
                if token.ent_type_:
                    t['entity'] = token.ent_type_

                # maybe check if a non-model language
                if model_lang != 'xx':
                    t['features'][
                        'Foreign'] = False if model_lang == token.lang_ else True

                # bookkeeping
                lang[token.lang_] += 1
                token_lookup[(sent_num, token.i)] = token_id
                current_sent['tokens'].append(token_id)
                d['tokenList'].append(t)
                token_id += 1

            d['tokenList'][token_id - 2]['misc'][
                'SpaceAfter'] = True  # EOS tokens have spaces after them
            sent_num += 1

        d['tokenList'][token_id -
                       2]['misc']['SpaceAfter'] = False  # EOD tokens do not

        # noun phrases
        if expressions:
            chunk_id = 1
            for chunk in doc.noun_chunks:
                if len(chunk) > 1:
                    sent_id = sent_lookup[chunk.sent.sent.end_char]
                    d['expressions'].append({
                        'id':
                        chunk_id,
                        'type':
                        'NP',
                        'head':
                        token_lookup[(sent_id, chunk.root.i)],
                        'dependency':
                        chunk.root.dep_.lower(),
                        'tokens':
                        [token_lookup[(sent_id, token.i)] for token in chunk]
                    })
                    chunk_id += 1

        # dependencies
        if dependencies:
            d['dependencies'] = []
            for sent_num, sent in enumerate(doc.sents):
                deps = {'style': "universal", 'trees': []}
                for token in sent:
                    dependent = token_lookup[(sent_num + 1, token.i)]
                    deps['trees'].append({
                        #'sentenceId': sent_num+1,
                        'lab':
                        token.dep_ if token.dep_ != 'ROOT' else 'root',
                        'gov':
                        token_lookup[(sent_num + 1, token.head.i)]
                        if token.dep_ != 'ROOT' else 0,
                        'dep':
                        dependent
                    })
                d['dependencies'].append(deps)

        # coref
        # noinspection PyProtectedMember
        if coreferences and doc._.coref_clusters is not None:
            # noinspection PyProtectedMember
            for cluster in doc._.coref_clusters:
                r = build_coreference(cluster.i)
                r['representative']['tokens'] = [t.i + 1 for t in cluster.main]
                r['representative']['head'] = find_head(
                    d, r['representative']['tokens'], d['tokenList'][max(
                        r['representative']['tokens'])]['sentence_id'],
                    'universal')
                for m in cluster.mentions:
                    if m[0].i + 1 in r['representative']['tokens']:
                        continue  # don't include the representative in the mention list
                    ref = {'tokens': [t.i + 1 for t in m]}
                    ref['head'] = find_head(d, ref['tokens'], sent_num + 1,
                                            'universal')
                    r['referents'].append(ref)
                d['coreferences'].append(r)

        d['meta']['DC.language'] = max(lang)

        return remove_empty_fields(j)
Example #7
0
    def process_conll(conll='',
                      lang='en',
                      coreferences=False,
                      constituents=False,
                      dependencies=False,
                      expressions=False,
                      **kwargs) -> OrderedDict:
        if conll == '':
            raise ValueError('You must pass something in the conll parameter!')

        x = load_xrenner()
        x.load(XrennerPipeline.iso2xrenner(lang))
        x.set_doc_name('not-used')  # needs to be set or error

        sgml_result = x.analyze(conll, 'sgml')
        j = parse_conllu(conll)
        #d = list(j['documents'].values())[0]
        d = j['documents'][0]
        d['meta']['DC.source'] = 'Xrenner 2.0'

        if coreferences:
            # wrap tokens with their token id so that xml parsing works
            token_num = 1
            tokenized = []
            for line in sgml_result.split('\n'):
                if line[0:9] != '<referent' and line[0:10] != '</referent':
                    line = f'<token id="{token_num}">{line}</token>'
                    token_num += 1
                tokenized.append(line)

            representatives = {}
            coref_id = 0
            soup = BeautifulSoup('\n'.join(tokenized), 'html.parser')
            for tag in soup.find_all('referent'):
                # new representative
                if 'antecedent' not in tag.attrs or tag['type'] == 'none':
                    r = build_coreference(coref_id)
                    coref_id += 1
                    r['representative'] = {
                        'entity': tag['entity'],
                        'tokens':
                        [int(t['id']) for t in tag.find_all('token')]
                    }
                    r['representative']['head'] = find_head(
                        d, r['representative']['tokens'])
                    representatives[(tag['id'], tag['group'])] = r
                    d['coreferences'].append(r)

                    # might be a multi-word expression too!
                    if expressions and tag['entity'] != 'event' and len(
                            r['representative']['tokens']) > 1:
                        d['expressions'].append({
                            # deduce the phrase type by the pos tag of the head token
                            'type':
                            'VP' if 'V' in d['tokenList'][
                                r['representative']['head']]['upos'] else 'NP',
                            'head':
                            r['representative']['head'],
                            'tokens':
                            r['representative']['tokens']
                        })
                # new referent
                else:
                    r = representatives[(tag['antecedent'], tag['group'])]
                    ids = [int(t['id']) for t in tag.find_all('token')]
                    r['referents'].append({
                        'type': tag['type'],
                        'tokens': ids,
                        'head': find_head(d, ids)
                    })

        return remove_empty_fields(j)