Beispiel #1
0
def is_valid(nlpjson: OrderedDict) -> bool:
    """
    Validates a json-nlp ordered dictionary.
    :param nlpjson: The json-nlp to be validated
    :return: True if the json-nlp validates, False otherwise
    """
    valid = True
    v = load_validator()
    for error in sorted(v.iter_errors(remove_empty_fields(nlpjson)), key=str):
        print(format_error(error))
        valid = False
    return valid
Beispiel #2
0
def is_valid(nlpjson: OrderedDict) -> Tuple[bool, List[str]]:
    """
    Validates a json-nlp ordered dictionary.
    :param nlpjson: The json-nlp to be validated
    :return: True if the json-nlp validates, False otherwise
    """
    valid = True
    errors = []
    v = __load_validator()
    for error in sorted(v.iter_errors(remove_empty_fields(nlpjson)), key=str):
        errors.append(format_error(error))
        valid = False
    return valid, errors
Beispiel #3
0
 def test_remove_empty_fields(self):
     d = OrderedDict([('a', 1), ('b', ''), ('c', []), ('d', [1, 2])])
     d['meta'] = OrderedDict([('a', 1), ('b', ''), ('c', {}), ('d', [1,
                                                                     2])])
     d['documents'] = {
         1: OrderedDict([('a', 1), ('b', ''), ('c', []), ('d', [1, 2])]),
         2: OrderedDict([('a', 1), ('b', ''), ('c', {}), ('d', [1, 2])])
     }
     actual = pyjsonnlp.remove_empty_fields(d)
     expected = OrderedDict([('a', 1), ('d', [1, 2]),
                             ('meta', OrderedDict([('a', 1), ('d', [1,
                                                                    2])])),
                             ('documents', {
                                 1: OrderedDict([('a', 1), ('d', [1, 2])]),
                                 2: OrderedDict([('a', 1), ('d', [1, 2])])
                             })])
     assert expected == actual, actual
Beispiel #4
0
    def get_nlp_json(text, neighbors) -> OrderedDict:
        """Process the Polyglot output into JSON"""

        j: OrderedDict = pyjsonnlp.get_base()
        j['DC.source'] = 'polyglot {}'.format(polyglot.__version__)
        d : OrderedDict = pyjsonnlp.get_base_document(1)
        #j['documents'] = get_base_document(text)
        #d = j.get('documents')[len(j.get('documents'))-1]
        #print(d['id'])
        #j['documents'][d['id']] = d
        j['documents'].append(d)
        d['meta']['DC.source'] = 'polyglot {}'.format(polyglot.__version__)
     
        doc = Text(text)
        d['meta']['DC.language'] = doc.language.code

        PolyglotPipeline.get_polyglot_sentences(text, neighbors, d, doc)
        return pyjsonnlp.remove_empty_fields(j)
Beispiel #5
0
    def process(text: str = '',
                spacy_model='en_core_web_sm',
                coreferences=False,
                constituents=False,
                dependencies=True,
                expressions=True) -> OrderedDict:
        """Process provided text"""
        nlp = get_model(spacy_model, coreferences, constituents)
        nlp.tokenizer = SyntokTokenizer(nlp.vocab)
        doc = nlp(text)
        j: OrderedDict = get_base()
        d: OrderedDict = get_base_document(1)
        j['documents'].append(d)

        d['meta']['DC.source'] = 'SpaCy {}'.format(spacy.__version__)
        d['text'] = text

        model_lang = spacy_model[0:2]
        lang = Counter()  # track the frequency of each language
        sent_lookup: Dict[int, int] = {}  # map sentence end_char to our index
        token_lookup: Dict[Tuple[int, int], int] = {
        }  # map (sent_id, spacy token index) to our token index

        # tokens and sentences
        token_id = 1
        sent_num = 1
        for sent in doc.sents:

            current_sent = {
                'id': sent_num,
                'tokenFrom': token_id,
                'tokenTo':
                token_id + len(sent),  # begin inclusive, end exclusive
                'tokens': []
            }
            if constituents:
                try:
                    d['constituents'].append(
                        build_constituents(sent_num, sent._.parse_string))
                except Exception:
                    pass

            sent_lookup[sent.end_char] = sent_num
            d['sentences'][current_sent['id']] = current_sent
            #d['sentences'].append(current_sent)
            last_char_index = 0
            for token in sent:
                t = {
                    'id': token_id,
                    'sentence_id': sent_num,
                    'text': token.text,
                    'lemma': token.lemma_,
                    'xpos': token.tag_,
                    'upos': token.pos_,
                    'entity_iob': token.ent_iob_,
                    'characterOffsetBegin': token.idx,
                    'characterOffsetEnd': token.idx + len(token),
                    'lang': token.lang_,
                    'features': {
                        'Overt': True,
                        'Stop': True if token.is_stop else False,
                        'Alpha': True if token.is_alpha else False,
                    },
                    'misc': {
                        'SpaceAfter': False
                    }
                }

                # shape
                if WORD_REGEX.findall(token.text):
                    t['shape'] = token.shape_

                # space after?
                if token.idx != 0 and token.idx != last_char_index:
                    # we don't know there was a space after the previous token until we see where this one
                    # starts in relation to where the last one finished
                    d['tokenList'][token_id - 2]['misc']['SpaceAfter'] = True
                last_char_index = t['characterOffsetEnd']

                # morphology
                for i, kv in enumerate(
                        nlp.vocab.morphology.tag_map.get(token.tag_,
                                                         {}).items()):
                    if i > 0:  # numeric k/v pair at the beginning
                        t['features'][kv[0]] = str(kv[1]).title()

                # entities
                if token.ent_type_:
                    t['entity'] = token.ent_type_

                # maybe check if a non-model language
                if model_lang != 'xx':
                    t['features'][
                        'Foreign'] = False if model_lang == token.lang_ else True

                # bookkeeping
                lang[token.lang_] += 1
                token_lookup[(sent_num, token.i)] = token_id
                current_sent['tokens'].append(token_id)
                d['tokenList'].append(t)
                token_id += 1

            d['tokenList'][token_id - 2]['misc'][
                'SpaceAfter'] = True  # EOS tokens have spaces after them
            sent_num += 1

        d['tokenList'][token_id -
                       2]['misc']['SpaceAfter'] = False  # EOD tokens do not

        # noun phrases
        if expressions:
            chunk_id = 1
            for chunk in doc.noun_chunks:
                if len(chunk) > 1:
                    sent_id = sent_lookup[chunk.sent.sent.end_char]
                    d['expressions'].append({
                        'id':
                        chunk_id,
                        'type':
                        'NP',
                        'head':
                        token_lookup[(sent_id, chunk.root.i)],
                        'dependency':
                        chunk.root.dep_.lower(),
                        'tokens':
                        [token_lookup[(sent_id, token.i)] for token in chunk]
                    })
                    chunk_id += 1

        # dependencies
        if dependencies:
            d['dependencies'] = []
            for sent_num, sent in enumerate(doc.sents):
                deps = {'style': "universal", 'trees': []}
                for token in sent:
                    dependent = token_lookup[(sent_num + 1, token.i)]
                    deps['trees'].append({
                        #'sentenceId': sent_num+1,
                        'lab':
                        token.dep_ if token.dep_ != 'ROOT' else 'root',
                        'gov':
                        token_lookup[(sent_num + 1, token.head.i)]
                        if token.dep_ != 'ROOT' else 0,
                        'dep':
                        dependent
                    })
                d['dependencies'].append(deps)

        # coref
        # noinspection PyProtectedMember
        if coreferences and doc._.coref_clusters is not None:
            # noinspection PyProtectedMember
            for cluster in doc._.coref_clusters:
                r = build_coreference(cluster.i)
                r['representative']['tokens'] = [t.i + 1 for t in cluster.main]
                r['representative']['head'] = find_head(
                    d, r['representative']['tokens'], d['tokenList'][max(
                        r['representative']['tokens'])]['sentence_id'],
                    'universal')
                for m in cluster.mentions:
                    if m[0].i + 1 in r['representative']['tokens']:
                        continue  # don't include the representative in the mention list
                    ref = {'tokens': [t.i + 1 for t in m]}
                    ref['head'] = find_head(d, ref['tokens'], sent_num + 1,
                                            'universal')
                    r['referents'].append(ref)
                d['coreferences'].append(r)

        d['meta']['DC.language'] = max(lang)

        return remove_empty_fields(j)
Beispiel #6
0
    def get_nlp_json(sentences: List[Sentence], text: str,
                     embed_type: str) -> OrderedDict:
        j: OrderedDict = pyjsonnlp.get_base()
        d: OrderedDict = pyjsonnlp.get_base_document(1)
        #j['documents'][d['id']] = d
        j['documents'].append(d)

        d['meta']['DC.source'] = 'Flair {}'.format(flair_version)
        d['text'] = text

        # sentences
        token_id = 1
        for i, s in enumerate(sentences):
            sent = {
                'id': i,
                'tokenFrom': token_id,
                'tokenTo': token_id + len(s),
                'tokens': []
            }
            d['sentences'][sent['id']] = sent

            # sentiment and any other classifiers
            for label in s.labels:
                if 'labels' not in sent:
                    sent['labels'] = []
                sent['labels'].append({
                    'type':
                    'sentiment' if label.value in ('POSITIVE', 'NEGATIVE') else
                    'offensive language',
                    'label':
                    label.value,
                    'scores': {
                        'label': label.score
                    }
                })

            # syntactic chunking (expressions)
            d['expressions'] = [{
                'type':
                span.tag,
                'scores': {
                    'type': span.score
                },
                'tokens': [t.idx + token_id - 1 for t in span.tokens]
            } for span in s.get_spans('np') if len(span.tokens) > 1]

            # features for each token
            for token in s:
                t = {
                    'id': token_id,
                    'text': token.text,
                    'characterOffsetBegin': token.start_pos,
                    'characterOffsetEnd': token.end_pos,
                    'features': {
                        'Overt': True
                    },
                    'scores': {},
                    'misc': {
                        'SpaceAfter': True if token.whitespace_after else False
                    }
                }

                # pos
                pos = token.get_tag(
                    'upos')  # 'multi' models give universal pos tags
                if pos.value:
                    t['upos'] = pos.value
                    t['scores']['upos'] = pos.score
                pos = token.get_tag('pos')
                if pos.value:
                    t['xpos'] = pos.value
                    t['scores']['xpos'] = pos.score
                #print(token_id -1, d['tokenList'])

                # named entities
                entity = token.get_tag('ner')
                if entity.value != 'O':
                    t['entity'] = entity.value
                    #print(token_id, len(d['tokenList']))
                    e = d['tokenList'][token_id - 2].get(
                        'entity') if token.idx != 1 else None
                    #e = None
                    t['entity_iob'] = 'B' if e != entity.value else 'I'
                    t['scores']['entity'] = entity.score
                else:
                    t['entity_iob'] = 'O'

                # semantic frames (wordnet)
                frame = token.get_tag('frame')
                if frame.value and frame.value != '_':
                    f = frame.value.split('.')
                    w_id = '.'.join([f[0], t['upos'][0].lower(), f[1]])
                    t['synsets'] = {
                        w_id: {
                            'wordnetId': w_id,
                            'scores': {
                                'wordnetId': frame.score
                            }
                        }
                    }

                # word embeddings
                if embed_type != 'Flair ':
                    t['embeddings'] = [{
                        'model': embed_type,
                        'vector': token.embedding.tolist()
                    }]

                d['tokenList'].append(t)
                sent['tokens'].append(token_id)
                token_id += 1

        return pyjsonnlp.remove_empty_fields(j)
Beispiel #7
0
    def process(text='',
                lang='en',
                coreferences=False,
                constituents=False,
                dependencies=False,
                expressions=False,
                **kwargs) -> OrderedDict:
        # build nlp-json
        j: OrderedDict = get_base()
        j['meta']['DC.language'] = lang
        d: OrderedDict = get_base_document(1)
        #j['documents'][d['id']] = d
        j['documents'].append(d)
        d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version)
        j['meta']['DC.language'] = lang
        d['text'] = text

        # collect parsers
        lemmatizer = get_lemmatizer()
        stemmer = get_stemmer()

        # tokenization and pos
        words = []
        for sent in segment(text):
            for token in sent:
                words.append(token.value)

        # create the token list
        t_id = 1
        for word, xpos in pos_tag(words):
            wordnet_pos = get_wordnet_pos(xpos)
            lemma = lemmatizer(word, pos=wordnet_pos)

            # start the token
            t = {'id': t_id, 'text': word, 'stem': stemmer(word)}
            #d['tokenList'][t['id']] = t
            d['tokenList'].append(t)
            t_id += 1

            # wordnet
            try:
                synsets = wordnet.synsets(lemma, pos=wordnet_pos)
                senses = {}
                for s in synsets:
                    hyponyms = [
                        y for x in s.hyponyms() for y in x.lemma_names()
                    ]
                    hypernyms = [
                        y for x in s.hypernyms() for y in x.lemma_names()
                    ]
                    synonyms = s.lemma_names()[1:]
                    examples = s.examples()
                    sense = {
                        'wordnetId': s.name(),
                        'definition': s.definition()
                    }
                    if synonyms:
                        sense['synonyms'] = synonyms
                    if hypernyms:
                        sense['hypernyms'] = hypernyms
                    if hyponyms:
                        sense['hyponyms'] = hyponyms
                    if examples:
                        sense['examples'] = examples

                    antonyms = []
                    for l in s.lemmas():
                        if l.antonyms():
                            for a in l.antonyms():
                                antonyms.append(a.name())
                    if antonyms:
                        sense['antonyms'] = antonyms

                    senses[sense['wordnetId']] = sense

                if senses:
                    t['synsets'] = senses
            except:
                pass

            # verbnet
            try:
                verbs = dict((class_id, {
                    'classId': class_id,
                    'frames': vn.frames(class_id)
                }) for class_id in vn.classids(word))

                if verbs:
                    t['verbFrames'] = verbs
            except:
                pass

            # framenet
            try:
                frame_net = {}
                frames = invoke_frame(word)
                if frames is not None:
                    for fr in frames:
                        lu_temp = []
                        for lu in fn.lus(r'(?i)' + word.lower()):
                            fr_ = fn.frames_by_lemma(r'(?i)' + lu.name)
                            if len(fr_):
                                if fr_[0] == fr:
                                    lu_temp.append({
                                        'name': lu.name,
                                        'definition': lu.definition,
                                        'pos': lu.name.split('.')[1]
                                    })
                        frame_net[fr.ID] = {
                            'name': fr.name,
                            'frameId': fr.ID,
                            'definition': fr.definition,
                            # 'relations':fr.frameRelations,
                            'lu': lu_temp
                        }
                if frame_net:
                    t['frames'] = frame_net
            except:
                pass

        return remove_empty_fields(j)
Beispiel #8
0
    def process_conll(conll='',
                      lang='en',
                      coreferences=False,
                      constituents=False,
                      dependencies=False,
                      expressions=False,
                      **kwargs) -> OrderedDict:
        if conll == '':
            raise ValueError('You must pass something in the conll parameter!')

        x = load_xrenner()
        x.load(XrennerPipeline.iso2xrenner(lang))
        x.set_doc_name('not-used')  # needs to be set or error

        sgml_result = x.analyze(conll, 'sgml')
        j = parse_conllu(conll)
        #d = list(j['documents'].values())[0]
        d = j['documents'][0]
        d['meta']['DC.source'] = 'Xrenner 2.0'

        if coreferences:
            # wrap tokens with their token id so that xml parsing works
            token_num = 1
            tokenized = []
            for line in sgml_result.split('\n'):
                if line[0:9] != '<referent' and line[0:10] != '</referent':
                    line = f'<token id="{token_num}">{line}</token>'
                    token_num += 1
                tokenized.append(line)

            representatives = {}
            coref_id = 0
            soup = BeautifulSoup('\n'.join(tokenized), 'html.parser')
            for tag in soup.find_all('referent'):
                # new representative
                if 'antecedent' not in tag.attrs or tag['type'] == 'none':
                    r = build_coreference(coref_id)
                    coref_id += 1
                    r['representative'] = {
                        'entity': tag['entity'],
                        'tokens':
                        [int(t['id']) for t in tag.find_all('token')]
                    }
                    r['representative']['head'] = find_head(
                        d, r['representative']['tokens'])
                    representatives[(tag['id'], tag['group'])] = r
                    d['coreferences'].append(r)

                    # might be a multi-word expression too!
                    if expressions and tag['entity'] != 'event' and len(
                            r['representative']['tokens']) > 1:
                        d['expressions'].append({
                            # deduce the phrase type by the pos tag of the head token
                            'type':
                            'VP' if 'V' in d['tokenList'][
                                r['representative']['head']]['upos'] else 'NP',
                            'head':
                            r['representative']['head'],
                            'tokens':
                            r['representative']['tokens']
                        })
                # new referent
                else:
                    r = representatives[(tag['antecedent'], tag['group'])]
                    ids = [int(t['id']) for t in tag.find_all('token')]
                    r['referents'].append({
                        'type': tag['type'],
                        'tokens': ids,
                        'head': find_head(d, ids)
                    })

        return remove_empty_fields(j)