def process_spacy_and_convert_to_naf(self,
                                         nlp,
                                         dct, # in a next iteration, we can make this a class attribute
                                         layers,
                                         output_path=None):
        """
        process with spacy and convert to NAF

        :param nlp: spacy language model
        :param datetime.datetime dct: document creation time
        :param set layers: layers to convert to NAF, e.g., {'raw', 'text', 'terms'}
        :param output_path: if provided, NAF is saved to that file

        :return: the root of the NAF XML object
        """
        root = spacy_to_naf.text_to_NAF(text=self.content,
                                        nlp=nlp,
                                        dct=dct,
                                        layers=layers,
                                        title=self.name,
                                        uri=self.uri,
                                        language=self.language)

        if output_path is not None:
            with open(output_path, 'w') as outfile:
                outfile.write(spacy_to_naf.NAF_to_string(NAF=root))

        return root
def text_to_naf(wiki_title,
                target_languages,
                text,
                wiki_uri,
                annotations,
                prefix,
                language,
                nlp,
                dct,
                output_folder=None,
                wiki_langlinks={},
                verbose=0):
    assert language in target_languages, f'{language} not part of supported languages: {" ".join(target_languages)}'

    # parse with spaCy
    add_mw = False
    if language in {'en', 'nl'}:
        add_mw = True

    try:
        naf = spacy_to_naf.text_to_NAF(text=text,
                                       nlp=nlp,
                                       dct=dct,
                                       layers={'raw', 'text', 'terms', 'deps'},
                                       naf_version='v3.1',
                                       title=wiki_title,
                                       uri=wiki_uri,
                                       language=language,
                                       add_mws=add_mw)

        assert naf.find('raw').text == text, f'mismatch between raw text JSON and NAF file'
    except:
        return


    # add hyperlinks as entity elements
    add_hyperlinks(naf,
                   annotations,
                   prefix,
                   language,
                   dct,
                   wiki_langlinks=wiki_langlinks)

    # if wanted, write output to disk
    if output_folder is not None:
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        lang_dir = os.path.join(output_folder, language)
        if not os.path.exists(lang_dir):
            os.mkdir(lang_dir)
        output_path = os.path.join(lang_dir, f'{wiki_title}.naf')
        spacy_to_naf.NAF_to_file(naf, output_path)

    if verbose >= 3:
        print(f'saved to {output_path}')

    return naf
Esempio n. 3
0
def fileread():
    for filename in os.listdir(path + "rawTest"):
        if filename.endswith(".txt"):
            orig = open(path + "rawTest/" + filename, "r", encoding="utf8")
            f = open(filename.replace(".txt", ".naf"), "w", encoding="utf8")
            text = ""
            for line in orig.read():
                text += line
            #datetime.datetime.now() may just be datetime.now() depending on python version
            NAF = spacy_to_naf.text_to_NAF(
                text,
                nlp,
                dct=datetime.datetime.now(),
                layers={'raw', 'text', 'terms', 'entities', 'deps', 'chunks'})
            f.write(spacy_to_naf.NAF_to_string(NAF))
            f.close()
Esempio n. 4
0
def process_first_x_files(path_signalmedia_json,
                          path_newsreader_nafs='',
                          start=None,
                          end=None):
    """
    create generator of json objects (representing signalmedia articles)
    
    :param str path_signalmedia_json: path to all signalmedia article in jsonl
    (originally called signalmedia-1m.jsonl
    :param str path_newsreader_nafs: path to where signalmedia processed
    with pipeline is stored in NAF
    :param int start: start line
    :param int end: end line

    :rtype: generator
    :return: generator of json objects
    """
    if end:
        line_range = range(start, end + 1)

    news_item = namedtuple('news_item', ['signalmedia_json', 'preprocessing'])
    path_template = '{path_newsreader_nafs}/{identifier}.in.naf'

    with open(path_signalmedia_json) as infile:
        for counter, line in enumerate(infile, 1):

            if end:
                if counter not in line_range:
                    continue
                if counter > end:
                    break

            article = json.loads(line)
            identifier = article['id']
            spacy_naf = spacy_to_naf.text_to_NAF(article['content'], nlp)
            the_preprocessing = {('spacy', spacy_naf)}

            if path_newsreader_nafs:
                path_newsreader_naf = path_template.format_map(locals())
                if os.path.exists(path_newsreader_naf):
                    newsreader_naf = etree.parse(path_newsreader_naf)
                    the_preprocessing.add(('newsreader', newsreader_naf))

            a_news_item = news_item(signalmedia_json=article,
                                    preprocessing=the_preprocessing)
            yield a_news_item
Esempio n. 5
0
def text_to_naf(wiki_title,
                text,
                wiki_uri,
                annotations,
                prefix,
                language,
                nlp,
                dct,
                output_folder=None):

    assert language in {
        'nl', 'en', 'it'
    }, f'{language} not part of supported languages: nl it en'

    # parse with spaCy
    naf = spacy_to_naf.text_to_NAF(text=text,
                                   nlp=nlp,
                                   dct=dct,
                                   layers={'raw', 'text', 'terms'},
                                   title=wiki_title,
                                   uri=wiki_uri,
                                   language=language)

    assert naf.find(
        'raw').text == text, f'mismatch between raw text JSON and NAF file'

    # add hyperlinks as entity elements
    add_hyperlinks(naf, annotations, prefix)

    # if wanted, write output to disk
    if output_folder is not None:
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        lang_dir = os.path.join(output_folder, language)
        if not os.path.exists(lang_dir):
            os.mkdir(lang_dir)
        output_path = os.path.join(lang_dir, f'{wiki_title}.naf')
        with open(output_path, 'w') as outfile:
            naf_string = spacy_to_naf.NAF_to_string(naf)
            outfile.write(naf_string)

    return naf
Esempio n. 6
0
import sys
sys.path.append('..')

import spacy
from datetime import datetime
from spacy_to_naf import text_to_NAF, NAF_to_string

nlp = spacy.load('nl_core_news_sm')

naf = text_to_NAF(
    "Hij nam de kat aan.",
    nlp,
    dct=datetime.now(),
    layers={'raw', 'text', 'terms', 'deps'},
    naf_version='v3.1',
    language='nl',
    layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}},
    replace_hidden_characters=True,
    dtd_validation=True)

print(NAF_to_string(naf))
Esempio n. 7
0
import sys
sys.path.append('..')

import spacy
from datetime import datetime
from spacy_to_naf import text_to_NAF, NAF_to_string

nlp = spacy.load('en_core_web_sm')

naf = text_to_NAF('He gave up.',
                   nlp,
                   dct=datetime.now(),
                   layers={'raw', 'text', 'terms'},
                   naf_version='v3.1',
                   layer_to_attributes_to_ignore={'terms' : {'morphofeat', 'type'}},
                   dtd_validation=True)

print(NAF_to_string(naf))
Esempio n. 8
0
import sys
sys.path.append('..')

import spacy
from datetime import datetime
from spacy_to_naf import text_to_NAF, NAF_to_string

nlp = spacy.load('en_core_web_sm')

naf = text_to_NAF(
    'Tom Cruise is an actor.\n\n\nHe likes to act.',
    nlp,
    dct=datetime.now(),
    layers={'raw', 'text', 'terms'},
    naf_version='v4',
    layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}},
    replace_hidden_characters=True,
    map_udpos2naf_pos=True)  # map UD pos to NAF pos

print(NAF_to_string(naf))
Esempio n. 9
0
import sys
sys.path.append('..')

import spacy
from lxml import etree
from spacy_to_naf import text_to_NAF, NAF_to_string
from spacy_to_naf import EntityElement, add_entity_element
from datetime import datetime

nlp = spacy.load('en_core_web_sm')

tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.',
                   nlp,
                   dct=datetime.now(),
                   layers={'raw', 'text', 'terms'},
                   replace_hidden_characters=True,
                   map_udpos2naf_pos=True)  # map UD pos to NAF pos

root = tree.getroot()

entities_layer = root.find('entities')
if entities_layer is None:
    etree.SubElement(root, "entities")
    entities_layer = root.find('entities')

entity_data = EntityElement(eid='1',
                            entity_type='None',
                            targets=['t1', 't2'],
                            text='Tom Cruise',
                            ext_refs=[{
                                'reference':
Esempio n. 10
0
import sys
sys.path.append('..')

import spacy
from lxml import etree
from spacy_to_naf import text_to_NAF, NAF_to_string
from spacy_to_naf import EntityElement, add_entity_element
from datetime import datetime

nlp = spacy.load('en_core_web_sm')

NAF = text_to_NAF(
    'The man saw the bird. The woman gave the gift to the person.',
    nlp,
    dct=datetime.now(),
    layers={'raw', 'text', 'terms', 'deps'},
    replace_hidden_characters=True,
    map_udpos2naf_pos=True)  # map UD pos to NAF pos

print(NAF_to_string(NAF))
def run_spacy_on_wiki_text_and_add_hyperlinks(wiki_title,
                                              prefix,
                                              language,
                                              nlp,
                                              wiki_folder,
                                              wiki_uri2relative_path,
                                              dct,
                                              output_folder=None,
                                              verbose=0):
    """

    :param str wiki_title: Wikipedia article title, e.g., "President van Frankrijk"
    :param str language: supported: 'nl' | 'en' | 'it'
    :param nlp: loaded spaCy model, i.e., results of calling spacy.load('MODELNAME')
    :param str wiki_folder: path to where extracted Wikipedia output is stored, e.g, the folder "wiki",
    with subfolders for the output per language
    :param datetime.datetime dct: document creation time, date of crawling for Wikipedia
    :param output_folder: if provided, the NAF file will be written to
    output_folder/LANGUAGE/WIKI_TITLE.naf

    :rtype: tuple
    :return: (succes, reason, naf)
    """
    succes = True
    reason = 'succes'
    naf = None

    assert language in {'nl', 'en', 'it'}, f'{language} not part of supported languages: nl it en'

    # try to retrieve JSON of Wikipedia article
    wiki_uri = f'{prefix}{wiki_title.replace(" ", "_")}'
    wiki_uri_encoded = urlencode_wikititle(wiki_title, prefix=prefix)

    if verbose >= 2:
        print(wiki_uri) 

    if wiki_uri_encoded not in wiki_uri2relative_path:
        reason = 'page not extracted'
        succes = False
    else:
        relative_path, line_number = wiki_uri2relative_path[wiki_uri_encoded]
        path = os.path.join(wiki_folder, relative_path)

        # load wiki_page
        wiki_page = {}
        with bz2.BZ2File(path, "r") as infile:
            for index, line in enumerate(infile):
                if index == line_number:
                    wiki_page = json.loads(line)
                    break

        assert wiki_page, f'index is wrong for {language} {wiki_title}'

        # parse with spaCy
        naf = spacy_to_naf.text_to_NAF(text=wiki_page['text'],
                                       nlp=nlp,
                                       dct=dct,
                                       layers={'raw', 'text', 'terms'},
                                       title=wiki_title,
                                       uri=wiki_uri,
                                       language=language)


        assert naf.find('raw').text == wiki_page['text'], f'mismatch between raw text JSON and NAF file'

        # add hyperlinks as entity elements
        add_hyperlinks(naf,
                       wiki_page['annotations'],
                       prefix,
                       verbose=verbose)

        # if wanted, write output to disk
        if output_folder is not None:
            if not os.path.exists(output_folder):
                os.mkdir(output_folder)
            lang_dir = os.path.join(output_folder, language)
            if not os.path.exists(lang_dir):
                os.mkdir(lang_dir)

            output_path = os.path.join(lang_dir, f'{wiki_title}.naf')
            with open(output_path, 'w') as outfile:
                naf_string = spacy_to_naf.NAF_to_string(naf)
                outfile.write(naf_string)
            if verbose >= 2:
                print(f'written {wiki_title} ({language}) to {output_path}')


    message = f'succes:{succes} with reason: {reason} for {wiki_title} ({language})'
    if verbose >= 3:
        print(message)

    if all([verbose == 2,
            not succes]):
        print(message)

    # return message whether it was succesful
    return succes, reason, naf
Esempio n. 12
0
import spacy
from lxml import etree
from spacy_to_naf import text_to_NAF, NAF_to_string
from spacy_to_naf import EntityElement, add_entity_element
from spacy_to_naf import time_in_correct_format
from spacy_to_naf import add_linguisticProcessors_el
from datetime import datetime

nlp = spacy.load('en_core_web_sm')
naf_version = 'v3.1'
now = datetime.now()

tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.',
                   nlp,
                   dct=now,
                   naf_version=naf_version,
                   layers={'raw', 'text', 'terms'},
                   replace_hidden_characters=True,
                   map_udpos2naf_pos=False,
                   dtd_validation=True) # map UD pos to NAF pos

root = tree.getroot()
naf_header = root.find('nafHeader')
time_as_string = time_in_correct_format(now)

modelname = 'Wikipedia hyperlinks'
add_linguisticProcessors_el(naf_header,
                            layer='entities',
                            start_time=time_as_string,
                            end_time=time_as_string,
                            modelname=modelname)
Esempio n. 13
0
import sys
sys.path.append('..')

import spacy
from datetime import datetime
from spacy_to_naf import text_to_NAF, NAF_to_string

nlp = spacy.load('it_core_news_sm')

naf = text_to_NAF(
    'Tom Cruise is an actor.\n\n\nHe likes to act.',
    nlp,
    dct=datetime.now(),
    layers={'raw', 'text', 'terms'},
    naf_version='v3.1',
    layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}},
    dtd_validation=True)  # map UD pos to NAF pos