def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param kwargs:other parameters
        """
        for param_name, value in kwargs.items():
            setattr(self, param_name, value)
        if nlp is None:
            raise NameError('parameter "nlp" need to be defined')
        self.nlp = nlp
        self.encoding = encoding
        self.doc_name_depth = doc_name_depth
        self.support_overlap = support_overlap
        self.set_logger(log_level)
        if not Doc.has_extension('doc_name'):
            Doc.set_extension('doc_name', default='')
        pass
Exemple #2
0
    def handle(self, *args, **options):
        spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat'])
        Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True)
        Span.set_extension('line_number', getter=Command.line_number_getter, force=True)
        Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines)
        Doc.set_extension('_lines', default=list())

        logger.debug("Loaded spacy server")
        main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT)
        while True:
            readable, writeable, exceptions = select(read_socks, write_socks, [])
            for sockobj in readable:
                if sockobj in main_socks:
                    new_sock, address = sockobj.accept()
                    logger.debug('Connect: %s - %s', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        entities = []
                        data = recv_end(sockobj)
                        if not data:
                            sockobj.close()
                            read_socks.remove(sockobj)
                        else:
                            for doc in spacy_model.pipe([data]):
                                doc._.lines = [x.start() for x in re.finditer('\n', doc.text)]
                                for ent in doc.ents:
                                    current_entity = self.get_ent(ent)
                                    entities.append(current_entity) if current_entity else None

                            sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8'))
                    except:
                        pass
    def __init__(self):
        super().__init__()

        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Token.has_extension('is_lexical'):
            Token.set_extension('is_lexical', default=False)
Exemple #4
0
def add_span_extensions():
    Doc.set_extension("relations", default=None)
    Doc.set_extension("entities", default=None)
    for span_extension in [
            'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest',
            'hansardParent', 'snomedct', 'synonyms'
    ]:
        Span.set_extension(span_extension, default=None)
Exemple #5
0
    def __init__(self, clf, extension='score'):
        """

        :type clf: Classifier, needs to have a predict(X) function
        """
        self.clf = clf
        self.extension = extension
        if not Doc.has_extension(extension):
            Doc.set_extension(extension, default=-1)
Exemple #6
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Exemple #7
0
    def __init__(self, links, **kwargs):
        self.start_urls.append(links)

        import spacy
        from spacy.tokens.doc import Doc
        from spacy.tokens.span import Span

        self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER)
        Span.set_extension('line_number',
                           getter=TagLinkSpider.line_number_getter,
                           force=True)
        Doc.set_extension('lines',
                          getter=TagLinkSpider.get_lines,
                          setter=TagLinkSpider.set_lines)
        Doc.set_extension('_lines', default=list())

        self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        connect(self.soc_spacy, '', settings.SPACY_PORT)

        super().__init__(**kwargs)
Exemple #8
0
    def __init__(self):
        if not Doc.has_extension('taaled_lemmas'):
            Doc.set_extension('taaled_lemmas', default=[])

        if not Doc.has_extension('context_tokens'):
            Doc.set_extension('context_tokens', default=[])

        if not Doc.has_extension('function_tokens'):
            Doc.set_extension('function_tokens', default=[])

        # Load TAALED word list files
        # source: https://github.com/kristopherkyle/TAALED/tree/master/TAALED_1_3_1_Py3/dep_files
        module_path = os.path.abspath(os.path.dirname(__file__))
        adj_lem_list_path = os.path.join(module_path,
                                         "Corpora/adj_lem_list.txt")
        real_words_path = os.path.join(module_path, "Corpora/real_words.txt")

        self.adj_word_list = open(adj_lem_list_path, "r",
                                  errors='ignore').read().split("\n")[:-1]
        self.real_word_list = open(real_words_path, "r",
                                   errors='ignore').read().split("\n")[:-1]
Exemple #9
0
 def init_component(self):
     if not Doc.has_extension("extract_keywords"):
         Doc.set_extension("extract_keywords", method=self.extract_keywords)
     if not Doc.has_extension("kw_candidates"):
         Doc.set_extension("kw_candidates", default=None)
Exemple #10
0
    """mapping of char offset to token index; token whitespace included; faster than approch in documentation"""
    d = {tok.idx: i for i, tok in enumerate(spacy_doc)}
    i = 0
    for idx in range(spacy_doc[-1].idx + len(spacy_doc[-1].text) + 1):
        if idx in d:
            i = d[idx]
        else:
            d[idx] = i
    return d


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Doc extensions to set mapping of chr offsets ("idx") to token index ("ti")
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

Doc.set_extension('_idx_to_ti_map', default=None)


def set_idx_to_ti_map(doc):
    doc._._idx_to_ti_map = _chr2tok(doc)


def get_idx_to_ti_map(doc):
    if doc._._idx_to_ti_map is None:
        set_idx_to_ti_map(doc)
    return doc._._idx_to_ti_map


Doc.set_extension('idx_to_ti_map', getter=get_idx_to_ti_map)

    def __init__(self):
        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Doc.has_extension(self.name + '_legacy'):
            Doc.set_extension(self.name + '_legacy', default=[])
Exemple #12
0
 def __init__(self):
     if not Doc.has_extension("features"):
         Doc.set_extension("features", default=OrderedDict())
Exemple #13
0
from typing import Dict, List
from allennlp.data import Batch
from dygie.models.dygie import DyGIE
from dygie.data.dataset_readers.dygie import DyGIEReader
from allennlp.models.archival import load_archive
from allennlp.nn import util
from spacy.language import Language
from spacy.tokens import Span
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span

Doc.set_extension("rels", default=[], force=True)
Span.set_extension("rels", default=[], force=True)
Doc.set_extension("span_ents", default=[], force=True)
Span.set_extension("label_", default=[], force=True)
Doc.set_extension("events", default=[], force=True)
Span.set_extension("events", default=[], force=True)


def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc:
    doc_rels = []
    doc_evs = []
    # store events as relations. include confidence scores in the relation tuple (TODO: add relation property)
    for evs, ds in zip(prediction.get("predicted_events", []), doc.sents):
        sent_evs = []
        for ev in evs:
            if len(ev) >= 3:
                trig = [r for r in ev if r[1] == "TRIGGER"]
                arg0s = [r for r in ev if r[2] == "ARG0"]
                #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]]
                arg1s = [r for r in ev if r[2] == "ARG1"]
    def __init__(self, paths=None):
        """
        paths:list -> a list of string, each of which represents a path to one of the corpora needed as listed below.
        This method initialized constant accross the object to be used by other methods of this object
        """

        super().__init__()
        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])
            Doc.set_extension('ngsl_words', default=[])
            Doc.set_extension('nawl_words', default=[])
            Doc.set_extension('tsl_words', default=[])
            Doc.set_extension('fpc_words', default=[])
            Doc.set_extension('cocaacad_words', default=[])
            Doc.set_extension('cocatech_words', default=[])
            Doc.set_extension('cocagenband1_words', default=[])
            Doc.set_extension('cocagenband2_words', default=[])
            Doc.set_extension('cocagenband3_words', default=[])

        if paths is None:
            #file locations
            self.fnameNGSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/NGSL+1.01+by+band - Frequency.csv')
            self.fnameNAWL = os.path.join(os.path.dirname(__file__),
                                          'Corpora/NAWL_SFI.csv')
            self.fnameBSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/BSL_1.01_SFI_freq_bands.csv')
            self.fnameTSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/TSL+1.1+Ranked+by+Frequency - TSL.csv')
            self.fnameCOCAAcad = os.path.join(os.path.dirname(__file__),
                                              'Corpora/COCA Academic.csv')
            self.fnameCOCATech = os.path.join(os.path.dirname(__file__),
                                              'Corpora/COCA Technical.csv')
            self.fnameCOCAGen = os.path.join(os.path.dirname(__file__),
                                             'Corpora/COCA General.csv')
        else:
            #file locations passed as a parameter to the construct
            self.fnameNGSL = paths[0]
            self.fnameNAWL = paths[1]
            self.fnameBSL = paths[2]
            self.fnameTSL = paths[3]
            self.fnameCOCAAcad = paths[4]
            self.fnameCOCATech = paths[5]
            self.fnameCOCAGen = paths[6]

        ## Taken by Vishal's code.
        self.NGSLTotal = 273613534
        self.NAWLTotal = 288176225
        self.TSLTotal = 1560194
        self.BSLTotal = 64651722
        self.COCAAcadTotal = 120032441

        # read the corpora
        self.read_corpora()
        self.nlp = spacy.load("en_core_web_sm")
Exemple #15
0
from typing import Tuple, List, Iterable, Optional, Dict, Callable, Any

from spacy.scorer import PRFScore
from thinc.types import Floats2d
import numpy
from spacy.training.example import Example
from thinc.api import Model, Optimizer
from spacy.tokens.doc import Doc
from spacy.pipeline.trainable_pipe import TrainablePipe
from spacy.vocab import Vocab
from spacy import Language
from thinc.model import set_dropout_rate
from wasabi import Printer


Doc.set_extension("rel", default={}, force=True)
msg = Printer()


@Language.factory(
    "relation_extractor",
    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
    assigns=["doc._.rel"],
    default_score_weights={
        "rel_micro_p": None,
        "rel_micro_r": None,
        "rel_micro_f": None,
    },
)
def make_relation_extractor(
    nlp: Language, name: str, model: Model, *, threshold: float
Exemple #16
0
        )

        if type == 'method':
            target.set_extension(name, method=func, force=force)
        if type == 'property':
            if create_attribute:
                logger.trace(f"Creating attribute '_{name}'")
                target.set_extension("_" + name, default=default, force=force)
            target.set_extension(name, getter=func, force=force, setter=setter)
        return func

    return inner


# ATTRIBUTES
Doc.set_extension('id', default=None, force=force)


# PROPERTIES
@extend(Doc, 'property', create_attribute=True)
def token_map(self: Doc):
    # TODO: another candidate for porting to faster code
    if not self._._token_map:
        token_map = []
        for i, token in enumerate(self):
            token_map.extend([i] * (len(token) +
                                    (1 if token.whitespace_ else 0)))
        self._._token_map = token_map
    return self._._token_map