def get_spans(word_seqs):
    vocab = Vocab()
    docs = [Doc(vocab, words=words) for words in word_seqs]
    return [doc[:] for doc in docs]
    print('time_taken: {}'.format(time.time() - t_0))
d = pickle.load(open(data_dir + '/word_count.p', 'rb'))
sort_list = sorted(d.items(), key=lambda item: item[1], reverse=True)
# nlp = spacy.load('en_core_web_md')
# vector_data = {u"dog": np.random.uniform(-1, 1, (300,)),
#                u"cat": np.random.uniform(-1, 1, (300,)),
#                u"orange": np.random.uniform(-1, 1, (300,)),
#                u"it's": np.random.uniform(-1, 1, (300,))}
nlp_lg = spacy.load('en_core_web_md')
print(nlp_lg("i'm").vector)
emb_list = nlp_lg.tokenizer.pipe([s[0] for s in sort_list])
vec_list = [a.vector for a in emb_list]
nlp = spacy.blank('en')
# vocab = Vocab(strings=[u"hello", u"world"])
# nlp.vocab = Vocab([s[0] for s in sort_list])
nlp.vocab = Vocab(strings=[s[0] for s in sort_list])
# vector_data = sort_list.keys()
#
# nlp.vocab = Vocab()
# nlp.vocab.vectors.resize((int(4), int(300)))
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
# spacy.vocab.link_vectors_to_models(nlp.vocab)

# for word, vector in vector_data.items():
for word, vec in zip(sort_list, vec_list):
    # nlp.vocab.set_vector(word[0], np.random.uniform(-1, 1, (300,)))
    nlp.vocab.set_vector(word[0], vec)

# spacy.vocab.link_vectors_to_models(nlp.vocab)
# nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
nlp.to_disk(data_dir + '/spacy_tok_combined_30080')
Exemple #3
0
def vocab():
    return Vocab()
def wp(name):
    return PyTT_WordPiecer.from_pretrained(Vocab(), pytt_name=name)
Exemple #5
0
path_to_db = "/media/norpheo/mySQL/db/ssorc"
# nlp_model = "en_core_web_sm_nertrained_v3"
nlp_model = "en_wa_v2"

path_to_mlgenome = os.path.join(path_to_db, "mlgenome", nlp_model)
if not os.path.isdir(path_to_mlgenome):
    print(f"Create Directory {path_to_mlgenome}")
    os.mkdir(path_to_mlgenome)

with open(os.path.join(path_to_mlgenome, "ml_acronyms.pickle"), "rb") as handle:
    acronyms = pickle.load(handle)

path_to_annotations = os.path.join(path_to_db, "annotations_version", nlp_model)

vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

window_size = 3

mentions = list()
unique_mentions = list()
um_set = dict()
ml_acronyms = dict()

lt = LoopTimer(update_after=100, avg_length=1000, target=len(infoDF))
for abstract_id, row in infoDF.iterrows():
    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    doc = Doc(vocab).from_disk(file_path)

    for sentence in doc.sents:
Exemple #6
0
def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    vocab.from_bytes(vocab.to_bytes())
    assert len(vocab.strings) == len(strings)
Exemple #7
0
def test_oracle_dev_sentence(vocab, arc_eager):
    words_deps_heads = """
        Rolls-Royce nn Inc.
        Motor nn Inc.
        Cars nn Inc.
        Inc. nsubj said
        said ROOT said
        it nsubj expects
        expects ccomp said
        its poss sales
        U.S. nn sales
        sales nsubj steady
        to aux steady
        remain cop steady
        steady xcomp expects
        at prep steady
        about quantmod 1,200
        1,200 num cars
        cars pobj at
        in prep steady
        1990 pobj in
        . punct said
    """
    expected_transitions = [
        "S",  # Shift "Rolls-Royce"
        "S",  # Shift 'Motor'
        "S",  # Shift 'Cars'
        "L-nn",  # Attach 'Cars' to 'Inc.'
        "L-nn",  # Attach 'Motor' to 'Inc.'
        "L-nn",  # Attach 'Rolls-Royce' to 'Inc.'
        "S",  # Shift "Inc."
        "L-nsubj",  # Attach 'Inc.' to 'said'
        "S",  # Shift 'said'
        "S",  # Shift 'it'
        "L-nsubj",  # Attach 'it.' to 'expects'
        "R-ccomp",  # Attach 'expects' to 'said'
        "S",  # Shift 'its'
        "S",  # Shift 'U.S.'
        "L-nn",  # Attach 'U.S.' to 'sales'
        "L-poss",  # Attach 'its' to 'sales'
        "S",  # Shift 'sales'
        "S",  # Shift 'to'
        "S",  # Shift 'remain'
        "L-cop",  # Attach 'remain' to 'steady'
        "L-aux",  # Attach 'to' to 'steady'
        "L-nsubj",  # Attach 'sales' to 'steady'
        "R-xcomp",  # Attach 'steady' to 'expects'
        "R-prep",  # Attach 'at' to 'steady'
        "S",  # Shift 'about'
        "L-quantmod",  # Attach "about" to "1,200"
        "S",  # Shift "1,200"
        "L-num",  # Attach "1,200" to "cars"
        "R-pobj",  # Attach "cars" to "at"
        "D",  # Reduce "cars"
        "D",  # Reduce "at"
        "R-prep",  # Attach "in" to "steady"
        "R-pobj",  # Attach "1990" to "in"
        "D",  # Reduce "1990"
        "D",  # Reduce "in"
        "D",  # Reduce "steady"
        "D",  # Reduce "expects"
        "R-punct",  # Attach "." to "said"
        "D",  # Reduce "."
        "D",  # Reduce "said"
    ]

    gold_words = []
    gold_deps = []
    gold_heads = []
    for line in words_deps_heads.strip().split("\n"):
        line = line.strip()
        if not line:
            continue
        word, dep, head = line.split()
        gold_words.append(word)
        gold_deps.append(dep)
        gold_heads.append(head)
    gold_heads = [gold_words.index(head) for head in gold_heads]
    for dep in gold_deps:
        arc_eager.add_action(2, dep)  # Left
        arc_eager.add_action(3, dep)  # Right
    doc = Doc(Vocab(), words=gold_words)
    example = Example.from_dict(doc, {"heads": gold_heads, "deps": gold_deps})
    ae_oracle_actions = arc_eager.get_oracle_sequence(example, _debug=False)
    ae_oracle_actions = [arc_eager.get_class_name(i) for i in ae_oracle_actions]
    assert ae_oracle_actions == expected_transitions
def main(json_loc: Path,
         train_file: Path,
         dev_file: Path,
         test_file: Path,
         test_split=0.189,
         train_split=0.709):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={})
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": []}
    ids = {"train": set(), "dev": set(), "test": set()}
    count_all = {"train": 0, "dev": 0, "test": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0}

    long_rel_count = 0  #how many relations are longer
    error_count_rel = 0  #how often is something different than ARGO, ARG1, ARG

    with json_loc.open("r", encoding="utf8") as jsonfile:
        length_training_data = len([
            True for line in jsonfile if json.loads(line)["answer"] == "accept"
        ])
        msg.info(f"Number of accepted recipes: {length_training_data}")

    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)  #one recipe
            span_starts = set()

            if example["answer"] == "accept":
                neg = 0
                pos = 0
                try:
                    # Parse the tokens -> example["tokens"] = list of dicts
                    words = [t["text"] for t in example["tokens"]
                             ]  #list containing all words
                    spaces = [
                        t["ws"] for t in example["tokens"]
                    ]  #list containing ws is behind word (ws = True/False)
                    doc = Doc(vocab, words=words, spaces=spaces)

                    # Parse the entities
                    spans = example[
                        "spans"]  #list of dicts containing entities
                    entities = []
                    span_end_to_start = {}
                    ents_dict = {}
                    for span in spans:  #every detected span
                        entity = doc.char_span(
                            span["start"], span["end"], label=span["label"]
                        )  #"start" = wievielter character ist start character des spans im doc
                        span_end_to_start[span["token_end"]] = span[
                            "token_start"]  #end_token of span as key for start_token (start token = wievielter token in doc)
                        entities.append(entity)  #appended to list
                        span_starts.add(span["token_start"])  #added to set
                        ents_dict[span["token_start"]] = (span["label"],
                                                          span["token_start"])
                    doc.ents = entities  #entity list assigned as doc entites

                    # Parse the relations
                    rels = {}

                    # create token combinations
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 1a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 1a
                                        if DIFF_FRONT_BACK == True:

                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                rels[(x1, x2)] = {}

                                            else:
                                                pass
                                        #DIFF_FRONT_BACK 1b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                rels[(x1, x2)] = {
                                                }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...
                        #VERBS_TO_OTHER 1b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 2a
                                if DIFF_FRONT_BACK == True:

                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        rels[(x1, x2)] = {}

                                    else:
                                        pass
                                #DIFF_FRONT_BACK 2b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        rels[(x1, x2)] = {
                                        }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...

                    relations = example[
                        "relations"]  #relations is list of dict
                    for relation in relations:
                        # the 'head' and 'child' annotations refer to the end token in the span
                        # but we want the first token
                        start = span_end_to_start[relation[
                            "head"]]  #wievielter token ist start token des head
                        end = span_end_to_start[relation[
                            "child"]]  #wievielter token ist start token des child
                        label = relation["label"]

                        #DETAILED_ARGS 1a
                        if DETAILED_ARGS == True:
                            if label == "ARG0":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG0[ents_dict[end][
                                        0]]  #assign new label based on span type
                            elif label == "ARG1":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG1[ents_dict[end][0]]
                            elif label == "ARG":
                                if ents_dict[end][0] in ["Z", "TOOL"]:
                                    if ents_dict[end][0] == "Z":
                                        label = "Arg0Z"
                                    elif ents_dict[end][0] == "TOOL":
                                        label = "Arg1Tool"
                                else:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                            else:
                                error_count_rel += 1

                        #DETAILED_ARGS 1b
                        else:
                            label = MAP_LABELS_STANDARD[
                                label]  #MAP_LABELS = dict containing label as key

                        # Positive relations are being added
                        try:
                            if label not in rels[(
                                    start, end
                            )]:  #check if label already exists for token combination
                                rels[(
                                    start, end
                                )][label] = 1.0  #initialize label as new key with value 1.0
                                pos += 1  #positive case
                        except:
                            long_rel_count += 1  #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb)
                            pass

                    # The annotation is complete, so fill in zero's where the data is missing
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 2a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 2a
                                        if DIFF_FRONT_BACK == True:
                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                #DETAILED_ARGS 2a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                            #DETAILED_ARGS 2b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0

                                        #DIFF_FRONT_BACK 2b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                #DETAILED_ARGS 3a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                                #DETAILED_ARGS 3b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0  #span combination with label as key gets 0 as value
                        #VERBS_TO_OTHER 2b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 3a
                                if DIFF_FRONT_BACK == True:
                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        #DETAILED_ARGS 4a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                    #DETAILED_ARGS 4b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                                #DIFF_FRONT_BACK 3b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        #DETAILED_ARGS 5a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                        #DETAILED_ARGS 5b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                    #print(rels)
                    doc._.rel = rels  # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}}

                    # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list)
                    if pos > 0:

                        recipe_id = example["_input_hash"]

                        if len(docs["train"]) < round(
                                train_split * length_training_data):
                            ids["train"].add(recipe_id)
                            docs["train"].append(doc)
                            count_pos["train"] += pos
                            count_all["train"] += pos + neg
                        elif len(docs["test"]) < round(
                                test_split * length_training_data):
                            ids["test"].add(recipe_id)
                            docs["test"].append(doc)
                            count_pos["test"] += pos
                            count_all["test"] += pos + neg
                        else:
                            ids["dev"].add(recipe_id)
                            docs["dev"].append(doc)
                            count_pos["dev"] += pos
                            count_all["dev"] += pos + neg

                except KeyError as e:
                    msg.fail(
                        f"Skipping doc because of key error: {e} in {example['_input_hash']}"
                    )

    msg.info(
        f"{long_rel_count} relations have been cut because tokens are too far apart."
    )

    docbin = DocBin(docs=docs["train"], store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, "
        f"{count_pos['train']}/{count_all['train']} pos instances.")

    docbin = DocBin(docs=docs["dev"], store_user_data=True)
    docbin.to_disk(dev_file)
    msg.info(
        f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, "
        f"{count_pos['dev']}/{count_all['dev']} pos instances.")

    docbin = DocBin(docs=docs["test"], store_user_data=True)
    docbin.to_disk(test_file)
    msg.info(
        f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, "
        f"{count_pos['test']}/{count_all['test']} pos instances.")
    def __init__(
        self,
        vocab_path="",
        model_name="bert-base-cased",
        max_edit_dist=10,
        debug=False,
        performance=False,
    ):
        """To create an object for this class. It does not require any special

        Args:
            vocab_path (str, optional): Vocabulary file path to be used by the
                                         model . Defaults to "".
            model_name (str, optional): Pretrained BERT model name. Defaults to
                                        "bert-base-cased".
            max_edit_dist (int, optional): Maximum edit distance between two
                                           words. Defaults to 10.
            debug (bool, optional): This help prints logs as the data flows
                                     through the class. Defaults to False.
            performance (bool, optional): This is used to print the time taken
                                          by individual steps in spell check.
                                          Defaults to False.
        """
        if (
            not isinstance(vocab_path, str)
            or not isinstance(debug, type(True))
            or not isinstance(performance, type(True))
        ):
            raise TypeError(
                "Please check datatype provided. vocab_path should be str,"
                " debug and performance should be bool"
            )
        try:
            int(float(max_edit_dist))
        except ValueError:
            raise ValueError(
                f"cannot convert {max_edit_dist} to int. Please provide a "
                f"valid integer "
            )

        if vocab_path != "":
            try:
                # First open() for user specified word addition to vocab
                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [
                    #     line.rstrip()
                    #     for line in f
                    #     if not line.startswith("[unused")
                    # ]
                    words = [line.strip() for line in f]

                # The below code adds the necessary words like numbers
                # /punctuations/tokenizer specific words like [PAD]/[
                # unused0]/##M
                current_path = os.path.dirname(__file__)
                vocab_path = os.path.join(current_path, "data", "vocab.txt")
                extra_token = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
                words.extend(extra_token)

                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [
                    #     line.rstrip()
                    #     for line in f
                    #     if not line.startswith("[unused")
                    # ]
                    for line in f:
                        extra_token = line.strip()
                        if extra_token.startswith("[unused"):
                            words.append(extra_token)
                        elif extra_token.startswith("##"):
                            words.append(extra_token)
                        elif len(extra_token) == 1:
                            words.append(extra_token)
                if debug:
                    debug_file_path = os.path.join(
                        current_path, "tests", "debugFile.txt"
                    )
                    with open(debug_file_path, "w+") as new_file:
                        new_file.write("\n".join(words))
                    print("Final vocab at " + debug_file_path)

            except Exception as e:
                print(e)
                warnings.warn("Using default vocab")
                vocab_path = ""
                words = []

        self.max_edit_dist = int(float(max_edit_dist))
        self.model_name = model_name
        self.BertTokenizer = AutoTokenizer.from_pretrained(self.model_name)

        if vocab_path == "":
            words = list(self.BertTokenizer.get_vocab().keys())
        self.vocab = Vocab(strings=words)
        logging.getLogger("transformers").setLevel(logging.ERROR)
        self.BertModel = AutoModelForMaskedLM.from_pretrained(self.model_name)
        self.mask = self.BertTokenizer.mask_token
        self.debug = debug
        self.performance = performance
        if not Doc.has_extension("contextual_spellCheck"):
            Doc.set_extension("contextual_spellCheck", default=True)
            Doc.set_extension("performed_spellCheck", default=False)

            Doc.set_extension("suggestions_spellCheck", default={})
            Doc.set_extension("outcome_spellCheck", default="")
            Doc.set_extension("score_spellCheck", default=None)

            Span.set_extension(
                "get_has_spellCheck", getter=self.span_require_spell_check
            )
            Span.set_extension(
                "score_spellCheck", getter=self.span_score_spell_check
            )

            Token.set_extension(
                "get_require_spellCheck", getter=self.token_require_spell_check
            )
            Token.set_extension(
                "get_suggestion_spellCheck",
                getter=self.token_suggestion_spell_check,
            )
            Token.set_extension(
                "score_spellCheck", getter=self.token_score_spell_check
            )
Exemple #10
0
def test_issue600():
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    doc = Doc(vocab, words=["hello"])
    doc[0].tag_ = "NN"
Exemple #11
0
def test_issue743():
    doc = Doc(Vocab(), ["hello", "world"])
    token = doc[0]
    s = set([token])
    items = list(s)
    assert items[0] is token
Exemple #12
0
def test_issue589():
    vocab = Vocab()
    vocab.strings.set_frozen(True)
    doc = Doc(vocab, words=["whata"])
    assert doc
    def __init__(self, vocab_path="", debug=False, performance=False):
        """To create an object for this class. It does not require any special 

        Args:
            vocab_path (str, optional): Vocabulary file path to be used by the model . Defaults to "".
            debug (bool, optional): This help prints logs as the data flows throught the class. Defaults to False.
            performance (bool, optional): This is used to print the time taken by individual steps in spell check. Defaults to False.
        """
        if ((type(vocab_path) != type("")) or (type(debug) != type(True))
                or (type(performance) != type(True))):
            raise TypeError(
                "Please check datatype provided. vocab_path should be str, debug and performance should be bool"
            )

        if vocab_path != "":
            try:
                # First open() for user specified word addition to vocab
                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [line.rstrip() for line in f if not line.startswith('[unused')]
                    words = [line.strip() for line in f]

                # The below code adds the neccesary words like numbers/puncutations/tokenizer specific words like [PAD]/[unused0]/##M
                currentPath = os.path.dirname(__file__)
                vocab_path = os.path.join(currentPath, "data", "vocab.txt")
                extraToken = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
                words.extend(extraToken)

                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [line.rstrip() for line in f if not line.startswith('[unused')]
                    for line in f:
                        extraToken = line.strip()
                        if extraToken.startswith("[unused"):
                            words.append(extraToken)
                        elif extraToken.startswith("##"):
                            words.append(extraToken)
                        elif len(extraToken) == 1:
                            words.append(extraToken)
                if debug:
                    debugFilePath = os.path.join(currentPath, "tests",
                                                 "debugFile.txt")
                    with open(debugFilePath, "w+") as newFile:
                        newFile.write("\n".join(words))
                    print("Final vocab at " + debugFilePath)

            except Exception as e:
                print(e)
                warnings.warn("Using default vocab")
                vocab_path = ""
                words = []

        if vocab_path == "":
            currentPath = os.path.dirname(__file__)
            vocab_path = os.path.join(currentPath, "data/vocab.txt")
            with open(vocab_path, encoding="utf8") as f:
                # if want to remove '[unusedXX]' from vocab
                # words = [line.rstrip() for line in f if not line.startswith('[unused')]
                words = [line.strip() for line in f]

        self.vocab = Vocab(strings=words)
        self.BertTokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        self.BertModel = AutoModelWithLMHead.from_pretrained("bert-base-cased")
        self.mask = self.BertTokenizer.mask_token
        self.debug = debug
        self.performance = performance
        if not Doc.has_extension("contextual_spellCheck"):
            Doc.set_extension("contextual_spellCheck", default=True)
            Doc.set_extension("performed_spellCheck", default=False)

            # {originalToken-1:[suggestedToken-1,suggestedToken-2,..],
            #  originalToken-2:[...]}
            Doc.set_extension("suggestions_spellCheck", default={})
            Doc.set_extension("outcome_spellCheck", default="")
            Doc.set_extension("score_spellCheck", default=None)

            Span.set_extension("get_has_spellCheck",
                               getter=self.span_require_spellCheck)
            Span.set_extension("score_spellCheck",
                               getter=self.span_score_spellCheck)

            Token.set_extension("get_require_spellCheck",
                                getter=self.token_require_spellCheck)
            Token.set_extension("get_suggestion_spellCheck",
                                getter=self.token_suggestion_spellCheck)
            Token.set_extension("score_spellCheck",
                                getter=self.token_score_spellCheck)
def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2])
    assert len(doc) == len(words) - 1
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # Test that IOB stays consistent with provided IOB
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    with doc.retokenize() as retokenizer:
        attrs = {"ent_type": "ent-abc", "ent_iob": 1}
        retokenizer.merge(doc[0:3], attrs=attrs)
        retokenizer.merge(doc[3:5], attrs=attrs)
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    # if no parse/heads, the first word in the span is the root and provides
    # default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-fg"
    ents[6] = "I-ent-fg"
    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
    en_vocab.strings.add("dep")
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    assert doc[2:4].root == doc[3]  # root of 'c d' is d
    assert doc[4:6].root == doc[4]  # root is 'e f' is e
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[2].ent_iob_ == "B"
    assert doc[2].ent_type_ == "ent-de"
    assert doc[3].ent_iob_ == "I"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-fg"

    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
    ents = ["O"] * len(words)
    ents[3] = "B-ent-de"
    ents[4] = "I-ent-de"
    ents[5] = "B-ent-de"
    ents[6] = "I-ent-de"
    deps = ["dep"] * len(words)
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5])
        retokenizer.merge(doc[5:7])
    assert len(doc) == 7
    assert doc[3].ent_iob_ == "B"
    assert doc[3].ent_type_ == "ent-de"
    assert doc[4].ent_iob_ == "B"
    assert doc[4].ent_type_ == "ent-de"
def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab(vectors_name="test_issue1807")
    assert "hello" not in vocab
    vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
    assert "hello" in vocab
Exemple #16
0
from spacy.language import Language

from whatlies.language import SpacyLanguage
from whatlies.transformers import (
    Transformer,
    Umap,
    Pca,
    Noise,
    AddRandom,
    Tsne,
    OpenTsne,
    Ivis,
    Normalizer,
)

vocab = Vocab().from_disk("tests/custom_test_vocab/")
words = list(vocab.strings)
lang = SpacyLanguage(nlp=Language(vocab=vocab, meta={"lang": "en"}))
emb = lang[words]

transformers = [
    Umap(2),
    Umap(3),
    Pca(2),
    Pca(3),
    Noise(0.1),
    Noise(0.01),
    AddRandom(n=4),
    AddRandom(n=1),
    lambda d: d | (d["man"] - d["woman"]),
    Tsne(2, n_iter=250),
def test_issue1967(label):
    ner = EntityRecognizer(Vocab())
    entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
    gold_parses = [(None, [(entry, None)])]
    ner.moves.get_actions(gold_parses=gold_parses)
Exemple #18
0
# Similarity of two documents
print(doc1, '<->', doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, '<->', burgers, french_fries.similarity(burgers))
print()
doc = nlp('I love coffee')
print(doc.vocab.strings['coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'
print()
doc = nlp('I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_,
          lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit, lexeme.is_title,
          lexeme.lang_)
print()
doc = nlp("I love coffee")  # Original Doc
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

empty_doc = Doc(Vocab())  # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(

empty_doc.vocab.strings.add("coffee")  # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

new_doc = Doc(doc.vocab)  # Create new doc with first doc's vocab
print(new_doc.vocab.strings[3197928453018144401])  # 'coffee' 👍
# displacy.serve(doc, style='ent')
Exemple #19
0
def test_serialize_vocab(en_vocab, text):
    text_hash = en_vocab.strings.add(text)
    vocab_bytes = en_vocab.to_bytes(exclude=["lookups"])
    new_vocab = Vocab().from_bytes(vocab_bytes)
    assert new_vocab.strings[text_hash] == text
    assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
Exemple #20
0
def test_Example_from_dict_with_morphology(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert str(token.morph) == annots["morphs"][i]
Exemple #21
0
def test_doc_api_has_vector():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    doc = Doc(vocab, words=["kitten"])
    assert doc.has_vector
Exemple #22
0
def test_Example_from_dict_basic():
    example = Example.from_dict(Doc(Vocab(), words=["hello", "world"]),
                                {"words": ["hello", "world"]})
    assert isinstance(example.x, Doc)
    assert isinstance(example.y, Doc)
Exemple #23
0
def main(model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()
Exemple #24
0
def test_Example_from_dict_with_entities_invalid(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.warns(UserWarning):
        example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.ents)) == 0
def component(vocab):
    return Transformer(Vocab(), DummyTransformer())
Exemple #26
0
def test_Example_from_dict_with_entities_overlapping(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
        Example.from_dict(predicted, annots)
Exemple #27
0
def vocab():
    return Vocab(lex_attr_getters={NORM: lambda s: s})
Exemple #28
0
def test_Example_from_dict_with_links_invalid(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
        Example.from_dict(predicted, annots)
def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "entity_linker" not in nlp.pipe_names:
        entity_linker = nlp.create_pipe("entity_linker")
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)
    else:
        entity_linker = nlp.get_pipe("entity_linker")
        kb = entity_linker.kb

    # make sure the annotated examples correspond to known identifiers in the knowlege base
    kb_ids = kb.get_entity_strings()
    for text, annotation in TRAIN_DATA:
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print("Removed", kb_id,
                          "from training because it is not in the KB.")
            annotation["links"][offset] = new_dict

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)