Esempio n. 1
0
def samples_generator_sorted(path, max_text_legth=10000):
    data = []
    with open(path, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row)

    MAX = max_text_legth
    datas = sorted(data, key=lambda x: len(x[3]), reverse=True)
    print('Longest text', len(datas[0][3]))
    for row in datas:

        id = row[0]
        print(id)
        text_id = row[1]
        sequence = row[2]
        text = row[3]

        if len(text) > MAX:
            for fragment in split_long_text(text, MAX):
                s = Sentence(fragment, use_tokenizer='toki')
                s.id = id
                s.text_id = text_id
                s.sequence = sequence
                s.ner = []
                s.length = len(fragment)

                yield s

        else:
            s = Sentence(text, use_tokenizer='toki')
            s.id = id
            s.text_id = text_id
            s.sequence = sequence
            s.ner = []
            s.length = len(text)

            yield s
Esempio n. 2
0
from flair.data import Sentence
from flair.models import SequenceTagger

# load the model you trained
model = SequenceTagger.load(
    'C:\Projects\SAKI_NLP\models/flair_best-model_33.pt')

sent = "Afreen Jamadar\nActive member of IIIT Committee in Third year\n\nSangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6\n\nI wish to use my knowledge, skills and conceptual understanding to create excellent team\nenvironments and work consistently achieving organization objectives believes in taking initiative\nand work to excellence in my work.\n\nWORK EXPERIENCE\n\nActive member of IIIT Committee in Third year\n\nCisco Networking -  Kanpur, Uttar Pradesh\n\norganized by Techkriti IIT Kanpur and Azure Skynet.\nPERSONALLITY TRAITS:\n\u2022 Quick learning ability\n\u2022 hard working\n\nEDUCATION\n\nPG-DAC\n\nCDAC ACTS\n\n2017\n\nBachelor of Engg in Information Technology\n\nShivaji University Kolhapur -  Kolhapur, Maharashtra\n\n2016\n\nSKILLS\n\nDatabase (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n\u2022 Programming Languages: C, C++, Java, .net, php.\n\u2022 Web Designing: HTML, XML\n\u2022 Operating Systems: Windows [\u2026] Windows Server 2003, Linux.\n\u2022 Database: MS Access, MS SQL Server 2008, Oracle 10g, MySql.\n\n"

# create example sentence
sentence = Sentence(sent)

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())
from flair.datasets import WIKINER_ENGLISH
x = WIKINER_ENGLISH()
Esempio n. 3
0
    def similarities(self, mask_entity=False):
        tp = 0
        fp = 0
        tps = list()
        fps = list()
        prediction = list()
        num_candidates = list()
        for sentence in tqdm.tqdm(self.test):
            is_mentioned = [token for token in sentence if token[2] != "-"]
            if not is_mentioned:
                continue
            if is_mentioned:
                persons = self.get_persons(sentence)
                mention_vectors = list(
                    self._vectorize(
                        sentence,
                        persons,
                        return_id=True,
                        return_type=True,
                        return_str=True,
                        mask_entity=mask_entity,
                    )
                )
                for identifier, type_, mention, mention_vector in mention_vectors:
                    TOP3 = dict()
                    max_score = 0.0
                    best_candidate = None
                    best_context = None
                    best_sent = None
                    if type_ == "ORG":
                        is_org = True
                    else:
                        is_org = False
                    candidates = self._get_candidates(mention, is_org)
                    num_candidates.append(len(candidates))
                    for candidate in candidates:
                        for context in self.kb[candidate]["MENTIONS"]:
                            if self.kb[candidate].get("DESCRIPTION"):
                                t = list(utils.tokenize(context))
                                t.extend(
                                    list(
                                        utils.tokenize(
                                            self.kb[candidate].get("DESCRIPTION")
                                        )
                                    )
                                )
                                text = " ".join(t)
                            else:
                                t = list(utils.tokenize(context))
                                text = " ".join(t)

                            indices = list(range(len(list(utils.tokenize(context)))))
                            sentence_ = Sentence(text, use_tokenizer=False)
                            if isinstance(EMBEDDING, EntityEmbeddings):
                                EMBEDDING.embed(sentence_, [indices])
                                candidate_vector = (
                                    sentence_.embedding.detach().numpy().reshape(1, -1)
                                )
                            else:
                                EMBEDDING.embed(sentence_)
                                vector = sentence_[indices[0]].get_embedding().numpy()
                                for i in indices[1:]:
                                    vector = (
                                        vector + sentence_[i].get_embedding().numpy()
                                    )
                                candidate_vector = (vector / len(indices)).reshape(
                                    1, -1
                                )

                            score = cosine_similarity(mention_vector, candidate_vector)[
                                0
                            ][0]
                            TOP3[
                                f"pred: {context} ({candidate}) vs. gold: {mention} ({identifier})"
                            ] = float(score)
                            if score > max_score:
                                max_score = score
                                best_candidate = candidate
                                best_context = context
                                best_sent = text

                    prediction.append(
                        {
                            "pred": best_candidate,
                            "gold": identifier,
                            "top3": [
                                {key: value}
                                for key, value in Counter(TOP3).most_common(5)
                            ],
                        }
                    )
                    if best_candidate == identifier:
                        tp += 1
                        tps.append(
                            {
                                "true": mention,
                                "pred": best_context,
                                "true_id": identifier,
                                "pred_id": best_candidate,
                                "score": float(max_score),
                                "sentence": " ".join([token[0] for token in sentence]),
                                "context": " ".join([token[0] for token in best_sent]),
                            }
                        )
                    else:
                        fp += 1
                        if best_sent:
                            fps.append(
                                {
                                    "true": mention,
                                    "pred": best_context,
                                    "true_id": identifier,
                                    "pred_id": best_candidate,
                                    "score": float(max_score),
                                    "sentence": " ".join(
                                        [token[0] for token in sentence]
                                    ),
                                    "context": " ".join(
                                        [token[0] for token in best_sent]
                                    ),
                                }
                            )
        with open("fps-tps.json", "w", encoding="utf-8") as f:
            json.dump({"tps": tps, "fps": fps}, f, ensure_ascii=False, indent=4)
        with open("scores.json", "w", encoding="utf-8") as f:
            json.dump(
                {
                    "accuracy": self.accuracy(tp, fp),
                    "precision": self.precision(tp, fp),
                    "num_candidates": statistics.mean(num_candidates),
                    "embedding": "language-models/presse/multi",
                },
                f,
                indent=4,
                ensure_ascii=False,
            )
        with open("prediction.json", "w", encoding="utf-8") as f:
            json.dump(prediction, f)
        return {
            "accuracy": self.accuracy(tp, fp),
            "precision": self.precision(tp, fp),
            "num_candidates": statistics.mean(num_candidates),
            "embedding": "language-models/presse/multi",
        }
Esempio n. 4
0
def get_tag_sentence(model,query):
    sentence = Sentence(query)
    model.predict(sentence)
    return sentence
    df["Sentence_num"] = new_sentence_nums


fill_sentence_numbers(df)

sentences = df.groupby('Sentence_num').apply(lambda row: " ".join(row["Word"]))

from flair.models import SequenceTagger
from flair.data import Sentence

model = SequenceTagger.load('final-model.pt')

tagged_sentences = []
# create example sentence
for sentence_string in sentences:
    sentence = Sentence(text=sentence_string, use_tokenizer=False)

    # predict
    model.predict(sentence)
    tagged_sentences.append(sentence.to_tagged_string())

import re
ner_regex = re.compile('^<[B|I]-.+>')

# set all predictions to O
df["Predicted"] = "O"

row_index = 0
for tagged_sentence in tagged_sentences:
    sentence_tokens = tagged_sentence.split(" ")
    for token in sentence_tokens:
    def __init__(self,
                 path_to_conll_file: Union[str, Path],
                 in_memory: bool = True):
        """
        Instantiates a column dataset in CoNLL-U format.

        :param path_to_conll_file: Path to the CoNLL-U formatted file
        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
        """
        if type(path_to_conll_file) is str:
            path_to_conll_file = Path(path_to_conll_file)
        assert path_to_conll_file.exists()

        self.in_memory = in_memory
        self.path_to_conll_file = path_to_conll_file
        self.total_sentence_count: int = 0

        if self.in_memory:
            self.sentences: List[Sentence] = []
        else:
            self.indices: List[int] = []

        with open(str(self.path_to_conll_file), encoding="utf-8") as file:

            line = file.readline()
            position = 0
            sentence: Sentence = Sentence()
            while line:

                line = line.strip()
                fields: List[str] = re.split("\t+", line)
                if line == "":
                    if len(sentence) > 0:
                        self.total_sentence_count += 1
                        if self.in_memory:
                            self.sentences.append(sentence)
                        else:
                            self.indices.append(position)
                            position = file.tell()
                    sentence: Sentence = Sentence()

                elif line.startswith("#"):
                    line = file.readline()
                    continue
                elif "." in fields[0]:
                    line = file.readline()
                    continue
                elif "-" in fields[0]:
                    line = file.readline()
                    continue
                else:
                    token = Token(fields[1], head_id=int(fields[6]))
                    token.add_label("lemma", str(fields[2]))
                    token.add_label("upos", str(fields[3]))
                    token.add_label("pos", str(fields[4]))
                    token.add_label("dependency", str(fields[7]))

                    if len(fields) > 9 and 'SpaceAfter=No' in fields[9]:
                        token.whitespace_after = False

                    for morph in str(fields[5]).split("|"):
                        if "=" not in morph:
                            continue
                        token.add_label(
                            morph.split("=")[0].lower(),
                            morph.split("=")[1])

                    if len(fields) > 10 and str(fields[10]) == "Y":
                        token.add_label("frame", str(fields[11]))

                    sentence.add_token(token)

                line = file.readline()
            if len(sentence.tokens) > 0:
                self.total_sentence_count += 1
                if self.in_memory:
                    self.sentences.append(sentence)
                else:
                    self.indices.append(position)
Esempio n. 7
0
def ebm_nlp_processing(data, context_embedding, sentence,
                       label_representations):
    domain_label_count = {}
    pain_mortality_domain = 'PAIN_MORT'
    for i in data:
        if i != '\n':
            i = i.split()
            #sentence += ' '+i[0]
            sentence.append((i[0], i[1]))
        elif i == '\n':
            # join words making up a sentence, the list of tags in each sentence and obtain the context vectors for the sentence words
            sent_unpacked = ' '.join(i[0] for i in sentence)
            tag_unpacked = [i[1] for i in sentence]
            sent = Sentence(sent_unpacked.strip())
            context_embedding.embed(sent)
            v = ''
            print('+++++++', len(sent), len(sentence))
            print(sent_unpacked)
            print(tag_unpacked)
            d = k = 0
            #process each word in a sentence, looking for those that form outcome phrases and obtain a vector representation for entire outcome phrase,
            for i in range(len(sent)):
                if i == d:
                    if tag_unpacked[i].startswith('B-'):
                        b = sent[i].embedding
                        b = b.reshape(1, len(b))
                        out_domain = tag_unpacked[i][2:].strip()

                        for j in range(i + 1, len(sent)):
                            if tag_unpacked[j].startswith('I-'):
                                inner_b = sent[j].embedding
                                inner_b = inner_b.reshape(1, len(inner_b))
                                b = torch.cat((b, inner_b), dim=0)
                                d = j
                            else:
                                break
                        b_mean = torch.mean(b, 0) if len(
                            b.shape
                        ) == 2 else b  #extract the centroid for word vectors of an outcome phrase
                        b_mean = b_mean.reshape(1, len(b_mean))
                        if out_domain not in label_representations:
                            label_representations[out_domain] = b_mean
                            domain_label_count[out_domain] = 1
                        elif out_domain in label_representations:
                            label_representations[out_domain] = torch.cat(
                                (label_representations[out_domain], b_mean),
                                dim=0)
                            domain_label_count[out_domain] += 1

                        #combine pain and mortality outcomes
                        if out_domain.lower() in ['pain', 'mortality']:
                            if pain_mortality_domain not in label_representations:
                                label_representations[
                                    pain_mortality_domain] = b_mean
                            else:
                                label_representations[
                                    pain_mortality_domain] = torch.cat(
                                        (label_representations[out_domain],
                                         b_mean),
                                        dim=0)
                    else:
                        pass
                    d += 1
            sentence.clear()
    return label_representations, domain_label_count
Esempio n. 8
0
 def __create_models(self):
     models = []
     models_fit = []
     #for _params in self.model_params:
     _params = {}
     for k, v in self.params.items():
         if k.startswith('_'):
             continue
         _params[k] = v
     self.textModels = dict(
         mtc=TextModel(_params).fit(self.train),
         #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]),
         #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]),
         ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]),
         langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]),
         charLangMultiEmb=DocumentPoolEmbeddings([
             CharacterEmbeddings(),
             BytePairEmbeddings(self.lang),
             BytePairEmbeddings('multi')
         ]),
         langMultiEmb=DocumentPoolEmbeddings(
             [BytePairEmbeddings(self.lang),
              BytePairEmbeddings('multi')]),
         bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]),
         #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]),
         #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]),
         #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')])
     )
     for km, tmodel in self.textModels.items():
         models.append({'name': km})
         models_fit.append({'name': km})
         if km == 'mtc':
             xt = tmodel.transform(self.train)
             xv = tmodel.transform(self.validation)
             X = tmodel.transform(self.data)
         else:
             sentences_train = [Sentence(txt) for txt in self.train]
             tmodel.embed(sentences_train)
             xt = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_train
             ])
             sentences_val = [Sentence(txt) for txt in self.validation]
             tmodel.embed(sentences_val)
             xv = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_val
             ])
             sentences = [Sentence(txt) for txt in self.data]
             tmodel.embed(sentences)
             X = np.array([
                 e.get_embedding().cpu().detach().numpy() for e in sentences
             ])
         models[-1]['xv'] = xv
         models[-1]['xt'] = xt
         models_fit[-1]['xt'] = X
         #max_iter=5000
         #if km=='mtc': max_iter=1000
         #if km=='langMulti': max_iter=5000
         #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt)
         #yp=self.models[-1]['clf'].decision_function(xv)
         #scaler=Normalizer().fit(yp)
         #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted')
         #self.models[-1]['probas']=scaler.transform(yp)
         ### Fit model with all avaliable data
         #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y)
     print('Fitting Ensemble')
     #self.models  =  Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models)
     #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit)
     self.models, self.models_fit = [], []
     for md, mdf in zip(models, models_fit):
         self.models.append(self._train_model(
             md))  #  =  [self._train_model(md) for md in models]
         self.models_fit.append(self._train_model(md))
Esempio n. 9
0
def edu_extract_2(model, d):
    print(d)
    l = []
    exp = {
        'start': '',
        'end': '',
        'school': '',
        'major': '',
        'description': ''
    }
    d = d[1:]
    spans = []
    for block in d:
        for index in range(len(block['lines'])):
            block['lines'][index] = combine_text_in_line(block['lines'][index])
        block = combine_text_in_block(block)

    Edu = d

    if len(Edu) == 0:
        return [exp]
    for block in d:
        for line in block['lines']:
            for span in line['spans']:
                spans.append(span)
    first_flags = 0
    first_size = 0
    first_type = ''
    first_color = ''
    for span in spans:
        sentence = Sentence(span['text'])
        model.predict(sentence)
        result = sentence.to_dict(tag_type='ner')
        if len(result['entities']) > 0:
            if first_size == 0 and first_flags == 0:
                first_size = span['size']
                first_flags = span['flags']
                first_type = result['entities'][0]['type']
                first_color = span['color']
            else:
                break
    for span in spans:
        print(span['text'])
        sentence = Sentence(span['text'])
        model.predict(sentence)
        result = sentence.to_dict(tag_type='ner')
        if len(result['entities']) > 0:
            if result['entities'][0]['type'] == first_type and span[
                    'color'] == first_color:
                l.append(exp)
                exp = {
                    'start': '',
                    'end': '',
                    'school': '',
                    'major': '',
                    'description': ''
                }
            for entiti in result['entities']:
                exp = complete_entiti(exp, entiti['type'], entiti['text'])
            exp['description'] += ' ' + span['text'][
                result['entities'][-1]['end_pos']:]

        else:
            exp['description'] += ' ' + span['text']
    l.append(exp)
    return l[1:]
Esempio n. 10
0
# Run run_process_vec.py to add commas to feature vectors

trainingResultFile = fasttext(trainingFile)
trainFeatures = np.genfromtxt(trainingResultFile)
trainFeatures = trainFeatures.tolist()
trainLabels = np.genfromtxt(trainLabelsFile)

trainFeatures_withsentiment = np.genfromtxt(trainingResultFile)
trainFeatures_withsentiment = trainFeatures_withsentiment.tolist()

train_file = open(trainingFile, 'r')
trainTweetList = train_file.readlines()

i = 0
for tweet in trainTweetList:
    sentence = Sentence(tweet)
    classifier.predict(sentence)
    if (sentence.labels[0].value == 'POSITIVE'):
        score = sentence.labels[0].score
    else:
        score = 0 - sentence.labels[0].score
    trainFeatures_withsentiment[i].append(score)
    i += 1

validationResultFile = fasttext(validationFile)
validationFeatures = np.genfromtxt(validationResultFile)
validationFeatures = validationFeatures.tolist()
validationLabels = np.genfromtxt(validationLabelsFile)

validationFeatures_withsentiment = np.genfromtxt(validationResultFile)
validationFeatures_withsentiment = validationFeatures_withsentiment.tolist()
    sentences, labels = get_word_sentences(word, dataset)
    km = KMeans(n_clusters=2, n_jobs=-1)

    tok_vecs = []
    word_obj_list = []

    except_counter = 0
    token_length_exceed_counter = 0

    print("Getting embeddings..")
    for sentence_ind, sent in enumerate(sentences):
        if sentence_ind % 1000 == 0:
            print("Finished sentences: " + str(sentence_ind) + " out of " +
                  str(len(sentences)))
        sentence = Sentence(sent)
        if len(sentence.tokens) > 200:
            token_length_exceed_counter += 1
            print("Token length exceeded for : " + str(sentence_ind) +
                  " Token exceed counter: " + str(token_length_exceed_counter))
            continue
        try:
            embedding.embed(sentence)
        except Exception as e:
            except_counter += 1
            print("Exception Counter: ", except_counter, sentence_ind, e)
            continue
        for token_ind, token in enumerate(sentence):
            if token.text != word:
                continue
            vec = token.embedding.cpu().numpy()
Esempio n. 12
0
def recognize(
    text: str,
    class_model: Optional[TextClassifier] = None,
    ner_models: Dict[str, SequenceTagger] = {},
    intent_name: Optional[str] = None,
    intent_to_slots: Dict[str, Dict[str, fst.Fst]] = {},
) -> Dict[str, Any]:
    intent = empty_intent()
    intent["text"] = text

    start_time = time.time()
    sentence = Sentence(text)

    if class_model is not None:
        class_model.predict(sentence)
        assert len(sentence.labels) > 0, "No intent predicted"

        label = sentence.labels[0]
        intent_id = label.value
        intent["intent"]["confidence"] = label.score
    elif len(ner_models) > 0:
        # Assume first intent
        intent_id = intent_name or next(iter(ner_models.keys()))
        intent["intent"]["confidence"] = 1
    else:
        return intent  # empty

    intent["intent"]["name"] = intent_id

    if intent_id in ner_models:
        slot_fsts = intent_to_slots.get(intent_id, {})

        # Predict entities
        ner_models[intent_id].predict(sentence)
        ner_dict = sentence.to_dict(tag_type="ner")
        for named_entity in ner_dict["entities"]:
            slot_name = named_entity["type"]
            slot_value = named_entity["text"]

            # Check for FST to transform
            slot_fst = slot_fsts.get(slot_name)
            if slot_fst is not None:
                try:
                    # Transform with FST
                    logger.debug(
                        f'Transforming "{slot_value}" for slot "{slot_name}" with FST'
                    )
                    slot_value = fstaccept(slot_fst, slot_value)[0]["text"]
                except:
                    logger.exception(slot_name)

            intent["entities"].append({
                "entity": slot_name,
                "value": slot_value,
                "raw_value": named_entity["text"],
                "start": named_entity["start_pos"],
                "end": named_entity["end_pos"],
                "confidence": named_entity["confidence"],
            })

    # Add slots
    intent["slots"] = {}
    for ev in intent["entities"]:
        intent["slots"][ev["entity"]] = ev["value"]

    # Record recognition time
    intent["recognize_seconds"] = time.time() - start_time

    return intent
Esempio n. 13
0
    def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence:
        sentence: Sentence = Sentence()

        # Build the sentence tokens and add the annotations.
        for conllu_token in token_list:
            token = Token(conllu_token["form"])

            for field in self.token_annotation_fields:
                field_value: Any = conllu_token[field]
                if isinstance(field_value, dict):
                    # For fields that contain key-value annotations,
                    # we add the key as label type-name and the value as the label value.
                    for key, value in field_value.items():
                        token.add_label(typename=key, value=str(value))
                else:
                    token.add_label(typename=field, value=str(field_value))

            if conllu_token.get("misc") is not None:
                space_after: Optional[str] = conllu_token["misc"].get(
                    "SpaceAfter")
                if space_after == "No":
                    token.whitespace_after = False

            sentence.add_token(token)

        if "sentence_id" in token_list.metadata:
            sentence.add_label("sentence_id",
                               token_list.metadata["sentence_id"])

        if "relations" in token_list.metadata:
            for (
                    head_start,
                    head_end,
                    tail_start,
                    tail_end,
                    label,
            ) in token_list.metadata["relations"]:
                # head and tail span indices are 1-indexed and end index is inclusive
                head = Span(sentence.tokens[head_start - 1:head_end])
                tail = Span(sentence.tokens[tail_start - 1:tail_end])

                sentence.add_complex_label(
                    "relation", RelationLabel(value=label,
                                              head=head,
                                              tail=tail))

        # determine all NER label types in sentence and add all NER spans as sentence-level labels
        ner_label_types = []
        for token in sentence.tokens:
            for annotation in token.annotation_layers.keys():
                if annotation.startswith(
                        "ner") and annotation not in ner_label_types:
                    ner_label_types.append(annotation)

        for label_type in ner_label_types:
            spans = sentence.get_spans(label_type)
            for span in spans:
                sentence.add_complex_label(
                    "entity",
                    label=SpanLabel(span=span,
                                    value=span.tag,
                                    score=span.score),
                )

        return sentence
    # share parameters
    if len(otaggers) > 1:
        first_tagger = otaggers[0]
        tagger.embedding2nn = first_tagger.embedding2nn
        tagger.rnn = first_tagger.rnn
        if tagger.train_initial_hidden_state:
            tagger.hs_initializer = first_tagger.hs_initializer
            tagger.lstm_init_h = first_tagger.lstm_init_h
            tagger.lstm_init_c = first_tagger.lstm_init_c

taggers += otaggers

sentences = [
    Sentence(
        "Zellecken der Lamina deutlich verdickt, die Zellwände getüpfelt."
    ),
    Sentence(
        "Die Spitze der Parichätialblätter entwickelt aus zahlreichen Zellen braune Rhizoiden."
    ),
    Sentence(
        "Blattränder überall bis zum nächsten Blatte herablaufend."
    ),
    Sentence(
        "Gefunden am Rande des Hammersees "
    ),
    Sentence(
        "Die Blumen sind von vollendeter Form mit elegant gewellten und nach Art der Petunien und chinesischen Primeln gefranzten Petalen, oft zur Füllung neigend und von edler, meist aufrechter Faltung ."
    ),
    Sentence(
        "Obwohl fast 70 Jahre alt, als er sich der mühevollen Aufgabe unterzog, diese schwierige Pilzgruppe systematisch zu beschreiben, widmete er dem Werke mit dem Eifer und der Schaffenskraft eines Jugendlichen seine letzten Lebensjahre fast ausschliefslich."
Esempio n. 15
0
tag_dictionary = corpus.make_label_dictionary(tag_type)
tagger.add_and_switch_to_new_task("zeroshot-moviecomplex-synonyms-to-conll3",
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type)
result, eval_loss = tagger.evaluate(corpus.test)
print(result.main_score)
print(result.log_header)
print(result.log_line)
print(result.detailed_results)
print(eval_loss)

# evaluation

sentences = [
    Sentence(
        "The Parlament of the United Kingdom is discussing a variety of topics."
    ),
    Sentence(
        "A man fell in love with a woman. This takes place in the last century. The film received the Golden Love Film Award."
    ),
    Sentence("The Company of Coca Cola was invented in 1901."),
    Sentence("This is very frustrating! I was smiling since I saw you."),
    Sentence("The Green Party received only a small percentage of the vote."),
    Sentence(
        "Bayern Munich won the german soccer series the sixth time in a row.")
]

tags = [[
    "O", "B-Institution", "I-Institution", "B-Place", "I-Place", "B-Diverse",
    "I-Diverse"
], ["O", "B-Story", "I-Story", "B-Price", "I-Price", "B-Time", "I-Time"],
Esempio n. 16
0
def bert_doc_embed(path):

    f = open(path, 'r')
    f1 = f.readlines()
    f.close()

    #number of sentences in text file
    l = len(f1)
    print('number of sentences in text file: ' + str(l))

    diff = l % 3
    quo = int((l - diff) / 3)

    #number of tokens
    token_count = 0

    if quo == 0:
        f2 = f1[0]
        for k in range(diff - 1):
            f2 = f2 + f1[k + 1]

        #create sentence
        sentence = Sentence(f2)

        size = len(sentence)
        if size < MAX:
            token_count = size

            #embed words in sentence
            embedding.embed(sentence)

            A = sentence[0].embedding
            for j in range(size - 1):
                A = A + sentence[j + 1].embedding
        else:
            sentence = Sentence(f1[0])
            size = len(sentence)
            if size > MAX:
                print('bad sentences')
                return torch.zeros(3072)
            token_count = token_count + size

            #embed words in sentence
            embedding.embed(sentence)

            A = sentence[0].embedding
            for j in range(size - 1):
                A = A + sentence[j + 1].embedding

        A = A / token_count

        print('embed success1')
        return A

    else:

        #create a sentence
        sentence = Sentence(f1[0] + f1[1] + f1[2])

        size = len(sentence)
        if size < MAX:
            token_count = token_count + size

            #embed words in sentence
            embedding.embed(sentence)

            A = sentence[0].embedding
            for j in range(size - 1):
                A = A + sentence[j + 1].embedding
        else:
            sentence = Sentence(f1[0])
            size = len(sentence)
            if size > MAX:
                print('bad sentences')
                return torch.zeros(3072)
            token_count = token_count + size

            #embed words in sentence
            embedding.embed(sentence)

            A = sentence[0].embedding
            for j in range(size - 1):
                A = A + sentence[j + 1].embedding

            sentence = Sentence(f1[1])
            size = len(sentence)
            if size > MAX:
                print('bad sentences')
                return torch.zeros(3072)

            token_count = token_count + size

            #embed words in sentence
            embedding.embed(sentence)

            for j in range(size):
                A = A + sentence[j].embedding

            sentence = Sentence(f1[2])
            size = len(sentence)
            if size > MAX:
                print('bad sentences')
                return torch.zeros(3072)

            token_count = token_count + size

            #embed words in sentence
            embedding.embed(sentence)

            for j in range(size):
                A = A + sentence[j].embedding

        for i in range(quo - 1):

            #create a sentence
            sentence = Sentence(f1[3 * (i + 1)] + f1[3 * (i + 1) + 1] +
                                f1[3 * (i + 1) + 2])

            size = len(sentence)
            if size < MAX:
                token_count = token_count + size

                #embed words in sentence
                embedding.embed(sentence)

                for j in range(size):
                    A = A + sentence[j].embedding

            else:
                sentence = Sentence(f1[3 * (i + 1)])
                size = len(sentence)
                if size > MAX:
                    print('bad sentences')
                    return torch.zeros(3072)

                token_count = token_count + size

                #embed words in sentence
                embedding.embed(sentence)

                for j in range(size):
                    A = A + sentence[j].embedding

                sentence = Sentence(f1[3 * (i + 1) + 1])
                size = len(sentence)
                if size > MAX:
                    print('bad sentences')
                    return torch.zeros(3072)

                token_count = token_count + size

                #embed words in sentence
                embedding.embed(sentence)

                for j in range(size):
                    A = A + sentence[j].embedding

                sentence = Sentence(f1[3 * (i + 1) + 2])
                size = len(sentence)
                if size > MAX:
                    print('bad sentences')
                    return torch.zeros(3072)

                token_count = token_count + size

                #embed words in sentence
                embedding.embed(sentence)

                for j in range(size):
                    A = A + sentence[j].embedding

        if diff != 0:
            f2 = f1[quo * 3]
            for i in range(diff - 1):
                f2 = f2 + f1[3 * quo + i + 1]

            #create sentence
            sentence = Sentence(f2)

            size = len(sentence)
            if size < MAX:

                token_count = token_count + size

                #embed words in sentence
                embedding.embed(sentence)

                for j in range(size):
                    A = A + sentence[j].embedding

        A = A / token_count
        print('embed success2')
        return A
Esempio n. 17
0
def embed(text, embedder):
    sentence = Sentence(text)
    embedder.embed(sentence)
    return sentence.get_embedding().detach().numpy()
Esempio n. 18
0
    def contextualize(df, cluster_dump_dir):
        def get_cluster(tok_vec, cc):
            max_sim = -10
            max_sim_id = -1
            for i, cluster_center in enumerate(cc):
                sim = cosine_similarity(tok_vec, cluster_center)
                if sim > max_sim:
                    max_sim = sim
                    max_sim_id = i
            return max_sim_id

        print("Contextualizing the corpus..")
        embedding = BertEmbeddings('bert-base-uncased')
        stop_words = set(stopwords.words('english'))
        stop_words.add('would')
        except_counter = 0
        word_cluster = {}

        #this tokenizer is used to check for length > 512
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        for index, row in df.iterrows():
            if index % 100 == 0:
                print("Finished rows: " + str(index) + " out of " +
                      str(len(df)))
            line = row["sentence"]
            sentences = sent_tokenize(line)
            for sentence_ind, sent in enumerate(sentences):
                tokenized_text = tokenizer.tokenize(sent)
                if len(tokenized_text) > 512:
                    print('sentence too long for Bert: truncating')
                    sentence = Sentence(' '.join(sent[:512]),
                                        use_tokenizer=True)
                else:
                    sentence = Sentence(sent, use_tokenizer=True)
                try:
                    embedding.embed(sentence)
                except:
                    print(index)
                    print(sentence)
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    if word in stop_words:
                        continue
                    word_clean = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if len(
                            word_clean
                    ) == 0 or word_clean in stop_words or "/" in word_clean:
                        continue
                    try:
                        cc = word_cluster[word_clean]
                    except:
                        try:
                            cc = word_cluster[word]
                        except:
                            word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl"
                            word_path = cluster_dump_dir + word + "/cc.pkl"
                            try:
                                with open(word_clean_path, "rb") as handler:
                                    cc = pickle.load(handler)
                                word_cluster[word_clean] = cc
                            except:
                                try:
                                    with open(word_path, "rb") as handler:
                                        cc = pickle.load(handler)
                                    word_cluster[word] = cc
                                except Exception as e:
                                    except_counter += 1
                                    print(
                                        "Exception Counter while getting clusters: ",
                                        except_counter, index, e)
                                    continue

                    if len(cc) > 1:
                        tok_vec = token.embedding.cpu().numpy()
                        cluster = get_cluster(tok_vec, cc)
                        sentence.tokens[token_ind].text = word + "$" + str(
                            cluster)
                sentences[sentence_ind] = to_tokenized_string(sentence)
            df["sentence"][index] = " . ".join(sentences)
        return df, word_cluster
Esempio n. 19
0
def ebm_comet_preprocessing(data, context_embedding, sentence,
                            label_representations, file):
    for i in data:
        if not i.__contains__('docx'):
            if i != '\n':
                if i.startswith("[['P") or i.startswith(
                        "[['E") or i.startswith("[['S") or re.search(
                            '\[\]', i):
                    multi_labels = i
                else:
                    i = i.split()
                    sentence.append((i[0], i[1]))
            elif i == '\n':
                if sentence:
                    sent_unpacked = ' '.join(i[0] for i in sentence)
                    tag_unpacked = [i[1] for i in sentence]
                    sent = Sentence(sent_unpacked.strip())
                    context_embedding.embed(sent)
                    v = ''
                    print('\n+++++++')
                    print(sent_unpacked)
                    print(tag_unpacked)
                    print(multi_labels, type(multi_labels))
                    multi_labels = ast.literal_eval(multi_labels)

                    d = k = ann = 0
                    for i in range(len(sent)):
                        if i == d:
                            if tag_unpacked[i].startswith('B-'):
                                b = sent[i].embedding
                                b = b.reshape(1, len(b))
                                z = sent[i].text
                                file.write('{} {}\n'.format(
                                    sent[i].text, tag_unpacked[i]))
                                out_domain = multi_labels[ann]

                                if out_domain[0][0] not in ['E', 'S']:
                                    for j in range(i + 1, len(sent)):
                                        if tag_unpacked[j].startswith('I-'):
                                            file.write('{} {}\n'.format(
                                                sent[j].text, tag_unpacked[j]))
                                            inner_b = sent[j].embedding
                                            inner_b = inner_b.reshape(
                                                1, len(inner_b))
                                            b = torch.cat((b, inner_b), dim=0)
                                            d = j
                                        else:
                                            break
                                # print('---------',b.shape)
                                    b_mean = torch.mean(b, 0) if len(
                                        b.shape) == 2 else b
                                    b_mean = b_mean.reshape(1, len(b_mean))

                                    for dom in out_domain:
                                        if dom not in label_representations:
                                            label_representations[dom] = b_mean
                                        elif dom in label_representations:
                                            label_representations[
                                                dom] = torch.cat((
                                                    label_representations[dom],
                                                    b_mean),
                                                                 dim=0)

                                else:
                                    e_s_features = []
                                    x_indices = []
                                    x_indices.append(
                                        (z, re_shape(sent[i].embedding)))
                                    for j in range(i + 1, len(sent)):
                                        inner_b = sent[j].embedding
                                        inner_b = inner_b.reshape(
                                            1, len(inner_b))
                                        if re.search(
                                                'E\d',
                                                tag_unpacked[j]) or re.search(
                                                    'S\d', tag_unpacked[j]):
                                            b = torch.cat((b, inner_b), dim=0)
                                            z += ' ' + sent[j].text
                                            file.write('{} {}\n'.format(
                                                sent[j].text, tag_unpacked[j]))
                                            e_s_features.append(
                                                (z, tag_unpacked[j], b))
                                            x_indices.append(
                                                (sent[j].text,
                                                 re_shape(sent[j].embedding)))
                                            d = j
                                            break
                                        elif re.search(
                                                'B', tag_unpacked[j]
                                        ) or ('Seperator' == tag_unpacked[j]
                                              and out_domain[0][0] == 'S'):
                                            e_s_features.append(
                                                (z, tag_unpacked[j], b))
                                            z = '' if out_domain[0][
                                                0] == 'S' else sent[j].text
                                            file.write('{} {}\n'.format(
                                                sent[j].text, tag_unpacked[j]))
                                            b = sent[j].embedding
                                            b = b.reshape(1, len(b))
                                            x_indices.append(
                                                (sent[j].text,
                                                 re_shape(sent[j].embedding)))
                                        else:
                                            z += ' ' + sent[j].text
                                            file.write('{} {}\n'.format(
                                                sent[j].text, tag_unpacked[j]))
                                            b = torch.cat((b, inner_b), dim=0)
                                            x_indices.append(
                                                (sent[j].text,
                                                 re_shape(sent[j].embedding)))

                                    x = int(out_domain[0][-1])
                                    print([
                                        (i[0], i[1].shape) for i in x_indices
                                    ], '+++++++++++++++++#####################+++++++++',
                                          x, [(g[0], g[1], g[2].shape)
                                              for g in e_s_features])

                                    y_indices = []
                                    if re.search('E\d', out_domain[0]):
                                        for m in range(len(e_s_features)):
                                            if m < (len(e_s_features) - 1):
                                                _m_ = e_s_features[m][2]
                                                for t in range(x):
                                                    #print('Ennnnnnnnnnnnd',e_s_features[m][0])
                                                    _m_ = torch.cat(
                                                        (_m_,
                                                         x_indices[-(t + 1)][1]
                                                         ),
                                                        dim=0)
                                                y_indices.append(_m_)
                                        y_indices.append(e_s_features[-1][2])
                                    elif re.search('S\d', out_domain[0]):
                                        for m in range(len(e_s_features)):
                                            if m > 0:
                                                _m_ = e_s_features[m][2]
                                                for t in range(x):
                                                    _m_ = torch.cat(
                                                        (x_indices[t][1], _m_))
                                                    #print('Staaaaaaaaaaaaaart',e_s_features[m][0])
                                                y_indices.append(_m_)
                                        y_indices.insert(0, e_s_features[0][2])

                                    b_mean = []
                                    for d_ in y_indices:
                                        d_ = torch.mean(
                                            d_, 0) if len(d_.shape) > 1 else d_
                                        b_mean.append(d_.reshape(1, len(d_)))
                                    for b_, dom in zip(b_mean, out_domain[1:]):
                                        if dom not in label_representations:
                                            label_representations[dom] = b_
                                        elif dom in label_representations:
                                            label_representations[
                                                dom] = torch.cat((
                                                    label_representations[dom],
                                                    b_),
                                                                 dim=0)
                                ann += 1

                            else:
                                file.write('{} {}\n'.format(sent[i].text, 'O'))
                                pass
                            d += 1
                    file.write('\n')
                sentence.clear()
    return label_representations
def interpret_sentence(flair_model_wrapper,
                       lig,
                       sentence,
                       target_label,
                       visualization_list,
                       n_steps=100,
                       estimation_method="gausslegendre",
                       internal_batch_size=None):
    """
    We can visualise the attributions made by making use of Pytorch Captum.
    Inputs:
    flair_model_wrapper: class containing a customized forward function of Flair model.
    lig: the layer integrated gradient object.
    sentence: the Flair sentence-object we want to interpret.
    target_label: the ground truth class-label of the sentence.
    visualization_list: a list to store the visualization records in.
    """

    # Return the target index from the label dictionary.
    target_index = flair_model_wrapper.label_dictionary.get_idx_for_item(
        target_label)

    # In order maintain consistency with Flair, we apply the same tokenization
    # steps.
    flair_sentence = Sentence(sentence)

    tokenized_sentence = flair_sentence.to_tokenized_string()

    # This calculates the token input IDs tensor for the model.
    input_ids = flair_model_wrapper.tokenizer.encode(
        tokenized_sentence,
        add_special_tokens=False,
        max_length=flair_model_wrapper.tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt")

    # Create a baseline by creating a tensor of equal length
    # containing the padding token tensor id.
    ref_base_line = torch.ones_like(input_ids)

    # Convert back to tokens as the model requires.
    # As some words might get split up. e.g. Caroll to Carol l.
    all_tokens = flair_model_wrapper.tokenizer.convert_ids_to_tokens(
        input_ids[0])

    # The tokenizer in the model adds a special character
    # in front of every sentence.
    readable_tokens = [token.replace("▁", "") for token in all_tokens]

    # The input IDs are passed to the embedding layer of the model.
    # It is better to return the logits for Captum.
    # https://github.com/pytorch/captum/issues/355#issuecomment-619610044
    # Thus we calculate the softmax afterwards.
    # For now, I take the first dimension and run this sentence, per sentence.
    model_outputs = flair_model_wrapper(input_ids)

    softmax = torch.nn.functional.softmax(model_outputs[0], dim=0)

    # Return the confidence and the class ID of the top predicted class.
    conf, idx = torch.max(softmax, 0)

    #conf, idx = torch.max(model_outputs[0], 0)

    # Returns the probability.
    prediction_confidence = conf.item()

    # Returns the label name from the top prediction class.
    pred_label = flair_model_wrapper.label_dictionary.get_item_for_index(
        idx.item())

    # Calculate the attributions according to the LayerIntegratedGradients method.
    attributions_ig, delta = lig.attribute(
        input_ids,
        baselines=ref_base_line,
        n_steps=n_steps,
        return_convergence_delta=True,
        target=target_index,
        method=estimation_method,
        internal_batch_size=internal_batch_size)

    convergence_delta = abs(delta)
    print('pred: ', idx.item(), '(', '%.2f' % conf.item(), ')', ', delta: ',
          convergence_delta)

    word_attributions, attribution_score = summarize_attributions(
        attributions_ig)

    visualization_list.append(
        viz.VisualizationDataRecord(word_attributions=word_attributions,
                                    pred_prob=prediction_confidence,
                                    pred_class=pred_label,
                                    true_class=target_label,
                                    attr_class=target_label,
                                    attr_score=attribution_score,
                                    raw_input=readable_tokens,
                                    convergence_score=delta))

    # Return these for the sanity checks.
    return readable_tokens, word_attributions, convergence_delta
Esempio n. 21
0
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)
print(sentence)

print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)
Esempio n. 22
0
def generate_embeddings(docs,
                        batch_size,
                        model_name='bert-base-cased',
                        pooling='mean',
                        offset=0):
    """
    Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and
    returns a list tuple. The first element represents failure (0) or success (1 or 2) and
    the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch
    if unsuccessful.
    The first element is 1, if batch_size embeddings were created
    :param docs: a list of strings for which embeddings should be created
    :param batch_size: integer representing how many embeddings should be created at once
    :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base
    :param pooling: the pooling strategy to generate Document Embeddings
    :param offset: the offset of the integers, for printing out the correct index
    :return: a tuple (success/failure, embeddings/failed_indices)
    """
    rest = len(docs) % batch_size
    model = False
    if pooling == 'mean':
        embedding = TransformerWordEmbeddings(model_name,
                                              layers='-1',
                                              allow_long_sentences=True)
        model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none')
    elif pooling == 'CLS':
        model = TransformerDocumentEmbeddings(model_name)
    if model:
        for i in range(0, len(docs) - rest, batch_size):
            sentences = [
                Sentence(sentence) for sentence in docs[i:i + batch_size]
            ]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            sentences = [Sentence(sentence) for sentence in docs[-rest:]]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    elif pooling == 'SentenceBert':
        model = SentenceTransformer(model_name)
        for i in range(0, len(docs) - rest, batch_size):
            try:
                embeddings = model.encode(docs[i:i + batch_size])
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, embeddings
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            try:
                embeddings = model.encode(docs[-rest:])
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, embeddings
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    else:
        raise Exception("No Valid model")
Esempio n. 23
0
    def get_indexed_data(self, data, data_type="train"):
        '''
        indexing data
        if data_type == "test", matrix_spots is not included
        
        '''
        indexed_sample_list = []
        max_word_num = self.max_word_num
        max_subword_num = self.max_subword_num
        max_char_num_in_tok = self.max_char_num_in_tok

        max_word_num4flair = max(
            [len(sample["text"].split(" ")) for sample in data]
        )  # 这里主要考虑让flair脱离max_word_num的依赖,如果有bug,改回max_word_num。以后用到flair都要带上max_word_num。
        for sample in tqdm(data, desc="Generate indexed data"):
            text = sample["text"]
            indexed_sample = {}
            indexed_sample["sample"] = sample
            if self.subword_tokenizer is not None:
                # codes for bert input
                bert_codes = self.subword_tokenizer.encode_plus(
                    text,
                    return_offsets_mapping=True,
                    add_special_tokens=False,
                    max_length=max_subword_num,
                    truncation=True,
                    pad_to_max_length=True)

                # get bert codes
                subword_input_ids = torch.tensor(
                    bert_codes["input_ids"]).long()
                attention_mask = torch.tensor(
                    bert_codes["attention_mask"]).long()
                token_type_ids = torch.tensor(
                    bert_codes["token_type_ids"]).long()
                subword2char_span = bert_codes["offset_mapping"]

                indexed_sample["subword_input_ids"] = subword_input_ids
                indexed_sample["attention_mask"] = attention_mask
                indexed_sample["token_type_ids"] = token_type_ids
                indexed_sample[
                    "tok2char_span"] = subword2char_span  # token is subword level

            # word level tokenizer
            if self.word_tokenizer is not None:  # use word enc
                if self.subword_tokenizer is not None:  # also use bert
                    indexed_sample[
                        "word_input_ids"] = self.word_tokenizer.text2word_indices(
                            text, max_word_num)

                    # subword2word_idx_map: map subword to corresponding word
                    words = self.word_tokenizer.tokenize(text)
                    subword2word_idx_map = []
                    for wd_idx, wd in enumerate(words):
                        for subwd in self.subword_tokenizer.tokenize(wd):
                            if subwd != "[PAD]":
                                subword2word_idx_map.append(wd_idx)
                    if len(subword2word_idx_map) < max_subword_num:
                        subword2word_idx_map.extend(
                            [len(words) - 1] *
                            (max_subword_num - len(subword2word_idx_map)))
                    subword2word_idx_map = torch.tensor(
                        subword2word_idx_map).long()
                    indexed_sample[
                        "subword2word_idx_map"] = subword2word_idx_map

                else:  # do not use bert, but use word enc
                    word_codes = self.word_tokenizer.encode_plus(
                        text, max_word_num)
                    word2char_span = word_codes["offset_mapping"]
                    indexed_sample[
                        "tok2char_span"] = word2char_span  # token is word level
                    indexed_sample["word_input_ids"] = word_codes["input_ids"]

            if self.text2char_indices_func is not None:  # use char enc
                char_input_ids = self.text2char_indices_func(text)
                char_input_ids_padded = []
                for span in indexed_sample["tok2char_span"]:
                    char_ids = char_input_ids[span[0]:span[1]]

                    if len(char_ids) < max_char_num_in_tok:
                        char_ids.extend([0] *
                                        (max_char_num_in_tok - len(char_ids)))
                    else:
                        char_ids = char_ids[:max_char_num_in_tok]
                    char_input_ids_padded.extend(char_ids)
                char_input_ids_padded = torch.tensor(
                    char_input_ids_padded).long()
                indexed_sample["char_input_ids_padded"] = char_input_ids_padded

            # prepare padded sentences for flair embeddings
            words = text.split(" ")
            words.extend(["[PAD]"] * (max_word_num4flair - len(words)))
            indexed_sample["padded_sent"] = Sentence(" ".join(words))

            # get spots
            if data_type != "test":
                matrix_spots = self.handshaking_tagger.get_spots(sample)
                indexed_sample["matrix_spots"] = matrix_spots

            indexed_sample_list.append(indexed_sample)
        return indexed_sample_list
Esempio n. 24
0
@author: God
"""

#import commands for flair ner
from flair.data import Sentence
from flair.models import SequenceTagger

#Load NER Model
tagger = SequenceTagger.load('ner')

#Sample text to run NER
text = 'Jackson is placed in Microsoft located in Redmond'

#passing text to sentence
sentence = Sentence(text)

# Run NER on sentence to identify Entities
tagger.predict(sentence)

# print the entities with below command
for entity in sentence.get_spans('ner'):
    print(entity)

print(sentence.to_tagged_string())

#Sample text
text1 = 'Redmond is coming to New York city'

#passing text to sentence
sentence = Sentence(text1)
Esempio n. 25
0
    print(
        f"#####\n{s}\n morfeusz={is_valid1} stanza={is_valid2} krnnt={is_valid3}"
    )

#%% pos with flair
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("pos-multi")

#%%

sentence = sentences[0]
print(f"\n>>>{sentence}")
sent = Sentence(sentence)
tagger.predict(sent)
print(f"\n{sent.to_tagged_string()}")
for t in sent.tokens:
    print(f"{t}- {t.get_tag('upos').value} {t.get_tag('upos').score}")

conv_flair_get_pos = lambda x: x.get_tag("upos").value
flair_ud_pos = list(map(conv_flair_get_pos, sent.tokens))
stats_flair_pos = Counter(flair_ud_pos)

print(stats_flair_pos)

# %% sentence taggers

# docker run -p 9003:9003 -it djstrong/krnnt:1.0.0
Esempio n. 26
0
 def _embed_document(self, document_text: str,
                     doc_embeddings: DocumentPoolEmbeddings):
     sentence = Sentence(document_text)
     doc_embeddings.embed(sentence)
     return sentence.get_embedding().data.cpu().numpy()
Esempio n. 27
0
def test_sentence_to_plain_string():
    sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True)

    assert ('I love Berlin .' == sentence.to_tokenized_string())
Esempio n. 28
0
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import BertEmbeddings
import nltk

filename = "/home/harsh/Downloads/data/abc_datafiles/01.txt"
file = open(filename, "r")
text = file.read()
text.replace('\"', '\\"')

sent_text = nltk.sent_tokenize(text)
final_text = ""
for sentence in sent_text:
    final_text += sentence

sentence = Sentence(final_text, use_tokenizer=True)

# load the NER tagger
# Part-of-Speech Tagging
tagger = SequenceTagger.load('pos')

# 4-class Named Entity Recognition
# tagger = SequenceTagger.load('ner')

# Semantic Frame Detection (Experimental)
# tagger = SequenceTagger.load('frame')

# Syntactic Chunking
# tagger = SequenceTagger.load('chunk')

# 12-class Named Entity Recognition
Esempio n. 29
0
def main(args):
    args = parser.parse_args()

    # Loading classifier model:
    print("Loading classifier model")
    classifier = TextClassifier.load_from_file(join(args.model_dir, 'best-model.pt'))

    txt_files = glob.glob(join(args.data_dir, '*.txt'))
    
    sent_splitter = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    sentence_lookahead = 0

    for txt_fn in txt_files:
        print("Processing %s" % (txt_fn))
        ann_input_fn = join(args.data_dir, basename(txt_fn)[:-3]+'ann')
        ents, _ = read_brat_file(ann_input_fn)

        ann_output_fn = join(args.output_dir, basename(txt_fn)[:-3]+'ann')
        with open(txt_fn, 'r') as myfile:
            text = myfile.read()

        ann_out = open(ann_output_fn, 'w')
        
        # Write entities right away:
        for ent_id in ents.keys():
            ent = ents[ent_id]
            ent_text = text[ent.start:ent.end].replace('\n', ' ')
            ann_out.write('%s\t%s %d %d\t%s\n' % (ent_id, ent.cat, ent.start, ent.end, ent_text))

        sent_spans = list(sent_splitter.span_tokenize(text))

        rel_ind = 0
        rel_attempts = 0
        for sent_ind in range(len(sent_spans)):
            primary_sent_span = sent_spans[sent_ind]
            end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1)
            end_sent_span = sent_spans[end_window_ind]

            sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ')
            drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents)

            for att_ent in att_ents:
                for drug_ent in drug_ents:
                    ## Get index of ents into sent:
                    a1_start = att_ent.start - primary_sent_span[0]
                    a1_end = att_ent.end - primary_sent_span[0]
                    a1_text = sent[a1_start:a1_end]

                    a2_start = drug_ent.start - primary_sent_span[0]
                    a2_end = drug_ent.end - primary_sent_span[0]
                    a2_text = sent[a2_start:a2_end]

                    if a1_start < a2_start:
                        # arg1 occurs before arg2
                        rel_text = (sent[:a1_start] + 
                                    " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) +
                                    sent[a1_end:a2_start] +
                                    " DrugStart %s DrugEnd" % (a2_text) +
                                    sent[a2_end:])
                    else:
                        rel_text = (sent[:a2_start] +
                                    " DrugStart %s DrugEnd " % (a2_text) +
                                    sent[a2_end:a1_start] +
                                    " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) +
                                    sent[a1_end:])

                    # if att_ent.cat == 'Dosage':
                        # print("working with Dosage ent")
                    sentence = Sentence(rel_text, use_tokenizer=True)
                    labels = classifier.predict(sentence)[0].labels
                    if len(labels) > 1:
                        print('  This relation has more than one output label')
                    label = labels[0].value
                    # print("Comparing ent %s and ent %s and got %s" % (att_ent.id, drug_ent.id, label))
                    rel_attempts += 1
                    if not label == 'None':
                        # Make sure label corresponds to entity type:
                        if label.find(att_ent.cat) < 0:
                            # print("  Skipping found relation where label %s doesn't match arg type %s" % (label, att_ent.cat))
                            continue
                        ann_out.write('R%d\t%s Arg1:%s Arg2:%s\n' % (rel_ind, label, att_ent.id, drug_ent.id))
                        rel_ind += 1

        # print("Finished: Found %d relations while making %d classification attempts" % (rel_ind, rel_attempts))
        ann_out.close()
Esempio n. 30
0
    def read_column_data(path_to_column_file: Path,
                         column_name_map: Dict[int, str],
                         infer_whitespace_after: bool = True):
        """
        Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
        column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
        specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
        the chunk and the forth the NER tag.
        :param path_to_column_file: the path to the column file
        :param column_name_map: a map of column number to token annotation name
        :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token
        :return: list of sentences
        """
        sentences: List[Sentence] = []

        try:
            lines: List[str] = open(
                str(path_to_column_file),
                encoding='utf-8').read().strip().split('\n')
        except:
            log.info(
                'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
                    path_to_column_file))
            lines: List[str] = open(
                str(path_to_column_file),
                encoding='latin1').read().strip().split('\n')

        # most data sets have the token text in the first column, if not, pass 'text' as column
        text_column: int = 0
        for column in column_name_map:
            if column_name_map[column] == 'text':
                text_column = column

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith('#'):
                continue

            if line.strip().replace('', '') == '':
                if len(sentence) > 0:
                    sentence.infer_space_after()
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[text_column])
                for column in column_name_map:
                    if len(fields) > column:
                        if column != text_column:
                            token.add_tag(column_name_map[column],
                                          fields[column])

                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentence.infer_space_after()
            sentences.append(sentence)

        return sentences