Esempi in Python per Keyphrase, esempi in Python per scripts.utils.Keyphrase

Esempio n. 1

0

Mostra file

File: initial_system.py Progetto: rcarlini-upf/ehealthkd-2019

    def predict_entities(self, collection):

        next_id = 0
        for instance_keyphrase, label in self.keyphrases.items():
            for sentence in collection.sentences:
                text = sentence.text.lower()
                pattern = r'\b' + instance_keyphrase + r'\b'
                for match in re.finditer(pattern, text):
                    keyphrase = Keyphrase(sentence, label, next_id, [match.span()])
                    keyphrase.split()
                    next_id += 1

                    sentence.keyphrases.append(keyphrase)

Esempio n. 2

0

Mostra file

    def _test_biluov_task():
        import es_core_news_md
        from scripts.utils import Sentence

        def forward(tokensxsentence, entitiesxsentence):
            labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence)
            return [
                from_biluov(biluov, sentence, spans=True)
                for biluov, sentence in zip(labelsxsentence, tokensxsentence)
            ]

        training = Collection().load(Path("data/training/scenario.txt"))
        nlp = es_core_news_md.load()

        def per_label(label):
            tokensxsentence = [nlp(s.text) for s in training.sentences]
            entitiesxsentence = [[
                k.spans for k in s.keyphrases if k.label == label
            ] for s in training.sentences]
            decoded = forward(tokensxsentence, entitiesxsentence)
            return decoded

        collection = Collection([Sentence(s.text) for s in training.sentences])
        for label in ENTITIES:
            decoded = per_label(label)
            for entities, sentence in zip(decoded, collection.sentences):
                for spans in entities:
                    keyphrase = Keyphrase(sentence, label, -1, spans)
                    sentence.keyphrases.append(keyphrase)

        collection.fix_ids()
        output = Path(
            "data/submissions/forward-biluov/train/run1/scenario2-taskA/")
        output.mkdir(parents=True, exist_ok=True)
        collection.dump(output / "scenario.txt", skip_empty_sentences=False)

Esempio n. 3

0

Mostra file

def decode_bilou(sentence: Sentence, tags, tokens, spans) -> List[Keyphrase]:
    """tags: B-Concept, B-Action, ..."""

    next_id = 0  # unique id
    tokens = [{
        "token": i,
        "span": j,
        "label": k
    } for i, j, k in zip(tokens, spans, tags) if j != (0, 0)]

    entity_spans = []
    entity_label = None
    prev_state = None
    prev_label = 'O'

    # if tokens are the atomic elements, we convert them to word
    words = []
    for token in tokens:
        if token['token'].startswith('##'):
            #  as ##ma is being convert to asma
            word = words.pop()
            s0 = list(word['span'])[0]
            s1 = list(token['span'])[1]
            words.append({
                "token": word['token'] + token['token'][2:],
                "span": (s0, s1),
                "label": word['label']
            })
        else:
            words.append(token)

    list_of_keypfrases = []
    for w in words:
        """IF the new label is BUO, 
        or the new label is different from previous one, 
        or  the new label is IL but the previuos one isnt BI
        THEN save the previous entity and reset """
        bool_1 = (w['label'][:1] in ['B', 'U', 'O'])
        bool_2 = w['label'][2:] != prev_label
        bool_3 = (w['label'][:1] in ['I', 'L']) and (prev_state
                                                     not in ['B', 'I'])
        if bool_1 or bool_2 or bool_3:
            if entity_spans:
                keyphrase = Keyphrase(sentence=sentence,
                                      label=entity_label,
                                      id=next_id,
                                      spans=entity_spans)
                list_of_keypfrases.append(keyphrase)
                next_id += 1
                entity_spans = []

        if w['label'] == 'O':
            entity_label = 'O'
        else:
            entity_spans.append(w['span'])
            entity_label = w['label'][2:]

        prev_state = w['label'][:1]
        prev_label = entity_label
    return list_of_keypfrases

Esempio n. 4

0

Mostra file

File: data.py Progetto: jpconsuegra/ensemble-baseline

def make_sentence(doc, bilouv, labels) -> Sentence:
    sentence = Sentence(doc.text)

    logger.debug(f"[make_sentence]: doc.text={doc.text}")
    logger.debug(f"[make_sentence]: bilouv={bilouv}")

    labels = set(l[2:] for l in labels if l != 'O')

    for label in labels:
        specific_bilouv = []

        for tag in bilouv:
            if tag.endswith(label):
                tag = tag[0]
                specific_bilouv.append(tag[0])
            else:
                specific_bilouv.append('O')

        logger.debug(
            f"[make_sentence]: label={label} specific_bilouv={specific_bilouv}"
        )

        spans = from_biluov(specific_bilouv, doc, spans=True)
        sentence.keyphrases.extend(
            Keyphrase(sentence, label, i, sp) for i, sp in enumerate(spans))

    return sentence

Esempio n. 5

0

Mostra file

File: legacy.py Progetto: jpconsuegra/ensemble-baseline

    def load_keyphrases(cls, collection: Collection, finput: Path):
        cls.load_input(collection, finput)

        input_a_file = finput.parent / ("output_a_" + finput.name.split("_")[1])

        sentences_length = [len(s.text) for s in collection.sentences]
        for i in range(1, len(sentences_length)):
            sentences_length[i] += sentences_length[i - 1] + 1

        sentence_by_id = {}

        for line in input_a_file.open(encoding="utf8").readlines():
            lid, spans, label, _ = line.strip().split("\t")
            lid = int(lid)

            spans = [s.split() for s in spans.split(";")]
            spans = [(int(start), int(end)) for start, end in spans]

            # find the sentence where this annotation is
            i = bisect.bisect(sentences_length, spans[0][0])
            # correct the annotation spans
            if i > 0:
                spans = [
                    (
                        start - sentences_length[i - 1] - 1,
                        end - sentences_length[i - 1] - 1,
                    )
                    for start, end in spans
                ]
                spans.sort(key=lambda t: t[0])
            # store the annotation in the corresponding sentence
            the_sentence = collection.sentences[i]
            keyphrase = Keyphrase(the_sentence, label, lid, spans)
            the_sentence.keyphrases.append(keyphrase)

            if len(keyphrase.spans) == 1:
                keyphrase.split()

            sentence_by_id[lid] = the_sentence

        return sentence_by_id

Esempio n. 6

0

Mostra file

File: main.py Progetto: jpconsuegra/ensemble-baseline

    def run(self, collection, taskA, taskB):
        gold_keyphrases, gold_relations = self.model

        if taskA:
            next_id = 0
            for gold_keyphrase, label in gold_keyphrases.items():
                for sentence in collection.sentences:
                    text = sentence.text.lower()
                    pattern = r"\b" + gold_keyphrase + r"\b"
                    for match in re.finditer(pattern, text):
                        keyphrase = Keyphrase(sentence, label, next_id,
                                              [match.span()])
                        keyphrase.split()
                        next_id += 1

                        sentence.keyphrases.append(keyphrase)

        if taskB:
            for sentence in collection.sentences:
                for origin in sentence.keyphrases:
                    origin_text = origin.text.lower()
                    for destination in sentence.keyphrases:
                        destination_text = destination.text.lower()
                        try:
                            label = gold_relations[origin_text, origin.label,
                                                   destination_text,
                                                   destination.label, ]
                        except KeyError:
                            continue
                        relation = Relation(sentence, origin.id,
                                            destination.id, label)
                        sentence.relations.append(relation)

                sentence.remove_dup_relations()

        return collection

Esempio n. 7

0

Mostra file

    def run_taskA_for_label(self, collection: Collection, entity_label: str,
                            *args, **kargs):
        model = self.taskA_models[entity_label]
        print(f"Building dataset for {entity_label} ...")
        dataset = BILUOVSentencesDS([s.text for s in collection.sentences],
                                    language=self.nlp)
        print(f"Done!")

        with torch.no_grad():
            for sid, (*s_features, _) in tqdm(
                    enumerate(dataset.shallow_dataloader()),
                    total=len(dataset),
                    desc=entity_label,
            ):
                tokensxsentence = dataset.tokensxsentence[sid]
                output = model(s_features)
                output = model.decode(output)
                labels = [dataset.labels[x] for x in output]
                decoded = from_biluov(labels, tokensxsentence, spans=True)

                sentence = collection.sentences[sid]
                for spans in decoded:
                    keyphrase = Keyphrase(sentence, entity_label, -1, spans)
                    sentence.keyphrases.append(keyphrase)