Esempio n. 1
0
    def test_allocate(self):
        # Small data
        context = "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        query = ['Houston', ',', 'Texas']
        start_char = 19
        end_char = 32
        span = Span.allocate(anchors, start_char, end_char)
        self.assertEqual(span.start, 4)
        self.assertEqual(span.end, 6)
        for k in range(span.start, span.end + 1):
            self.assertEqual(context_toks[k], query[k - span.start])

        # Real data
        context = "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        query = ['Dangerously', 'in', 'Love']
        start_char = 505
        end_char = 523
        span = Span.allocate(anchors, start_char, end_char)
        self.assertEqual(span.start, 108)
        self.assertEqual(span.end, 110)
        for k in range(span.start, span.end + 1):
            self.assertEqual(context_toks[k], query[k - span.start])
Esempio n. 2
0
def remove_entites(train_insts: List[Instance], config: Config) -> Set:
    """
    Remove certain number of entities and make them become O label
    :param train_insts:
    :param config:
    :return:
    """
    all_spans = []
    for inst in train_insts:
        output = inst.output
        start = -1
        for i in range(len(output)):
            if output[i].startswith("B-"):
                start = i
            if output[i].startswith("E-"):
                end = i
                all_spans.append(
                    Span(start, end, output[i][2:], inst_id=inst.id))
            if output[i].startswith("S-"):
                all_spans.append(Span(i, i, output[i][2:], inst_id=inst.id))
    random.shuffle(all_spans)

    span_set = set()
    num_entity_removed = round(len(all_spans) * (1 - config.entity_keep_ratio))
    for i in range(num_entity_removed):
        span = all_spans[i]
        id = span.inst_id
        output = train_insts[id].output
        for j in range(span.left, span.right + 1):
            output[j] = config.O
        span_str = ' '.join(train_insts[id].input.words[span.left:(span.right +
                                                                   1)])
        span_str = span.type + " " + span_str
        span_set.add(span_str)
    return span_set
Esempio n. 3
0
def redirects(source: str, language: str) -> Iterator[CaptureResult[Redirect]]:
    """Return the redirects found in the document."""

    assert (language in redirect_magicwords), \
           'Language {} not in allowed choices.'.format(language)

    redirect_re = redirect_res[language]
    redirect_matches = peekable(redirect_re.finditer(source, concurrent=True))

    for match in redirect_matches:
        target = match.group('link') or ''
        target = target.strip()
        anchor = match.group('anchor') or target
        # newlines in anchor are visualized as spaces.
        anchor = anchor.replace('\n', ' ').strip()

        # split on '#' (link to section)
        tosection = ''
        if '#' in target:
            splittarget = target.split('#', 1)
            target = splittarget[0]
            tosection = splittarget[1]

        # For some reason if wikilink has no pipe, e.g. [[apple]] the regex
        # above captures everything in the anchor group, so we need to set
        # the link to the same page.
        if (anchor and not target):
            target = anchor

        redirect = Redirect(target=target, tosection=tosection)

        yield CaptureResult(redirect, Span(match.start(), match.end()))