def test_allocate(self): # Small data context = "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child." context_toks = tokenize(context) anchors = align(context, context_toks) query = ['Houston', ',', 'Texas'] start_char = 19 end_char = 32 span = Span.allocate(anchors, start_char, end_char) self.assertEqual(span.start, 4) self.assertEqual(span.end, 6) for k in range(span.start, span.end + 1): self.assertEqual(context_toks[k], query[k - span.start]) # Real data context = "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"." context_toks = tokenize(context) anchors = align(context, context_toks) query = ['Dangerously', 'in', 'Love'] start_char = 505 end_char = 523 span = Span.allocate(anchors, start_char, end_char) self.assertEqual(span.start, 108) self.assertEqual(span.end, 110) for k in range(span.start, span.end + 1): self.assertEqual(context_toks[k], query[k - span.start])
def remove_entites(train_insts: List[Instance], config: Config) -> Set: """ Remove certain number of entities and make them become O label :param train_insts: :param config: :return: """ all_spans = [] for inst in train_insts: output = inst.output start = -1 for i in range(len(output)): if output[i].startswith("B-"): start = i if output[i].startswith("E-"): end = i all_spans.append( Span(start, end, output[i][2:], inst_id=inst.id)) if output[i].startswith("S-"): all_spans.append(Span(i, i, output[i][2:], inst_id=inst.id)) random.shuffle(all_spans) span_set = set() num_entity_removed = round(len(all_spans) * (1 - config.entity_keep_ratio)) for i in range(num_entity_removed): span = all_spans[i] id = span.inst_id output = train_insts[id].output for j in range(span.left, span.right + 1): output[j] = config.O span_str = ' '.join(train_insts[id].input.words[span.left:(span.right + 1)]) span_str = span.type + " " + span_str span_set.add(span_str) return span_set
def redirects(source: str, language: str) -> Iterator[CaptureResult[Redirect]]: """Return the redirects found in the document.""" assert (language in redirect_magicwords), \ 'Language {} not in allowed choices.'.format(language) redirect_re = redirect_res[language] redirect_matches = peekable(redirect_re.finditer(source, concurrent=True)) for match in redirect_matches: target = match.group('link') or '' target = target.strip() anchor = match.group('anchor') or target # newlines in anchor are visualized as spaces. anchor = anchor.replace('\n', ' ').strip() # split on '#' (link to section) tosection = '' if '#' in target: splittarget = target.split('#', 1) target = splittarget[0] tosection = splittarget[1] # For some reason if wikilink has no pipe, e.g. [[apple]] the regex # above captures everything in the anchor group, so we need to set # the link to the same page. if (anchor and not target): target = anchor redirect = Redirect(target=target, tosection=tosection) yield CaptureResult(redirect, Span(match.start(), match.end()))