def test_melting_point_heading_salt(self): """Test extraction of melting point from a heading and paragraphs. Example taken from patent US06840965B2.""" d = Document( Heading( 'D. Synthesis of 4-Amino-2-(3-thienyl)phenol Hydrochloride'), Paragraph( '3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.' ), Paragraph( 'This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.' )) expected = [{ 'names': ['4-nitro-2-(3-thienyl)phenol'] }, { 'names': ['ethanol'] }, { 'names': ['palladium'] }, { 'names': ['carbon'] }, { 'names': ['hydrogen'] }, { 'names': ['diethyl ether'] }, { 'melting_points': [{ 'units': '°C', 'value': '130-132' }], 'names': [ '4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride' ], 'roles': ['product'] }] self.assertEqual(expected, d.records.serialize())
def tokenize(text, cems=False): if cems: # getting initial annotation cde_cem_starts = [cem.start for cem in Document(text).cems] else: cde_cem_starts = [] # getting all tokens cde_p = Paragraph(text) all_tokens = cde_p.tokens pos_tokens = cde_p.pos_tagged_tokens # part of speech tagger # building the array for annotation tokens = [] for row_idx, sentence in enumerate(all_tokens): tokens.append([]) for idx, elem in enumerate(sentence): tokens[row_idx].append({ "id": "token-" + str(elem.start) + "-" + str(elem.end), "annotation": ('CHM' if elem.start in cde_cem_starts else None), "pos": pos_tokens[row_idx][idx][1], "text": elem.text, "start": elem.start, "end": elem.end }) return tokens
def custom_tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False, cde=True): text = to_unicode(text, encoding, errors=errors) lowercase = lowercase or to_lower or lower if lowercase: text = text.lower() if deacc: text = deaccent(text) if cde: text = " ".join(text.split()) cde_p = Paragraph(text) tokens = cde_p.tokens toks = [] for sentence in tokens: toks.append([]) for tok in sentence: if tok.text not in string.punctuation: yield tok.text else: for match in PAT_ALPHABETIC.finditer(text): yield match.group()
def test_parse_control_character(self): """Test control character in text is handled correctly.""" # The parser doesn't like controls because it uses LXML model so must be XML compatible. d = Document( Paragraph('Yielding 2,4,6-trinitrotoluene,\n m.p. 20 \x0eC.')) expected = [{'names': ['2,4,6-trinitrotoluene']}] self.assertEqual(expected, d.records.serialize())
def build_abbreviations_dict(self, materials_list, paragraphs): """ :param materials_list: list of found materials entities :param paragraphs: list of paragraphs where look for abbreviations :return: dictionary abbreviation - corresponding entity """ abbreviations_dict = { t: '' for t in materials_list if self.__is_abbreviation(t.replace(' ', '')) } not_abbreviations = list( set(materials_list) - set(abbreviations_dict.keys())) # run through all materials list to resolve abbreviations among its entities for abbr in abbreviations_dict.keys(): for material_name in not_abbreviations: if sorted(re.findall('[A-NP-Z]', abbr)) == sorted( re.findall('[A-NP-Z]', material_name)): abbreviations_dict[abbr] = material_name # for all other abbreviations going through the paper text for abbr, name in abbreviations_dict.items(): if name == '': sents = ' '.join([ s.text for p in paragraphs for s in Paragraph(p).sentences if abbr in s.text ]).split(abbr) i = 0 while abbreviations_dict[abbr] == '' and i < len(sents): sent = sents[i] for tok in sent.split(' '): if sorted(re.findall('[A-NP-Z]', tok)) == sorted( re.findall('[A-NP-Z]', abbr)): abbreviations_dict[abbr] = tok i = i + 1 for abbr in abbreviations_dict.keys(): parts = re.split('-', abbr) if all(p in abbreviations_dict for p in parts ) and abbreviations_dict[abbr] == '' and len(parts) > 1: name = ''.join('(' + abbreviations_dict[p] + ')' + '-' for p in parts).rstrip('-') abbreviations_dict[abbr] = name empty_list = [ abbr for abbr, name in abbreviations_dict.items() if name == '' ] for abbr in empty_list: del abbreviations_dict[abbr] return abbreviations_dict
def tokenize(self, text, split_oxidation=True, keep_sentences=True): """Converts a string to a list tokens (words) using a modified chemdataextractor tokenizer. Adds a few fixes for inorganic materials science, such as splitting common units from numbers and splitting the valence state. Args: text: input text as a string split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II) will become iron (II), same with Fe(II), etc. keep_sentences: if False, will disregard the sentence structure and return tokens as a single list of strings. Otherwise returns a list of lists, each sentence separately. Returns: A list of strings if keep_sentence is False, otherwise a list of list of strings, which each list corresponding to a single sentence. """ def split_token(token, so=split_oxidation): """Processes a single token, in case it needs to be split up. There are 2 cases when the token is split: A number with a common unit, or an element with a valence state. Args: token: The string to be processed. so: If True, split the oxidation (valence) string. Units are always split. Returns: A list of strings. """ elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match( token) if so else None nr_unit = self.NR_AND_UNIT.match(token) if nr_unit is not None and nr_unit.group(2) in self.SPLIT_UNITS: # Splitting the unit from number, e.g. "5V" -> ["5", "V"]. return [nr_unit.group(1), nr_unit.group(2)] elif elem_with_valence is not None: # Splitting element from it"s valence state, e.g. "Fe(II)" -> ["Fe", "(II)"]. return [elem_with_valence.group(1), elem_with_valence.group(2)] else: return [token] cde_p = Paragraph(text) tokens = cde_p.tokens toks = [] for sentence in tokens: if keep_sentences: toks.append([]) for tok in sentence: toks[-1] += split_token(tok.text, so=split_oxidation) else: for tok in sentence: toks += split_token(tok.text, so=split_oxidation) return toks
def _find_variables(self, var, raw_text, mp): sents = Paragraph(raw_text).sentences i = 0 values = [] while len(values) == 0 and i < len(sents): sent = sents[i] try: values, mode = mp.get_stoichiometric_values(var, sent.text) except ValueError: pass i += 1 return values
def get_ents(paragraphs): # get extractor global extractor config_path = os.path.join(os.path.realpath('.'), '.env') load_dotenv(dotenv_path=config_path) models_dir = environ.get('MODELS_DIR') model_name = environ.get('ACTIVE_MODEL') model_dir = os.path.join(models_dir, model_name) if extractor is None: extractor = RxnExtractor(model_dir=model_dir) # Get sentences paragraphs = [Paragraph(p).sentences for p in paragraphs] sentences = [] for par in paragraphs: for sent in par: sentences.append(str(sent)) reactions = extractor.get_reactions(sentences) # Re-combine sentences into paragraphs extractions = [] off = 0 for par in paragraphs: tokens = [] recs = [] for j in range(off, off + len(par)): sent_react = reactions[j] for r in sent_react["reactions"]: r_offset = {} for k in r: r_offset[k] = [] for e in r[k]: if isinstance(e, (list, tuple)): r_offset[k].append( [j + len(tokens) for j in e[1:]]) else: if isinstance(e, int): r_offset[k].append(e + len(tokens)) recs.append(r_offset) tokens.extend(sent_react['tokens']) extractions.append({'tokens': tokens, 'reactions': recs}) off += len(par) return extractions
def tokenize(text): """ Returns a 1d list of tokens using chemdataextractor tokenizer. Removes all punctuation but keeps the structure of sentences. """ cde_p = Paragraph(text) tokens = cde_p.tokens toks = [] for sentence in tokens: toks.append([]) for tok in sentence: toks[-1].append(tok.text) return toks
def append_cde_mols(text, mol_list, ptable): """ This function uses ChemDataExtractor to find all molecules in a chunk of text. Parameters: text (str, required): The text to find molecules in Returns: list: list of all molecules in the text """ para = Paragraph(text) new_mols = para.cems # find all molecules in the text for mol in new_mols: mol_list.append(mol.text) print('appended ', mol)
def test_syn_order(): '''Tests if function syn_order works''' paragraph = Paragraph( 'After drying, the HTM was deposited by spin-coating a solution of spiro-MeOTAD, 4-tert-butylpyridine, \ lithium bis(trifluoromethylsulphonyl)imide and tris(2-(1H-pyrazol-1-yl)-4-tert-butylpyridine)cobalt(iii) \ bis(trifluoromethylsulphonyl)imide in chlorobenzene.\ Annealing the as-deposited films at 100\u2009°C for 45\u2009min in the N2-filled glove box \ before spin-coating the hole transporter enabled full crystallization of the perovskite, darkening \ the colour and resulting in an apparent growth of the crystal features visible in the SEM image, \ as shown in Extended Data Fig. 1.') vb_order, vb_dict = order.syn_order(paragraph) assert vb_order[0][0] == 'dry', 'First action identified is incorrect' assert vb_order[0][ 1] == 0, 'Sentence number where first action is identified is incorrect' assert vb_order[2][0] == 'anneal', 'Fails to identify capitalized word' assert ['anneal', 'spin-coat'] in vb_dict.values(), \ 'Fails to store all steps found in vb_dict output'
def tokenize(self, text, split_oxidation=True, keep_sentences=True): """ Converts string to a list tokens (words) using chemdataextractor tokenizer, with a couple of fixes for inorganic materials science. Keeps the structure of sentences. :param text: input text as a string :param split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II) will become iron (II), same with Fe(II), etc. :param keep_sentences: if False, will disregard the sentence structure and return tokens as a single list of strings. Otherwise returns a list of lists, each sentence separately. """ def split_token(token, so=split_oxidation): """ Process a single token, in case it needs to be split up. There are 2 cases: It's a number with a unit, or an element with a valence state. """ elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match( token) if so else None nr_unit = self.NR_AND_UNIT.match(token) if nr_unit is not None and nr_unit.group(2) in self.SPLIT_UNITS: # splitting the unit from number, e.g. "5V" -> ["5", "V"] return [nr_unit.group(1), nr_unit.group(2)] elif elem_with_valence is not None: # splitting element from it's valence state, e.g. "Fe(II)" -> ["Fe", "(II)"] return [elem_with_valence.group(1), elem_with_valence.group(2)] else: return [token] cde_p = Paragraph(text) tokens = cde_p.tokens toks = [] for sentence in tokens: if keep_sentences: toks.append([]) for tok in sentence: toks[-1] += split_token(tok.text, so=split_oxidation) else: for tok in sentence: toks += split_token(tok.text, so=split_oxidation) return toks
def main(): parser = argparse.ArgumentParser() parser.add_argument("--annotation-file", type=str, required=True, help="The full annotation file.") parser.add_argument("--output-file", type=str, required=True, help="The product recognition data file.") args = parser.parse_args() # data = [] with open(args.output_file, "w") as fw: with open(args.annotation_file, "r") as fr: reader = csv.DictReader(fr, delimiter=',') for row in tqdm(reader): text = row["description"] sents = [s.text for s in Paragraph(text)] # data += sents for sent in sents: fw.write(f"{sent}\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--annotation-file", type=str, required=True, help="The full annotation file.") parser.add_argument("--output-file", type=str, required=True, help="The product recognition data file.") args = parser.parse_args() data = [] with open(args.annotation_file, "r") as f: reader = csv.DictReader(f, delimiter=',') for row in reader: text = row["description"] tokens = text.split(' ') product = row["Products"] product_tags = row["Products-tag"] if product_tags == "": # skip rows if products is empty assert (product == "") continue product_spans = make_spans(product_tags) # verify the correspondence between ${product} and tags val = " ".join([ " ".join(tokens[start:end]) for (start, end) in product_spans ]) assert (val == product) sents = [s.text.split(' ') for s in Paragraph(text)] # make sure the indexes don't change # assert(text == " ".join([" ".join(sent) for sent in sents])) if (text != " ".join([" ".join(sent) for sent in sents])): print("text not matched after tokenization, skip.") continue # get sentence boundaries sent_boundaries = [ 0, ] for sent in sents: offset = sent_boundaries[-1] sent_boundaries.append(offset + len(sent)) def get_segment(interval, boundaries, window=3): """ """ cxt = int((window - 1) / 2) start, end = interval for i, b in enumerate(boundaries): if start >= b and (end - 1 < boundaries[i + 1]): sent_id = i break segment_start = boundaries[max(0, sent_id - cxt)] segment_end = boundaries[min( len(boundaries) - 1, sent_id + cxt + 1)] return (segment_start, segment_end) for span in product_spans: # for each product mention, create an individual instance seg_start, seg_end = get_segment(span, sent_boundaries, window=1) tagged_text = [] for p, token in enumerate(tokens): tagged_text.append([token, 'O']) # assign B/I- tags to each token for field in FOI: fval = row[field] fval_tags = row[field + "-tag"] if fval_tags == "": assert (fval == "") continue fval_spans = make_spans(fval_tags) for fval_span in fval_spans: start, end = fval_span tagged_text[start][1] = f'B-{field}' if end == start + 1: continue for i in range(start + 1, end): tagged_text[i][1] = f'I-{field}' prod_span_start, prod_span_end = span tagged_text.insert(prod_span_start, ["[P1]", "O"]) tagged_text.insert(prod_span_end + 1, ["[P2]", "O"]) tagged_segment = tagged_text[seg_start:(seg_end + 2)] data.append(tagged_segment) with open(args.output_file, "w") as f: for tt in data: for token, tag in tt: f.write(f"{token}\t{tag}\n") f.write("\n")
# coding: utf-8 # # Extracting Solubility # In[ ]: from chemdataextractor import Document from chemdataextractor.model import Compound from chemdataextractor.doc import Paragraph, Heading # In[5]: d = Document( Paragraph( u'The procedure was followed to yield a pale yellow solid Hippeastrine Hydrobromide. ( melting point of Amodiaquine is 137 °C)' )) # In[6]: d.records.serialize() # In[37]: from chemdataextractor.model import BaseModel, StringType, ListType, ModelType class Solubility(BaseModel): value = StringType() units = StringType()
from operations_extractor import OperationsExtractor oe = OperationsExtractor() tp = TextCleanUp() text_sents = [ "LiNixMn2−xO4 (x=0.05,0.1,0.3,0.5) samples were prepared in either an air or an O2 atmosphere by solid-state reactions.", "Mixtures of Li2CO3,MnCO3, and NiO were heated at 700°C for 24 to 48 h with intermittent grinding.", "All these samples were cooled to room temperature at a controlled rate of 1°C/min." ] paragraph_data = [] for sent in text_sents: text = tp.cleanup_text(sent) sent_toks = [tok for sent in Paragraph(text).raw_tokens for tok in sent] # output, sentence, tokens = get_operations(sent) operations, spacy_tokens = oe.get_operations(sent_toks) updated_operations = oe.operations_correction(spacy_tokens, operations, parsed_tokens=True) updated_operations = oe.find_aqueous_mixing(spacy_tokens, updated_operations, parsed_tokens=True) paragraph_data.append((spacy_tokens, updated_operations)) paragraph_data_upd = oe.operations_refinement(paragraph_data, parsed_tokens=True) pprint(paragraph_data_upd)
# -*- coding: utf-8 -*- """ Created on Wed Feb 17 09:07:39 2021 @author: Kristian """ from chemdataextractor import Document from chemdataextractor.model import Compound from chemdataextractor.doc import Paragraph, Heading #u in front of the string indicates that a unicode string is to be created #We think the unicode is for symbols like the degree since it may not be recognized ASCII d = Document( Heading(u'Synthesis of HKUST-1-AC'), Paragraph( u'The BET surface area and CO2 uptake capacity values for the HKUST-1–AC composite were 1381 m2 g−1 and 8.1 mmol g−1 (at 273 K and 1 bar), respectively, representing increases of 70% and 39%, respectively, over the reported values for HKUST-1' )) from chemdataextractor.model import BaseModel, StringType, ListType, ModelType import re from chemdataextractor.parse import R, I, W, Optional, merge class Capacity(BaseModel): value = StringType() units = StringType() Compound.capacity = ListType(ModelType(Capacity)) prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide()
def get_CDE_mols(corpus_path, years, ppy, output_path, mode='fulltext'): """ This function grabs Parameters: corpus_path (str, required): Path to the corpus years (list, required): List of years to find mols for ppy (int, required): Papers per year. How many papers to get mols from per year output_path (str, required): path to place output data to be furher analyzed mode (str, optional): Either 'fulltext' or 'abstract' or 'both' """ paper_count = 0 # make sure we have consistent endings if not corpus_path.endswith('/'): corpus_path += '/' # get a list of all the journal directories and remove the README journals = os.listdir(corpus_path) journals.remove('README.txt') random.seed(42) random.shuffle(journals) # iterate through every journal in corpus for journal_name in journals: journal_path = corpus_path + journal_name + '/' journal_json = journal_path + journal_name + '.json' print('On journal ', journal_name) # open the entire dictionary corresponding to a single jornal with open(journal_json) as json_file: journal_dict = json.load(json_file) # iterate through the specified years in parameter for year in years: year_dict = journal_dict[year] print(year) try: # don't know if there will be enough papers in this year for this pub paper_idxs = random.sample(range(len(year_dict)), ppy) except: continue for num in paper_idxs: paper_count += 1 print('On paper ', paper_count, ' of ', len(journals) * len(years) * ppy) # grab the paper from this year corresponding to the 'numth' paper paper_dict = year_dict[str(num)] # get the fulltext out try: text = paper_dict['fulltext'] except: continue if type(text) != str: continue # remove nonsense information text = clean_paper(text) para = Paragraph(text) mols = para.cems # find all molecules in the text mols = ['<<NEW_PAPER>>'] + [mol.text for mol in mols] with open(output_path, 'a') as file: for entry in mols: file.write(entry + '\n') file.write('\n')
def tokenize(self, texts='default', entities='default', use_entities=True, keep_sentences=True, exclude_punct=False, save=False): """ Takes the set of normalized texts and tokenizes them Parameters: texts (list): List of texts to tokenize. If `default` then self.normalized_texts will be used entities (dict): Dictionary of entity names and index positions. If `default` then self.entities_per_text will be used use_entities (bool): If true then entity dict will be used to tokenize multi-word phrases and chemical entities. Otherwise all words in text list will be tokenized with the same algorithm and some entities may be split keep_sentences (bool): If true then abstract will be split into list of lists where each nested list is a single sentence. Otherwise abstract will be split into a single list of tokens exclude_punct (bool): If true then common punctuation marks will be left out of token list. Otherwise all punctuation will remain """ if texts == 'default': texts = self.normalized_texts if entities == 'default': entities = self.entities_per_text if use_entities: assert len(texts) == len( entities ), "ERROR: SIZE OF ENTITY AND TEXT LISTS DO NOT MATCH. YOU CAN EITHER RUN A NORMALIZATION FUNCTION ON UNPROCESSED TEXT OR LOAD FILES OF MATCHING SIZE" ### Instantiate Mat2Vec MaterialsTextProcessor MTP = MaterialsTextProcessor() ### Iterate through all abstracts and corresponding entities if applicable for i in trange(len(texts)): text = texts[i] entity_spans = [] if use_entities: entry = entities[i] for entity in entry: name = entity[0] start = entity[1] stop = entity[2] entity_spans.append((start, stop)) new_name = name.replace(' ', '_') text = text[:start] + new_name + text[stop:] if keep_sentences == False: ### Split text into entities vs. non-entities token_list = self.extract_entity_tokens(text, entity_spans) ### Tokenize non-entities and combine with entities tokens, self.entity_idxs[i] = self.process_token_list( token_list) ### Use Mat2Vec MaterialsTextProcessor for casing, number normalization, puncutation, etc. tokens, _ = MTP.process(tokens, exclude_punct=exclude_punct, normalize_materials=False, split_oxidation=False) else: ### Split text into sentences tokens = [] self.entity_idxs[i] = [] para = Paragraph(text) prior_split = 0 for j, sentence in enumerate(para.sentences): split = sentence.end sentence = sentence.text sentence_entities = [] for span in entity_spans: if span[1] < split and span[0] >= prior_split: new_span = (span[0] - split, span[1] - split) sentence_entities.append(new_span) prior_split = split ### Make a token_list for each sentence token_list = self.extract_entity_tokens( sentence, sentence_entities) ### Tokenize non-entities and combine with entities sentence_tokens, sentence_entity_idxs = self.process_token_list( token_list) self.entity_idxs[i].append(sentence_entity_idxs) ### Mat2Vec Processing sentence_tokens, _ = MTP.process( sentence_tokens, exclude_punct=exclude_punct, normalize_materials=False, split_oxidation=False) tokens.append(sentence_tokens) self.tokenized_texts[i] = tokens if save: os.makedirs('preprocessor_files', exist_ok=True) with io.open('preprocessor_files/tokenized_texts.json', 'w', encoding='utf8') as f: out_ = json.dumps(self.tokenized_texts, indent=4, sort_keys=False, separators=(',', ': '), ensure_ascii=False) f.write(str(out_)) with io.open('preprocessor_files/tokenized_entity_idxs.json', 'w', encoding='utf8') as f: out_ = json.dumps(self.entity_idxs, indent=4, sort_keys=False, separators=(',', ': '), ensure_ascii=False) f.write(str(out_))
""" Created on Tue Feb 16 08:42:34 2021 @author: Kristian """ from chemdataextractor import Document from chemdataextractor.model import Compound from chemdataextractor.doc import Paragraph, Heading #u in front of the string indicates that a unicode string is to be created #We think the unicode is for symbols like the degree since it may not be recognized ASCII d = Document( Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'), Paragraph( u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C) and a white solid (b.p. 60 °C)' )) from chemdataextractor.model import BaseModel, StringType, ListType, ModelType class BoilingPoint(BaseModel): value = StringType() units = StringType() Compound.boiling_points = ListType(ModelType(BoilingPoint)) import re from chemdataextractor.parse import R, I, W, Optional, merge '''
def main(): parser = argparse.ArgumentParser() parser.add_argument("--annotation-file", type=str, required=True, help="The full annotation file.") parser.add_argument("--output-file", type=str, required=True, help="The product recognition data file.") args = parser.parse_args() data = defaultdict(list) with open(args.annotation_file, "r") as f: reader = csv.DictReader(f, delimiter=',') for row in reader: text = row["description"] tokens = text.split(' ') product = row["Products"] product_tags = row["Products-tag"] if product_tags == "": assert (product == "") continue product_tags = list(map(int, product_tags.split(','))) # verify the correspondence between ${product} and tags product_spans = [] for i in range(int(len(product_tags) / 2)): product_spans.append( (product_tags[i * 2], product_tags[i * 2 + 1])) val = " ".join([ " ".join(tokens[start:end]) for (start, end) in product_spans ]) assert (val == product) for span in product_spans: if span not in data[text]: data[text].append(span) n_sents = 0 with open(args.output_file, "w") as f: # segment each paragraph into sentences, and map indexes correspondingly for text, prod_spans in data.items(): print(prod_spans) sents = [s.text.split(' ') for s in Paragraph(text)] print("{} sentences detected".format(len(sents))) n_sents += len(sents) # make sure the indexes don't change # assert(text == " ".join([" ".join(sent) for sent in sents])) merged_text = " ".join([" ".join(sent) for sent in sents]) if text != merged_text: print("text: %s (len: %d)" % (text, len(text))) print("merged text: %s (len: %d)" % (merged_text, len(merged_text))) # get sentence boundaries sent_boundaries = [ 0, ] for sent in sents: offset = sent_boundaries[-1] sent_boundaries.append(offset + len(sent)) print(sent_boundaries) # initialize all tokens with tag "O" tagged_text = [] for p, token in enumerate(text.split(' ')): tagged_text.append([token, 'O']) # check if a span (interval) crosses any sentence boundary def cross_boundary(interval, refs): # e.g., interval = [10, 12] for i, b in enumerate(refs): if b > interval[0] and b <= interval[1] - 1: return i return -1 # tag Product tokens for span in prod_spans: # if the span crosses sentence boundaries, skip (or merge the two sents) if cross_boundary(span, sent_boundaries) >= 0: print("cross_boundary!") continue start, end = span tagged_text[start][1] = 'B-Prod' if end == start + 1: continue for i in range(start + 1, end): tagged_text[i][1] = 'I-Prod' # split paragraph to sentences tagged_sents = [] for i in range(len(sents)): bos = sent_boundaries[i] eos = sent_boundaries[i + 1] tagged_sents.append(tagged_text[bos:eos]) for ts in tagged_sents: for token, tag in ts: f.write(f"{token}\t{tag}\n") f.write("\n")