def __call__(self, doc: Doc): save_parsed = doc.is_parsed doc.is_parsed = False if self.split_matcher: matches = self.split_matcher(doc) for match_id, start, end in matches: token = doc[end - 1] token.is_sent_start = True if end - 2 >= 0 and doc[end - 2].is_sent_start is True: doc[end - 2].is_sent_start = False if self.join_matcher: matches = self.join_matcher(doc) for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False if doc.is_sentenced: # Trim starting spaces sent_start = None for sent in doc.sents: sentlen = len(sent) first_non_space = 0 while first_non_space < sentlen and sent[ first_non_space].is_space: first_non_space += 1 if first_non_space > 0 and first_non_space < sentlen: sent[0].is_sent_start = False sent[first_non_space].is_sent_start = True doc.is_parsed = save_parsed if doc.is_sentenced else True return doc
def __call__(self, doc : Doc): save_parsed = doc.is_parsed doc.is_parsed = False if self.split_matcher: matches = self.split_matcher(doc) for match_id, start, end in matches: token = doc[end-1] token.is_sent_start = True if end-2>=0 and doc[end-2].is_sent_start is True: doc[end-2].is_sent_start = False if self.join_matcher: matches = self.join_matcher(doc) for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False doc.is_parsed = save_parsed if doc.is_sentenced else True return doc
def load_and_transform(batch_id, in_loc, out_dir): out_loc = path.join(out_dir, '%d.txt' % batch_id) if path.exists(out_loc): return None print('Batch', batch_id) nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False) with io.open(out_loc, 'w', encoding='utf8') as out_file: with io.open(in_loc, 'rb') as in_file: for byte_string in Doc.read_bytes(in_file): doc = Doc(nlp.vocab).from_bytes(byte_string) doc.is_parsed = True out_file.write(transform_doc(doc))