def lemmatization_stream() -> Iterator[str]: lower = request.args.get("lower", False) if lower: lower = True if request.method == "GET": data = request.args.get("data") else: data = request.form.get("data") if not data: yield "" else: header = False for chunk in chunks(data_iterator(data, lower=lower), size=BATCH): sents, lengths = zip(*chunk) tagged, tasks = tagger.tag(sents=sents, lengths=lengths) formatter = formatter_class(tasks) sep = "\t" for sent in tagged: if not header: yield sep.join(formatter.format_headers()) + '\r\n' header = True for token, tags in sent: yield sep.join(formatter.format_line(token, tags)) + '\r\n'
def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype, no_tokenizer: bool = False) -> Generator[Dict[str, str], None, None]: # Reset at each document processor.reset() iterator.tokenizer.reset() # Iterate ! for chunk in utils.chunks( iterator(data, lower=self.lower, no_tokenizer=no_tokenizer), size=self.batch_size): # Unzip the batch into the sentences, their sizes and the dictionaries of things that needs # to be reinserted sents, lengths, needs_reinsertion = zip(*chunk) is_empty = [not bool(sent) for sent in sents] tagged, tasks = self.tag( sents=[sent for sent in sents if sent], lengths=[l for l in lengths if l != 0] ) if not processor.task_init: processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): if sent_is_empty: sent = [] else: sent = tagged.pop(0) # Gets things that needs to be reinserted sent_reinsertion = needs_reinsertion[sents_index] # If we have a disambiguator, we run the results into it if self.disambiguation and sent: sent = self.disambiguation(sent, tasks) reinsertion_index = 0 for index, (token, tags) in enumerate(sent): # Before current index while reinsertion_index + index in sent_reinsertion: yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 yield from processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): yield processor.reinsert(sent_reinsertion[reinsertion])
def prepare_buffer(self, buf, **kwargs): "Transform buffer into batch generator" def key(data): inp, tasks = data return len(inp) if self.minimize_pad: buf = sorted(buf, key=key, reverse=True) batches = list(utils.chunks(buf, self.batch_size)) if self.shuffle: random.shuffle(batches) for batch in batches: yield self.pack_batch(batch, **kwargs)
def lemmatise(path, model_spec): """ lemmatises raw text input, with the given model(s), using Pie. :param path: path to folder containing the texts :param model_spec: specification of the model(s), in Pie syntax :return: a dictionary, with a list for each witness, containing a list for each sentence. """ tagger = Tagger() for model, tasks in utils.model_spec(model_spec): tagger.add_model(model, *tasks) print(" - model: {}".format(model)) tasks = tasks or tagger.models[-1][0].label_encoder.tasks print(" - tasks: {}".format(", ".join(tasks))) # Get files content files = glob.glob(path + '/*.txt') content = {} for f in files: wit = os.path.splitext(os.path.split(f)[-1])[0] content[wit] = [] tokenId = 1 for chunk in utils.chunks(lines_from_file(f), 200): sents, lengths = zip(*chunk) tagged, tasks = tagger.tag(sents, lengths) for sent in tagged: new_sent = [] for t in sent: token_dict = { "form": t[0], "id": "w_" + str(tokenId), "order_id": str(tokenId) } # and now add the different annotations from lemmatiser for index in enumerate(tasks): token_dict[index[1]] = t[1][index[0]] new_sent.append(token_dict) tokenId += 1 content[wit].append(new_sent) return content
def tag_file(self, fpath, sep='\t'): _, ext = os.path.splitext(fpath) header = False with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: for chunk in utils.chunks(lines_from_file(fpath, self.lower), self.batch_size): sents, lengths = zip(*chunk) tagged, tasks = self.tag(sents, lengths) for sent in tagged: if not header: f.write(sep.join(['token'] + tasks) + '\n') header = True for token, tags in sent: f.write(sep.join([token] + list(tags)) + '\n') f.write('\n')
def iter_tag_token( self, data: str, iterator: DataIterator, processor: ProcessorPrototype, no_tokenizer: bool = False, empty_token_on_sent_break: bool = False ) -> Generator[Optional[Dict[str, str]], None, None]: """ Reads the string in [DATA] with [ITERATOR] and [PROCESSOR], then returns each token as a dict :param data: Textual :param iterator: Iterator used to read data :param processor: Processor used to post-process data :param no_tokenizer: Disable the tokenizer inside the iterator :param empty_token_on_sent_break: Returns a None token when going into a new sequence. :yield: Token in the form of a dict or, if [empty_token...] is True, a None value when changing "sentences" """ # Reset at each document processor.reset() iterator.tokenizer.reset() # Iterate ! for chunk in utils.chunks(iterator(data, lower=self.lower, no_tokenizer=no_tokenizer), size=self.batch_size): # Unzip the batch into the sentences, their sizes and the dictionaries of things that needs # to be reinserted sents, lengths, needs_reinsertion = zip(*chunk) is_empty = [not bool(sent) for sent in sents] tagged, tasks = self.tag(sents=[sent for sent in sents if sent], lengths=[l for l in lengths if l != 0]) if not processor.task_init: processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): if sent_is_empty: sent = [] else: sent = tagged.pop(0) # Gets things that needs to be reinserted sent_reinsertion = needs_reinsertion[sents_index] # If we have a disambiguator, we run the results into it if self.disambiguation and sent: sent = self.disambiguation(sent, tasks) reinsertion_index = 0 for index, (token, tags) in enumerate(sent): # Before current index while reinsertion_index + index in sent_reinsertion: yield processor.reinsert( sent_reinsertion[reinsertion_index + index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 yield from processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): yield processor.reinsert(sent_reinsertion[reinsertion]) if empty_token_on_sent_break: yield None