def run(self, request, response): text = request['text'] lang = request['lang'] debug = request.get('debug', False) if lang not in self.langs: raise MissingLanguage(lang) if 'classification-model' in request: model_name = request['classification-model'] else: raise MissingParameter(self.task, self.name, 'classification-model') if model_name in self.models: model = self.models[model_name] patterns = self.patterns[model_name] extra_patterns = self.extra_patterns[model_name] else: raise MissingModel(self.task, self.name, model_name, list(self.models.keys())) features = np.array( [get_features(text.lower(), patterns, extra_patterns)]) category = model.predict(features)[0] probs = model.predict_proba(features)[0] category_prob = probs.max() result = {'category': category, 'category_probability': category_prob} if debug: result['distribution'] = dict(zip(model.classes_, probs)) return result
def run(self, request, response): lang = request['lang'] if lang in self.langs: text = request['text'] debug = request.get('debug', False) start = 0 end = 0 result = [] if search_dates(text, languages=self.langs) is not None: for chunk, date in search_dates(text, languages=self.langs): start = text.index(chunk, end) end = start + len(chunk) if debug: result.append({ 'text': chunk, 'start': start, 'end': end, 'date': date.strftime(_format) }) else: result.append({ 'start': start, 'end': end, 'date': date.strftime(_format) }) return result else: raise MissingLanguage(lang)
def run(self, request, response): text = request['text'] lang = request['lang'] debug = request.get('debug', False) if lang not in self.langs: raise MissingLanguage(lang) if 'nmf-model' in request: model_name = request['nmf-model'] else: raise MissingParameter(self.task, self.name, 'nmf-model') if model_name in self.models: model = self.models[model_name] vectorizer = self.vectorizers[model_name] else: raise MissingModel(self.task, self.name, model_name, list(self.models.keys())) vectors = vectorizer.transform([text]) probs = model.transform(vectors) category = int(probs.argmax()) category_prob = probs.max() result = { 'distribution': probs.flatten().tolist(), 'best-topic': category, 'best-score': category_prob } if debug: H1 = model.components_ vocab = vectorizer.get_feature_names() result['topics'] = [[vocab[i] for i in np.argsort(x)[:-11:-1]] for x in H1] return result
def run(self, request, response): lang = request['lang'] if lang in self.stopwords: # Extract request data text = request['text'] parsing = response['parse'] num_keywords = request.get('num-keywords', 3) # Cleaning and normalization stopwords = self.stopwords[lang] keep = lambda x: any(char.isalpha() or char.isdigit() for char in x) \ and (len(x) > 2) and not (x in stopwords) normalize = lambda x: x.lower() # Create the sentence graph and compute pagerank sentences = _parsing_to_tokens(parsing, text, keep, normalize) graph = _word_graph(sentences) weights = networkx.pagerank(graph) # Assemble result topk = nlargest(num_keywords, weights.items(), key=lambda x: x[1]) result = [{'text': t} for t, _ in topk] return result else: raise MissingLanguage(lang)
def run(self, request, response): lang = request['lang'] if lang in self.models: text = request['text'] debug = request.get('debug', False) model = self.models[lang] doc = model(text) result = [] for sentence in doc.sents: tokens = [] for token in sentence: start = token.idx end = start + len(token) if debug: tokens.append({ 'text': token.text, 'start': start, 'end': end }) else: tokens.append({'start': start, 'end': end}) result.append(tokens) return result else: raise MissingLanguage(lang)
def run(self, request, response): lang = request['lang'] if lang == 'it': text = request['text'] debug = request.get('debug', False) result = [] for entity in response['ner']: if entity['label'] == 'PER': if 'text' in entity: s = entity['text'] else: s = text[entity['start']:entity['end']] fiscal_codes = [] if 'fiscal_code' in response: fiscal_codes = [ x['text'] for x in response['fiscal_code'] ] name, surname = self.split_name(s, fiscal_codes) person = { 'start': entity['start'], 'end': entity['end'], 'name': name, 'surname': surname } result.append(person) return result else: raise MissingLanguage(lang)
def run(self, request, response): lang = request['lang'] if lang in self.models: text = request['text'] debug = request.get('debug', False) model = self.models[lang] doc = model(text) result = [] for ent in doc.ents: if debug: result.append({ 'text': ent.text, 'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_ }) else: result.append({ 'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_ }) return result else: raise MissingLanguage(lang)
def run(self, request, response): if request['lang'] == 'en': text = request['text'] parsing = response['parse'] debug = request.get('debug', False) result = [] for sentence in parsing: tokens = [_annotation2token(t, text) for t in sentence] tags = _conll(tokens) for ann, (token, tag) in zip(sentence, tags): if tag.startswith('B-'): item = { 'start': ann['start'], 'end': ann['end'], 'label': tag.partition('-')[2] } if debug: item['text'] = ann['text'] result.append(item) elif tag.startswith('I-'): item = result[-1] item['end'] = ann['end'] if debug: item['text'] = text[item['start']:item['end']] return result else: raise MissingLanguage(request['lang'])
def run(self, request, response): lang = request['lang'] if lang in self.models: model = self.models[lang] text = request['text'] parsing = response['parse'] debug = request.get('debug', False) sentences = [] for sentence in parsing: tokens = [] for item in sentence: if 'text' in item: token = item['text'].strip() else: token = text[item['start']:item['end']].strip() if token != '': tokens.append(token) sentences.append('<t> ' + ' '.join(tokens) + ' </t>') summary_response = self._summarize_text(model, ' '.join(sentences), debug) item = {'summary': summary_response['text']} if debug: item['summarization_ratio'] = len( summary_response['text']) / len(text) item['prediction_score'] = summary_response['score'] return item else: raise MissingLanguage(lang)
def run(self, request, response): if request['lang'] == 'en': text = request['text'] debug = request.get('debug', False) prediction = self.model.predict(sentence=text) return _to_annotations(prediction, text, debug=debug) else: raise MissingLanguage(request['lang'])
def run(self, request, response): lang = request['lang'] if lang == 'it': debug = request.get('debug', False) result = [] for code in response['codes']: if code['type'] == 'FISCAL_CODE': code_ = dict(code) code_['correct'] = self.check_fiscal_code(code['text']) code_.update(self.birth_date_and_gender(code['text'])) result.append(code_) return result else: raise MissingLanguage(lang)
def run(self, request, response): lang_pair = request['lang'] + '-' + request['target-lang'] trl_sentences = [] scores = [] sentencewise = True if lang_pair in self.models: model = self.models[lang_pair] text = request['text'] parsing = response['parse'] debug = request.get('debug', False) sentences = [] for sentence in parsing: tokens = [] for item in sentence: if 'text' in item: token = item['text'].strip() else: token = text[item['start']:item['end']].strip() if token != '': tokens.append(token) if sentencewise: trl_output = self._translate_text(model, ' '.join(tokens), debug) trl_sentences.append(trl_output['text']) if debug: scores.append(trl_output['score']) else: sentences.append(' '.join(tokens)) # Here translate sentence by sentence (otherwise it seems that it will just translate + summarize) item = {} if sentencewise: item['translation'] = ' '.join(trl_sentences) if debug: #item['summarization_ratio'] = len(summary_response['text'])/len(text) item['prediction_score'] = scores else: trl_output = self._translate_text(model, ' '.join(sentences), debug) item['translation'] = trl_output['text'] item['prediction_score'] = trl_output['score'] return item else: raise MissingLanguage(lang_pair)
def run(self, request, response): lang = request['lang'] if lang in self.models: model = self.models[lang] text = request['text'] parsing = response['parse'] debug = request.get('debug', False) result = [] offset = 0 for sentence in parsing: prediction = self._predict_sentence(model, sentence, text, debug) result.append(prediction) offset = sentence[-1]['end'] return result else: raise MissingLanguage(lang)
def run(self, request, response): lang = request['lang'] if lang in self.predictors: reader = self.readers[lang] predictor = self.predictors[lang] text = request['text'] parsing = response['parse'] debug = request.get('debug', False) result = [] offset = 0 for sentence in parsing: prediction = self._predict_sentence(reader, predictor, sentence, text) result = result + _to_annotations( prediction, text, offset=offset, debug=debug) offset = sentence[-1]['end'] return result else: raise MissingLanguage(lang)
def run(self, request, response): if request['lang'] == 'en': text = request['text'] debug = request.get('debug', False) result = [] for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize( text): tokens = [] sentence = text[sent_s:sent_e] for token_s, token_e in self.treebankWordTokenizer.span_tokenize( sentence): item = {'start': token_s + sent_s, 'end': token_e + sent_s} if debug: item['text'] = sentence[token_s:token_e] tokens.append(item) result.append(tokens) return result else: raise MissingLanguage(request['lang'])
def run(self, request, response): lang = request['lang'] if lang in self.stopwords: # Extract request data text = request['text'] parsing = response['parse'] debug = request.get('debug', False) num_sentences = request.get('num-extractive-sentences', 3) # Cleaning and normalization stopwords = self.stopwords[lang] keep = lambda x: any(char.isalpha() or char.isdigit() for char in x) \ and (len(x) > 2) and not (x in stopwords) normalize = lambda x: x.lower() # Create the sentence graph and compute pagerank sentences = _parsing_to_tokens(parsing, text, keep, normalize) graph = _sentence_graph(sentences) weights = networkx.pagerank(graph) # Assemble result topk = nlargest(num_sentences, weights.items(), key=lambda x: x[1]) result = [] for i, _ in topk: sentence = parsing[i] start = sentence[0]['start'] end = sentence[-1]['end'] if debug: result.append({ 'start': start, 'end': end, 'text': text[start:end] }) else: result.append({'start': start, 'end': end}) return result else: raise MissingLanguage(lang)