コード例 #1
0
ファイル: sklearn.py プロジェクト: phillswope/charade
    def run(self, request, response):
        text = request['text']
        lang = request['lang']
        debug = request.get('debug', False)
        if lang not in self.langs:
            raise MissingLanguage(lang)

        if 'classification-model' in request:
            model_name = request['classification-model']
        else:
            raise MissingParameter(self.task, self.name,
                                   'classification-model')

        if model_name in self.models:
            model = self.models[model_name]
            patterns = self.patterns[model_name]
            extra_patterns = self.extra_patterns[model_name]
        else:
            raise MissingModel(self.task, self.name, model_name,
                               list(self.models.keys()))

        features = np.array(
            [get_features(text.lower(), patterns, extra_patterns)])
        category = model.predict(features)[0]
        probs = model.predict_proba(features)[0]
        category_prob = probs.max()
        result = {'category': category, 'category_probability': category_prob}
        if debug:
            result['distribution'] = dict(zip(model.classes_, probs))
        return result
コード例 #2
0
 def run(self, request, response):
     lang = request['lang']
     if lang in self.langs:
         text = request['text']
         debug = request.get('debug', False)
         start = 0
         end = 0
         result = []
         if search_dates(text, languages=self.langs) is not None:
             for chunk, date in search_dates(text, languages=self.langs):
                 start = text.index(chunk, end)
                 end = start + len(chunk)
                 if debug:
                     result.append({
                         'text': chunk,
                         'start': start,
                         'end': end,
                         'date': date.strftime(_format)
                     })
                 else:
                     result.append({
                         'start': start,
                         'end': end,
                         'date': date.strftime(_format)
                     })
         return result
     else:
         raise MissingLanguage(lang)
コード例 #3
0
ファイル: sklearn.py プロジェクト: phillswope/charade
    def run(self, request, response):
        text = request['text']
        lang = request['lang']
        debug = request.get('debug', False)
        if lang not in self.langs:
            raise MissingLanguage(lang)

        if 'nmf-model' in request:
            model_name = request['nmf-model']
        else:
            raise MissingParameter(self.task, self.name, 'nmf-model')

        if model_name in self.models:
            model = self.models[model_name]
            vectorizer = self.vectorizers[model_name]
        else:
            raise MissingModel(self.task, self.name, model_name,
                               list(self.models.keys()))

        vectors = vectorizer.transform([text])
        probs = model.transform(vectors)
        category = int(probs.argmax())
        category_prob = probs.max()
        result = {
            'distribution': probs.flatten().tolist(),
            'best-topic': category,
            'best-score': category_prob
        }
        if debug:
            H1 = model.components_
            vocab = vectorizer.get_feature_names()
            result['topics'] = [[vocab[i] for i in np.argsort(x)[:-11:-1]]
                                for x in H1]
        return result
コード例 #4
0
    def run(self, request, response):
        lang = request['lang']
        if lang in self.stopwords:
            # Extract request data
            text = request['text']
            parsing = response['parse']
            num_keywords = request.get('num-keywords', 3)

            # Cleaning and normalization
            stopwords = self.stopwords[lang]
            keep = lambda x: any(char.isalpha() or char.isdigit() for char in x) \
                and (len(x) > 2) and not (x in stopwords)
            normalize = lambda x: x.lower()

            # Create the sentence graph and compute pagerank
            sentences = _parsing_to_tokens(parsing, text, keep, normalize)
            graph = _word_graph(sentences)
            weights = networkx.pagerank(graph)

            # Assemble result
            topk = nlargest(num_keywords, weights.items(), key=lambda x: x[1])
            result = [{'text': t} for t, _ in topk]

            return result
        else:
            raise MissingLanguage(lang)
コード例 #5
0
 def run(self, request, response):
     lang = request['lang']
     if lang in self.models:
         text = request['text']
         debug = request.get('debug', False)
         model = self.models[lang]
         doc = model(text)
         result = []
         for sentence in doc.sents:
             tokens = []
             for token in sentence:
                 start = token.idx
                 end = start + len(token)
                 if debug:
                     tokens.append({
                         'text': token.text,
                         'start': start,
                         'end': end
                     })
                 else:
                     tokens.append({'start': start, 'end': end})
             result.append(tokens)
         return result
     else:
         raise MissingLanguage(lang)
コード例 #6
0
 def run(self, request, response):
     lang = request['lang']
     if lang == 'it':
         text = request['text']
         debug = request.get('debug', False)
         result = []
         for entity in response['ner']:
             if entity['label'] == 'PER':
                 if 'text' in entity:
                     s = entity['text']
                 else:
                     s = text[entity['start']:entity['end']]
                 fiscal_codes = []
                 if 'fiscal_code' in response:
                     fiscal_codes = [
                         x['text'] for x in response['fiscal_code']
                     ]
                 name, surname = self.split_name(s, fiscal_codes)
                 person = {
                     'start': entity['start'],
                     'end': entity['end'],
                     'name': name,
                     'surname': surname
                 }
                 result.append(person)
         return result
     else:
         raise MissingLanguage(lang)
コード例 #7
0
 def run(self, request, response):
     lang = request['lang']
     if lang in self.models:
         text = request['text']
         debug = request.get('debug', False)
         model = self.models[lang]
         doc = model(text)
         result = []
         for ent in doc.ents:
             if debug:
                 result.append({
                     'text': ent.text,
                     'start': ent.start_char,
                     'end': ent.end_char,
                     'label': ent.label_
                 })
             else:
                 result.append({
                     'start': ent.start_char,
                     'end': ent.end_char,
                     'label': ent.label_
                 })
         return result
     else:
         raise MissingLanguage(lang)
コード例 #8
0
 def run(self, request, response):
     if request['lang'] == 'en':
         text = request['text']
         parsing = response['parse']
         debug = request.get('debug', False)
         result = []
         for sentence in parsing:
             tokens = [_annotation2token(t, text) for t in sentence]
             tags = _conll(tokens)
             for ann, (token, tag) in zip(sentence, tags):
                 if tag.startswith('B-'):
                     item = {
                         'start': ann['start'],
                         'end': ann['end'],
                         'label': tag.partition('-')[2]
                     }
                     if debug:
                         item['text'] = ann['text']
                     result.append(item)
                 elif tag.startswith('I-'):
                     item = result[-1]
                     item['end'] = ann['end']
                     if debug:
                         item['text'] = text[item['start']:item['end']]
         return result
     else:
         raise MissingLanguage(request['lang'])
コード例 #9
0
    def run(self, request, response):
        lang = request['lang']
        if lang in self.models:
            model = self.models[lang]
            text = request['text']
            parsing = response['parse']
            debug = request.get('debug', False)

            sentences = []
            for sentence in parsing:
                tokens = []
                for item in sentence:
                    if 'text' in item:
                        token = item['text'].strip()
                    else:
                        token = text[item['start']:item['end']].strip()
                    if token != '':
                        tokens.append(token)
                sentences.append('<t> ' + ' '.join(tokens) + ' </t>')
            summary_response = self._summarize_text(model, ' '.join(sentences),
                                                    debug)

            item = {'summary': summary_response['text']}

            if debug:
                item['summarization_ratio'] = len(
                    summary_response['text']) / len(text)
                item['prediction_score'] = summary_response['score']

            return item
        else:
            raise MissingLanguage(lang)
コード例 #10
0
ファイル: allen.py プロジェクト: phillswope/charade
 def run(self, request, response):
     if request['lang'] == 'en':
         text = request['text']
         debug = request.get('debug', False)
         prediction = self.model.predict(sentence=text)
         return _to_annotations(prediction, text, debug=debug)
     else:
         raise MissingLanguage(request['lang'])
コード例 #11
0
 def run(self, request, response):
     lang = request['lang']
     if lang == 'it':
         debug = request.get('debug', False)
         result = []
         for code in response['codes']:
             if code['type'] == 'FISCAL_CODE':
                 code_ = dict(code)
                 code_['correct'] = self.check_fiscal_code(code['text'])
                 code_.update(self.birth_date_and_gender(code['text']))
                 result.append(code_)
         return result
     else:
         raise MissingLanguage(lang)
コード例 #12
0
    def run(self, request, response):
        lang_pair = request['lang'] + '-' + request['target-lang']
        trl_sentences = []
        scores = []
        sentencewise = True

        if lang_pair in self.models:
            model = self.models[lang_pair]
            text = request['text']
            parsing = response['parse']
            debug = request.get('debug', False)
            sentences = []
            for sentence in parsing:
                tokens = []
                for item in sentence:
                    if 'text' in item:
                        token = item['text'].strip()
                    else:
                        token = text[item['start']:item['end']].strip()
                    if token != '':
                        tokens.append(token)
                if sentencewise:
                    trl_output = self._translate_text(model, ' '.join(tokens),
                                                      debug)
                    trl_sentences.append(trl_output['text'])
                    if debug:
                        scores.append(trl_output['score'])
                else:
                    sentences.append(' '.join(tokens))

            # Here translate sentence by sentence (otherwise it seems that it will just translate + summarize)
            item = {}

            if sentencewise:
                item['translation'] = '  '.join(trl_sentences)
                if debug:
                    #item['summarization_ratio'] = len(summary_response['text'])/len(text)
                    item['prediction_score'] = scores
            else:
                trl_output = self._translate_text(model, ' '.join(sentences),
                                                  debug)
                item['translation'] = trl_output['text']
                item['prediction_score'] = trl_output['score']

            return item
        else:
            raise MissingLanguage(lang_pair)
コード例 #13
0
ファイル: allen.py プロジェクト: phillswope/charade
 def run(self, request, response):
     lang = request['lang']
     if lang in self.models:
         model = self.models[lang]
         text = request['text']
         parsing = response['parse']
         debug = request.get('debug', False)
         result = []
         offset = 0
         for sentence in parsing:
             prediction = self._predict_sentence(model, sentence, text,
                                                 debug)
             result.append(prediction)
             offset = sentence[-1]['end']
         return result
     else:
         raise MissingLanguage(lang)
コード例 #14
0
ファイル: allen.py プロジェクト: phillswope/charade
 def run(self, request, response):
     lang = request['lang']
     if lang in self.predictors:
         reader = self.readers[lang]
         predictor = self.predictors[lang]
         text = request['text']
         parsing = response['parse']
         debug = request.get('debug', False)
         result = []
         offset = 0
         for sentence in parsing:
             prediction = self._predict_sentence(reader, predictor,
                                                 sentence, text)
             result = result + _to_annotations(
                 prediction, text, offset=offset, debug=debug)
             offset = sentence[-1]['end']
         return result
     else:
         raise MissingLanguage(lang)
コード例 #15
0
 def run(self, request, response):
     if request['lang'] == 'en':
         text = request['text']
         debug = request.get('debug', False)
         result = []
         for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize(
                 text):
             tokens = []
             sentence = text[sent_s:sent_e]
             for token_s, token_e in self.treebankWordTokenizer.span_tokenize(
                     sentence):
                 item = {'start': token_s + sent_s, 'end': token_e + sent_s}
                 if debug:
                     item['text'] = sentence[token_s:token_e]
                 tokens.append(item)
             result.append(tokens)
         return result
     else:
         raise MissingLanguage(request['lang'])
コード例 #16
0
    def run(self, request, response):
        lang = request['lang']
        if lang in self.stopwords:
            # Extract request data
            text = request['text']
            parsing = response['parse']
            debug = request.get('debug', False)
            num_sentences = request.get('num-extractive-sentences', 3)

            # Cleaning and normalization
            stopwords = self.stopwords[lang]
            keep = lambda x: any(char.isalpha() or char.isdigit() for char in x) \
                and (len(x) > 2) and not (x in stopwords)
            normalize = lambda x: x.lower()

            # Create the sentence graph and compute pagerank
            sentences = _parsing_to_tokens(parsing, text, keep, normalize)
            graph = _sentence_graph(sentences)
            weights = networkx.pagerank(graph)

            # Assemble result
            topk = nlargest(num_sentences, weights.items(), key=lambda x: x[1])
            result = []
            for i, _ in topk:
                sentence = parsing[i]
                start = sentence[0]['start']
                end = sentence[-1]['end']
                if debug:
                    result.append({
                        'start': start,
                        'end': end,
                        'text': text[start:end]
                    })
                else:
                    result.append({'start': start, 'end': end})

            return result
        else:
            raise MissingLanguage(lang)