Exemple #1
0
    def __init__(self,
                 tags_only=True,
                 input='content',
                 encoding='utf-8',
                 charset=None,
                 decode_error='strict',
                 charset_error=None,
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 analyzer='word',
                 stop_words=None,
                 token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=np.int64,
                 norm='l2',
                 use_idf=True,
                 smooth_idf=True,
                 sublinear_tf=False):
        super(NERVectorizer, self).__init__(input=input,
                                            charset=charset,
                                            charset_error=charset_error,
                                            encoding=encoding,
                                            decode_error=decode_error,
                                            strip_accents=strip_accents,
                                            lowercase=lowercase,
                                            preprocessor=preprocessor,
                                            tokenizer=tokenizer,
                                            analyzer=analyzer,
                                            stop_words=stop_words,
                                            token_pattern=token_pattern,
                                            ngram_range=ngram_range,
                                            max_df=max_df,
                                            min_df=min_df,
                                            max_features=max_features,
                                            vocabulary=vocabulary,
                                            binary=False,
                                            dtype=dtype,
                                            norm=norm,
                                            use_idf=use_idf,
                                            smooth_idf=smooth_idf,
                                            sublinear_tf=sublinear_tf)

        self.tags_only = tags_only
        # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding)
        self.tagger = SocketNER(host='localhost',
                                port=config.NER_PORT,
                                collapse=False)
Exemple #2
0
 def preprocess(self, pos=False, ner=False, tok_q=True):
     log.debug("preprocessing documents")
     if tok_q:
         self.tok_question = unicode(self.question).translate(
             self.delete_punctuation_map)
         self.tok_question = nltk.word_tokenize(self.tok_question.lower())
         self.tok_question = [
             self.lem.lemmatize(word) for word in self.tok_question
         ]
     if pos:
         # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs]
         self.pos_docs = [
             nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs
         ]
     if ner:
         self.ner = SocketNER(host='localhost',
                              port=config.NER_PORT,
                              collapse=False)
         self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]