def __init__(self, tags_only=True, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): super(NERVectorizer, self).__init__(input=input, charset=charset, charset_error=charset_error, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=False, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) self.tags_only = tags_only # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding) self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
def preprocess(self, pos=False, ner=False, tok_q=True): log.debug("preprocessing documents") if tok_q: self.tok_question = unicode(self.question).translate( self.delete_punctuation_map) self.tok_question = nltk.word_tokenize(self.tok_question.lower()) self.tok_question = [ self.lem.lemmatize(word) for word in self.tok_question ] if pos: # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs] self.pos_docs = [ nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs ] if ner: self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False) self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]