Ejemplo n.º 1
0
    def __init__(self,
                 tags_only=True,
                 input='content',
                 encoding='utf-8',
                 charset=None,
                 decode_error='strict',
                 charset_error=None,
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 analyzer='word',
                 stop_words=None,
                 token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=np.int64,
                 norm='l2',
                 use_idf=True,
                 smooth_idf=True,
                 sublinear_tf=False):
        super(NERVectorizer, self).__init__(input=input,
                                            charset=charset,
                                            charset_error=charset_error,
                                            encoding=encoding,
                                            decode_error=decode_error,
                                            strip_accents=strip_accents,
                                            lowercase=lowercase,
                                            preprocessor=preprocessor,
                                            tokenizer=tokenizer,
                                            analyzer=analyzer,
                                            stop_words=stop_words,
                                            token_pattern=token_pattern,
                                            ngram_range=ngram_range,
                                            max_df=max_df,
                                            min_df=min_df,
                                            max_features=max_features,
                                            vocabulary=vocabulary,
                                            binary=False,
                                            dtype=dtype,
                                            norm=norm,
                                            use_idf=use_idf,
                                            smooth_idf=smooth_idf,
                                            sublinear_tf=sublinear_tf)

        self.tags_only = tags_only
        # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding)
        self.tagger = SocketNER(host='localhost',
                                port=config.NER_PORT,
                                collapse=False)
Ejemplo n.º 2
0
class NERVectorizer(TfidfVectorizer):

    def __init__(self, tags_only=True, input='content', encoding='utf-8', charset=None,
                 decode_error='strict', charset_error=None,
                 strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(NERVectorizer, self).__init__(
            input=input, charset=charset, charset_error=charset_error,
            encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=False,
            dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf,
            sublinear_tf=sublinear_tf)

        self.tags_only = tags_only
        # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding)
        self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)

    def build_analyzer(self):
        """Return a callable that handles preprocessing and tokenization"""

        # preprocess = self.build_preprocessor()
        # tokenizer = self.build_tokenizer()
        # tokenize = lambda doc: tokenizer(preprocess(self.decode(doc)))

        # get_tags = lambda doc: [tag for tag in self.tagger.get_entities(doc).iterkeys()]

        if self.tags_only:
            get_tags = lambda doc: [t[0] for t in self.tagger.get_entities(doc)]
        else:
            get_tags = lambda doc: list(chain.from_iterable(self.tagger.get_entities(doc)))

        # if self.tags_only:
        # get_tags = lambda doc: [t[1] for t in self.tagger.tag(tokenize(doc))]
        # else:
        #     get_tags = lambda doc: list(chain.from_iterable(self.tagger.tag(tokenize(doc))))
        return lambda doc: self._word_ngrams(get_tags(doc))
Ejemplo n.º 3
0
 def preprocess(self, pos=False, ner=False, tok_q=True):
     log.debug("preprocessing documents")
     if tok_q:
         self.tok_question = unicode(self.question).translate(
             self.delete_punctuation_map)
         self.tok_question = nltk.word_tokenize(self.tok_question.lower())
         self.tok_question = [
             self.lem.lemmatize(word) for word in self.tok_question
         ]
     if pos:
         # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs]
         self.pos_docs = [
             nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs
         ]
     if ner:
         self.ner = SocketNER(host='localhost',
                              port=config.NER_PORT,
                              collapse=False)
         self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]
Ejemplo n.º 4
0
 def preprocess(self, pos=False, ner=False, tok_q=True):
     log.debug("preprocessing documents")
     if tok_q:
         self.tok_question = unicode(self.question).translate(self.delete_punctuation_map)
         self.tok_question = nltk.word_tokenize(self.tok_question.lower())
         self.tok_question = [self.lem.lemmatize(word) for word in self.tok_question]
     if pos:
         # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs]
         self.pos_docs = [nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs]
     if ner:
         self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
         self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]
Ejemplo n.º 5
0
class BaseExtractor(object):
    def __init__(self, question, docs):
        self.docs = docs
        self.question = question
        self.lem = nltk.stem.wordnet.WordNetLemmatizer()
        self.delete_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    def preprocess(self, pos=False, ner=False, tok_q=True):
        log.debug("preprocessing documents")
        if tok_q:
            self.tok_question = unicode(self.question).translate(self.delete_punctuation_map)
            self.tok_question = nltk.word_tokenize(self.tok_question.lower())
            self.tok_question = [self.lem.lemmatize(word) for word in self.tok_question]
        if pos:
            # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs]
            self.pos_docs = [nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs]
        if ner:
            self.ner = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
            self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]

    def clean(self, s):
        return self.lem.lemmatize(unicode(s).translate(self.delete_punctuation_map).lower())

    def sort_candidates(self, candidates):
        """
        Takes a dict with frequencies {'a':2, 'b':4, 'c':1} and sorts them.
        Returns the list of sorted candidates with percentages.
        """
        if len(candidates) == 0:
            return None
        # automatically creates nested dict when they don't exist
        cleaned = defaultdict(dict)
        for item, count in candidates.iteritems():
            cleaned[self.clean(item)][item] = count
        results = {}
        for item, options in cleaned.iteritems():
            selected_option, max_count, total_count = None, 0, 0
            for option, count in options.iteritems():
                total_count += count
                if count > max_count:
                    selected_option, max_count = option, count
            results[selected_option] = total_count

        results = sorted(results.iteritems(), key=itemgetter(1), reverse=True)
        total = sum(count for item, count in results)
        # trim to first 10 items
        return [(item, count/float(total)) for item, count in results][:10]

    def answer(self):
        """
        Answer should return a sorted list of answer tuples with their confidence
        """
        return "I don't know how to answer that type of question yet"
Ejemplo n.º 6
0
    def __init__(self, tags_only=True, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(NERVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=False,
            dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf,
            sublinear_tf=sublinear_tf)

        self.tags_only = tags_only
        # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding)
        self.tagger = SocketNER(host='localhost', port=config.NER_PORT, collapse=False)
Ejemplo n.º 7
0
class NERVectorizer(TfidfVectorizer):
    def __init__(self,
                 tags_only=True,
                 input='content',
                 encoding='utf-8',
                 charset=None,
                 decode_error='strict',
                 charset_error=None,
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 analyzer='word',
                 stop_words=None,
                 token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=np.int64,
                 norm='l2',
                 use_idf=True,
                 smooth_idf=True,
                 sublinear_tf=False):
        super(NERVectorizer, self).__init__(input=input,
                                            charset=charset,
                                            charset_error=charset_error,
                                            encoding=encoding,
                                            decode_error=decode_error,
                                            strip_accents=strip_accents,
                                            lowercase=lowercase,
                                            preprocessor=preprocessor,
                                            tokenizer=tokenizer,
                                            analyzer=analyzer,
                                            stop_words=stop_words,
                                            token_pattern=token_pattern,
                                            ngram_range=ngram_range,
                                            max_df=max_df,
                                            min_df=min_df,
                                            max_features=max_features,
                                            vocabulary=vocabulary,
                                            binary=False,
                                            dtype=dtype,
                                            norm=norm,
                                            use_idf=use_idf,
                                            smooth_idf=smooth_idf,
                                            sublinear_tf=sublinear_tf)

        self.tags_only = tags_only
        # self.tagger = NERTagger(config.NER_MODEL_PATH, config.NER_JAR, encoding=self.encoding)
        self.tagger = SocketNER(host='localhost',
                                port=config.NER_PORT,
                                collapse=False)

    def build_analyzer(self):
        """Return a callable that handles preprocessing and tokenization"""

        # preprocess = self.build_preprocessor()
        # tokenizer = self.build_tokenizer()
        # tokenize = lambda doc: tokenizer(preprocess(self.decode(doc)))

        # get_tags = lambda doc: [tag for tag in self.tagger.get_entities(doc).iterkeys()]

        if self.tags_only:
            get_tags = lambda doc: [
                t[0] for t in self.tagger.get_entities(doc)
            ]
        else:
            get_tags = lambda doc: list(
                chain.from_iterable(self.tagger.get_entities(doc)))

        # if self.tags_only:
        # get_tags = lambda doc: [t[1] for t in self.tagger.tag(tokenize(doc))]
        # else:
        #     get_tags = lambda doc: list(chain.from_iterable(self.tagger.tag(tokenize(doc))))
        return lambda doc: self._word_ngrams(get_tags(doc))
Ejemplo n.º 8
0
class BaseExtractor(object):
    def __init__(self, question, docs):
        self.docs = docs
        self.question = question
        self.lem = nltk.stem.wordnet.WordNetLemmatizer()
        self.delete_punctuation_map = dict(
            (ord(char), None) for char in string.punctuation)

    def preprocess(self, pos=False, ner=False, tok_q=True):
        log.debug("preprocessing documents")
        if tok_q:
            self.tok_question = unicode(self.question).translate(
                self.delete_punctuation_map)
            self.tok_question = nltk.word_tokenize(self.tok_question.lower())
            self.tok_question = [
                self.lem.lemmatize(word) for word in self.tok_question
            ]
        if pos:
            # self.tok_docs = [nltk.word_tokenize(doc) for doc in self.docs]
            self.pos_docs = [
                nltk.pos_tag(nltk.word_tokenize(doc)) for doc in self.docs
            ]
        if ner:
            self.ner = SocketNER(host='localhost',
                                 port=config.NER_PORT,
                                 collapse=False)
            self.ne_docs = [self.ner.get_entities(doc) for doc in self.docs]

    def clean(self, s):
        return self.lem.lemmatize(
            unicode(s).translate(self.delete_punctuation_map).lower())

    def sort_candidates(self, candidates):
        """
        Takes a dict with frequencies {'a':2, 'b':4, 'c':1} and sorts them.
        Returns the list of sorted candidates with percentages.
        """
        if len(candidates) == 0:
            return None
        # automatically creates nested dict when they don't exist
        cleaned = defaultdict(dict)
        for item, count in candidates.iteritems():
            cleaned[self.clean(item)][item] = count
        results = {}
        for item, options in cleaned.iteritems():
            selected_option, max_count, total_count = None, 0, 0
            for option, count in options.iteritems():
                total_count += count
                if count > max_count:
                    selected_option, max_count = option, count
            results[selected_option] = total_count

        results = sorted(results.iteritems(), key=itemgetter(1), reverse=True)
        total = sum(count for item, count in results)
        # trim to first 10 items
        return [(item, count / float(total)) for item, count in results][:10]

    def answer(self):
        """
        Answer should return a sorted list of answer tuples with their confidence
        """
        return "I don't know how to answer that type of question yet"