コード例 #1
0
    def __init__(self, config=dict()):
        super(SIFFeaturizer, self).__init__()

        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.tokenize_fn = word_tokenize
        self.use_tokenizer = config.get('use_tokenizer', False)

        self.tokenizer = Tokenizer(num_words=self.num_words)
コード例 #2
0
    def __init__(self, config=dict()):
        super(FastTextFeaturizer, self).__init__()

        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.tokenize_fn = wordpunct_tokenize
        self.tokenizer = Tokenizer(num_words=self.num_words)
        self.token_indice = None
        self.indice_token = None
        self.max_features = MAX_NUM_WORDS
        self.max_len = MAX_SEQUENCE_LENGTH
        self.ngrams = 3
コード例 #3
0
ファイル: model.py プロジェクト: luungoc2005/nlp-test
    def __init__(self, config={}, *args, **kwargs):
        super(OvrClassifierWrapper, self).__init__(model_class=OvrClassifier,
                                                   config=config,
                                                   *args,
                                                   **kwargs)

        self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.n_classes = config.get('num_classes', 10)

        self.tokenize_fn = wordpunct_tokenize
        self.label_encoder = LabelEncoder()
コード例 #4
0
ファイル: model.py プロジェクト: luungoc2005/nlp-test
    def __init__(self, config={}):
        super(StarspaceClassifierWrapper,
              self).__init__(model_class=StarSpaceClassifier, config=config)
        self.config = config

        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.topk = config.get('top_k', 5)

        self.tokenizer = Tokenizer(num_words=self.num_words)
        self.loss_margin = config.get('loss_margin', .8)

        self.tokenize_fn = wordpunct_tokenize
        self.label_encoder = LabelEncoder()
コード例 #5
0
    def transform(self, data):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        tokens = [self.tokenize_fn(sent) for sent in data]
        tokens = self.tokenizer.texts_to_sequences(tokens)
        tokens = self.add_ngram(tokens, self.token_indice, self.ngrams)

        max_len = max([len(seq) for seq in tokens])
        if max_len > self.max_len:
            warnings.warn('Max training sequence length is %s, which is higher than max length setting %s' % \
                (max_len, self.max_len), UserWarning)

        tokens = pad_sequences(tokens, maxlen=self.max_len)

        return to_gpu(torch.LongTensor(tokens))
コード例 #6
0
 def fit(self, data):
     if self.tokenizer is None:
         self.tokenizer = Tokenizer(
             num_words=self.num_words, 
             lower=self.lower, 
             char_level=self.char_level,
             reserved_tokens=self.reserved_tokens
         )
     if self.char_level: print('Using char-level tokenizer')
     
     try:
         _ = (it for it in data)
         if len(data) < 1: return # Must have at least 1 item
     except:
         pass # data is not an iterable
     
     self.tokenizer.fit_on_texts(self.tokenize(data))
コード例 #7
0
    def __init__(self, config=dict()):
        super(BasicFeaturizer, self).__init__()

        self.lower = config.get('lower', True)
        self.char_level = config.get('char_level', False)
        self.num_words = config.get('num_words', n_letters + LM_CHAR_RESERVED if self.char_level else LM_VOCAB_SIZE)
        self.append_sos_eos = config.get('append_sos_eos', False)
        self.featurizer_seq_len = config.get('featurizer_seq_len', MAX_SEQUENCE_LENGTH)
        self.reserved_tokens = config.get('featurizer_reserved_tokens', [START_TAG, STOP_TAG, UNK_TAG])
        self.to_tensor = config.get('to_tensor', True) # pad sequences and return tensors
        self.return_mask = config.get('return_mask', False)

        self.tokenize_fn = config.get('tokenize_fn', wordpunct_tokenize)
        
        self.tokenizer = Tokenizer(
            num_words=self.num_words, 
            lower=self.lower, 
            char_level=self.char_level,
            reserved_tokens=self.reserved_tokens
        )
コード例 #8
0
ファイル: model.py プロジェクト: luungoc2005/nlp-test
    def preprocess_input(self, X):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        tokens = [self.tokenize_fn(sent) for sent in X]
        tokens = self.tokenizer.texts_to_sequences(tokens)
        tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf')

        maxlen = max([len(sent) for sent in tokens])
        tfidf_weights = np.zeros((len(tokens), maxlen))
        for i, seq in enumerate(tokens):
            for j, token in enumerate(seq):
                if token < self.tokenizer.num_words:
                    tfidf_weights[i][j] = tfidf_matrix[i][token]

        # convert from token back to texts
        # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited)
        embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens))

        sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0)

        return torch.from_numpy(sif_emb).float()
コード例 #9
0
    def fit(self, data):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        tokens = [self.tokenize_fn(sent) for sent in data]
        self.tokenizer.fit_on_texts(tokens)

        if self.ngrams > 1:
            # print('Adding {}-gram features'.format(ngram_range))
            # Create set of unique n-gram from the training set.
            ngram_set = set()
            for input_list in data:
                for i in range(2, self.ngrams + 1):
                    set_of_ngram = self.create_ngram_set(input_list,
                                                         ngram_value=i)
                    ngram_set.update(set_of_ngram)

        start_index = self.num_words + 1
        token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
        indice_token = {token_indice[k]: k for k in token_indice}
        self.token_indice = token_indice
        self.indice_token = indice_token

        max_features = np.max(list(indice_token.keys())) + 1
        self.max_features = max_features
コード例 #10
0
 def fit(self, data):
     if self.tokenizer is None:
         self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
     tokens = [self.tokenize_fn(sent) for sent in data]
     self.tokenizer.fit_on_texts(tokens)