Esempio n. 1
0
    def _create_features(self, sentences):
        """Tokenize and add special [CLS] ans [SEP] tokens"""
        examples = []
        for sent in sentences:
            line = bert.tokenization.convert_to_unicode(sent)
            if not line:
                print('WARNING! Blank sentence after unicodifying!')
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                bert.InputExample(unique_id=self._unique_id,
                                  text_a=text_a,
                                  text_b=text_b))
            self._unique_id += 1

        return bert.convert_examples_to_features(examples, self.max_seq_length,
                                                 self.tokenizer)
def batcher(params, batch):
    #print ('batch size' ,len(batch))
    batch = [sent if sent != [] else ['.'] for sent in batch]
    batch = [' '.join(sent) for sent in batch]
    #print ('batch', batch)
    examples = []
    unique_id = 0
    #print ('batch size ', len(batch))
    for sent in batch:
        sent = sent.strip()
        text_b = None
        text_a = sent
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1

    features = convert_examples_to_features(examples,
                                            params['bert'].seq_length,
                                            tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)

    all_encoder_layers, _ = params['bert'](all_input_ids,
                                           token_type_ids=None,
                                           attention_mask=all_input_mask)

    ###get z_vec
    #get the output of previous layer
    prev_out = all_encoder_layers[params['bert'].layer_no - 1]
    #print ('prev_out.shape ', prev_out.shape)
    #print ('all_input_mask.shape ', all_input_mask.shape)
    ##apply self-attention to it

    extended_attention_mask = all_input_mask.cuda().unsqueeze(1).unsqueeze(2)
    extended_attention_mask = extended_attention_mask.to(
        dtype=next(params['bert'].parameters()).dtype)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

    embeddings = next(params['bert'].children()).encoder.layer[
        params['bert'].layer_no].attention.self(prev_out,
                                                extended_attention_mask)
    ##do mean/max pooling

    embeddings = embeddings.detach().mean(1).cpu().numpy()
    #print ('befor shape', embeddings.shape)
    if params['bert'].head_no is not None:
        if params['bert'].head_no == 'random':
            embeddings = embeddings[:, params['bert'].randidx]
        else:
            embeddings = embeddings[:, 64 * params['bert'].head_no:64 *
                                    (params['bert'].head_no + 1)]
    #print ('after shape', embeddings.shape)

    #print ('embeddings.shape ', embeddings.shape)
    #print ('finished a batch \n\n')

    return embeddings
def batcher(params, batch):
    #print ('batch size' ,len(batch))
    batch = [sent if sent != [] else ['.'] for sent in batch]
    batch = [' '.join(sent) for sent in batch]
    #print ('batch', batch)
    examples = []
    unique_id = 0
    #print ('batch size ', len(batch))
    for sent in batch:
        sent = sent.strip()
        text_b = None
        text_a = sent
        examples.append(
            IE(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1

    features = convert_examples_to_features(examples, 128, params['DEbert'].tokenizer)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(params['DEbert'].device)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long).to(params['DEbert'].device)
    
    embeddings, _ = params['DEbert'](all_input_ids, token_type_ids=None, \
                                  attention_mask=all_input_mask)
    
    #print ('embeddings[-1].shape ', embeddings[-1].shape)
    final_embeddings = embeddings[-1].detach().mean(1).cpu().numpy()
    
    return final_embeddings
Esempio n. 4
0
    def predict(self, sentences):
        """
        Predic Model.

        Args:
            sentences (str): A list of sentences

        Return:
            A dict of word embedding
        """
        examples = self.get_examples(sentences)

        features = convert_examples_to_features(examples=examples,
                                                seq_length=self.max_seq_length,
                                                tokenizer=self.tokenizer)

        unique_id_to_feature = {}
        for feature in features:
            unique_id_to_feature[feature.unique_id] = feature

        input_fn = input_fn_builder(features=features,
                                    seq_length=self.max_seq_length)

        predictions = []
        for result in self.estimator.predict(input_fn,
                                             yield_single_examples=True):
            unique_id = int(result["unique_id"])
            feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            all_features = []
            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                for (j, layer_index) in enumerate(self.layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(float(x), 6)
                        for x in layer_output[i:(i + 1)].flat
                    ]
                    all_layers.append(layers)
                features = collections.OrderedDict()
                features["token"] = token
                features["layers"] = all_layers
                all_features.append(features)
            output_json["features"] = all_features
            predictions.append(output_json)

        # Post processing
        result = []
        for i in range(len(predictions)):
            prediction = [(elem["token"], elem["layers"][0]["values"])
                          for elem in predictions[i]["features"]]
            result.append(prediction)

        return result
def preprocess(text):
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len,
                                           tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {"inputs": {"input_ids": input_ids.tolist()}}
Esempio n. 6
0
def preprocess(text):

    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    feature = convert_examples_to_features([example], max_token_len,
                                           tokenizer)[0]
    """4: 从上一步的信息可知,输入的key是input_ids,维度是(1,400),同时外包[]表示batch size 为1 """
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return input_ids
Esempio n. 7
0
def get_features(input_text, dim=768, code_num=1):
    tf.logging.set_verbosity(tf.logging.ERROR)
    layer_indexes = LAYERS
    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
    _normalizer = None
    tokenizer = tokenization.JapaneseTweetTokenizer(
        vocab_file=VOCAB_FILE,
        model_file=VOCAB_MODEL,
        normalizer=_normalizer,
        do_lower_case=False)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=NUM_TPU_CORES,
            per_host_input_for_training=is_per_host))
    
    examples = read_sequence(input_text)

    features = convert_examples_to_features(
          examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      layer_indexes=layer_indexes,
      use_tpu=False,
      use_one_hot_embeddings=False)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=False,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE)

    input_fn = input_fn_builder(
        features=features, seq_length=MAX_SEQ_LENGTH)

    # Get features
    if code_num == 1:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            unique_id = int(result["unique_id"])
            feature = unique_id_to_feature[unique_id]
            layer_output = result["layer_output_0"]
            layer_output_flat = np.array([x for x in layer_output[0].flat])
            output = layer_output_flat[:dim]
    else:
        output = np.array([result["layer_output_0"][0] for result in estimator.predict(
        input_fn, yield_single_examples=True)])[-1][:dim]
    return output
Esempio n. 8
0
    def __getitem__(self, task_index):
        # choose the task indicated by index
        pos_examples, neg_examples = self.tasks[task_index]

        # for now just choose randomly among examples
        pos_indices = np.random.choice(range(len(pos_examples)), size=self.K)
        neg_indices = np.random.choice(range(len(neg_examples)), size=self.K)

        # # interleave randomly - DIFFERENT FROM RANDOMLY SHUFFLING
        # examples = np.empty((self.K*2, 2), dtype=pos.dtype)
        # if np.random.uniform() > .5:
        #     examples[0::2, :] = pos
        #     examples[1::2, :] = neg
        # else:
        #     examples[0::2, :] = neg
        #     examples[1::2, :] = pos

        # Randomly shuffle positive and negative examples for now
        all_examples = pd.concat([pos_examples.iloc[pos_indices, :],
                                  neg_examples.iloc[neg_indices, :]]).sample(frac=1)
        train_examples = self.data_processor.get_examples(input_df=all_examples.iloc[:self.K, :], set_type='train')
        test_examples = self.data_processor.get_examples(input_df=all_examples.iloc[self.K:, :], set_type='test')

        train_features = convert_examples_to_features(train_examples, label_list=['0','1'], max_seq_length=128,
                                                      tokenizer=self.tokenizer, task_name='Streetbees_Mood',
                                                      model_name=self.baseLM_name, do_logging=False)
        all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long)
        all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long)
        task_train_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids,
                           'input_masks':  all_input_masks, 'label_ids': all_label_ids}

        test_features = convert_examples_to_features(test_examples, label_list=['0','1'], max_seq_length=128,
                                                      tokenizer=self.tokenizer, task_name='Streetbees_Mood',
                                                      model_name=self.baseLM_name, do_logging=False)
        all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long)
        all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long)
        task_test_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids,
                          'input_masks':  all_input_masks, 'label_ids': all_label_ids}

        return task_train_data, task_test_data
def preprocess(text):
    """
 function: preprocess text into input numpy array
    """
    vocab_file = os.environ.get("vocab_file", "./dependency/vocab.txt")
    max_token_len = os.environ.get("max_token_len", 128)
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {"inputs": {"input_ids": input_ids.tolist()}}
Esempio n. 10
0
    def vectorize(self, text, log_verbosity=tf.logging.INFO):
        ''' Converts text to features '''
        features = convert_examples_to_features(
            examples=self.__convert_text_to_examples(text), 
            seq_length=self.max_seq_length, 
            tokenizer=self.tokenizer
        )

        
        tf.logging.set_verbosity(log_verbosity)

        ''' Inverse mapping from id to feature '''
        unique_id_to_feature = {}
        
        for feature in features:
            unique_id_to_feature[feature.unique_id] = feature

        ''' Generates the function for feeding inputs '''
        input_fn = input_fn_builder(features=features, seq_length=self.max_seq_length)


        ''' Predicts embeddings '''
        results = self.estimator.predict(input_fn, yield_single_examples=True) 


        ''' Under the assumption tha text contains multiple paragraphs, this array will contain them. Process paragraphs.  '''
        paragraphs = []

        for result in results:
            unique_id = int(result['unique_id'])
            feature = unique_id_to_feature[unique_id]
            
            all_features = []

            ''' Iterates tokens in paragraph. Process words. '''
            for (i, token) in enumerate(feature.tokens):

                all_layers = []
                
                ''' Iterates over all requested layer outputs. Process layers.'''
                for (j, layer_index) in enumerate(self.layer_indexes):
                    layer_output = result['layer_output_%d' % j]
                    layer_values = [round(float(x), 6) for x in layer_output[i:(i + 1)].flat]
                    if self.concatenate_layers:
                        all_layers += layer_values
                    else:
                        all_layers.append(layer_values)

                all_features.append(np.asarray(all_layers))
            paragraphs.append(np.asarray(all_features))
        return paragraphs
Esempio n. 11
0
def create_examples(sents, tokenizer):
    inputs = _create_example(sents)
    features = extract_features.convert_examples_to_features(examples=inputs, seq_length=MAXLENGTH, tokenizer=tokenizer)
    return features
Esempio n. 12
0
    def make_predictions(self,
                         input_df=None,
                         labelled_data=True,
                         data_processor=None):
        """
        Make predictions using the Language Model

        Parameters
        ----------
        input_df : pandas dataframe, optional
            If inputted, this is the dataframe that we will make predictions from. If not given, then
            the system will automatically look in the processed_data folder in datascience-projects and get the
            test data corresponding to the folder defined by the processor object, by default None
        labelled_data : bool, optional
            Indicates whether or not the data is labelled, by default True
        data_processor : DataProcessor object
            A data processor object from data_processing.py that processes the data. We use this for Streetbees Mood
            data to get the test examples, the name and the labels etc

        Returns
        -------
        array or tuple
            If labelled_data: tuple (y_preds, test_accuracy)
            if not labelled data: array y_preds
        """
        processor = data_processor(data_dir=self.data_dir,
                                   labelled_data=labelled_data)
        task_name = processor.data_name
        label_list = processor.get_labels()
        test_examples = processor.get_examples(input_df=input_df,
                                               set_type="test")
        max_seq_length = 128  # All models trained with this max_seq_length

        # Load in the testing data
        test_features = convert_examples_to_features(test_examples, label_list,
                                                     max_seq_length,
                                                     self.tokenizer, task_name,
                                                     self.baseLM_model_name)
        all_input_ids = torch.tensor(
            [feature.input_ids for feature in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [feature.segment_ids for feature in test_features],
            dtype=torch.long)
        all_input_masks = torch.tensor(
            [feature.input_mask for feature in test_features],
            dtype=torch.long)
        if labelled_data:
            all_label_ids = torch.tensor(
                [feature.label_id for feature in test_features],
                dtype=torch.long)
            test_data = TensorDataset(all_input_ids, all_segment_ids,
                                      all_input_masks, all_label_ids)
        else:
            test_data = TensorDataset(all_input_ids, all_segment_ids,
                                      all_input_masks)

        test_dataloader = DataLoader(test_data)
        # Calculate the accuracies, losses (if labelled_data) and return the predictions
        y_pred = []
        if labelled_data:
            y_true = []
            test_total_correct = 0

        # Evaluate the model with batchwise accuracy like in training
        for batch in tqdm(test_dataloader, desc="Getting Predictions"):
            if labelled_data:
                input_ids, segment_ids, input_masks, label_ids = tuple(
                    model_input.to(self.device) for model_input in batch)
                with torch.no_grad():
                    _, logits = self.model(input_ids, segment_ids, input_masks,
                                           task_name, label_ids)
            else:
                input_ids, segment_ids, input_masks = tuple(
                    model_input.to(self.device) for model_input in batch)
                with torch.no_grad():
                    logits = self.model(input_ids, segment_ids, input_masks,
                                        task_name)

            logits = logits.detach().cpu().numpy()
            y_pred_batch = np.argmax(logits, axis=1)
            y_pred = np.concatenate([y_pred, y_pred_batch])
            if labelled_data:
                y_true_batch = label_ids.to('cpu').numpy()
                y_true = np.concatenate([y_true, y_true_batch])
                batch_num_correct = np.sum(y_pred_batch == y_true_batch)
                test_total_correct += batch_num_correct

        if labelled_data:
            test_accuracy = test_total_correct / len(test_data)
            LOGGER.info(
                '\n' +
                classification_report(y_true, y_pred, target_names=label_list))
            return y_pred, test_accuracy
        else:
            return y_pred