Beispiel #1
0
def evaluate():
    vocab_map, _ = dataset.read_map('corpus/mapping')
    sess = tf.Session()
    Model = create_model(sess, 'test')
    Model.batch_size = 1

    df = pd.read_csv('corpus/SAD.csv', header=None)
    df = df.dropna()
    #df=df.head()
    idx = list(df.index)
    random.seed(SEED)
    random.shuffle(idx)
    df = df.ix[idx]
    cut_by = int(0.9 * df.shape[0])
    train_df = df.iloc[:cut_by]
    val_df = df.iloc[cut_by:]
    for df in [train_df, val_df]:
        sentences = df[3]
        answers = df[1]
        scores = []
        for i, sentence in enumerate(sentences):
            if i % 1000 == 0:
                print(i)
            token_ids = dataset.convert_to_token(sentence, vocab_map)
            encoder_input, encoder_length, _ = Model.get_batch(
                [(0, token_ids)], shuffle=False)
            score = Model.step(sess, encoder_input, encoder_length)
            #print(i,score)
            scores.append(score)
        scores = [s[0][0] for s in scores]
        auc = roc_auc_score(answers, scores)
        yield auc
def evaluate(cut_mode):
    if cut_mode == "word":
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    vocab_map, _ = dataset.read_map('corpus/mapping')
    sess = tf.Session()
    Model = create_model(sess, 'test')
    Model.batch_size = 1

    sys.stdout.write('>')
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    sentence = sentence_cutter(sentence, cut_mode)

    while (sentence):
        print('sentence: ', sentence)
        token_ids = dataset.convert_to_token(sentence, vocab_map)
        print('toekn_ids: ', token_ids)
        encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)])
        print('encoder_input: ', encoder_input, encoder_input.shape)
        print('encoder_length: ', encoder_length)
        score = Model.step(sess, encoder_input, encoder_length)
        print('Score: ', score[0][0])
        print('>', end='')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sentence = sentence_cutter(sentence, cut_mode)
Beispiel #3
0
    def build_model(self):
        cell = tf.contrib.rnn.GRUCell(self.unit_size)
        params = tf.get_variable('embedding',
                                 [self.vocab_size, self.unit_size])
        self.encoder_input = tf.placeholder(tf.int32, [None, self.max_length])
        embedding = tf.nn.embedding_lookup(params, self.encoder_input)
        self.seq_length = tf.placeholder(tf.int32, [None])

        _, hidden_state = tf.nn.dynamic_rnn(cell,
                                            embedding,
                                            sequence_length=self.seq_length,
                                            dtype=tf.float32)

        w = tf.get_variable('w', [self.unit_size, 1])
        b = tf.get_variable('b', [1])
        output = tf.matmul(hidden_state, w) + b

        self.logit = tf.nn.sigmoid(output)

        if self.mode != 'test':
            self.target = tf.placeholder(tf.float32, [None, 1])
            self.loss = tf.reduce_mean(tf.square(self.target - self.logit))

            self.opt = tf.train.AdamOptimizer().minimize(self.loss)
        else:
            self.vocab_map, _ = dataset.read_map(
                'sentiment_analysis/corpus/mapping')
def train():
    if gfile.Exists('corpus/mapping') and gfile.Exists('corpus/SAD.csv.token'):
        print('Files have already been formed!')
    else:
        dataset.form_vocab_mapping(50000)
        vocab_map, _ = dataset.read_map('corpus/mapping')
        dataset.file_to_token('corpus/SAD.csv', vocab_map)

    d = dataset.read_data('corpus/SAD.csv.token')
    random.seed(SEED)
    random.shuffle(d)

    train_set = d[:int(0.9 * len(d))]
    valid_set = d[int(-0.1 * len(d)):]

    sess = tf.Session()

    Model = create_model(sess, 'train')
    #Model = create_model(sess, 'valid')
    step = 0
    loss = 0

    while (True):
        step += 1
        encoder_input, encoder_length, target = Model.get_batch(train_set)
        '''
     print(encoder_input)
     print(encoder_length)
     print(target)
     exit()
     '''
        loss_train = Model.step(sess, encoder_input, encoder_length, target)
        loss += loss_train / CHECK_STEP
        if step % CHECK_STEP == 0:
            Model.mode = 'valid'
            temp_loss = 0
            for _ in range(100):
                encoder_input, encoder_length, target = Model.get_batch(
                    valid_set)
                loss_valid = Model.step(sess, encoder_input, encoder_length,
                                        target)
                temp_loss += loss_valid / 100.
            Model.mode = 'train'
            print("Train Loss: %s" % loss)
            print("Valid Loss: %s" % temp_loss)
            checkpoint_path = os.path.join('saved_model/', 'dis.ckpt')
            Model.saver.save(sess, checkpoint_path, global_step=step)
            print("Model Saved!")
            loss = 0
Beispiel #5
0
  def top_layer(self,outputs):
    w = tf.get_variable('w', [self.unit_size, 1])
    b = tf.get_variable('b', [1])
    output = tf.matmul(outputs, w) + b

    self.logit = tf.nn.sigmoid(output)

    if self.mode != 'test':
      self.target = tf.placeholder(tf.float32, [None, 1])
      self.loss = tf.reduce_mean(tf.square(self.target - self.logit))

      self.opt = tf.train.AdamOptimizer().minimize(self.loss)
    else:
      #self.vocab_map, _ = dataset.read_map('sentiment_analysis/corpus/mapping')
      self.vocab_map, _ = dataset.read_map('./corpus/mapping')
def evaluate():
  vocab_map, _ = dataset.read_map('corpus/mapping')
  sess = tf.Session()
  Model = create_model(sess, 'test')
  Model.batch_size = 1
  
  sys.stdout.write('>')
  sys.stdout.flush()
  sentence = sys.stdin.readline()

  while(sentence):
    token_ids = dataset.convert_to_token(sentence, vocab_map)
    encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)]) 
    score = Model.step(sess, encoder_input, encoder_length)
    print('Score: ' + str(score[0][0]))
    print('>', end = '')
    sys.stdout.flush()
    sentence = sys.stdin.readline()