plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


if len(sys.argv) > 2:
  file1 = sys.argv[1]
  file2 = sys.argv[2]

  df1 = pd.read_csv(file1)
  df1 = df1.sort_values('id')
  df2 = pd.read_csv(file2)
  df2 = df2.sort_values('id')

  scores1 = [gezi.str2scores(x) for x in df1['score'].values]
  scores2 = [gezi.str2scores(x) for x in df2['score'].values]

  scores1 = np.reshape(scores1, [-1, len(ATTRIBUTES), 4])
  scores1 = gezi.softmax(scores1)
  scores2 = np.reshape(scores2, [-1, len(ATTRIBUTES), 4])
  scores2 = gezi.softmax(scores2)

  ndf1 = pd.DataFrame()
  ndf2 = pd.DataFrame()

  for i, attr in enumerate(ATTRIBUTES):
    score1 = np.reshape(scores1[:, i, :], [-1])
    score2 = np.reshape(scores2[:, i, :], [-1])
    ndf1[attr] = score1 
    ndf2[attr] = score2
Esempio n. 2
0
# for score in iscores: 
#   score = gezi.softmax(np.reshape(score, [num_attrs, 4]), -1)
#   score = np.reshape(score, [-1])
#   iscores2.append(score)
# iscores = iscores2
iscores = np.array(iscores)

print(valid_file)
df = pd.read_csv(valid_file, sep=',')
df = df.sort_values('id')
labels = df.iloc[:,idx:idx+num_attrs].values
predicts = df.iloc[:,idx+num_attrs:idx+2*num_attrs].values
#scores = df['score']
scores = df['logit']
#scores = df['prob']
scores = [gezi.str2scores(score) for score in scores]
# scores2 = []
# for score in scores: 
#   score = gezi.softmax(np.reshape(score, [num_attrs, 4]), -1)
#   score = np.reshape(score, [-1])
#   scores2.append(score)
# scores = scores2
scores = np.array(scores)
ids = df.iloc[:,0].values 

cnames = []
for attr in ATTRIBUTES:
  for i in range(4):
    cnames.append(f'{attr}_{i}')

print('---------', cnames)  
Esempio n. 3
0
df = pd.read_csv('./models.csv')
df = df[df['model'] != 'ensemble']

models_ = df['model'].values
files_ = df['file'].values
metrics = df['adjusted_f1/mean'].values

models = []
files = []
for file, model in tqdm(zip(files_, models_), ascii=True):
    if not os.path.exists(file):
        continue
    df = pd.read_csv(file)
    df = df.sort_values('id')
    scores = [gezi.str2scores(x) for x in df['score'].values]
    scores = np.reshape(scores, [-1, len(ATTRIBUTES), 4])
    scores = gezi.softmax(scores)
    ndf = pd.DataFrame()
    ndf['score'] = np.reshape(scores, [-1])
    dfs.append(ndf)
    files.append(file)
    models.append(model)


def calc_correlation(x, y, method):
    if method.startswith('ks'):
        ks_stat, p_value = ks_2samp(x, y)
        if method == 'ks_s':
            score = ks_stat
        else:
Esempio n. 4
0
def evaluate_file(file):
    print('-------------------------', file)
    df = pd.read_csv(file)

    scores = df['score']
    scores = [gezi.str2scores(score) for score in scores]
    scores = np.array(scores)

    predicts = np.reshape(scores, [-1, NUM_ATTRIBUTES, NUM_CLASSES])

    # for auc might need to do this
    #predicts /= 26

    idx = 2
    length = NUM_ATTRIBUTES

    labels = df.iloc[:, idx:idx + length].values
    labels += 2

    #print(labels.shape, predicts.shape)
    assert labels.shape[0] == 15000, labels.shape[0]
    vals, names = evaluate(labels, predicts)

    if FLAGS.show_detail:
        for name, val in zip(names, vals):
            print(name, val)

    print('---------------------------------')
    for name, val in zip(names, vals):
        if 'mean' in name:
            print(name, val)

    lens = [len(x) for x in df['content'].values]
    predicts1 = []
    predicts2 = []

    labels1 = []
    labels2 = []
    for len_, label, predict in zip(lens, labels, predicts):
        if len_ >= FLAGS.len_thre:
            predicts2.append(predict)
            labels2.append(label)
        else:
            predicts1.append(predict)
            labels1.append(label)
    predicts1 = np.array(predicts1)
    labels1 = np.array(labels1)
    print('num docs len < ', FLAGS.len_thre, len(predicts1))
    vals1, names1 = evaluate(labels1, predicts1)
    for name, val in zip(names1, vals1):
        if 'mean' in name:
            print(name, val)
    predicts2 = np.array(predicts2)
    labels2 = np.array(labels2)
    print('num docs len >= ', FLAGS.len_thre, len(predicts2))
    vals2, names2 = evaluate(labels2, predicts2)
    for name, val in zip(names2, vals2):
        if 'mean' in name:
            print(name, val)

    return vals, names
def build_features(index):
  mode = get_mode(FLAGS.input)

  start_index = FLAGS.start_index

  out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format(mode, index + start_index)
  os.system('mkdir -p %s' % os.path.dirname(out_file))
  print('---out_file', out_file)
  # TODO now only gen one tfrecord file 

  total = len(df)
  num_records = FLAGS.num_records_ 
  ## TODO FIXME whty here still None ? FLAGS.num_records has bee modified before in main as 7 ...
  #print('---------', num_records, FLAGS.num_records_)
  if not num_records:
    if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'] or 'valid' in FLAGS.input:
      num_records = 1
    else:
      num_records = 7
  #print('------------------', num_records, FLAGS.num_records_)
  start, end = gezi.get_fold(total, num_records, index)

  print('total', total, 'infile', FLAGS.input, 'out_file', out_file)

  max_len = 0
  max_num_ids = 0
  num = 0
  with melt.tfrecords.Writer(out_file) as writer:
    for i in tqdm(range(start, end), ascii=True):
      try:
        row = df.iloc[i]
        id = str(row[0])

        if seg_result:
          if id not in seg_result:
            print('id %s ot found in seg_result' % id)
            continue
          words = seg_result[id]

          if FLAGS.content_limit_:
            # NOW only for bert!
            if len(words) + 2 > FLAGS.content_limit_:
              words = words[:FLAGS.content_limit_ - 3 - 50] + ['[MASK]'] + words[-50:]
              #print(words)
          if FLAGS.add_start_end_:
            words = gezi.add_start_end(words, FLAGS.start_mark, FLAGS.end_mark)
        if pos_result:
          pos = pos_result[id]
          if FLAGS.add_start_end_:
            pos = gezi.add_start_end(pos)
        if ner_result:
          ner = ner_result[id]
          if FLAGS.add_start_end_:
            ner = gezi.add_start_end(ner)

        if start_index > 0:
          id == 't' + id
  
        content = row[1] 
        content_ori = content
        content = filter.filter(content)

        if not FLAGS.use_soft_label_:
          if 'test' in mode:
            label = [-2] * 20
          else:
            label = list(row[2:])
          
          #label = [x + 2 for x in label]
          #num_labels = len(label)
        else:
          label = [0.] * 80
          if not FLAGS.is_soft_label:
            for idx, val in enumerate(row[2:]):
              label[idx * 4 + val] = 1.
          else:
            logits = np.array(gezi.str2scores(row['score']))
            logits = np.reshape(logits, [20, 4])
            probs = gezi.softmax(logits)
            label = list(np.reshape(probs, [-1]))

        if not seg_result:
          content_ids, words = text2ids_(content, preprocess=False, return_words=True)
          assert len(content_ids) == len(words)
        else:
          content_ids = [vocab.id(x) for x in words]
          #print(words, content_ids)
          #exit(0)

        if len(content_ids) > max_len:
          max_len = len(content_ids)
          print('max_len', max_len)

        if len(content_ids) > FLAGS.word_limit and len(content_ids) < 5:
          print('{} {} {}'.format(id, len(content_ids), content_ori))
        #if len(content_ids) > FLAGS.word_limit:
        #  print(id, content)
        #  if mode not in ['test', 'valid']:
        #    continue 

        #if len(content_ids) < 5 and mode not in ['test', 'valid']:
        #  continue

        content_ids = content_ids[:FLAGS.word_limit]
        words = words[:FLAGS.word_limit]

        # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode
        if FLAGS.use_char:
          chars = [list(word) for word in words]
          char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32)
          
          vocab_ = char_vocab if char_vocab else vocab

          for i, token in enumerate(chars):
            for j, ch in enumerate(token):
              if j == FLAGS.char_limit:
                break
              char_ids[i, j] = vocab_.id(ch)

          char_ids = list(char_ids.reshape(-1))
          if np.sum(char_ids) == 0:
            print('------------------------bad id', id)
            print(content_ids)
            print(words)
            exit(0)
        else:
          char_ids = [0]

        if pos_vocab:
          assert pos
          pos = pos[:FLAGS.word_limit]
          pos_ids = [pos_vocab.id(x) for x in pos]
        else:
          pos_ids = [0]

        if ner_vocab:
          assert ner 
          if pos_vocab:
            assert len(pos) == len(ner)         
          ner = ner[:FLAGS.word_limit]

          ner_ids = [ner_vocab.id(x) for x in ner]
        else:
          ner_ids = [0]

        wlen = [len(word) for word in words]

        feature = {
                    'id': melt.bytes_feature(id),
                    'content':  melt.int64_feature(content_ids),
                    'content_str': melt.bytes_feature(content_ori), 
                    'char': melt.int64_feature(char_ids),
                    'pos': melt.int64_feature(pos_ids), # might also be postion info for mix seg
                    'ner': melt.int64_feature(ner_ids),
                    'wlen': melt.int64_feature(wlen),
                    'source': melt.bytes_feature(mode), 
                  }
        feature['label'] = melt.int64_feature(label) if not FLAGS.use_soft_label_ else melt.float_feature(label)

        # TODO currenlty not get exact info wether show 1 image or 3 ...
        record = tf.train.Example(features=tf.train.Features(feature=feature))

        writer.write(record)
        num += 1
        global counter
        with counter.get_lock():
          counter.value += 1
        global total_words
        with total_words.get_lock():
          total_words.value += len(content_ids)
      except Exception:
        print(traceback.format_exc(), file=sys.stderr)
        pass