Python int64_feature_listの例

プログラミング言語: Python

名前空間/パッケージ名: melt

メソッド/関数: int64_feature_list

hotexamples.comのコード掲載数: 6

Python int64_feature_list - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmelt.int64_feature_listの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break

コード例 #2

ファイルを表示

ファイル: gen-records.py プロジェクト: buptpriswang/hasky

def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break

コード例 #3

ファイルを表示

ファイル: gen-records.py プロジェクト: buptpriswang/hasky

def deal_imgtextfile(file):
    """
  since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M
  this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to 
  convert and store image binaries from imatext(preprocess)
  """
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    assert len(pic_info_map) > 0
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            if img not in pic_info_map:
                continue

            img_text = l[-1]
            encoded_image = urllib.unquote_plus(img_text)

            text_info = pic_info_map[img]
            texts = text_info.split('\x01')

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        ##--well too big for encoded_image and so not consider evaluation?  TODO
                        #image_features.append(encoded_image)
                        if FLAGS.image_dir:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break

コード例 #4

ファイルを表示

ファイル: gen-records.py プロジェクト: tangqiqi123/hasky

def _parse_line(line, writer, thread_index = 0):
  l = line.rstrip().split('\t')
  image_name = l[0]
  image_feature = [float(x) for x in l[1:]]
  if image_name not in text_map:
    print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip()))
    return
  else:
    image_path =  FLAGS.image_dir + '/' + image_name
    #print(image_path)

    if FLAGS.write_raw_image_bytes:
      with tf.gfile.FastGFile(image_path, "r") as f:
        encoded_image = f.read()
    else:
      encoded_image = ''

    #---------below will hang if multi process
    #try:
    #  decoder.decode_jpeg(encoded_image)
    #except (tf.errors.InvalidArgumentError, AssertionError):
    #  print("Skipping file with invalid JPEG data: %s" % image_path)
    #  return
      
    for text, ori_text in text_map[image_name]:
      word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK]
      if not word_ids:
        continue 
      word_ids_length = len(word_ids)
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'image_name': melt.bytes_feature(image_name),
          'image_data': melt.bytes_feature(encoded_image),
          'image_feature': melt.float_feature(image_feature),
          'text': melt.int64_feature(word_ids),
          'text_str': melt.bytes_feature(ori_text),
          }))
      else:
        example = tf.train.SequenceExample(
          context=melt.features(
            {
              'image_name': melt.bytes_feature(image_name),
              'image_data': melt.bytes_feature(encoded_image),
              'image_feature': melt.float_feature(image_feature),
              'text_str': melt.bytes_feature(ori_text),
             }),
          feature_lists=melt.feature_lists(
          { 
            'text': melt.int64_feature_list(word_ids)
          }))
     
      if FLAGS.np_save:
        gtexts[thread_index].append(word_ids)
        gtext_strs[thread_index].append(ori_text)


      #NOTICE not test here for num_threads > 1
      if FLAGS.num_records:
        if image_name not in images:
          images[image_name] = 1
          print(image_name, len(images))
          writer.write(example.SerializeToString())
          if len(images) == FLAGS.num_records:
            print('Done')
            exit(1)
      else:
        writer.write(example.SerializeToString())
        global counter, max_num_words, sum_words
        with counter.get_lock():
          counter.value += 1
        if word_ids_length > max_num_words.value:
          with max_num_words.get_lock():
            max_num_words.value = word_ids_length
        with sum_words.get_lock():
          sum_words.value += word_ids_length

コード例 #5

ファイルを表示

def deal_file(file):
  out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
  print('out_file:', out_file)
  with melt.tfrecords.Writer(out_file) as writer:
    num = 0
    for line in open(file):
      if num % 1000 == 0:
        print(num)
      
      l = line.rstrip('\n').split('\t')

      text = l[FLAGS.text_index]

      input_text = l[FLAGS.input_text_index]
      
      input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method)
      input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      if len(input_word_ids) == 0:
        continue
      input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
      if FLAGS.pad:
        input_word_ids = gezi.pad(input_word_ids)

      words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
      word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      word_ids_length = len(word_ids)
      if num % 1000 == 0:
        print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
      if word_ids_length == 0:
        continue 
      if is_luanma(words, word_ids):
        print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        continue 
                  
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'input_text_str': melt.bytes_feature(input_text),
          'input_text': melt.int64_feature(input_word_ids),
          'text_str': melt.bytes_feature(text),
          'text': melt.int64_feature(word_ids),
          }))
      else:
        example = tf.train.SequenceExample(
              context=melt.features(
              {
                'input_text_str': melt.bytes_feature(input_text),
                'text_str': melt.bytes_feature(text),
              }),
              feature_lists=melt.feature_lists(
              { 
                'input_text': melt.int64_feature_list(input_word_ids),
                'text': melt.int64_feature_list(word_ids)
              }))
      writer.write(example)
      
        #global counter, max_num_words, sum_words
      with record_counter.get_lock():
        record_counter.value += 1
      if word_ids_length > max_num_words.value:
        with max_num_words.get_lock():
          max_num_words.value = word_ids_length
      with sum_words.get_lock():
        sum_words.value += word_ids_length
      
      if FLAGS.np_save:
        assert FLAGS.threads == 1
        gtexts.append(word_ids)
        gtext_strs.append(text)
        
      num += 1   
      if num == FLAGS.num_max_records:
        break

コード例 #6

ファイルを表示

ファイル: gen-records.py プロジェクト: tangqiqi123/hasky

def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break