Esempio n. 1
0
def main(argv):
  writer = tf.python_io.TFRecordWriter(argv[2])
  num = 0
  for line in open(argv[1]):
    if line[0] == '#':
      continue
    if num % 10000 == 0:
      print('%d lines done'%num)
    l = line.rstrip().split()
    
    label_index = 0
    if l[0][0] == '_':
      label_index = 1
      id = int(l[0][1:])
    else:
      id = num
    label = int(l[label_index])
    
    start = label_index + 1
    feature = [float(x) for x in l[start:]]
    example = tf.train.Example(
      features=tf.train.Features(
        feature={
        'id': melt.int_feature(id), 
        'label': melt.int_feature(label),
        'feature': melt.float_feature(feature),
        }))
    writer.write(example.SerializeToString())
    num += 1
    if FLAGS.num_examples and num == FLAGS.num_examples:
      break
Esempio n. 2
0
def main(argv):
    writer = tf.python_io.TFRecordWriter(argv[2])
    num = 0
    for line in open(argv[1]):
        if line[0] == '#':
            continue
        if num % 10000 == 0:
            print('%d lines done' % num)
        l = line.rstrip().split()

        label_index = 0
        if l[0][0] == '_':
            label_index = 1
            id = int(l[0][1:])
        else:
            id = num
        label = int(l[label_index])

        start = label_index + 1
        #notice this will be float64 not float32
        feature = np.array([float(x) for x in l[start:]])
        if num == 0:
            print('len feature', len(feature))
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'id': melt.int_feature(id),
                'label': melt.int_feature(label),
                'feature': melt.bytes_feature(feature.tostring()),
                'length': melt.int_feature(len(feature)),
            }))
        writer.write(example.SerializeToString())
        num += 1
Esempio n. 3
0
def deal_file(file, writer, thread_index):
  writer = melt.tfrecords.Writer('{}/{}_{}'.format(FLAGS.output_directory, FLAGS.name, thread_index))
  num = 0
  for line in open(file):
    #if num % 1000 == 0:
    #  print(num)
    l = line.rstrip().split('\t')
    img = l[0]
    #print(img)
    img_feature = [float(x) for x in l[1:1001]]
    #print(img_feature)
    text = l[-1].split('\x01')[0]
    #print(text)
    words = Segmentor.Segment(text)
    word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)]
    #print(word_ids)
    if len(word_ids) == 0:
      continue
    if FLAGS.pad:
      word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
    #print(word_ids)
    #gtexts[thread_index].append(word_ids)
    #gtext_strs[thread_index].append(text)

    example = tf.train.Example(features=tf.train.Features(feature={
      'image_name': melt.bytes_feature(img),
       'image_feature': melt.float_feature(img_feature),
       'text': melt.int_feature(word_ids),
       'text_str': melt.bytes_feature(text),
       }))
    writer.write(example)
    print(example.SerializeToString())
    num += 1
Esempio n. 4
0
def deal_file(file, writer):
  num = 0
  for line in open(file):
    if num % 1000 == 0:
      print('num:', num)
    l = line.rstrip().split('\t')
    img = l[0]
    img_feature = [float(x) for x in l[1:1001]]
    text = l[-1].split('\x01')[0]
    words = Segmentor.Segment(text)
    word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)]
    if len(word_ids) == 0:
      num += 1
      continue
    if FLAGS.pad:
      word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
    
    texts.append(word_ids)
    text_strs.append(text)
    example = tf.train.Example(features=tf.train.Features(feature={
      'image_name': melt.bytes_feature(img),
       'image_feature': melt.float_feature(img_feature),
       'text': melt.int_feature(word_ids),
       'text_str': melt.bytes_feature(text),
       }))
    #writer.write(example.SerializeToString())
    writer.write(example)
    num += 1
Esempio n. 5
0
def main(argv):
    writer = tf.python_io.TFRecordWriter(argv[2])
    num = 0
    for line in open(argv[1]):
        if line[0] == '#':
            continue
        if num % 10000 == 0:
            print('%d lines done' % num)
        l = line.rstrip().split()

        label_index = 0
        if l[0][0] == '_':
            label_index = 1
            id = int(l[0][1:])
        else:
            id = num
        label = int(l[label_index])

        start = label_index + 1
        feature = [float(x) for x in l[start:]]

        if FLAGS.fake_var_len:
            if id % 2 == 0:
                feature = feature[:10]

            if id % 3 == 0:
                feature = feature[:20]

        example = tf.train.SequenceExample(
            context=melt.features({
                'id': melt.int_feature(id),
                'label': melt.int_feature(label)
            }),
            feature_lists=melt.feature_lists({
                #see sequence_test.py use each single as a list and stack all lists(single items)
                #can this deal with var len sequence ?
                'feature':
                melt.feature_list(
                    [melt.float_feature(item) for item in feature])
                #'feature': melt.feature_list(melt.float_feature(feature))
            }))

        writer.write(example.SerializeToString())

        num += 1
        if FLAGS.num_examples and num == FLAGS.num_examples:
            break
Esempio n. 6
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)
            l = line.rstrip().split('\t')
            img = l[0]
            img_end = IMAGE_FEATURE_LEN + 1
            img_feature = [float(x) for x in l[1:img_end]]
            texts = [x.split('\x01')[0] for x in l[img_end:]]
            for text in texts:
                #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
                words = segmentor.Segment(text, FLAGS.seg_method)
                word_ids = [
                    vocabulary.id(word) for word in words
                    if vocabulary.has(word) or ENCODE_UNK
                ]
                word_ids_length = len(word_ids)
                if len(word_ids) == 0:
                    continue
                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

                if FLAGS.np_save:
                    gtexts[thread_index].append(word_ids)
                    gtext_strs[thread_index].append(text)

                assert img and img_feature and word_ids and text, line
                assert len(img_feature) == IMAGE_FEATURE_LEN
                #add pos info? weght info? or @TODO add click num info
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image_name': melt.bytes_feature(img),
                        'image_feature': melt.float_feature(img_feature),
                        'text': melt.int_feature(word_ids),
                        'text_str': melt.bytes_feature(text),
                    }))
                writer.write(example)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
Esempio n. 7
0
def _parse_line(line, writer, thread_index=0):
    l = line.rstrip().split('\t')
    image_name = l[0]
    image_feature = [float(x) for x in l[1:]]
    if image_name not in text_map:
        print('image %s ignore' % image_name)
        return
    else:
        for text, ori_text in text_map[image_name]:
            word_ids = [
                vocabulary.id(word) for word in text.split(WORDS_SEP)
                if vocabulary.has(word)
            ]
            if not word_ids:
                continue
            word_ids_length = len(word_ids)
            word_ids = word_ids[:TEXT_MAX_WORDS]
            if FLAGS.pad:
                word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

            if FLAGS.np_save:
                gtexts[thread_index].append(word_ids)
                gtext_strs[thread_index].append(ori_text)

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(image_name),
                    'image_feature': melt.float_feature(image_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(ori_text),
                }))

            #NOTICE not test here for num_threads > 1
            if FLAGS.num_records:
                if image_name not in images:
                    images[image_name] = 1
                    print(image_name, len(images))
                    writer.write(example.SerializeToString())
                    if len(images) == FLAGS.num_records:
                        print('Done')
                        exit(1)
            else:
                writer.write(example.SerializeToString())
                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
Esempio n. 8
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    writer = melt.tfrecords.Writer(out_file)
    num = 0
    for line in open(file):
        if num % 1000 == 0:
            print(num)
        l = line.rstrip().split('\t')
        img = l[0]
        img_end = IMAGE_FEATURE_LEN + 1
        img_feature = [float(x) for x in l[1:img_end]]
        texts = [x.split('\x01')[0] for x in l[img_end:]]
        for text in texts:
            words = Segmentor.Segment(text)
            word_ids = [
                vocabulary.id(word) for word in words if vocabulary.has(word)
            ]
            if len(word_ids) == 0:
                continue
            if FLAGS.pad:
                word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

            gtexts[thread_index].append(word_ids)
            gtext_strs[thread_index].append(text)

            #add pos info? weght info? or @TODO add click num info
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(img),
                    'image_feature': melt.float_feature(img_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(text),
                }))
            writer.write(example)
        num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
Esempio n. 9
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_dir,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            #line = line.lower()
            if num % 1000 == 0:
                print(num)
            if FLAGS.max_lines and num >= FLAGS.max_lines:
                break
            l = line.rstrip().split('\t')

            if len(l) != 2:
                continue

            ltext, rtext_list = l

            for rtext in rtext_list.split('\x01'):
                lword_ids = _text2ids(ltext, TEXT_MAX_WORDS)
                rword_ids = _text2ids(rtext, TEXT_MAX_WORDS)

                if not lword_ids or not rword_ids:
                    continue

                if num % 1000 == 0:
                    print(ltext,
                          lword_ids,
                          text2ids.ids2text(lword_ids),
                          file=sys.stderr)
                    print(rtext,
                          rword_ids,
                          text2ids.ids2text(rword_ids),
                          file=sys.stderr)

                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'ltext_str': melt.bytes_feature(ltext),
                        'ltext': melt.int_feature(lword_ids),
                        'rtext_str': melt.bytes_feature(rtext),
                        'rtext': melt.int_feature(rword_ids),
                    }))
                writer.write(example)

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    ltexts.append(lword_ids)
                    ltext_strs.append(ltext)
                    rtexts.append(rword_ids)
                    rtext_strs.append(rtext)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1

                word_ids = lword_ids
                word_ids_length = len(word_ids)
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1
Esempio n. 10
0
        if num % 1000 == 0:
            #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr)
            print('\t'.join(words), file=sys.stderr)
            print(word_ids, file=sys.stderr)
        if len(word_ids) == 0:
            continue
        word_ids = word_ids[:TEXT_MAX_WORDS]
        if FLAGS.pad:
            word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

        if writer is not None:
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(img),
                    'image_feature': melt.float_feature(img_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(text),
                }))
            writer.write(example)
        else:
            count += 1

if FLAGS.mode != 1:
    if writer is not None:
        count = writer.count
    print('count\t%d' % (count), file=sys.stderr)
    #--------for calc total count
    print('count\t%d' % (count))

#do not forget to close ! NOTICE
if writer is not None:
Esempio n. 11
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            line = line.lower()
            if num % 1000 == 0:
                print(num)
            if FLAGS.max_lines and num >= FLAGS.max_lines:
                break
            l = line.strip().split('\t')
            #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
            #words = segmentor.Segment(text, FLAGS.seg_method)
            #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK]

            #text is what to predict which is clickquery right now  decoder
            #input text is what to predict from, encoder, here will may be ct0, title, real_title

            if title.strip() is '':
                title = real_title

            if clickquery.startswith('http://'):
                clickquery = l[3]

            text = clickquery
            word_ids = _text2ids(text, TEXT_MAX_WORDS)

            if not word_ids:
                continue

            if FLAGS.np_save:
                gtexts[thread_index].append(word_ids)
                gtext_strs[thread_index].append(text)

            ct0_ids = _text2ids(ct0, INPUT_TEXT_MAX_WORDS)

            title_ids = _text2ids(title, INPUT_TEXT_MAX_WORDS)
            real_title_ids = _text2ids(real_title, INPUT_TEXT_MAX_WORDS)

            if len(ct0_ids) == 0:
                ct0_ids = real_title_ids
                ct0 = real_title

            if num % 1000 == 0:
                print(text,
                      word_ids,
                      text2ids.ids2text(word_ids),
                      file=sys.stderr)
                print(ct0,
                      ct0_ids,
                      text2ids.ids2text(ct0_ids),
                      file=sys.stderr)

            image = l[1]
            url = l[2]

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(image),
                    'url': melt.bytes_feature(url),
                    'text_str': melt.bytes_feature(text),
                    'ct0_str': melt.bytes_feature(ct0),
                    'title_str': melt.bytes_feature(title),
                    'real_title_str': melt.bytes_feature(real_title),
                    'text': melt.int_feature(word_ids),
                    'ct0': melt.int_feature(ct0_ids),
                    'title': melt.int_feature(title_ids),
                    'real_title': melt.int_feature(real_title_ids),
                }))
            writer.write(example)

            global counter, max_num_words, sum_words
            with counter.get_lock():
                counter.value += 1
            word_ids_length = len(word_ids)
            if word_ids_length > max_num_words.value:
                with max_num_words.get_lock():
                    max_num_words.value = word_ids_length
            with sum_words.get_lock():
                sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]