Ejemplo n.º 1
0
            continue
        word_ids = text2ids.text2ids(text,
                                     seg_method=FLAGS.seg_method,
                                     feed_single=FLAGS.feed_single,
                                     allow_all_zero=True,
                                     pad=False)
        word_ids_length = len(word_ids)
        if num % 1000 == 0:
            #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr)
            #print('\t'.join(words), file=sys.stderr)
            print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        if len(word_ids) == 0:
            continue
        word_ids = word_ids[:TEXT_MAX_WORDS]
        if FLAGS.pad:
            word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

        if writer is not None:
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(img),
                    'image_feature': melt.float_feature(img_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(text),
                }))
            writer.write(example)
        else:
            count += 1

if FLAGS.mode != 1:
    if writer is not None:
Ejemplo n.º 2
0
texts = np.load(FLAGS.dir + '/texts.npy')
text_strs = np.load(FLAGS.dir + '/text_strs.npy')

distinct_texts = []
distinct_text_strs = []

maxlen = 0
for text in texts:
    if len(text) > maxlen:
        maxlen = len(text)

text_set = set()
for text, text_str in zip(list(texts), list(text_strs)):
    if text_str not in text_set:
        text_set.add(text_str)
        distinct_texts.append(gezi.pad(text, maxlen))
        distinct_text_strs.append(text_str)

        if len(distinct_texts) == FLAGS.max_texts:
            print('stop at', FLAGS.max_texts, file=sys.stderr)
            break

print('num ori texts:', len(texts))
print('num distinct texts:', len(distinct_texts))

distinct_texts = np.array(distinct_texts)
distinct_text_strs = np.array(distinct_text_strs)

if FLAGS.shuffle:
    distinct_texts, distinct_text_strs = gezi.unison_shuffle(
        distinct_texts, distinct_text_strs)
Ejemplo n.º 3
0
IMAGE_FEATURE_LEN = 1000

predictor = melt.Predictor('./model.ckpt-12000')

vocabulary.init()
vocab = vocabulary.vocab 
#vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS)

ids_list = []
text_list = []
for line in open('./test.txt'): 
  text = line.strip().split('\t')[-1]
  text_list.append(text)
  words = line.split()
  ids = [vocab.id(word) for word in text.split(WORDS_SEP) if vocab.has(word) or ENCODE_UNK]
  ids = gezi.pad(ids, TEXT_MAX_WORDS)
  ids_list.append(ids)
#ids_list = np.array(ids_list)


def bulk_predict(predictor, images, texts):
  scores = predictor.inference('score', 
                               { '%s/%s'%(FLAGS.algo, FLAGS.image_feature_place): images,
                                 '%s/%s'%(FLAGS.algo, FLAGS.text_place): texts })
  return scores

def predict():
  for line in sys.stdin:
    l = line.strip().split('\t')
    image_name = l[0]
    #image_feature = np.array([[float(x) for x in l[1:]]])
Ejemplo n.º 4
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            is_top_text = True
            for text in texts:
                if text.strip() == '':
                    continue

                words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                word_ids = text2ids.words2ids(words,
                                              feed_single=FLAGS.feed_single,
                                              allow_all_zero=True,
                                              pad=False)
                word_ids_length = len(word_ids)
                if num % 1000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    continue
                if is_luanma(words, word_ids):
                    print('luanma',
                          img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Ejemplo n.º 5
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_path = os.path.join(FLAGS.image_dir, img.replace('/', '_'))
            encoded_image = melt.read_image(image_path)

            is_top_text = True
            for text in texts:
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Ejemplo n.º 6
0
def build_features(index):
    total = len(df)
    start, end = gezi.get_fold(total, FLAGS.num_records, index)
    df_ = df.iloc[start:end]

    num_records = 0

    buffer_size = None if (
        FLAGS.mark != 'train'
        or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size
    ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP'
    folder_name = FLAGS.mark
    if FLAGS.neg_parts > 1:
        folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}'
        os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}')
        ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP'
    writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size)

    if FLAGS.mark == 'train' and FLAGS.train_by_day:
        # 2019 11 9 -> 11 14
        num_days = 7
        num_records_list = [0] * num_days
        ofiles = []
        writers = []
        for i in range(num_days):
            os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}')
            ofiles += [
                f'{FLAGS.out_dir}/{folder_name}-days/{i}/record_{index}.TMP'
            ]
            writers += [
                melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size)
            ]

    for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True):
        time_ = row['time']
        day = int(time_.split()[0].split('/')[1]) - 9
        if FLAGS.day is not None and day != FLAGS.day:
            continue

        x = to_datetime(time_)
        weekday = x.weekday()
        hour = x.hour
        # timestamp = to_timestamp(x)
        timestamp = row['timestamp']

        impressions = row['impressions'].split()
        impression_id = row['impression_id']
        uid = uid_vocab.id(row['uid'])

        try:
            history = [
                did_vocab.id(x) for x in reversed(row['history'].split())
            ]
        except Exception:
            # print(row['history'], row['impression_id'])
            history = []

        feature = {}
        feature['uid_'] = row['uid']
        feature['uid'] = uid
        feature['day'] = day
        feature['weekday'] = weekday
        feature['hour'] = hour
        feature['impression_id'] = impression_id
        feature['uid_in_train'] = int(uid_vocab2.has(row['uid']))
        feature['impression_len'] = len(impressions)
        feature['hist_len'] = len(history)
        feature['history'] = history
        if FLAGS.record_padded:
            feature['history'] = gezi.pad(feature['history'],
                                          FLAGS.max_history)
        else:
            feature['history'] = feature['history'][:FLAGS.max_history]

        if FLAGS.use_impressions:
            feature['impressions'] = [
                did_vocab.id(x.split('-')[0]) for x in impressions
            ]

        # 当前doc的cat subcat

        # 当前doc的entities entity types

        feature['history_title_entities'] = []
        feature['history_title_entity_types'] = []
        feature['history_abstract_entities'] = []
        feature['history_abstract_entity_types'] = []
        for did in history:
            if did == 0:
                break
            did = did_vocab.key(did)
            news = news_info[did]

            try:
                title_entities = json.loads(news['title_entities'])
                for i, m in enumerate(title_entities):
                    if i == 2:
                        break
                    entity = m['WikidataId']
                    feature['history_title_entities'] += [
                        entity_vocab.id(entity)
                    ]
                    feature['history_title_entity_types'] += [
                        entity_type_vocab.id(m['Type'])
                    ]
            except Exception:
                pass

            try:
                abstract_entities = json.loads(news['abstract_entities'])
                for m in title_entities:
                    if i == 2:
                        break
                    entity = m['WikidataId']
                    feature['history_abstract_entities'] += [
                        entity_vocab.id(entity)
                    ]
                    feature['history_abstract_entity_types'] += [
                        entity_type_vocab.id(m['Type'])
                    ]
            except Exception:
                pass

        if FLAGS.record_padded:
            feature['history_title_entities'] = pad(
                feature['history_title_entities'],
                FLAGS.max_history * FLAGS.max_his_title_entities)
            feature['history_title_entity_types'] = gezi.pad(
                feature['history_title_entity_types'],
                FLAGS.max_history * FLAGS.max_his_title_entities)
            feature['history_abstract_entities'] = pad(
                feature['history_abstract_entities'],
                FLAGS.max_history * FLAGS.max_his_abstract_entities)
            feature['history_abstract_entity_types'] = pad(
                feature['history_abstract_entity_types'],
                FLAGS.max_history * FLAGS.max_his_abstract_entities)
        else:
            feature['history_title_entities'] = feature[
                'history_title_entities'][:FLAGS.max_history *
                                          FLAGS.max_his_title_entities]
            feature['history_title_entity_types'] = feature[
                'history_title_entity_types'][:FLAGS.max_history *
                                              FLAGS.max_his_title_entities]
            feature['history_abstract_entities'] = feature[
                'history_abstract_entities'][:FLAGS.max_history *
                                             FLAGS.max_his_abstract_entities]
            feature['history_abstract_entity_types'] = feature[
                'history_abstract_entity_types'][:FLAGS.max_history * FLAGS.
                                                 max_his_abstract_entities]

        if FLAGS.neg_parts > 1:
            indexes = list(range(len(impressions)))
            np.random.shuffle(indexes)

        prev_cat, prev_sub_cat = X, X
        recall_cats, recall_sub_cats = defaultdict(int), defaultdict(int)
        for i, impression in enumerate(impressions):
            did_ = impression.split('-')[0]
            news = news_info[did_]
            cat, sub_cat = news['cat'], news['sub_cat']
            recall_cats[cat] += 1
            recall_sub_cats[sub_cat] += 1

        for i, impression in enumerate(impressions):
            if '-' in impression:
                did_, click = impression.split('-')
            else:
                did_, click = impression, '0'
            click = int(click)

            if FLAGS.neg_parts > 1:
                if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part:
                    continue

            start_timestamp = start_timestamps[did_]
            fresh = timestamp - start_timestamp
            did = did_vocab.id(did_)

            feature['fresh'] = fresh
            feature['did_in_train'] = int(did_vocab2.has(did_))

            feature['click'] = click
            feature['did_'] = did_
            feature['did'] = did
            feature['id'] = impression_id * 100 + i
            feature['position'] = i

            news = news_info[did_]

            feature['cat'] = cat_vocab.id(news['cat'])
            feature['sub_cat'] = scat_vocab.id(news['sub_cat'])
            feature['title_len'] = len(news['title'].split())
            try:
                feature['abstract_len'] = len(news['abstract'].split())
            except Exception:
                # Nan
                feature['abstract_len'] = 0

            feature['title_entities'] = []
            feature['title_entity_types'] = []
            feature['abstract_entities'] = []
            feature['abstract_entity_types'] = []

            try:
                title_entities = json.loads(news['title_entities'])
                for m in title_entities:
                    entity = m['WikidataId']
                    feature['title_entities'].append(entity_vocab.id(entity))
                    feature['title_entity_types'].append(
                        entity_type_vocab.id(m['Type']))
            except Exception:
                pass

            try:
                abstract_entities = json.loads(news['abstract_entities'])
                for m in title_entities:
                    entity = m['WikidataId']
                    feature['abstract_entities'].append(
                        entity_vocab.id(entity))
                    feature['abstract_entity_types'].append(
                        entity_type_vocab.id(m['Type']))
            except Exception:
                pass

            if FLAGS.record_padded:
                for key in ['title_entities', 'title_entity_types']:
                    feature[key] = pad(feature[key], FLAGS.max_title_entities)

                for key in ['abstract_entities', 'abstract_entity_types']:
                    feature[key] = pad(feature[key],
                                       FLAGS.max_abstract_entities)

            # feature['impression_prev_cat'] = prev_cat
            # feature['impression_prev_sub_cat'] = prev_sub_cat

            # prev_cat = cat_vocab.id(news['cat'])
            # prev_sub_cat = scat_vocab.id(news['sub_cat'])

            # feature['impression_cat_ratio'] = recall_cats[news['cat']] / len(impressions)
            # feature['impression_sub_cat_ratio'] = recall_sub_cats[news['sub_cat']] / len(impressions)

            if FLAGS.use_impressions:
                feature['impressions'] = feature['impressions'][
                    max(0, i - 5):min(len(impressions), i + 4)]
                if FLAGS.record_padded:
                    feature['impressions'] = gezi.pad(feature['impressions'],
                                                      FLAGS.max_impressions)

            feature_ = {}
            for key in feature:
                feature_[key] = feature[key]
                if isinstance(feature[key], list
                              or tuple) and not feature[key]:
                    feature_[key] = [X]
            for key in feature_:
                try:
                    feature_[key] = melt.gen_feature(feature_[key])
                except Exception:
                    print(key, feature[key])
                    print(traceback.format_exc())
                    exit(0)

            record = tf.train.Example(features=tf.train.Features(
                feature=feature_))

            if FLAGS.mark == 'train' and FLAGS.train_by_day:
                writer = writers[day]

            writer.write(record)

            if FLAGS.mark == 'train' and FLAGS.train_by_day:
                num_records_list[day] += 1
            else:
                num_records += 1

    if FLAGS.mark == 'train' and FLAGS.train_by_day:
        for i in range(num_days):
            writers[i].close()
            if num_records_list[i] == 0:
                os.system('rm -rf %s' % ofiles[i])
            else:
                ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}')
                os.system('mv %s %s' % (ofiles[i], ofile2))
    else:
        writer.close()
        if num_records == 0:
            os.system('rm -rf %s' % ofile)
        else:
            ofile2 = ofile.replace('.TMP', f'.{num_records}')
            os.system('mv %s %s' % (ofile, ofile2))
Ejemplo n.º 7
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)
            l = line.rstrip().split('\t')
            img = l[0]
            img_end = IMAGE_FEATURE_LEN + 1
            img_feature = [float(x) for x in l[1:img_end]]
            texts = [x.split('\x01')[0] for x in l[img_end:]]
            for text in texts:
                if text.strip() == '':
                    continue
                #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
                #words = segmentor.Segment(text, FLAGS.seg_method)
                #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK]
                word_ids = text2ids.text2ids(text,
                                             seg_method=FLAGS.seg_method,
                                             feed_single=FLAGS.feed_single,
                                             allow_all_zero=True,
                                             pad=False)
                word_ids_length = len(word_ids)
                if num % 1000 == 0:
                    print(text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    continue
                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

                if FLAGS.np_save:
                    gtexts[thread_index].append(word_ids)
                    gtext_strs[thread_index].append(text)

                #add pos info? weght info? or @TODO add click num info
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image_name': melt.bytes_feature(img),
                        'image_feature': melt.float_feature(img_feature),
                        'text': melt.int_feature(word_ids),
                        'text_str': melt.bytes_feature(text),
                    }))
                writer.write(example)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]