def main(_): if not os.path.exists(FLAGS.output_dir): print('Creating directory: %s' % FLAGS.output_dir) os.mkdir(FLAGS.output_dir) desc_counter = Counter() attr_counter = Counter() partial_counts = defaultdict(Counter) print('Processing data...') n = len(FLAGS.inputs) for i, fname in enumerate(FLAGS.inputs): print('File %i of %i: %s' % (i, n, fname)) with open(fname, 'r') as f: data = json.load(f) for product in data: desc = product['clean_text'].split() + \ product['clean_title'].split() desc_counter.update(desc) for attr, value in product['specs'].items(): attr_counter.update((attr,)) partial_counts[attr].update((value,)) # Filter values partial_counts = {x: {y: z for y, z in y.items() if z >= FLAGS.min_value } for x, y in partial_counts.items()} # Remove singular attributes singular = {x for x, y in partial_counts.items() if len(y) <= 1} attr_counter = Counter({x: y for x, y in attr_counter.items() if x not in singular}) partial_counts = {x: y for x, y in partial_counts.items() if x not in singular} # Filter attrs if FLAGS.max_attr is not None: attr_counter = {x: y for x, y in attr_counter.most_common(FLAGS.max_attr)} # Filter desc desc_counter = Counter({x: y for x, y in desc_counter.items() if y >= FLAGS.min_desc}) desc_vocab = Vocab.build_from_counter(desc_counter) attr_vocab = Vocab.build_from_counter(attr_counter) value_set = ValueSet.build_from_partial_counts(partial_counts) print('Writing to disk...') desc_fname = os.path.join(FLAGS.output_dir, 'desc.txt') with open(desc_fname, 'w') as f: desc_vocab.write(f) attr_fname = os.path.join(FLAGS.output_dir, 'attr.txt') with open(attr_fname, 'w') as f: attr_vocab.write(f) value_fname = os.path.join(FLAGS.output_dir, 'value.txt') with open(value_fname, 'w') as f: value_set.write(f) stats_fname = os.path.join(FLAGS.output_dir, 'stats.txt') with open(stats_fname, 'w') as f: f.write('num_attrs: %i\n' % len(attr_vocab)) f.write('num_vals: %i\n' % len(value_set)) f.write('num_words: %i\n' % len(desc_vocab)) print('Done')