def _iter_trectext_documents_multiprocessing_worker_(document_path): logging.info('Iterating over %s.', document_path) num_documents = 0 with open(document_path, 'r', encoding=_iter_trectext_documents_multiprocessing_worker_. encoding) as f: for doc_id, text in _parse_trectext(f): if (_iter_trectext_documents_multiprocessing_worker_. document_ids and doc_id not in _iter_trectext_documents_multiprocessing_worker_. document_ids): continue # Concatenate document lines. text = ' '.join(text) if _iter_trectext_documents_multiprocessing_worker_.strip_html: text = io_utils.strip_html(text) if _iter_trectext_documents_multiprocessing_worker_.replace_digits: text = ( _iter_trectext_documents_multiprocessing_worker_. digit_regex. sub('<num>', text)) if _iter_trectext_documents_multiprocessing_worker_.tokenize: tokens = io_utils.tokenize_text( text, ignore_words=( _iter_trectext_documents_multiprocessing_worker_. ignore_words)) _iter_trectext_documents_multiprocessing_worker_.\ result_queue.put((doc_id, tokens)) else: _iter_trectext_documents_multiprocessing_worker_.\ result_queue.put((doc_id, text)) num_documents += 1 return num_documents
def main(): parser = argparse.ArgumentParser() parser.add_argument('--loglevel', type=str, default='INFO') parser.add_argument('--shard_size', type=argparse_utils.positive_int, default=(1 << 14)) parser.add_argument('reviews_file', type=argparse_utils.existing_file_path) parser.add_argument('--product_list', type=argparse_utils.existing_file_path, default=None) parser.add_argument('--trectext_out', type=argparse_utils.nonexisting_file_path, required=True) args = parser.parse_args() try: logging_utils.configure_logging(args) except IOError: return -1 if args.product_list: with open(args.product_list, 'r') as f_product_list: product_list = set( product_id.strip() for product_id in f_product_list) logging.info('Only considering white list of %d products.', len(product_list)) else: product_list = None writer = trec_utils.ShardedTRECTextWriter( args.trectext_out, args.shard_size) with gzip.open(args.reviews_file, 'r') as f_reviews: for i, raw_line in enumerate(f_reviews): raw_line = raw_line.decode('utf8') review = json.loads(raw_line) product_id = review['asin'] if product_list and product_id not in product_list: continue document_id = '{product_id}_{reviewer_id}_{review_time}'.format( product_id=product_id, reviewer_id=review['reviewerID'], review_time=review['unixReviewTime']) review_summary = review['summary'] review_text = review['reviewText'] product_document = '{0} \n{1}'.format( review_summary, review_text) product_document = io_utils.tokenize_text(product_document) document = ' '.join(product_document) writer.write_document(document_id, document) if (i + 1) % 1000 == 0: logging.info('Processed %d reviews.', i + 1) writer.close() logging.info('All done!')
def tokenizer_fn(text): return [ token for token in io_utils.tokenize_text(text) if dictionary.has_token(token)] \ if text is not None else None
def main(): parser = argparse.ArgumentParser() parser.add_argument('--loglevel', type=str, default='INFO') parser.add_argument('--shard_size', type=argparse_utils.positive_int, default=(1 << 14)) parser.add_argument('meta_file', type=argparse_utils.existing_file_path) parser.add_argument('--product_list', type=argparse_utils.existing_file_path, default=None) parser.add_argument('--trectext_out', type=argparse_utils.nonexisting_file_path, required=True) args = parser.parse_args() try: logging_utils.configure_logging(args) except IOError: return -1 if args.product_list: with open(args.product_list, 'r') as f_product_list: product_list = set(product_id.strip() for product_id in f_product_list) logging.info('Only considering white list of %d products.', len(product_list)) else: product_list = None writer = trec_utils.ShardedTRECTextWriter(args.trectext_out, args.shard_size) department = ' '.join( os.path.basename(args.meta_file).split('.')[0].split('_')[1:]).replace( ' and ', ' & ') logging.info('Department: %s', department) with gzip.open(args.meta_file, 'r') as f_meta: for i, raw_line in enumerate(f_meta): raw_line = raw_line.decode('utf8') product = ast.literal_eval(raw_line) product_id = product['asin'] if product_list and product_id not in product_list: continue if 'description' in product and 'title' in product: product_title = product['title'] product_description = \ io_utils.strip_html(product['description']) product_document = '{0} \n{1}'.format(product_title, product_description) product_document = io_utils.tokenize_text(product_document) logging.debug('Product %s has description of %d tokens.', len(product_document)) writer.write_document(product_id, ' '.join(product_document)) else: logging.debug( 'Filtering product %s due to missing description.', product_id) continue if (i + 1) % 1000 == 0: logging.info('Processed %d products.', i + 1) writer.close() logging.info('All done!')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--loglevel', type=str, default='INFO') parser.add_argument('--shard_size', type=argparse_utils.positive_int, default=(1 << 14)) parser.add_argument('meta_file', type=argparse_utils.existing_file_path) parser.add_argument('--product_list', type=argparse_utils.existing_file_path, default=None) parser.add_argument('--trectext_out', type=argparse_utils.nonexisting_file_path, required=True) args = parser.parse_args() try: logging_utils.configure_logging(args) except IOError: return -1 if args.product_list: with open(args.product_list, 'r') as f_product_list: product_list = set( product_id.strip() for product_id in f_product_list) logging.info('Only considering white list of %d products.', len(product_list)) else: product_list = None writer = trec_utils.ShardedTRECTextWriter( args.trectext_out, args.shard_size) department = ' '.join( os.path.basename(args.meta_file).split('.')[0] .split('_')[1:]).replace(' and ', ' & ') logging.info('Department: %s', department) with gzip.open(args.meta_file, 'r') as f_meta: for i, raw_line in enumerate(f_meta): raw_line = raw_line.decode('utf8') product = ast.literal_eval(raw_line) product_id = product['asin'] if product_list and product_id not in product_list: continue if 'description' in product and 'title' in product: product_title = product['title'] product_description = \ io_utils.strip_html(product['description']) product_document = '{0} \n{1}'.format( product_title, product_description) product_document = io_utils.tokenize_text(product_document) logging.debug('Product %s has description of %d tokens.', len(product_document)) writer.write_document( product_id, ' '.join(product_document)) else: logging.debug( 'Filtering product %s due to missing description.', product_id) continue if (i + 1) % 1000 == 0: logging.info('Processed %d products.', i + 1) writer.close() logging.info('All done!')
def tokenize_fn(text): return [ token for token in io_utils.tokenize_text(text) if dictionary.has_token(token)]