def _read_data(variant): zip_path = _common.get_data('https://s3-eu-west-1.amazonaws.com/' 'yc-rdata/yoochoose-data.7z', 'yoochoose', 'yoochoose.7z') dest_dir = os.path.dirname(zip_path) for suffix in ('buys', 'clicks'): if not os.path.exists(os.path.join( dest_dir, 'yoochoose-{}.dat'.format(suffix))): subprocess.check_call(['7z', '-o{}'.format(dest_dir), 'x', zip_path]) fname = os.path.join(dest_dir, 'yoochoose-{}.dat'.format(variant)) with open(fname, 'r') as datafile: for line in datafile: uid, timestamp, iid = line.split(',')[:3] timestamp = time.mktime( datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') .timetuple()) yield int(uid), int(iid), timestamp
def read_movielens_latest(): zip_path = _common.get_data(URL_PREFIX + URL_LATEST,"movielens","movielens_latest.zip") archive_path = os.path.join("ml-latest","ratings.csv") data = itertools.islice(_read_data(zip_path, archive_path), 1, None) for line in _make_contiguous(data, separator=","): yield line
def read_movielens_100K(): zip_path = _common.get_data(URL_PREFIX + URL_100K, 'movielens', 'movielens_100k.zip') archive_path = os.path.join('ml-100k', 'u.data') for line in _read_data(zip_path, archive_path): yield _parse_line(line, separator='\t')
def _read_data(variant): file_path = _common.get_data( BASE_URL + 'ratings_{}.csv'.format(variant.title()), 'amazon', 'ratings_{}.csv'.format(variant)) with open(file_path, 'r') as datafile: for line in datafile: uid, iid, rating, timestamp = line.split(',') yield uid, iid, float(rating), float(timestamp)
def read_movielens_20M(): zip_path = _common.get_data(URL_PREFIX + URL_20M, 'movielens', 'movielens_20M.zip') archive_path = os.path.join('ml-20m', 'ratings.csv') data = itertools.islice(_read_data(zip_path, archive_path), 1, None) for line in _make_contiguous(data, separator=','): yield line
def read_movielens_10M(): zip_path = _common.get_data(URL_PREFIX + URL_10M, 'movielens', 'movielens_10M.zip') archive_path = os.path.join('ml-10M100K', 'ratings.dat') data = _read_data(zip_path, archive_path) for line in _make_contiguous(data, separator='::'): yield line
def read_amazon_co_purchasing(): path = _common.get_data( 'https://snap.stanford.edu/data/bigdata/amazon/amazon-meta.txt.gz', 'amazon', 'amazon_co_purchasing.gz') user_dict = {} feature_dict = {} interaction_user_ids = array.array('i') interaction_item_ids = array.array('i') interaction_ratings = array.array('f') interaction_timestamps = array.array('f') feature_item_ids = array.array('i') feature_ids = array.array('i') failed_parses = [] total_parses = 0 for block in _read_blocks(path): total_parses += 1 try: (item_id, categories, user_ids, ratings, dates) = _parse_block(block) except Exception as e: print('Parse failed') failed_parses.append((e, block)) user_ids = [user_dict.setdefault(x, len(user_dict)) for x in user_ids] interaction_user_ids.extend(user_ids) interaction_item_ids.extend([item_id] * len(user_ids)) interaction_ratings.extend(ratings) interaction_timestamps.extend( [int(time.mktime(x.timetuple())) for x in dates]) categories = [ feature_dict.setdefault(x, len(feature_dict)) for x in categories ] feature_item_ids.extend([item_id] * len(categories)) feature_ids.extend(categories) print('Num of failed parses: {} (out of {})'.format( len(failed_parses), total_parses)) return (np.array(interaction_user_ids), np.array(interaction_item_ids), np.array(interaction_ratings), np.array(interaction_timestamps), np.array(feature_item_ids), np.array(feature_ids))
def read_gowalla(): zip_path = _common.get_data(URL, 'gowalla', 'gowalla.txt.gz') for line in _read_data(zip_path): yield line