Exemple #1
0
def _read_data(variant):

    zip_path = _common.get_data('https://s3-eu-west-1.amazonaws.com/'
                                'yc-rdata/yoochoose-data.7z',
                                'yoochoose',
                                'yoochoose.7z')

    dest_dir = os.path.dirname(zip_path)

    for suffix in ('buys', 'clicks'):
        if not os.path.exists(os.path.join(
                dest_dir, 'yoochoose-{}.dat'.format(suffix))):
            subprocess.check_call(['7z',
                                   '-o{}'.format(dest_dir),
                                   'x',
                                   zip_path])

    fname = os.path.join(dest_dir, 'yoochoose-{}.dat'.format(variant))
    with open(fname, 'r') as datafile:
        for line in datafile:
            uid, timestamp, iid = line.split(',')[:3]

            timestamp = time.mktime(
                datetime.datetime.strptime(timestamp,
                                           '%Y-%m-%dT%H:%M:%S.%fZ')
                .timetuple())

            yield int(uid), int(iid), timestamp
Exemple #2
0
def read_movielens_latest():

    zip_path = _common.get_data(URL_PREFIX + URL_LATEST,"movielens","movielens_latest.zip")
    archive_path = os.path.join("ml-latest","ratings.csv")
    data = itertools.islice(_read_data(zip_path, archive_path), 1, None)
    for line in _make_contiguous(data, separator=","):
        yield line
Exemple #3
0
def read_movielens_100K():

    zip_path = _common.get_data(URL_PREFIX + URL_100K,
                                'movielens',
                                'movielens_100k.zip')

    archive_path = os.path.join('ml-100k', 'u.data')

    for line in _read_data(zip_path, archive_path):
        yield _parse_line(line, separator='\t')
def _read_data(variant):

    file_path = _common.get_data(
        BASE_URL + 'ratings_{}.csv'.format(variant.title()), 'amazon',
        'ratings_{}.csv'.format(variant))

    with open(file_path, 'r') as datafile:
        for line in datafile:
            uid, iid, rating, timestamp = line.split(',')

            yield uid, iid, float(rating), float(timestamp)
Exemple #5
0
def read_movielens_20M():

    zip_path = _common.get_data(URL_PREFIX + URL_20M,
                                'movielens',
                                'movielens_20M.zip')

    archive_path = os.path.join('ml-20m', 'ratings.csv')

    data = itertools.islice(_read_data(zip_path, archive_path), 1, None)

    for line in _make_contiguous(data, separator=','):
        yield line
Exemple #6
0
def read_movielens_10M():

    zip_path = _common.get_data(URL_PREFIX + URL_10M,
                                'movielens',
                                'movielens_10M.zip')

    archive_path = os.path.join('ml-10M100K', 'ratings.dat')

    data = _read_data(zip_path, archive_path)

    for line in _make_contiguous(data, separator='::'):
        yield line
Exemple #7
0
def read_amazon_co_purchasing():

    path = _common.get_data(
        'https://snap.stanford.edu/data/bigdata/amazon/amazon-meta.txt.gz',
        'amazon', 'amazon_co_purchasing.gz')

    user_dict = {}
    feature_dict = {}

    interaction_user_ids = array.array('i')
    interaction_item_ids = array.array('i')
    interaction_ratings = array.array('f')
    interaction_timestamps = array.array('f')

    feature_item_ids = array.array('i')
    feature_ids = array.array('i')

    failed_parses = []
    total_parses = 0

    for block in _read_blocks(path):
        total_parses += 1
        try:
            (item_id, categories, user_ids, ratings,
             dates) = _parse_block(block)
        except Exception as e:
            print('Parse failed')
            failed_parses.append((e, block))

        user_ids = [user_dict.setdefault(x, len(user_dict)) for x in user_ids]

        interaction_user_ids.extend(user_ids)
        interaction_item_ids.extend([item_id] * len(user_ids))
        interaction_ratings.extend(ratings)
        interaction_timestamps.extend(
            [int(time.mktime(x.timetuple())) for x in dates])

        categories = [
            feature_dict.setdefault(x, len(feature_dict)) for x in categories
        ]

        feature_item_ids.extend([item_id] * len(categories))
        feature_ids.extend(categories)

    print('Num of failed parses: {} (out of {})'.format(
        len(failed_parses), total_parses))

    return (np.array(interaction_user_ids), np.array(interaction_item_ids),
            np.array(interaction_ratings), np.array(interaction_timestamps),
            np.array(feature_item_ids), np.array(feature_ids))
Exemple #8
0
def read_gowalla():

    zip_path = _common.get_data(URL, 'gowalla', 'gowalla.txt.gz')

    for line in _read_data(zip_path):
        yield line