Exemple #1
0
def boston_housing(root):
    """Housing Values in Suburbs of Boston
    
    Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for 
    clean air. J. Environ. Economics and Management 5, 81–102.
    
    Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. 
    Identifying Influential Data and Sources of Collinearity. New York: Wiley.
    
    Data storage directory:
    root = `/user/.../mydata`
    boston_housing data: 
    `root/boston_housing/boston_housing.txt`
    `root/boston_housing/boston_housing.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/boston_housing`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/boston_housing`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'boston_housing')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/boston_house/boston_housing.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/boston_house/boston_housing.txt'
    rq.json(url_json, gfile.path_join(task_path, 'boston_housing.json'))
    rq.table(url_txt, gfile.path_join(task_path, 'boston_housing.txt'))
    print(
        'boston_housing dataset download completed, run time %d min %.2f sec' %
        divmod((time.time() - start), 60))
    return task_path
Exemple #2
0
def adult(root):
    """This data was extracted from the census bureau database found at
    http://www.census.gov/ftp/pub/DES/www/welcome.html
    
    48842 instances, mix of continuous and discrete    (train=32561, test=16281)
    45222 if instances with unknown values are removed (train=30162, test=15060)
    Duplicate or conflicting instances : 6
    Class probabilities for adult.all file
    Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
    Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
    
    Data storage directory:
    root = `/user/.../mydata`
    adult data: 
    `root/adult/adult.txt`
    `root/adult/adult.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/adult`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/adult`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'adult')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/adult/adult.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/adult/adult.txt'
    rq.json(url_json, gfile.path_join(task_path, 'adult.json'))
    rq.table(url_txt, gfile.path_join(task_path, 'adult.txt'))
    print('adult dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemple #3
0
def online_shopping_10_cats(root):
    """Chinese online shopping reviews datasets.
        
    Chinese online shopping reviews datasets contains 60,000+ samples, 
    about 10 categories (books, tablets, mobile phones, fruits, shampoos, 
    water heaters, Mengniu, clothes, computers, hotels),
    including more than 30,000 positive reviews 
    and more than 30,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_online_shopping_10_cats data: 
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json`
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_online_shopping_10_cats`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_online_shopping_10_cats`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_online_shopping_10_cats')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.txt'))
    print('chinese_reviews_online_shopping_10_cats dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #4
0
def douban_movies(root):
    """Chinese douban movies reviews datasets.
        
    Chinese douban movies reviews datasets Includes 28 movies, 
    over 700,000 users, over 2 million ratings.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_douban_movies data: 
    `root/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json`
    `root/chinese_reviews_douban_movies/ratings.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_douban_movies`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_douban_movies`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_douban_movies')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json'
    url_movies = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/movies.txt'
    url_ratings = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/ratings.txt'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_douban_movies.json'))
    rq.table(url_movies, path_join(task_path, 'movies.txt'))
    l = [url_ratings[:-4]+str(i)+url_ratings[-4:] for i in range(13)]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(path_join(task_path, 'ratings.txt'), index=False)
    print('chinese_reviews_douban_movies dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #5
0
def takeaway(root):
    """Chinese takeaway reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k/waimai_10k.csv`
    
    Chinese takeaway reviews datasets contains 12,000+ samples, 
    including more than 4,000 positive reviews 
    and more than 8,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_takeaway data: 
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.json`
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_takeaway`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_takeaway`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_takeaway')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_takeaway/chinese_reviews_takeaway.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/waimai_10k/waimai_10k.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_takeaway.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_takeaway.txt'))
    print('chinese_reviews_takeaway dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #6
0
def ctrip_hotel(root):
    """Ctrip hotel reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/
    master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv`
    
    Ctrip's review data set contains 7000+ samples, 
    including more than 5,000 positive reviews 
    and more than 2,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_ctrip_hotel data: 
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.txt`
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_ctrip_hotel`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_ctrip_hotel`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_ctrip_hotel')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_ctrip_hotel.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_ctrip_hotel.txt'))
    print('chinese_reviews_ctrip_hotel dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #7
0
def sina_weibo_emotion4(root):
    """Chinese Sina weibo 4 emotion reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 360,000+ samples, 
    contains 4 emotions, including about 200,000 joys, 
    anger, disgust, and low, more than 50,000.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo_emotion4 data: 
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json`
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo_emotion4`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo_emotion4`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo_emotion4')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json'
    url_txt = ['https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_01.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_02.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_03.txt',]
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.json'))
    data = pd.DataFrame()
    for url in url_txt:
        s = requests.get(url).content
        data = pd.concat([data, pd.read_csv(io.StringIO(s.decode('utf-8')))])
    data.to_csv(path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.txt'), index=False)
    print('chinese_reviews_sina_weibo_emotion4 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #8
0
def sina_weibo(root):
    """Chinese Sina weibo reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 110,000+ samples, 
    including more than 59,000 positive reviews 
    and more than 59,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo data: 
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json`
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_sina_weibo.txt'))
    print('chinese_reviews_sina_weibo dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemple #9
0
def movielens(root):
    """Movielens movies datasets.
    
    This dataset (ml-latest) describes 5-star rating and free-text tagging 
    activity from [MovieLens](http://movielens.org), a movie recommendation service. 
    It contains 27753444 ratings and 1108997 tag applications across 58098 movies. 
    These data were created by 283228 users between January 09, 1995 and September 26, 2018. 
    This dataset was generated on September 26, 2018.

    Users were selected at random for inclusion. All selected users had rated at least 1 movies. 
    No demographic information is included. Each user is represented by an id, 
    and no other information is provided.

    The data are contained in the files `genome-scores.csv`, `genome-tags.csv`,
    `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. 
    More details about the contents and use of all these files follows.

    This is a *development* dataset. As such, 
    it may change over time and is not an appropriate dataset for shared research results. 
    See available *benchmark* datasets if that is your intent.

    This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.
    
    Data storage directory:
    root = `/user/.../mydata`
    movielens data: 
    `root/movielens/movielens.json`
    `root/movielens/links.txt`
    `root/movielens/genome_tags.txt`
    `root/movielens/movies.txt`
    `root/movielens/genome_scores.txt`
    `root/movielens/ratings.txt`
    `root/movielens/tags.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/movielens`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/movielens`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'movielens')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/movielens.json'
    url_link_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/links.txt'
    url_genome_tags_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/genome_tags.txt'
    url_movies_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/movies.txt'
    url_genome_scores_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/genome_scores.txt'
    url_ratings_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/ratings.txt'
    url_tags_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/imdb/tags.txt'
    rq.json(url_json, gfile.path_join(task_path, 'movielens.json'))
    rq.table(url_link_txt, gfile.path_join(task_path, 'links.txt'))
    rq.table(url_movies_txt, gfile.path_join(task_path, 'movies.txt'))
    rq.table(url_genome_tags_txt, gfile.path_join(task_path,
                                                  'genome_tags.txt'))
    l = [
        url_genome_scores_txt[:-4] + str(i) + url_genome_scores_txt[-4:]
        for i in range(16)
    ]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(path_join(task_path, 'genome_scores.txt'), index=False)
    l = [
        url_ratings_txt[:-4] + str(i) + url_ratings_txt[-4:] for i in range(21)
    ]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(gfile.path_join(task_path, 'ratings.txt'), index=False)
    l = [url_tags_txt[:-4] + str(i) + url_tags_txt[-4:] for i in range(2)]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(gfile.path_join(task_path, 'tags.txt'), index=False)
    print('movielens dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path