Example #1
0
def load_problem_movielens_1m(all_features=False):
    '''
    Standard test dataset for recommendation systems
    From http://grouplens.org/datasets/movielens/
    '''
    folder = '../data/ml-1m'
    ratings = pandas.read_csv(folder + '/ratings.dat', sep='::', 
                              names=['user', 'movie', 'rating', 'timestamp'], header=None)
    ratings = ratings.drop('timestamp', axis=1)
    
    if all_features:
        users = pandas.read_csv(folder + '/users.dat', sep='::', 
                                names=['user', 'gender', 'age', 'occupation', 'zip'], header=None)
        movies = pandas.read_csv(folder + '/movies.dat', sep='::', names=['movie', 'title', 'genres'], header=None)
        sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
        sparse_genres = pandas.DataFrame(sparse_genres.todense())
        movies = pandas.concat([movies[['movie']], sparse_genres], axis=1)    
        ratings = pandas.merge(pandas.merge(ratings, users, on='user'), movies, on='movie')

    answers = ratings['rating'].values
    ratings = ratings.drop('rating', axis=1)

    for feature in ratings.columns:
        _, ratings[feature] = numpy.unique(ratings[feature], return_inverse=True)
        
    trainX, testX, trainY, testY = train_test_split(ratings, answers, train_size=0.75, random_state=42)
    return trainX, testX, trainY, testY
Example #2
0
def make_reuters_data():
    np.random.seed(1234)
    random.seed(1234)
    from sklearn.feature_extraction.text import CountVectorizer
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open('../reuters/rcv1-v2.topics.qrels') as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        for did in list(did_to_cat):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open('../reuters/' + dat) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    X = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    Y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X)
    X = np.asarray(X.todense()) * np.sqrt(X.shape[1])

    p = np.random.permutation(X.shape[0])
    X = X[p]
    Y = Y[p]

    N = X.shape[0]
    save_hdf5(X[:N], Y[:N], 'reutersidf_train')
    save_hdf5(X[int(N * 4 / 5):N], Y[int(N * 4 / 5):N], 'reutersidf_test')
    save_hdf5(X[:N], Y[:N], 'reutersidf_total')
Example #3
0
def load_data():
    print('loading data....')
    from sklearn.feature_extraction.text import CountVectorizer
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open('dataset/reuters/rcv1-v2.topics.qrels') as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        copy_dc = copy.copy(did_to_cat)
        for did in copy_dc.keys():
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        print(dat + '....')
        with open('dataset/reuters/' + dat) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat.keys():
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    X = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    Y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X)
    X = np.asarray(X.todense()) * np.sqrt(X.shape[1])
    X = preprocessing.normalize(X, norm='l2') * 200
    X = X.astype('float32')
    X = X[:685000]  # for 100 minibatch training
    Y = Y[:685000]
    return X, Y
Example #4
0
def load_jdData(path='jd_Data/extract_comments4.txt',
                binary=False,
                threeClassed=True):
    print('loading JD reviews...')
    if binary == True:
        tfidfFile = 'jdReview_TOP2000tfidf' + '2class' + '.npy'
    if threeClassed == True:
        tfidfFile = 'jdReview_TOP2000tfidf' + '3class' + '.npy'
    data_dir = 'jd_Data/'

    import os
    if os.path.exists(os.path.join(data_dir, tfidfFile)):
        data = np.load(os.path.join(data_dir, tfidfFile)).item()
        x = data['data']
        y = data['label']
        return x, y

    df = loadSplitRawData(path, binary, threeClassed)

    data = np.array(df["data"])
    target = np.array(df["label"])

    from sklearn.feature_extraction.text import CountVectorizer
    # Convert a collection of text documents to a matrix of token counts
    # CountVectorizer会将文本中的词语转换为词频矩阵,它通过fit_transform函数计算各个词语出现的次数
    # 对所有关键词的term frequency进行降序排序,只取前max_features个作为关键词集
    x = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    # Transform a count matrix to a normalized tf or tf-idf representation
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(
        x)  # 返回值类型'scipy.sparse.csr.csr_matrix',下面的todense会将其变为矩阵
    x = x[:].astype(np.float32)
    print(x.dtype, x.size)
    x = np.asarray(x.todense()) * np.sqrt(
        x.shape[1])  # 保证一行的平方和除以D等于1? D = x.shape[1]
    print('todense succeed')

    # p = np.random.permutation(x.shape[0]) #打乱 Randomly permute a sequence, or return a permuted range.
    # x = x[p]
    # y = y[p]
    # print('permutation finished')

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], -1))

    print(x.shape, y.shape)

    np.save(os.path.join(data_dir, tfidfFile), {
        'data': x,
        'label': y
    })  #Save an array to a binary file in NumPy .npy format.
    return x, y
Example #5
0
def load_problem_movielens_1m(all_features=False):
    '''
    Standard test dataset for recommendation systems
    From http://grouplens.org/datasets/movielens/
    '''
    folder = '/Users/scottcronin/Dropbox/data/benchmark-movielens-1m'
    ratings = pandas.read_csv(folder + '/ratings.dat',
                              sep='::',
                              names=['user', 'movie', 'rating', 'timestamp'],
                              header=None,
                              engine='python')
    ratings = ratings.drop('timestamp', axis=1)

    if all_features:
        users = pandas.read_csv(
            folder + '/users.dat',
            sep='::',
            names=['user', 'gender', 'age', 'occupation', 'zip'],
            header=None)
        movies = pandas.read_csv(folder + '/movies.dat',
                                 sep='::',
                                 names=['movie', 'title', 'genres'],
                                 header=None)
        sparse_genres = CountVectorizer().fit_transform(
            movies.genres.map(lambda x: x.replace('|', ' ')))
        sparse_genres = pandas.DataFrame(sparse_genres.todense())
        movies = pandas.concat([movies[['movie']], sparse_genres], axis=1)
        ratings = pandas.merge(pandas.merge(ratings, users, on='user'),
                               movies,
                               on='movie')

    answers = ratings['rating'].values
    ratings = ratings.drop('rating', axis=1)

    for feature in ratings.columns:
        _, ratings[feature] = numpy.unique(ratings[feature],
                                           return_inverse=True)

    trainX, testX, trainY, testY = train_test_split(ratings,
                                                    answers,
                                                    train_size=0.75,
                                                    random_state=42)
    return trainX, testX, trainY, testY
Example #6
0
def build_movielens(folder, test_size, with_genres=True, with_users_info=True, with_rated_movies=True):
    print('load ratings....')
    rating_path = [name for name in os.listdir(folder) if 'ratings' in name][0]
    if 'csv' in rating_path:
        ratings = pd.read_csv(folder + rating_path, sep=',', header=0, dtype=dtypes,
                      names=['user', 'movie', 'rating', 'timestamp'])
        ratings['rating'] = (ratings['rating'] + 0.5).astype('int8')
    else:
        ratings = pd.read_csv(folder + rating_path, sep='::', header=None, engine='python',
                              names=['user', 'movie', 'rating', 'timestamp'], dtype=dtypes)
    ratings.rating = ratings.rating.astype('int8')
    
    print('calculation of monthes....')
    ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
    min_date = ratings.timestamp.min()
    ratings['monthes'] = (ratings.timestamp - min_date).dt.days // 28
    ratings.monthes /= ratings.monthes.max()
    ratings.monthes = ratings.monthes.astype('float16')
    dataset = ratings.drop('timestamp', 1)
    del(ratings); gc.collect()
    
    if with_genres:
        print('load movies....')
        movie_path = [name for name in os.listdir(folder) if 'movies' in name][0]
        if 'csv' in movie_path:
            movies = pd.read_csv(folder + movie_path, sep=',', header=0,
                                 names=['movie', 'title', 'genres'], 
                                 usecols=['movie', 'genres'], dtype=dtypes)
        else:
            movies = pd.read_csv(folder + movie_path, sep='::', engine='python',
                                 names=['movie', 'title', 'genres'], 
                                 usecols=['movie', 'genres'], header=None, dtype=dtypes)

        print('build genres ohe....')
        sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
        colnames = ['genre_{}'.format(col) for col in range(sparse_genres.shape[1])]
        sparse_genres = pd.DataFrame(sparse_genres.todense().astype('uint8'), columns=colnames)
        movies = pd.concat([movies[['movie']], sparse_genres], axis=1)
        del(sparse_genres); gc.collect()        

        print('join dataframes....')
        dataset = pd.merge(dataset, movies, on='movie', how='inner')
        del(movies); gc.collect()
    else:
        print('genres skipped')
    
    if with_users_info and 'users.dat' in os.listdir(folder):
        print('load users info....')
        users = pd.read_csv(folder + 'users.dat', sep='::', 
                            header=None, names=['user', 'gender', 'age', 'occupation', 'zip'],
                            engine='python')
        users.age    = (users.age / users.age.max()).astype('float16')
        users.gender = users.gender.apply(lambda x: 1 if x=='M' else 0).astype('int8')
        users.occupation = users.occupation.astype('int8')
        users.zip    = np.unique(users.zip.values, return_inverse=True)[1]
        users.zip = users.zip.astype('int16')
        dataset = pd.merge(dataset, users, on='user', how='left')
        del(users); gc.collect()
    else:
        print('users info skipped')

    np.random.seed(42)
    print('train/test split...')
    test_indexes = np.random.choice(dataset.index, int(test_size * dataset.shape[0]), replace=False)
    test = dataset.loc[test_indexes]
    train = dataset.drop(test_indexes)
    del(dataset); gc.collect();
    
    if with_rated_movies:
        print('building rated movies history (on train)....')
        rated_movies = train.groupby('user')['movie'].agg(lambda x: list(x))
        train.loc[:, 'ratedMovies'] = train.user.map(rated_movies)
        test.loc[:, 'ratedMovies']  = test.user.map(rated_movies)
        del(rated_movies); gc.collect()
    else:
        print('rated movies history skipped')
        
    print('preprocessing done....')
    return train, test
Example #7
0
def make_reuters_data(data_dir):
    np.random.seed(1234)
    from sklearn.feature_extraction.text import CountVectorizer
    from os.path import join
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        for did in did_to_cat.keys():
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did_to_cat.has_key(did):
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
    x = x[:10000]
    y = y[:10000]
    x = np.asarray(x.todense()) * np.sqrt(x.shape[1])
    print 'todense succeed'

    p = np.random.permutation(x.shape[0])
    x = x[p]
    y = y[p]
    print 'permutation finished'

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], x.size / x.shape[0]))
    np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})
Example #8
0
File: dec.py Project: VikingMew/dec
def make_reuters_data():
  np.random.seed(1234)
  random.seed(1234)
  from sklearn.feature_extraction.text import CountVectorizer
  did_to_cat = {}
  cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
  with open('../reuters/rcv1-v2.topics.qrels') as fin:
    for line in fin.readlines():
      line = line.strip().split(' ')
      cat = line[0]
      did = int(line[1])
      if cat in cat_list:
        did_to_cat[did] = did_to_cat.get(did, []) + [cat]
    for did in did_to_cat.keys():
      if len(did_to_cat[did]) > 1:
        del did_to_cat[did]

  dat_list = ['lyrl2004_tokens_test_pt0.dat', 
              'lyrl2004_tokens_test_pt1.dat',
              'lyrl2004_tokens_test_pt2.dat',
              'lyrl2004_tokens_test_pt3.dat',
              'lyrl2004_tokens_train.dat']
  data = []
  target = []
  cat_to_cid = {'CCAT':0, 'GCAT':1, 'MCAT':2, 'ECAT':3}
  del did
  for dat in dat_list:
    with open('../reuters/'+dat) as fin:
      for line in fin.readlines():
        if line.startswith('.I'):
          if 'did' in locals():
            assert doc != ''
            if did_to_cat.has_key(did):
              data.append(doc)
              target.append(cat_to_cid[did_to_cat[did][0]])
          did = int(line.strip().split(' ')[1])
          doc = ''
        elif line.startswith('.W'):
          assert doc == ''
        else:
          doc += line

  assert len(data) == len(did_to_cat)

  X = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data)
  Y = np.asarray(target)

  from sklearn.feature_extraction.text import TfidfTransformer
  X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X)
  X = np.asarray(X.todense())*np.sqrt(X.shape[1])

  p = np.random.permutation(X.shape[0])
  X = X[p]
  Y = Y[p]

  N = X.shape[0]
  write_db(X[:N], Y[:N], 'reutersidf_train')
  write_db(X[N*4/5:N], Y[N*4/5:N], 'reutersidf_test')
  write_db(X[:N], Y[:N], 'reutersidf_total')
  np.save('reutersidf.npy', Y[:N])

  N = 10000
  write_db(X[:N], Y[:N], 'reutersidf10k_train')
  write_db(X[N*4/5:N], Y[N*4/5:N], 'reutersidf10k_test')
  write_db(X[:N], Y[:N], 'reutersidf10k_total')
def mk_reuters_data(data_dir='./', rate=None, skip=None):
    from os.path import join
    from sklearn.feature_extraction.text import CountVectorizer
    import os
    import wget

    for c, i in enumerate(REU_URLS):
        if os.path.exists(REU_DAT_gz[c].split('.gz')[0]):
            continue
        else:
            wget.download(i)
            os.system('gunzip ' + REU_DAT_gz[c])

    print 'Generating TF-IDF features....'

    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        for did in list(did_to_cat.keys()):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]
    dat_list = ['lyrl2004_tokens_test_pt0.dat',
                'lyrl2004_tokens_test_pt1.dat',
                'lyrl2004_tokens_test_pt2.dat',
                'lyrl2004_tokens_test_pt3.dat',
                'lyrl2004_tokens_train.dat']
    data = []
    target = []
    ids = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                            ids.append(did)
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)

    X = x.todense()
    y = np.asarray(y)
    ids = np.asarray(ids)

    np.random.seed(1234)
    p = np.random.permutation(X.shape[0])
    np.save('document_vectors.npy', X[p])
    np.save('document_labels.npy', y[p])
    np.save('document_ids.npy', ids[p])
    np.save('perm.npy', p)
    mk_reuters_sublabels(rate=rate, skip=skip)

    return data, target, ids
Example #10
0
    def __preprocess_raw_data(self, data_dir):
        '''
        Pre-process raw reuters dataset.
        Args:
            data_dir: path to dataset contains tokenized data files and qrels file.
        '''
        print('Start preprocessing reuters dataset...')
        print('Reading...')

        doc_id_to_cat = {}
        cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
        with open(os.path.join(data_dir, 'rcv1-v2.topics.qrels')) as f:
            for line in f.readlines():
                line = line.strip(' ')
                cat = line[0]
                doc_id = int(line[1])
                if cat in cat_list:
                    doc_id_to_cat[doc_id] = doc_id_to_cat[doc_id, []].append(cat)
            
            for doc_id in list(doc_id_to_cast.keys()):
                if len(doc_id_to_cat[doc_id]) > 1:
                    del doc_id_to_cat[doc_id]
        
        data_list = ['lyrl2004_tokens_test_pt0.dat',
                    'lyrl2004_tokens_test_pt1.dat',
                    'lyrl2004_tokens_test_pt2.dat',
                    'lyrl2004_tokens_test_pt3.dat',
                    'lyrl2004_tokens_train.dat']

        data = []
        target = []
        cat_to_id = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
        for data_file in data_list:
            is_start = True
            with open(os.path.join(data_dir, data_file)) as f:
                for line in f.readlines():
                    if line.startswith('.I'):
                        if not is_start:
                            assert doc != ''
                            if doc_id in doc_id_to_cat:
                                data.append(doc)
                                target.append(cat_to_id[doc_id_to_cat[did][0]])
                        doc_id = int(line.strip().split(' ')[1])
                        is_start = False
                        doc = ''
                    elif line.startswith('.W'):
                        assert doc = ''
                    else:
                        doc += line

        print(len(data), 'and', len(doc_id_to_cat), 'and', len(target))
        assert len(data) == len(doc_id_to_cat)

        # Use bag_of_word to feature document
        x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data)
        y = np.asarray(target)

        x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
        x = x[:10000].astype(np.float32)
        y = y[:10000]
        x = np.asarray(x.todense()) * np.sqrt(x.shape[1])

        p = np.random.permutation(x.shape[0])
        x = x[p]
        y = y[p]

        print('Permutation finished')
        assert x.shape[0] == y.shape[0]
        x = x.reshape((x.shape[0], -1))
        np.save(os.path.join(data_dir, 'processed', 'reutersidf10k.npy'),
            {'data': x, 'label': y})
Example #11
0
def make_reuters_data(path, N):
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(osp.join(path, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        for did in did_to_cat.keys():
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(osp.join(path, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did_to_cat.has_key(did):
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    X = CountVectorizer(dtype=np.float64, max_features=2000,
                        max_df=0.90).fit_transform(data)
    Y = np.asarray(target)

    X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X)
    X = np.asarray(X.todense())

    minmaxscale = MinMaxScaler().fit(X)
    X = minmaxscale.transform(X)

    p = np.random.permutation(X.shape[0])
    X = X[p]
    Y = Y[p]

    fo = h5py.File(osp.join(path, 'traindata.h5'), 'w')
    fo.create_dataset('X', data=X[:N * 6 / 7])
    fo.create_dataset('Y', data=Y[:N * 6 / 7])
    fo.close()

    fo = h5py.File(osp.join(path, 'testdata.h5'), 'w')
    fo.create_dataset('X', data=X[N * 6 / 7:N])
    fo.create_dataset('Y', data=Y[N * 6 / 7:N])
    fo.close()
Example #12
0
        for z in range(0, 780):
            if (bagOfWords[x][y] == stopwords[0][z]):
                bagOfWords[x][y] = ''
print("Filtering: ", bagOfWords)

for i in range(0, len(bagOfWords)):
    bagOfWords[i] = filter(bool, bagOfWords[i])
    dataSet[i] = ' '.join(bagOfWords[i])
print("Kata Bersih: ", dataSet)

#VSM & TFIDF#
VSM = CountVectorizer().fit_transform(dataSet)
TFIDF = TfidfTransformer().fit_transform(VSM)
#print (CountVectorizer().vocabulary)
print("VSM: ", VSM)
print("", VSM.todense())
print("TFIDF: ", TFIDF)
print(TFIDF.todense())

#KONVERSI LABEL#
#Pendidikan = 0, RPL = 1, TKJ = 2, MM = 3#
label_manual = [
    1,
    1,
    1,
    2,
    3,
    3,
    1,
    1,
    0,
Example #13
0
#print("\nKata Bersih:")
#print(dataSet[4])
#print(dataSet[2])
#print(dataSet[1])

#VSM & TFIDF#
VSM = CountVectorizer().fit_transform(
    dataSet
)  #method vector space model dari library scikit learn melakukan perubahan menjadi sebuah vektor
#tfidf = TfidfTransformer() #method tfidf dari library scikit learn di copy ke variabel tfidf
TFIDF = TfidfTransformer().fit_transform(
    VSM
)  #method tfidf dari library scikit learn melakukan perubahan menjadi sebuah nilai
#print (CountVectorizer().vocabulary)
print("\nVSM: \n", VSM)
print("\n", VSM.todense())
print("\nTFIDF: \n", TFIDF)
#hhprint(TFIDF.todense())

#KONVERSI LABEL#
#Pendidikan = 0, RPL = 1, TKJ = 2, MM = 3#
label_manual = [
    1,
    1,
    1,
    2,
    3,
    3,
    1,
    1,
    0,
Example #14
0
def make_reuters_data(data_dir):
    """
    NOTE: RCV1-V2 data is heavy and not included.
    The data can be downloaded from http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
    Necessary files are:
        'rcv1-v2.topics.qrels'
        'lyrl2004_tokens_test_pt0.dat'
        'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat',
        'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    """
    np.random.seed(1234)
    from sklearn.feature_extraction.text import CountVectorizer
    from os.path import join
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1}
        for did in list(did_to_cat.keys()):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = ['lyrl2004_tokens_test_pt0.dat',
                'lyrl2004_tokens_test_pt1.dat',
                'lyrl2004_tokens_test_pt2.dat',
                'lyrl2004_tokens_test_pt3.dat',
                'lyrl2004_tokens_train.dat']
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    print((len(data), 'and', len(did_to_cat)))
    assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
    x = x[:10000].astype(np.float32)
    print(x.dtype, x.size)
    y = y[:10000]
    x = np.asarray(x.todense()) * np.sqrt(x.shape[1])
    print('todense succeed')

    p = np.random.permutation(x.shape[0])
    x = x[p]
    y = y[p]
    print('permutation finished')

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], -1))
    np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})
Example #15
0
def make_reuters_data(data_dir):

    # download reuters data
    data_path = data_dir

    print('Downloading data...')
    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt0.dat.gz -P %s'
        % data_path)
    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt1.dat.gz -P %s'
        % data_path)
    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt2.dat.gz -P %s'
        % data_path)
    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt3.dat.gz -P %s'
        % data_path)
    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_train.dat.gz -P %s'
        % data_path)

    os.system(
        'wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz -P %s'
        % data_path)

    print('Unzipping data...')
    os.system('gunzip %s/lyrl2004_tokens_test_pt0.dat.gz' % data_path)
    os.system('gunzip %s/lyrl2004_tokens_test_pt1.dat.gz' % data_path)
    os.system('gunzip %s/lyrl2004_tokens_test_pt2.dat.gz' % data_path)
    os.system('gunzip %s/lyrl2004_tokens_test_pt3.dat.gz' % data_path)
    os.system('gunzip %s/lyrl2004_tokens_train.dat.gz' % data_path)
    os.system('gunzip %s/rcv1-v2.topics.qrels.gz' % data_path)

    np.random.seed(1234)

    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(os.path.join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1}
        for did in list(did_to_cat.keys()):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(os.path.join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line
        print(len(data), len(target))

    print((len(data), 'and', len(did_to_cat)))
    #     assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
    x = x[:50000].astype(np.float32)
    print(x.dtype, x.size)
    y = y[:50000]
    x = np.asarray(x.todense()) * np.sqrt(x.shape[1])
    print('todense succeed')

    p = np.random.permutation(x.shape[0])
    x = x[p]
    y = y[p]
    print('permutation finished')

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], -1))
    np.save(os.path.join(data_dir, 'reutersidf10k.npy'), {
        'data': x,
        'label': y
    })
print(cutcorpus)

from sklearn.feature_extraction.text import CountVectorizer
# 加载停用词
vectorizer = CountVectorizer("我", "是")
counts = vectorizer.fit_transform(cutcorpus).todense()
print(counts)
print(vectorizer.vocabulary_)

# 导入欧式距离
from sklearn.metrics.pairwise import euclidean_distances
vectorizer = CountVectorizer()
for x, y in [[0, 1], [0, 2], [1, 2]]:
    dist = euclidean_distances(counts[x], counts[y])
    print('文档{}与文档{}的距离{}'.format(x, y, dist))

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'Today is Friday. The weather is fine',
    'Because of the fine weather today, I decided to go out',
    'It rained on Friday afternoon and I had to go home '
]

words = CountVectorizer().fit_transform(corpus)
tfidf = TfidfTransformer().fit_transform(words)

print(words.todense())
print(tfidf)