Ejemplo n.º 1
0
def load_users(path):
    dbm = DBM()
    fin = open(path, encoding='utf-8', mode='r')
    cnt = 1
    for line in fin:
        cnt += 1
        try:
            user = xmltodict.parse(line)['row']
            dbm.add_user(user)
            print(cnt)
        except:
            print("ERROR")
            print(line)
Ejemplo n.º 2
0
def load_posts(path):
    dbm = DBM()
    fin = open(path, mode='r', encoding='utf-8')
    dcnt = 1
    for line in fin:
        try:
            post = xmltodict.parse(line)['row']
            # print(post)
            dbm.add_post(post)
            # print(post)
        except:
            print("ERROR")
            print(post)

        if dcnt > 1000000:
            dcnt = 1
            del dbm
            dbm = DBM()
Ejemplo n.º 3
0
assert (cfg.pretrain or not cfg.continue_learning)
### append node types
while len(cfg.utype) < len(cfg.l_size):
    cfg.utype.append(pyrbm.UnitType.binary)

print "Shuffling data..."
if not cfg.gethidrep:
    dataset.shuffle()

print "Initializing RBM..."
pyrbm.initialize(cfg)
print "ready."

if cfg.dbm:
    rbmstack = DBM(cfg)
else:
    rbmstack = pyrbm.RBMStack(cfg)

rbmstack.saveOptions(cfg.get_serialization_obj())

mbp = minibatch_provider.MNISTMiniBatchProvider(dataset.data, dataset.teacher)

print "Calculating statistics for minibatch..."
mbs = minibatch_provider.MiniBatchStatistics(mbp, rbmstack.layers[0].act)
if cfg.utype[0] == pyrbm.UnitType.gaussian:
    mbp.norm = lambda x: mbs.normalize_zmuv(x)
else:
    mbp.norm = lambda x: mbs.normalize_255(x)
if "test_data" in dataset.__dict__:
    mbp_test = minibatch_provider.MNISTMiniBatchProvider(
Ejemplo n.º 4
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--gauss', type=int, default=2)
    parser.add_argument('--epoch', type=int, default=50)
    parser.add_argument('--hidden', type=int, default=8)
    parser.add_argument('--steps', type=int, default=1)
    parser.add_argument('--recon', type=int, default=50)

    args = parser.parse_args()
    data_file = 'exp1_gauss' + str(args.gauss)

    with open(data_file, 'rb') as f:

        samples = pickle.load(f, encoding='bytes')
        n_visible, n_hidden = 2, args.hidden
        n_steps = args.steps
        n_epochs = args.epoch
        n_gibbs = args.recon

        dbm = DBM(num_visible=n_visible,
                  num_hidden=n_hidden,
                  CD_steps=n_steps,
                  gibb_steps=n_gibbs,
                  num_epochs=n_epochs)

        train = samples[:8000]
        validation = samples[8000:]
        dbm.fit(train, validation)
        plot_samples(dbm.reconstruction)
Ejemplo n.º 5
0
from dbm import DBM
from tqdm import tqdm
'''
This script assigns a set of tags to every user based on their interactions.
'''

db = DBM()
upd_handler = DBM()
#The following line adds a '@Tags' attribute to each user doc
# db.db.users.update({}, {'$set':{'@Tags':[]}}, upsert=False, multi=True)


def append_tags(user_id, tags):
    '''
    This method appends a set of tags to a specific user's list of tags.
    '''
    upd_handler.db.users.update({'@Id': user_id},
                                {'$push': {
                                    '@Tags': {
                                        '$each': tags
                                    }
                                }})


def tags_list_to_dict(user):
    if '@Id' not in user or '@Tags' not in user:
        return
    tag_dict = {}
    for tag in user['@Tags']:
        if tag not in tag_dict:
            tag_dict[tag] = 1
Ejemplo n.º 6
0
assert (cfg.pretrain or not cfg.continue_learning)
### append node types
while len(cfg.utype) < len(cfg.l_size):
    cfg.utype.append(pyrbm.UnitType.binary)

print "Shuffling data..."
if not cfg.gethidrep:
    dataset.shuffle()

print "Initializing RBM..."
pyrbm.initialize(cfg)
print "ready."

if cfg.dbm:
    rbmstack = DBM(cfg)
else:
    rbmstack = pyrbm.RBMStack(cfg)

rbmstack.saveOptions(cfg.get_serialization_obj())

mbp = minibatch_provider.MNISTMiniBatchProvider(dataset.data, dataset.teacher)

print "Calculating statistics for minibatch..."
mbs = minibatch_provider.MiniBatchStatistics(mbp, rbmstack.layers[0].act)
if cfg.utype[0] == pyrbm.UnitType.gaussian:
   mbp.norm = lambda x: mbs.normalize_zmuv(x)
else:
   mbp.norm = lambda x: mbs.normalize_255(x)
if "test_data" in dataset.__dict__:
   mbp_test = minibatch_provider.MNISTMiniBatchProvider(dataset.test_data, dataset.test_teacher) 
Ejemplo n.º 7
0
from dbm import DBM
import networkx as nx

db = DBM()
db2 = DBM()
post_filter = {'@CreationDate': {'$gt': '2019'}}
post_filter = {}
# post_filter = {'@Tags' : {'$type':'string', '$not':{'$type':'array'}}}

posts = db.get_post(post_filter)

graph = nx.Graph()
# cnt = 0
# for post in posts:
#     if cnt%100==0:
#         print(cnt, post['@Tags'])
#     cnt += 1
#     tags = post['@Tags'][1:-1].replace("><", " ").split(" ")
#     # print(tags)
#     post['@Tags'] = tags
#     update = { "$set": {"@Tags": tags} }
#     db2.update_post({'_id':post['_id']}, update)

cnt = 0
for post in posts:
    if '@Tags' not in post or type(post['@Tags']) == type(""):
        continue
    tags = post['@Tags']
    cnt += 1
    if cnt % 10000 == 0:
        print(cnt, tags)
Ejemplo n.º 8
0
    print i, ' cycle entropy: ', entropy[-1],' cycle accuracy: ', accuracy[-1]
    joblib.dump(entropy, 'output/dbm_entropy')
    joblib.dump(accuracy, 'output/dbm_accuracy')

dataset = np.round(np.random.rand(10000, 1))
labels = 1-dataset
dataset = np.append(dataset,1-dataset,axis=1)
dataset = np.append(dataset,np.ones((dataset.shape[0],1)),axis=1)
print 'dataset shape: ', dataset.shape


energy = []
entropy = []
accuracy = []
print 'initializing model'
dbm_test=DBM(dataset,layers=[30,20])
#render_output(1,1)

for k in range(1,3):
    for i in range(10):
        print 'beginning boltzmann training of model'
        dbm_test.train_unsupervised(k)
        render_output(i,k)

dbm_test.learning_rate = 1.0
dbm_test.add_layer(1)
dbm_test.labels = labels
#Adapt the output layer to the network
render_output(-1,4)
render_supervised(-1)
for i in range(20):
Ejemplo n.º 9
0
    print i, ' cycle entropy: ', entropy[-1], ' cycle accuracy: ', accuracy[-1]
    joblib.dump(entropy, 'output/dbm_entropy')
    joblib.dump(accuracy, 'output/dbm_accuracy')


dataset = np.round(np.random.rand(10000, 1))
labels = 1 - dataset
dataset = np.append(dataset, 1 - dataset, axis=1)
dataset = np.append(dataset, np.ones((dataset.shape[0], 1)), axis=1)
print 'dataset shape: ', dataset.shape

energy = []
entropy = []
accuracy = []
print 'initializing model'
dbm_test = DBM(dataset, layers=[30, 20])
#render_output(1,1)

for k in range(1, 3):
    for i in range(10):
        print 'beginning boltzmann training of model'
        dbm_test.train_unsupervised(k)
        render_output(i, k)

dbm_test.learning_rate = 1.0
dbm_test.add_layer(1)
dbm_test.labels = labels
#Adapt the output layer to the network
render_output(-1, 4)
render_supervised(-1)
for i in range(20):
Ejemplo n.º 10
0
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dbm import DBM

dbm = DBM()

#Get all tags from all users
all_users = dbm.get_user(flt={'tags': {'$exists': 1}})
all_tags = ""
norm_tags = {}
for user in all_users:
    for tag in user['tags']:
        all_tags += tag['name'] + " "
        if tag['name'] in norm_tags:
            norm_tags[tag['name']] += tag['count']
        else:
            norm_tags[tag['name']] = tag['count']

print("Number of distinct tags:", len(norm_tags))

# Generate a non-normalized tag cloud image
wordcloud = WordCloud(width=700,
                      height=500,
                      stopwords=['n'],
                      normalize_plurals=False,
                      max_words=1000).generate(all_tags)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Non Normalized Tag-Cloud")

# Generate a Normalized tag cloud image