Esempio n. 1
0
def loadDataset(filename, usermap, itemmap, parser, shape=None):
    r = envoy.run('wc -l {}'.format(filename))
    num_lines = int(r.std_out.strip().partition(' ')[0])
    bar = progressbar.ProgressBar(maxval=num_lines,
                                  widgets=[
                                      "Loading data: ",
                                      progressbar.Bar('=', '[', ']'), ' ',
                                      progressbar.Percentage(), ' ',
                                      progressbar.ETA()
                                  ]).start()
    I, J, V = [], [], []
    cold = []
    with open(filename) as f:
        for i, line in enumerate(f):
            if (i % 1000) == 0:
                bar.update(i % bar.maxval)
            userid, itemid, rating = parser.parse(line)
            if userid not in usermap or itemid not in itemmap:
                cold.append((userid, itemid, rating))
                continue
            uid = usermap[userid]
            iid = itemmap[itemid]
            I.append(uid)
            J.append(iid)
            V.append(float(rating))
    bar.finish()
    if shape is not None:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape)
    else:
        R = scipy.sparse.coo_matrix((V, (I, J)),
                                    shape=(len(usermap), len(itemmap)))
    R = coo_tocsr(R)

    return R, cold
Esempio n. 2
0
 def import_data(self,
                 filename,
                 parser,
                 shape=None,
                 num_headers=0,
                 debug=False):
     r = envoy.run('wc -l {}'.format(filename))
     num_lines = int(r.std_out.strip().partition(' ')[0])
     bar = progressbar.ProgressBar(maxval=num_lines,
                                   widgets=[
                                       "Loading data: ",
                                       progressbar.Bar('=', '[', ']'), ' ',
                                       progressbar.Percentage(), ' ',
                                       progressbar.ETA()
                                   ]).start()
     I, J, V = [], [], []
     with open(filename) as f:
         for i in range(num_headers):
             f.readline()
         for i, line in enumerate(f):
             if (i % 1000) == 0:
                 bar.update(i % bar.maxval)
             try:
                 userid, itemid, rating = parser.parse(line)
                 self.update_user_item(userid, itemid)
                 uid = self.users[userid]
                 iid = self.items[itemid]
                 I.append(uid)
                 J.append(iid)
                 V.append(float(rating))
             except:
                 if debug:
                     print "Ignoring Input: ", line,
                 continue
     bar.finish()
     if shape is not None:
         _shape = (self.nusers if shape[0] is None else shape[0],
                   self.nitems if shape[1] is None else shape[1])
         R = scipy.sparse.coo_matrix((V, (I, J)), shape=_shape)
     else:
         R = scipy.sparse.coo_matrix((V, (I, J)),
                                     shape=(self.nusers, self.nitems))
     self.R = coo_tocsr(R)
     sys.stdout.flush()
     return self.R
Esempio n. 3
0
 def import_data(self, filename, parser, shape=None, num_headers=0, debug=False):
     r = envoy.run("wc -l {}".format(filename))
     num_lines = int(r.std_out.strip().partition(" ")[0])
     bar = progressbar.ProgressBar(
         maxval=num_lines,
         widgets=[
             "Loading data: ",
             progressbar.Bar("=", "[", "]"),
             " ",
             progressbar.Percentage(),
             " ",
             progressbar.ETA(),
         ],
     ).start()
     I, J, V = [], [], []
     with open(filename) as f:
         for i in range(num_headers):
             f.readline()
         for i, line in enumerate(f):
             if (i % 1000) == 0:
                 bar.update(i % bar.maxval)
             try:
                 userid, itemid, rating = parser.parse(line)
                 self.update_user_item(userid, itemid)
                 uid = self.users[userid]
                 iid = self.items[itemid]
                 I.append(uid)
                 J.append(iid)
                 V.append(float(rating))
             except:
                 if debug:
                     print "Ignoring Input: ", line,
                 continue
     bar.finish()
     if shape is not None:
         _shape = (self.nusers if shape[0] is None else shape[0], self.nitems if shape[1] is None else shape[1])
         R = scipy.sparse.coo_matrix((V, (I, J)), shape=_shape)
     else:
         R = scipy.sparse.coo_matrix((V, (I, J)), shape=(self.nusers, self.nitems))
     self.R = coo_tocsr(R)
     sys.stdout.flush()
     return self.R
Esempio n. 4
0
def loadSideInfo(filename, targetmap, parser, shape=None):
    r = envoy.run("wc -l {}".format(filename))
    num_lines = int(r.std_out.strip().partition(" ")[0])
    bar = progressbar.ProgressBar(
        maxval=num_lines,
        widgets=[
            "Loading data: ",
            progressbar.Bar("=", "[", "]"),
            " ",
            progressbar.Percentage(),
            " ",
            progressbar.ETA(),
        ],
    ).start()
    I, J, V = [], [], []
    cold = []
    counter = 0
    feature_map = {}
    with open(filename) as f:
        for i, line in enumerate(f):
            if (i % 1000) == 0:
                bar.update(i % bar.maxval)
            keyid, featureid = parser.parse(line)
            if keyid not in targetmap:
                continue
            if featureid not in feature_map:
                feature_map[featureid] = counter
                counter += 1
            kid = targetmap[keyid]
            fid = feature_map[featureid]
            I.append(kid)
            J.append(fid)
            V.append(1.0)
    bar.finish()
    if shape is not None:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape)
    else:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=(len(targetmap), len(feature_map)))
    R = coo_tocsr(R)

    return R, feature_map
Esempio n. 5
0
def loadSideInfo(filename, targetmap, parser, shape=None):
    r = envoy.run('wc -l {}'.format(filename))
    num_lines = int(r.std_out.strip().partition(' ')[0])
    bar = progressbar.ProgressBar(maxval=num_lines,
                                  widgets=[
                                      "Loading data: ",
                                      progressbar.Bar('=', '[', ']'), ' ',
                                      progressbar.Percentage(), ' ',
                                      progressbar.ETA()
                                  ]).start()
    I, J, V = [], [], []
    cold = []
    counter = 0
    feature_map = {}
    with open(filename) as f:
        for i, line in enumerate(f):
            if (i % 1000) == 0:
                bar.update(i % bar.maxval)
            keyid, featureid = parser.parse(line)
            if keyid not in targetmap:
                continue
            if featureid not in feature_map:
                feature_map[featureid] = counter
                counter += 1
            kid = targetmap[keyid]
            fid = feature_map[featureid]
            I.append(kid)
            J.append(fid)
            V.append(1.0)
    bar.finish()
    if shape is not None:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape)
    else:
        R = scipy.sparse.coo_matrix((V, (I, J)),
                                    shape=(len(targetmap), len(feature_map)))
    R = coo_tocsr(R)

    return R, feature_map
Esempio n. 6
0
def loadDataset(filename, usermap, itemmap, parser, shape=None):
    r = envoy.run("wc -l {}".format(filename))
    num_lines = int(r.std_out.strip().partition(" ")[0])
    bar = progressbar.ProgressBar(
        maxval=num_lines,
        widgets=[
            "Loading data: ",
            progressbar.Bar("=", "[", "]"),
            " ",
            progressbar.Percentage(),
            " ",
            progressbar.ETA(),
        ],
    ).start()
    I, J, V = [], [], []
    cold = []
    with open(filename) as f:
        for i, line in enumerate(f):
            if (i % 1000) == 0:
                bar.update(i % bar.maxval)
            userid, itemid, rating = parser.parse(line)
            if userid not in usermap or itemid not in itemmap:
                cold.append((userid, itemid, rating))
                continue
            uid = usermap[userid]
            iid = itemmap[itemid]
            I.append(uid)
            J.append(iid)
            V.append(float(rating))
    bar.finish()
    if shape is not None:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape)
    else:
        R = scipy.sparse.coo_matrix((V, (I, J)), shape=(len(usermap), len(itemmap)))
    R = coo_tocsr(R)

    return R, cold