def parse(self): for i,c in enumerate(self.load()): #self.output_all(i+1) print 'parse %d th file' % i if i % 100: print '.. has parsed %d files' % i movie = 0 lines = c.split('\n') for l in lines: if len(l.split(',')) < 3 and common.canfind(l, ':'): ''' movie id ''' words = l.split(':') movie = int(words[0]) assert movie<17779, 'Error: movieid greater than max 17770' else: #get user record #parse line words = l.split(',') try: userid = int(words[0]) rank = int(words[1]) except: continue assert userid < 2649440, 'Error: userid greater than max' assert rank <= 5, 'Error: rank greater than max 5' #add record to dic if not self.datadic.has_key(userid): #create new key self.datadic[userid] = [] self.datadic[userid].append((movie, rank))
def parse(self): for i,c in enumerate(self.load()): lines = c.split('\n') for l in lines: if len(l.split(',')) < 3: continue if common.canfind(l, ':'): continue # the line contains userid words = l.split(',') userid = int(words[0]) if userid > 2650000: print '.. Error: get userid', userid break self.set.add(userid) if i % 100 == 0: print '.. has parsed %d files' % i print '.. begin sort ids' print '.. get list' self._list = list(self.set) print '.. list sort' self._list.sort() print '.. end parse.'