class Misc: def __init__(self): self.timeHandler = TimeHandler() def earliestPostTime(self, dataFile): reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\") earliestTime = "November 17th, 2017, 3:07 am" count = 0 for line in reader: #print line time = line[5] if self.timeHandler.diffDays(time, earliestTime) > 0: earliestTime = time count += 1 if count%10000 == 0: print count print earliestTime
class TableMaker: def __init__(self): self.post2Page = {} self.posts = [] self.activeForums = {} self.earliestDate = "November 22nd, 2006, 9:51 pm" self.timeHandler = TimeHandler() self.userNames = {} self.userJoins = {} def loadUserNames(self, userNames): for line in open(userNames): line = line.strip().split('\t') self.userNames[line[0]] = line[1] def loadUserJoins(self, userJoins): for line in open(userJoins): line = line.strip().split('\t') self.userJoins[line[0]] = line[1] def daysHoursMinutes(self, rawTime): days = self.timeHandler.diffDays(self.earliestDate, rawTime) hours = self.timeHandler.diffHours(self.earliestDate, rawTime) minutes = self.timeHandler.diffMinutes(self.earliestDate, rawTime) return days, hours, minutes def createActiveForums(self): userPostedForums = dd(lambda:dd(int)) for post in self.posts: user = post[0] forum = post[4].split()[0] userPostedForums[user][forum] += 1 for user in userPostedForums.iterkeys(): actForum = "" maxPosts = 0 for forum, numPosts in userPostedForums[user].iteritems(): if numPosts > maxPosts: maxPosts = numPosts actForum = forum self.activeForums[user] = actForum def createPostPages(self, dataFile): dataFile = open(dataFile) reader = csv.reader(dataFile, quotechar='"', escapechar="\\") threads = dd(list) posts = dd(int) for line in reader: self.posts.append(line) #print line post = int(line[1].strip('"')) thread = int(line[2].strip('"')) threads[thread].append(post) posts[post] = thread for thread in threads.iterkeys(): threads[thread] = sorted(threads[thread], cmp=lambda x, y:x - y) for post in posts.keys(): self.post2Page[str(post)] = (threads[posts[post]].index(post)) / 50 print len(self.post2Page) def formUrl(self, postId, threadId): baseUrl = 'http://thehoodup.com/board/viewtopic.php?' return baseUrl + 't=' + str(threadId) + '&start=' + str(int(self.post2Page[str(postId)]) * 50) + '#p' + str(postId) def makeATable(self, dataFile): reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\") conn = M.connect('localhost', 'phani', 'phani', 'hoodup') cursor = conn.cursor() cursor.execute("""create table posts(userName VARCHAR(1000), userId INT, postId INT, threadId INT, postBody VARCHAR(10000), postForum VARCHAR(30), activeForum VARCHAR(30), userRegDay INT, days INT,hours INT, minutes INT, HoodupLink VARCHAR(100))""") status = 0 for post in reader: if len(post) != 6: print 'skipping' print post continue #print post[0], post[2] rawTime = post[-1] post = post[:-1] post[-1] = post[-1].split()[0] post.append(self.activeForums[post[0]]) try: post.append(self.timeHandler.diffDays(self.earliestDate, self.userJoins[post[0]])) except: post.append(-1) post.extend(self.daysHoursMinutes(rawTime)) post.append(self.formUrl(post[1], post[2])) try: post.insert(0, self.userNames[post[0]]) except: post.insert(0, '__N_U_L_L__') #print post cursor.execute("""insert into posts values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", tuple(post)) status += 1 if status % 10000 == 0: print status conn.close()