コード例 #1
0
ファイル: MiscFromData.py プロジェクト: phanigadde/CSRelated
class Misc:
  def __init__(self):
    self.timeHandler = TimeHandler()

  def earliestPostTime(self, dataFile):
    reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\")
    earliestTime = "November 17th, 2017, 3:07 am"
    count = 0
    for line in reader:
      #print line
      time = line[5]
      if self.timeHandler.diffDays(time, earliestTime) > 0:
        earliestTime = time
      count += 1
      if count%10000 == 0:
        print count
    print earliestTime
コード例 #2
0
ファイル: makeATable.py プロジェクト: phanigadde/CSRelated
class TableMaker:
  
  def __init__(self):
    self.post2Page = {}
    self.posts = []
    self.activeForums = {}
    self.earliestDate = "November 22nd, 2006, 9:51 pm"
    self.timeHandler = TimeHandler()
    self.userNames = {}
    self.userJoins = {}
    
  def loadUserNames(self, userNames):
    for line in open(userNames):
      line = line.strip().split('\t')
      self.userNames[line[0]] = line[1]
  
  def loadUserJoins(self, userJoins):
    for line in open(userJoins):
      line = line.strip().split('\t')
      self.userJoins[line[0]] = line[1]
    
  def daysHoursMinutes(self, rawTime):
    days = self.timeHandler.diffDays(self.earliestDate, rawTime)
    hours = self.timeHandler.diffHours(self.earliestDate, rawTime)
    minutes = self.timeHandler.diffMinutes(self.earliestDate, rawTime)
    return days, hours, minutes
    
  def createActiveForums(self):
    userPostedForums = dd(lambda:dd(int))
    for post in self.posts:
      user = post[0]
      forum = post[4].split()[0]
      userPostedForums[user][forum] += 1
    for user in userPostedForums.iterkeys():
      actForum = ""
      maxPosts = 0
      for forum, numPosts  in userPostedForums[user].iteritems():
        if numPosts > maxPosts:
          maxPosts = numPosts
          actForum = forum
      self.activeForums[user] = actForum
    
  def createPostPages(self, dataFile):
    dataFile = open(dataFile)
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    threads = dd(list)
    posts = dd(int)
    for line in reader:
      self.posts.append(line)
      #print line
      post = int(line[1].strip('"'))
      thread = int(line[2].strip('"'))
      threads[thread].append(post)
      posts[post] = thread
    for thread in threads.iterkeys():
      threads[thread] = sorted(threads[thread], cmp=lambda x, y:x - y)
    for post in posts.keys():
      self.post2Page[str(post)] = (threads[posts[post]].index(post)) / 50
    print len(self.post2Page)
    
  def formUrl(self, postId, threadId):
      baseUrl = 'http://thehoodup.com/board/viewtopic.php?'
      return baseUrl + 't=' + str(threadId) + '&start=' + str(int(self.post2Page[str(postId)]) * 50) + '#p' + str(postId) 
  
  def makeATable(self, dataFile):
      reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\")
      conn = M.connect('localhost', 'phani', 'phani', 'hoodup')
      cursor = conn.cursor()
      cursor.execute("""create table posts(userName VARCHAR(1000), userId INT, postId INT, threadId INT, postBody VARCHAR(10000), postForum VARCHAR(30),
        activeForum VARCHAR(30), userRegDay INT, days INT,hours INT, minutes INT, HoodupLink VARCHAR(100))""")
      status = 0
      for post in reader:
        if len(post) != 6:
          print 'skipping'
          print post
          continue
        #print post[0], post[2]
        rawTime = post[-1]
        post = post[:-1]
        post[-1] = post[-1].split()[0]
        post.append(self.activeForums[post[0]])
        try:
          post.append(self.timeHandler.diffDays(self.earliestDate, self.userJoins[post[0]]))
        except:
          post.append(-1)
        post.extend(self.daysHoursMinutes(rawTime))
        post.append(self.formUrl(post[1], post[2]))
        try:
          post.insert(0, self.userNames[post[0]])
        except:
          post.insert(0, '__N_U_L_L__')
        #print post
        cursor.execute("""insert into posts values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", tuple(post))
        status += 1
        if status % 10000 == 0:
          print status
      conn.close()