Exemple #1
0
class TimeCheckHandler:
    def __init__(self, value):
        self.value = value
        self.city = value['location'][0]['value'].title()
        self.daytime = value['daytime'][0]['value'].title()
        self.th = TimeHandler(self.value)
        self.th.checkDataBase()

    def Result(self):

        if self.daytime == 'Daytime':
            self.daytime == 'Day'

        self.th.lookupNoPrint()
        self.results = self.th.results
        for x in self.results:
            hour = int(self.results[x].strftime('%H'))
            daytime = 'Day'

            day = False

            if(hour > 7 and hour < 21):
                day = True

            if(hour <= 1 or hour >= 23):
                daytime = 'Midnight'
            elif(hour < 3 or hour > 21):
                daytime ='Night'
            elif(hour >= 11 and hour <= 13):
                daytime = 'Midday'
            elif(hour <11 and hour > 3):
                daytime = 'Morning'
            elif(hour <= 17 and hour >= 13):
                daytime = 'Afternoon'
            elif(hour <= 21 and hour >= 17):
                daytime = 'Evening'

            yield "PILOT:START"
            yield "LABEL:<b><big>"+ self.city +"</big></b>"
            if self.daytime == 'Day':
                if day == True:
                    print "LABEL:<b>Yes</b>"
                else:
                    print "LABEL:<b>No</b>"
            else:
                if daytime == self.daytime:
                    print "LABEL:<b>Yes</b>"
                else:
                    print "LABEL:<b>No</b>"

            print "LABEL:<small>" + self.results[x].strftime('%H:%M') + "  (" + daytime + ")" +"</small>"
            yield "PILOT:END"
 def __init__(self, dataFile, usersData):
   self.__data = []
   self.__vocab = dd(int)
   self.__vocabDocCount = dd(int)
   self.__backGround = {}
   self.__commWiseIndices = {}
   self.__commWiseTimeSplitIndices = {}
   self.__communutyWiseVocab = dd(lambda:dd(int))
   self.__users = set()
   self.__userWiseIndices = {}
   self.__userWiseTimeSplitIndices = {}
   self.__timeWiseUserSplitIndices = dd(lambda:dd(int))
   self._tok = Tokenizer(preserve_case=False)
   self.__userJoins = dd(lambda:-1)
   self.timeHandler = TimeHandler()
   self.sampledUsers = set()
   self.activeForums = {}
   self.activeUsersInForums = dd(set)
   
   ## Processing/dealing with data
   #self.__read(dataFile)
   self.__justRead(dataFile)
   self.__loadUsersJoins(usersData)
   self.__splitUserWise()
   self.__userWiseTimeSplit()
   #self.__timeWiseUserSplit()
   #self.__commWiseTimeSplit()
   
   ## Extra data structures
   self.postingFreq = dd(int)
Exemple #3
0
 def __init__(self):
   self.post2Page = {}
   self.posts = []
   self.activeForums = {}
   self.earliestDate = "November 22nd, 2006, 9:51 pm"
   self.timeHandler = TimeHandler()
   self.userNames = {}
   self.userJoins = {}
Exemple #4
0
class Misc:
  def __init__(self):
    self.timeHandler = TimeHandler()

  def earliestPostTime(self, dataFile):
    reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\")
    earliestTime = "November 17th, 2017, 3:07 am"
    count = 0
    for line in reader:
      #print line
      time = line[5]
      if self.timeHandler.diffDays(time, earliestTime) > 0:
        earliestTime = time
      count += 1
      if count%10000 == 0:
        print count
    print earliestTime
Exemple #5
0
class TableMaker:
  
  def __init__(self):
    self.post2Page = {}
    self.posts = []
    self.activeForums = {}
    self.earliestDate = "November 22nd, 2006, 9:51 pm"
    self.timeHandler = TimeHandler()
    self.userNames = {}
    self.userJoins = {}
    
  def loadUserNames(self, userNames):
    for line in open(userNames):
      line = line.strip().split('\t')
      self.userNames[line[0]] = line[1]
  
  def loadUserJoins(self, userJoins):
    for line in open(userJoins):
      line = line.strip().split('\t')
      self.userJoins[line[0]] = line[1]
    
  def daysHoursMinutes(self, rawTime):
    days = self.timeHandler.diffDays(self.earliestDate, rawTime)
    hours = self.timeHandler.diffHours(self.earliestDate, rawTime)
    minutes = self.timeHandler.diffMinutes(self.earliestDate, rawTime)
    return days, hours, minutes
    
  def createActiveForums(self):
    userPostedForums = dd(lambda:dd(int))
    for post in self.posts:
      user = post[0]
      forum = post[4].split()[0]
      userPostedForums[user][forum] += 1
    for user in userPostedForums.iterkeys():
      actForum = ""
      maxPosts = 0
      for forum, numPosts  in userPostedForums[user].iteritems():
        if numPosts > maxPosts:
          maxPosts = numPosts
          actForum = forum
      self.activeForums[user] = actForum
    
  def createPostPages(self, dataFile):
    dataFile = open(dataFile)
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    threads = dd(list)
    posts = dd(int)
    for line in reader:
      self.posts.append(line)
      #print line
      post = int(line[1].strip('"'))
      thread = int(line[2].strip('"'))
      threads[thread].append(post)
      posts[post] = thread
    for thread in threads.iterkeys():
      threads[thread] = sorted(threads[thread], cmp=lambda x, y:x - y)
    for post in posts.keys():
      self.post2Page[str(post)] = (threads[posts[post]].index(post)) / 50
    print len(self.post2Page)
    
  def formUrl(self, postId, threadId):
      baseUrl = 'http://thehoodup.com/board/viewtopic.php?'
      return baseUrl + 't=' + str(threadId) + '&start=' + str(int(self.post2Page[str(postId)]) * 50) + '#p' + str(postId) 
  
  def makeATable(self, dataFile):
      reader = csv.reader(open(dataFile), quotechar='"', escapechar="\\")
      conn = M.connect('localhost', 'phani', 'phani', 'hoodup')
      cursor = conn.cursor()
      cursor.execute("""create table posts(userName VARCHAR(1000), userId INT, postId INT, threadId INT, postBody VARCHAR(10000), postForum VARCHAR(30),
        activeForum VARCHAR(30), userRegDay INT, days INT,hours INT, minutes INT, HoodupLink VARCHAR(100))""")
      status = 0
      for post in reader:
        if len(post) != 6:
          print 'skipping'
          print post
          continue
        #print post[0], post[2]
        rawTime = post[-1]
        post = post[:-1]
        post[-1] = post[-1].split()[0]
        post.append(self.activeForums[post[0]])
        try:
          post.append(self.timeHandler.diffDays(self.earliestDate, self.userJoins[post[0]]))
        except:
          post.append(-1)
        post.extend(self.daysHoursMinutes(rawTime))
        post.append(self.formUrl(post[1], post[2]))
        try:
          post.insert(0, self.userNames[post[0]])
        except:
          post.insert(0, '__N_U_L_L__')
        #print post
        cursor.execute("""insert into posts values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", tuple(post))
        status += 1
        if status % 10000 == 0:
          print status
      conn.close()
class DataHandler:
  def __init__(self, dataFile, usersData):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSplitIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self.__users = set()
    self.__userWiseIndices = {}
    self.__userWiseTimeSplitIndices = {}
    self.__timeWiseUserSplitIndices = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__userJoins = dd(lambda:-1)
    self.timeHandler = TimeHandler()
    self.sampledUsers = set()
    self.activeForums = {}
    self.activeUsersInForums = dd(set)
    
    ## Processing/dealing with data
    #self.__read(dataFile)
    self.__justRead(dataFile)
    self.__loadUsersJoins(usersData)
    self.__splitUserWise()
    self.__userWiseTimeSplit()
    #self.__timeWiseUserSplit()
    #self.__commWiseTimeSplit()
    
    ## Extra data structures
    self.postingFreq = dd(int)
  
  def printMonthlyDataForUser(self, user, outFile):
    userTimeIndices = self.__userWiseTimeSplitIndices[user]
    for month in userTimeIndices.iterkeys():
      f = csv.writer(open(outFile+"."+str(month),"w"))
      for index in userTimeIndices[month]:
        f.writerow(self.__data[index])
  
  def tokenizeRecord(self, record):
    record = list(copy.deepcopy(record))
    #print record
    try:
      text = record[1]
      tokenizedText = ' '.join(self._tokenize(text))
      record[1] = tokenizedText
      #print tokenizedText
      return record
    except:
      return -1
  
  def getTokenizedCSV(self):
    tokenizedRecords = []
    for index in range(len(self.__data)):
      newRecord = self.tokenizeRecord(self.__data[index])
      if newRecord != -1:
        tokenizedRecords.append(newRecord)
    return tokenizedRecords
 
  def getBasicUserMonthRecord(self, user, month):
    record = []
    record.append(user)
    record.append(month)
    record.append(self.activeForums[user])
    record.append([])
    return record

  def getTokenizedUserMonthCSV(self):
    tokenizedRecords = dd(lambda:dd(list))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            tokenizedRecords[user][month].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  def getTokenizedUserMonthForumCSV(self):
    tokenizedRecords = dd(lambda:dd(lambda:dd(list)))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            forum = newRecord[3]
            tokenizedRecords[user][month][forum].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  
  def getPost2Month(self):
    post2Month = {}
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          postId = self.__data[index][0]
          post2Month[postId] = month
    return copy.deepcopy(post2Month)
  
  def getDoc2Post(self):
    doc2Post = {}
    for index in range(len(self.__data)):
      doc2Post[index+1] = self.__data[index][0]
    return copy.deepcopy(doc2Post)
    
  def getPost2User(self):
    post2User = {}
    for user in self.__userWiseIndices.iterkeys():
      for index in self.__userWiseIndices[user]:
        postId = self.__data[index][0]
        post2User[postId] = user
    return copy.deepcopy(post2User)
    
  def getPostingFreq(self):
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      self.postingFreq[len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCumulativePostingFreq(self):
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      for num in range(0,userPosts+1,10):
        self.postingFreq[num] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCutoffPostingFreq(self):
    totalPosts = 0
    cdfFreqPosting = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      totalPosts += userPosts
      for num in range(0,userPosts+1,10):
        cdfFreqPosting[num] += userPosts
    for num in cdfFreqPosting.iterkeys():
      cdfFreqPosting[num] = round(cdfFreqPosting[num]*100.0/float(totalPosts),2)
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    sys.stderr.write("Total Posts:"+str(totalPosts)+"\n")
    return copy.deepcopy(cdfFreqPosting)
  
  def getMonthwisePostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      timeWisePostedUsers[time] = len(self.__timeWiseUserSplitIndices[time])
    return copy.deepcopy(timeWisePostedUsers)
  
  def getMonthwiseBinnedPostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      userWiseIndices = self.__timeWiseUserSplitIndices[time]
      postingFreq = dd(int)
      for user in userWiseIndices.iterkeys():
        userPosts = len(self.__userWiseIndices[user])
        for num in range(0,userPosts+1):
          postingFreq[num] += 1
      timeWisePostedUsers[time] = copy.deepcopy(postingFreq)
    return copy.deepcopy(timeWisePostedUsers)
  
  def getBasicTable(self):
    table = []
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      userSubtable = []
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        try:
          activeForum = self.activeForums[user]
          if activeForum == 'NULL':
            continue
          if int(month) >100:
            continue
          content = (user, month, len(self.__userWiseTimeSplitIndices[user][month]), self.activeForums[user])
          userSubtable.append(content)
        except:
          pass
      if len(userSubtable) >= 3:
        table.extend(userSubtable)
    return table
  
  def totalPostsByUsers(self):
    total = 0
    for user in self.__userWiseIndices.iterkeys():
      total += len(self.__userWiseIndices[user])
    return total
  
  def getTopPosterCoverage(self):
    totalPosts = self.totalPostsByUsers()
    postsTillTopN = 0
    
    
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

  def loadActiveForums(self, activeForums):
    for line in csv.reader(open(activeForums)):
      try:
        self.activeForums[line[0]] = line[1]
        self.activeUsersInForums[line[1]].add(line[0])
      except:
        pass
  
  def __validUserId(self, userId):
    try:
      userId = int(userId)
      assert userId >= 1 and userId <= 45037
      return True
    except:
      return False

  def __splitUserWise(self):
    tempDD = dd(list)
    for index in range(len(self.__data)):
      try:
        user = self.__data[index][5]
      except:
        continue
      if not self.__validUserId(user):
        continue
      tempDD[user].append(index)
    for user in tempDD.iterkeys():
      self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
    del tempDD

  def __userWiseTimeSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user])
  
  def __timeWiseUserSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      timeDividedUserData = self.divideBasedOnMonths(self.__userWiseIndices[user])
      for time in timeDividedUserData.iterkeys():
        self.__timeWiseUserSplitIndices[time][user] = timeDividedUserData[time]
    return copy.deepcopy(self.__timeWiseUserSplitIndices)
  
  def __commWiseTimeSplit(self):
    for comm in self.__commWiseIndices.iterkeys():
      self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm])
  
  def __justRead(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    for record in csvReader:
      #self.__data.append(tuple(record[1:]))
      self.__data.append(tuple(record))
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        succ = self.__updateVocab(record)
        if succ:
          self.__data.append(tuple(record))
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index += 1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk") >= 0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read " + str(index) + " records\n")
    sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
    sys.stderr.write("Users: " + str(len(self.__users)) + "\n")
    
  def _tokenize(self, text):
    text = text.strip()
    text = re.sub('[\s\n]+', ' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    if len(record)!=7:
      return
    comm = record[3]
    if comm.find('Talk') < 0:
      return 0
    text = record[1]
    if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def preprocessVocab(self, stopWords):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

  def getAllUsers(self):
    return copy.deepcopy(self.__users)

  def userStats(self, outFile):
    outFile = open(outFile,'w')
    for user in self.__userWiseIndices.iterkeys():
      userDataIndices = self.__userWiseIndices[user]
      timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices)
      outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n')
    outFile.close()

  def getUserDataIndices(self, user):
    userDataIndices = []
    for index in range(len(self.__data)):
      userDataIndices.append(index)
    return copy.deepcopy(userDataIndices)
  
  def divideBasedOnMonths(self, data):
    timeDividedIndices = dd(list)
    for index in data:
      timeDiff  = -1
      try:
        timeDiff = self.__timeDiff(index)
      except:
        continue
      if timeDiff >= 0:
        timeDividedIndices[timeDiff].append(index)
      #else:
      #  print timeDiff
    return copy.deepcopy(timeDividedIndices)
    
  def __timeDiff(self, recordIndex):
    #try:
      #print recordIndex
      record = self.__data[recordIndex]
      postTime = str(record[4])
      user = str(record[5])
      userJoin = self.__userJoins[user]
      return self.timeHandler.diffMonths(postTime, userJoin)
    #except:
    #  return -1
  
  def makeDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for text in data: ## I just expect an array of texts, not the entire records
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
    #assert self.isValid(dist)
    return dist

  def isValid(self, dist):
    sumProb = 0
    for x in dist.iterkeys():
      sumProb += dist[x]
    print sumProb
    return True

  def sampleUsers(self):
    US = userSampling(self.__userWiseTimeSplitIndices)
    self.sampledUsers = US.finalizeUsers()
    self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices)
    return copy.deepcopy(self.sampledUsers)

  def getUserMonths(self, user):
    months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys())
    for i in range(1,4):
      try:
        months.remove(i)
      except:
        pass
    for i in range(25,31):
      try:
        months.remove(i)
      except:
        pass
    return months

  def getUserDataForDivergence(self, user, month):
    return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]]

  def getUserInitialData(self, user):
    data = []
    for month in range(1,4):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getUserMaturedData(self, user):
    data = []
    for month in range(25,31):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getActiveForum(self, userNum):
    return self.activeForums[userNum]

  def getForumInitialData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(1,4):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

  def getForumMaturedData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(25,31):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data
Exemple #7
0
 def __init__(self):
   self.timeHandler = TimeHandler()
Exemple #8
0
 def __init__(self, value):
     self.value = value
     self.city = value['location'][0]['value'].title()
     self.daytime = value['daytime'][0]['value'].title()
     self.th = TimeHandler(self.value)
     self.th.checkDataBase()
Exemple #9
0
    if(voice):
        response = wit.voice_query_auto(TOKEN)
    elif(usetext):
        response = wit.text_query(sys.argv[textpos], TOKEN)
    else:
        time.sleep(1)
        response = wit.text_query(raw_input(">> "), TOKEN)

    wit.close()
    time.sleep(0.1)

    inp = InputHandler(response)
    task = inp.getIntent()

    if(task == 'time_places'):
        handle = TimeHandler(inp.getValue())
        for x in handle.lookup():
            print(x)
    elif(task == 'weather_in'):
        handle = WeatherHandler(inp.getValue())
        for x in handle.printData():
            print(x)
    elif(task == 'check_time'):
        handle = TimeCheckHandler(inp.getValue())
        for x in handle.Result():
            print(x)
    elif(task == 'computer_power'):
        handle = PowerHandler(inp.getValue())
        handle.perform()
    else:
        print("ERR: task not recognized [" + task + "]")