Example #1
0
 def __geo_to_dict(self, filename, stopwords):
     counter = 0
     id_to_geotok = dict()
     tok = Tokenizer(preserve_case=False)
     geo_functions = geo.GeoFunctions()
     with codecs.open(filename, 'r', "utf-8") as json_file:
         for line in json_file:
             try:
                 json_data = json.loads(line, 'utf-8')
                 tweet_id = json_data['id']
                 tweet = json_data['text']
                 coordinates = json_data['geo']['coordinates']
                 region = geo_functions.get_region((float(coordinates[0]),float(coordinates[1])))
                 # Stopworte entfernen
                 if region != -1:
                     tokenized_tweet = tok.tokenize(tweet)
                     id_to_geotok[tweet_id] = (
                         [token for token in tokenized_tweet if token not in stopwords],
                         region)
                 counter += 1
                 # if counter % 1000 == 0:
                     # sys.stdout.write('- ')
             except:
                 None
     return id_to_geotok
Example #2
0
 def __classify_tweet(self,tweet_text):
     tweet_vector = array([0.0,0.0,0.0,0.0,0.0,0.0,0.0])
     tok = Tokenizer(preserve_case=False)
     for token in tok.tokenize(tweet_text):
         if token in self.__wv:
             tweet_vector += self.__wv[token]
     if self.__cosine_sim(tweet_vector, self.__average_distribution) > self.__sim_threshold:
         return None
     tweet_vector_normalized = self.__normalize_len(tweet_vector)
     tweet_vector_diff = tweet_vector_normalized - self.__average_distribution
     return tweet_vector_diff
class DataSampler():
  
  def __init__(self, dataFile):
    self.__data = []
    self.__commWiseIndices = {}
    self.__commWiseSampleIndices = {}
    self.__read(dataFile)
    self._tok = Tokenizer(preserve_case=False)
    
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      record = filter(lambda x:x.strip(), record)
      try:
        self.__data.append(tuple(record))
        tempDD[record[3]].append(index)
        index+=1
      except:
        pass
        ##print record
        ##sys.exit()
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
        
  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def communityWiseSample(self):
    numPosts = 1000
    for key in self.__commWiseIndices.iterkeys():
      self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts)
      
  def prepareOutput(self, outputFile):
    outputFile = open(outputFile,'w')
    csvWriter = csv.writer(outputFile)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        #tokens = self._tokenize(self.__data[index][1])
        csvWriter.writerow(self.__data[index])
    outputFile.close()
Example #4
0
 def __jsons_to_dict(self, tweet_file, stopwords):
     counter = 0
     id_to_tok = dict()
     tok = Tokenizer(preserve_case=False)
     with codecs.open(tweet_file, 'r', "utf-8") as json_file:
         for line in json_file:
             try:
                 tweet = json.loads(line, 'utf-8')['text']
                 tweet_id = json.loads(line, 'utf-8')['id']
                 tokenized_tweet = tok.tokenize(tweet)
                 # Stopworte entfernen
                 id_to_tok[tweet_id] = [token for token in tokenized_tweet if token not in stopwords]
                 counter += 1
                 # if counter % 1000 == 0:
                     # sys.stdout.write('+ ')
             except:
                 None
     return id_to_tok
Example #5
0
def read_and_count():
    dictionary = {}
    tweetfolder = '/home/gontrum/april-corpus-raw'

    tok = Tokenizer(preserve_case=False)
    
    for tweetfile in [folder for folder in os.listdir(tweetfolder) if folder.startswith('tweets') == True ]:
        tweetfile = os.path.join(tweetfolder, tweetfile)
    with open(tweetfile, 'r') as f:
            for line in f:
                try:
                    tw = json.loads(line, 'latin1')['text']
                except:
                    None
                for each in tok.tokenize(tw):
                    dictionary[each] = dictionary.get(each, 0) + 1

    return dictionary
def GetWordDictionary(filePAth):
    csv.field_size_limit(sys.maxsize)
    ifile  = open(filePAth, "rb")
    reader = csv.reader(ifile)
    word_dictionary={}       
    tok = Tokenizer(preserve_case=False)    
    for row in reader:
        tokens=[]
        try:
            tokens=tok.tokenize(row[3])
        except Exception,e:
            print e
        for token in tokens:
            if token in word_dictionary:
                token_count=word_dictionary.get(token)
                token_count=token_count+1
                word_dictionary[token]=token_count
            else:
                word_dictionary[token]=1               
def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.data_dir[-1] != '/':
        args.data_dir+='/'

    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[args.database]
    logger.info("Connected to%s"%uri)

    files = glob.glob(args.data_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("done.")
def NaiveBesianClassifer(positive_word_frequency,
                         negative_words_frequency,
                         count_pos_words,
                         count_neg_words,
                         tweet,
                         class_pos_prob,
                         class_neg_prob):
    tok = Tokenizer(preserve_case=False)   
    tokens=tok.tokenize(tweet)
    positiveClassProb=1.00
    negativeClassProb=1.00    
            
    for token in tokens:
        positiveClassProb=positiveClassProb*LaplaceSmoothingValue(token,positive_word_frequency,count_pos_words)
        negativeClassProb=negativeClassProb*LaplaceSmoothingValue(token,negative_words_frequency,count_neg_words)
    positiveClassProb=positiveClassProb*class_pos_prob
    negativeClassProb=negativeClassProb*class_neg_prob            
    
    if(positiveClassProb >=negativeClassProb):
        print positiveClassProb,1
        return positiveClassProb,1
    else:
        print negativeClassProb,1
        return negativeClassProb,-1
class FakeMatcher:
  def __init__(self):
    self.posts = []
    self.userwiseThreads = dd(set)
    self.userwisePosts = dd(set) # Stores indices
    self.threads = dd(list)
    self.userNames = {}
    self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b")
    self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b")
    self.tok = Tokenizer()
    self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|'])
    self.fakeUsers = {} # Stores the postId of the previous fake annotation we did
    
  def loadData(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    postIndex = 0
    for line in reader:
      self.posts.append(line)
      thread = line[3]
      user = line[1]
      username = line[0]
      self.userNames[user] = ' '.join(self.tok.tokenize(username))
      self.threads[thread].append(postIndex)
      self.userwiseThreads[user].add(thread)  
      self.userwisePosts[user].add(postIndex)
      postIndex += 1
  
  def loadFakeUsers(self, fakeAnnotation):
    fakeAnnotation = csv.reader(open(fakeAnnotation))
    for line in fakeAnnotation:
      try:
        dummy = int(line[1])
        dummy = int(line[2])
      except:
        continue
      self.fakeUsers[line[1]] = int(line[2])
  
  def filterUsers(self):
    allUsers = self.userwisePosts.keys()
    for user in allUsers:
      if user not in self.fakeUsers.iterkeys():
        del self.userwisePosts[user]
        del self.userwiseThreads[user]
        del self.userNames[user]
  
  def hasFake(self, postId):
    postText = self.posts[postId][4]
    return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None)
  
  def printFakeUsers(self, fakersDir):
    for user in self.fakeUsers:
      fakePostIds = []
      for thread in self.userwiseThreads[user]:
        for postIndex in self.threads[thread]:
          if self.hasFake(postIndex):
            fakePostIds.append(postIndex)
      fakePostIds = sorted(fakePostIds, cmp=lambda x, y:int(self.posts[x][2]) - int(self.posts[y][2]))
      #print user, self.posts[fakePostIds[0]][2], self.fakeUsers[user]
      if len(fakePostIds) > 0 and self.posts[fakePostIds[0]][2] != str(self.fakeUsers[user]):
        #self.printPosts(user, fakePostIds)
        dummy = 1
      else:
        print user
  
  def printPosts(self, user, fakePostIds):
    fakersFile = open(fakersDir + user, 'w', 1)
    for postIndex in fakePostIds:
      postId = self.posts[postIndex][2]
      postBody = self.posts[postIndex][4]
      fakersFile.write(postId + '\t' + postBody + '\n')
    fakersFile.close()
  
  def sanityCheck(self):
    print "Posts:", len(self.posts)
    print "Users:", len(self.userwiseThreads)
    print "Fake users:", len(self.fakeUsers)
    for user in self.fakeUsers:
      if user not in self.userwiseThreads.iterkeys():
        print user
class FakeMatcher:
  def __init__(self):
    self.posts = []
    self.userwiseThreads = dd(lambda:dd(lambda:-1))
    self.userwisePosts = dd(set) # Stores indices
    self.userLastPost = dd(lambda:-1)
    self.threads = dd(list)
    self.userStart = dd(lambda:5000)
    self.userNames = {}
    self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b")
    self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b")
    self.tok = Tokenizer()
    self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|'])
    
  def loadData(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    postIndex = 0
    for line in reader:
      self.posts.append(line)
      thread = line[3]
      user = line[1]
      username = line[0]
      self.userNames[user] = ' '.join(self.tok.tokenize(username))
      self.threads[thread].append(postIndex)
      if self.userwiseThreads[user][thread] < 0 or self.userwiseThreads[user][thread] > postIndex:  
        self.userwiseThreads[user][thread] = postIndex
      self.userwisePosts[user].add(postIndex)
      days = int(line[8])
      if self.userLastPost[user] < days:
        self.userLastPost[user] = days
      if self.userStart[user] > int(line[8]):
        self.userStart[user] = int(line[8])
      postIndex += 1
    self.sortThreads()
  
  def sortThreads(self):
    for thread in self.threads.iterkeys():
      self.threads[thread] = sorted(self.threads[thread], cmp=lambda x, y:x - y)
  
  def filterUsers(self):
    allUsers = self.userwisePosts.keys()
    for user in allUsers:
      if len(self.userwisePosts[user]) < 20 or len(self.userwisePosts[user]) > 150 or (self.userStart[user] - self.userLastPost[user]) > 120:
        del self.userwisePosts[user]
        del self.userwiseThreads[user]
        del self.userNames[user]
  
  def hasFake(self, postId):
    postText = self.posts[postId][4]
    #if postText.find(" you a fake ")>=0:
    #  print postText
    return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None)
  
  def printFakePosts(self, logFile):
    logFile = open(logFile, 'w')
    index = 0
    for post in self.posts:
      if self.hasFake(index):
        logFile.write('\t'.join(post[:5]) + '\n')
      index += 1 
  
  def printFakeUsers(self, fakersFile):
    fakersFile = open(fakersFile, 'w', 1)
    for user in self.userwiseThreads.iterkeys():
      fakePostCount = 0
      fakePostIds = set()
      for thread in self.userwiseThreads[user].iterkeys():
        userFirstPost = self.userwiseThreads[user][thread]
        postIndex = self.threads[thread].index(userFirstPost) + 1
        while postIndex < len(self.threads[thread]):
          postId = self.threads[thread][postIndex]
          if self.hasFake(postId):
            #print 'here'
            fakePostCount += 1
            fakePostIds.add(postId)
          postIndex += 1
      if fakePostCount > 5:
        fakersFile.write(user + '\t' + ' '.join(map(lambda x:str(x), list(fakePostIds))) + '\n')
    fakersFile.close()
  
  def makeRECompatible(self, userName):
    for char in self.badChars:
      if char != '\\':
        userName = userName.replace(char, "\\" + char)
    return userName
  
  def bigRESearch(self, logFile):
    logFile = open(logFile, 'w', 1)
    bigUserName = "******"
    for userName in self.userNames.itervalues():
      if userName in ["dat n***a", "bitch"]:
        continue
      if userName.strip() != "":
        if self.considerUserName(userName): 
          userName = self.makeRECompatible(userName)
          bigUserName += userName + " |"
    bigUserName = bigUserName[:-1] + ")"
    bigUserName += "(is )(a )?(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)"
    print len(bigUserName)
    print bigUserName
    P = re.compile(bigUserName)
    #sampleText = "i wanna see wat dat n***a about but i aint gonna fite him im on parole . but dat n***a fake so i dont even matter"
    #while 1:
    #  sampleText = raw_input("Enter the text: ")
    #  if sampleText == 'exit':
    #    break
    #  print "Full match:",P.search(sampleText).group(), " username match:",P.search(sampleText).group(1)
    for post in self.posts:
      text = post[4]
      if P.search(text) != None:
        logFile.write('\t'.join(post[:5]) + '\n')
    logFile.close()
  
  def printNonChars(self):
    nonChars = set()
    for userName in self.userNames.itervalues():
      userName = userName.lower()
      for char in userName:
        if ord(char) >= 32 and ord(char) <= 126 and (ord(char) < 97 or ord(char) > 122) and ord(char) not in range(48, 58):
          nonChars.add(char)
    print "Users:", len(self.userNames)
    print nonChars
  
  def contentToLookAt(self):
    uniqThreads = set()
    uniqPosts = set()
    for userId in self.userNames.iterkeys():
      for thread in self.userwiseThreads[userId]:
        uniqThreads.add(thread)
        for post in self.threads[thread]:
          uniqPosts.add(post)
    print "Users to look at:", len(self.userNames)
    print "Unique threads to look at:", len(uniqThreads)
    print "Unique posts to look at:", len(uniqPosts)
  
  def isAllLetters(self, userName):
    for char in userName:
      if ord(char) < 97 or ord(char) > 122:
        return False
    return True
  
  def considerUserName(self, userName):
    for char in userName:
      o = ord(char)
      if o < 32 or o > 126:
        return False
    return True
  
  def matchUserNamesInPosts(self, logFile):
    logFile = open(logFile, 'w', 1)
    for userId in self.userNames.iterkeys():
      userName = self.userNames[userId]
      if not self.isAllLetters(userName):
        continue
      for post in self.posts:
        if post[4].find(userName) >= 0:
          logFile.write(str(userId) + '\t' + userName + '\t' + post[4] + '\n')
    logFile.close()
class DataHandler:
  def __init__(self, dataFile, usersData):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSplitIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self.__users = set()
    self.__userWiseIndices = {}
    self.__userWiseTimeSplitIndices = {}
    self.__timeWiseUserSplitIndices = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__userJoins = dd(lambda:-1)
    self.timeHandler = TimeHandler()
    self.sampledUsers = set()
    self.activeForums = {}
    self.activeUsersInForums = dd(set)
    
    ## Processing/dealing with data
    #self.__read(dataFile)
    self.__justRead(dataFile)
    self.__loadUsersJoins(usersData)
    self.__splitUserWise()
    self.__userWiseTimeSplit()
    #self.__timeWiseUserSplit()
    #self.__commWiseTimeSplit()
    
    ## Extra data structures
    self.postingFreq = dd(int)
  
  def printMonthlyDataForUser(self, user, outFile):
    userTimeIndices = self.__userWiseTimeSplitIndices[user]
    for month in userTimeIndices.iterkeys():
      f = csv.writer(open(outFile+"."+str(month),"w"))
      for index in userTimeIndices[month]:
        f.writerow(self.__data[index])
  
  def tokenizeRecord(self, record):
    record = list(copy.deepcopy(record))
    #print record
    try:
      text = record[1]
      tokenizedText = ' '.join(self._tokenize(text))
      record[1] = tokenizedText
      #print tokenizedText
      return record
    except:
      return -1
  
  def getTokenizedCSV(self):
    tokenizedRecords = []
    for index in range(len(self.__data)):
      newRecord = self.tokenizeRecord(self.__data[index])
      if newRecord != -1:
        tokenizedRecords.append(newRecord)
    return tokenizedRecords
 
  def getBasicUserMonthRecord(self, user, month):
    record = []
    record.append(user)
    record.append(month)
    record.append(self.activeForums[user])
    record.append([])
    return record

  def getTokenizedUserMonthCSV(self):
    tokenizedRecords = dd(lambda:dd(list))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            tokenizedRecords[user][month].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  def getTokenizedUserMonthForumCSV(self):
    tokenizedRecords = dd(lambda:dd(lambda:dd(list)))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            forum = newRecord[3]
            tokenizedRecords[user][month][forum].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  
  def getPost2Month(self):
    post2Month = {}
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          postId = self.__data[index][0]
          post2Month[postId] = month
    return copy.deepcopy(post2Month)
  
  def getDoc2Post(self):
    doc2Post = {}
    for index in range(len(self.__data)):
      doc2Post[index+1] = self.__data[index][0]
    return copy.deepcopy(doc2Post)
    
  def getPost2User(self):
    post2User = {}
    for user in self.__userWiseIndices.iterkeys():
      for index in self.__userWiseIndices[user]:
        postId = self.__data[index][0]
        post2User[postId] = user
    return copy.deepcopy(post2User)
    
  def getPostingFreq(self):
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      self.postingFreq[len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCumulativePostingFreq(self):
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      for num in range(0,userPosts+1,10):
        self.postingFreq[num] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCutoffPostingFreq(self):
    totalPosts = 0
    cdfFreqPosting = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      totalPosts += userPosts
      for num in range(0,userPosts+1,10):
        cdfFreqPosting[num] += userPosts
    for num in cdfFreqPosting.iterkeys():
      cdfFreqPosting[num] = round(cdfFreqPosting[num]*100.0/float(totalPosts),2)
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    sys.stderr.write("Total Posts:"+str(totalPosts)+"\n")
    return copy.deepcopy(cdfFreqPosting)
  
  def getMonthwisePostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      timeWisePostedUsers[time] = len(self.__timeWiseUserSplitIndices[time])
    return copy.deepcopy(timeWisePostedUsers)
  
  def getMonthwiseBinnedPostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      userWiseIndices = self.__timeWiseUserSplitIndices[time]
      postingFreq = dd(int)
      for user in userWiseIndices.iterkeys():
        userPosts = len(self.__userWiseIndices[user])
        for num in range(0,userPosts+1):
          postingFreq[num] += 1
      timeWisePostedUsers[time] = copy.deepcopy(postingFreq)
    return copy.deepcopy(timeWisePostedUsers)
  
  def getBasicTable(self):
    table = []
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      userSubtable = []
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        try:
          activeForum = self.activeForums[user]
          if activeForum == 'NULL':
            continue
          if int(month) >100:
            continue
          content = (user, month, len(self.__userWiseTimeSplitIndices[user][month]), self.activeForums[user])
          userSubtable.append(content)
        except:
          pass
      if len(userSubtable) >= 3:
        table.extend(userSubtable)
    return table
  
  def totalPostsByUsers(self):
    total = 0
    for user in self.__userWiseIndices.iterkeys():
      total += len(self.__userWiseIndices[user])
    return total
  
  def getTopPosterCoverage(self):
    totalPosts = self.totalPostsByUsers()
    postsTillTopN = 0
    
    
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

  def loadActiveForums(self, activeForums):
    for line in csv.reader(open(activeForums)):
      try:
        self.activeForums[line[0]] = line[1]
        self.activeUsersInForums[line[1]].add(line[0])
      except:
        pass
  
  def __validUserId(self, userId):
    try:
      userId = int(userId)
      assert userId >= 1 and userId <= 45037
      return True
    except:
      return False

  def __splitUserWise(self):
    tempDD = dd(list)
    for index in range(len(self.__data)):
      try:
        user = self.__data[index][5]
      except:
        continue
      if not self.__validUserId(user):
        continue
      tempDD[user].append(index)
    for user in tempDD.iterkeys():
      self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
    del tempDD

  def __userWiseTimeSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user])
  
  def __timeWiseUserSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      timeDividedUserData = self.divideBasedOnMonths(self.__userWiseIndices[user])
      for time in timeDividedUserData.iterkeys():
        self.__timeWiseUserSplitIndices[time][user] = timeDividedUserData[time]
    return copy.deepcopy(self.__timeWiseUserSplitIndices)
  
  def __commWiseTimeSplit(self):
    for comm in self.__commWiseIndices.iterkeys():
      self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm])
  
  def __justRead(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    for record in csvReader:
      #self.__data.append(tuple(record[1:]))
      self.__data.append(tuple(record))
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        succ = self.__updateVocab(record)
        if succ:
          self.__data.append(tuple(record))
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index += 1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk") >= 0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read " + str(index) + " records\n")
    sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
    sys.stderr.write("Users: " + str(len(self.__users)) + "\n")
    
  def _tokenize(self, text):
    text = text.strip()
    text = re.sub('[\s\n]+', ' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    if len(record)!=7:
      return
    comm = record[3]
    if comm.find('Talk') < 0:
      return 0
    text = record[1]
    if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def preprocessVocab(self, stopWords):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

  def getAllUsers(self):
    return copy.deepcopy(self.__users)

  def userStats(self, outFile):
    outFile = open(outFile,'w')
    for user in self.__userWiseIndices.iterkeys():
      userDataIndices = self.__userWiseIndices[user]
      timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices)
      outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n')
    outFile.close()

  def getUserDataIndices(self, user):
    userDataIndices = []
    for index in range(len(self.__data)):
      userDataIndices.append(index)
    return copy.deepcopy(userDataIndices)
  
  def divideBasedOnMonths(self, data):
    timeDividedIndices = dd(list)
    for index in data:
      timeDiff  = -1
      try:
        timeDiff = self.__timeDiff(index)
      except:
        continue
      if timeDiff >= 0:
        timeDividedIndices[timeDiff].append(index)
      #else:
      #  print timeDiff
    return copy.deepcopy(timeDividedIndices)
    
  def __timeDiff(self, recordIndex):
    #try:
      #print recordIndex
      record = self.__data[recordIndex]
      postTime = str(record[4])
      user = str(record[5])
      userJoin = self.__userJoins[user]
      return self.timeHandler.diffMonths(postTime, userJoin)
    #except:
    #  return -1
  
  def makeDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for text in data: ## I just expect an array of texts, not the entire records
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
    #assert self.isValid(dist)
    return dist

  def isValid(self, dist):
    sumProb = 0
    for x in dist.iterkeys():
      sumProb += dist[x]
    print sumProb
    return True

  def sampleUsers(self):
    US = userSampling(self.__userWiseTimeSplitIndices)
    self.sampledUsers = US.finalizeUsers()
    self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices)
    return copy.deepcopy(self.sampledUsers)

  def getUserMonths(self, user):
    months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys())
    for i in range(1,4):
      try:
        months.remove(i)
      except:
        pass
    for i in range(25,31):
      try:
        months.remove(i)
      except:
        pass
    return months

  def getUserDataForDivergence(self, user, month):
    return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]]

  def getUserInitialData(self, user):
    data = []
    for month in range(1,4):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getUserMaturedData(self, user):
    data = []
    for month in range(25,31):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getActiveForum(self, userNum):
    return self.activeForums[userNum]

  def getForumInitialData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(1,4):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

  def getForumMaturedData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(25,31):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data
def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.snapshot_dir[-1] != '/':
        args.snapshot_dir+='/'

    if args.database:
        database = args.database
    else:
        database = "snapshot_"+args.snapshot_dir.split('/')[-2]
    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[database]
    logger.info("Connected to%s"%uri)


    files = glob.glob(args.snapshot_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("Loading users from %susers.db"%args.snapshot_dir)
    connection = sqlite3.connect("%susers.db"%args.snapshot_dir)
    connection.row_factory = sqlite3.Row
    cursor = connection.cursor()
    logger.info('fetching users')
    cursor.execute('SELECT id,friends FROM users where friends is not NULL')
    users = cursor.fetchall()
    logger.info('%d users fetched'%len(users))
    bulk_size=25000
    nUsersInserted=0
    usersToBeInserted=[]
    for user in users:
        id = user['id']
        friends = ujson.loads(user['friends'])
        usersToBeInserted.append({'id':id, 'friends':friends})
        if len(usersToBeInserted)>=bulk_size:
            client['users'].insert(usersToBeInserted)
            usersToBeInserted=[]
            nUsersInserted+=bulk_size
            logger.info("%d users insered"%nUsersInserted)
    client['users'].insert(usersToBeInserted)
    logger.info("all users insered.")

    logger.info("done.")
Example #13
0
class preprocessor:
  def __init__(self, tweetsFile):
    self._tweets = []
    self._tok = Tokenizer(preserve_case=False)
    self.loadTweets(tweetsFile)
    sys.stderr.write("preprocessor instance created\n")
    sys.stderr.write("@ Mentions removed\n")
    
  def anonnimize(self, tweet):
    tweet = tweet.split('\t')[-1] ## Assumption about the format
    tweet = self._tok.tokenize(tweet) ## Tokenization
    anonTweet = []
    for word in tweet:
      if word[0] != '@':
        anonTweet.append(word)
    return anonTweet
    
  def loadTweets(self, tweetsFile):
    for tweet in open(tweetsFile):
      tweet = tweet.strip()
      tokenizedTweet = self.anonnimize(tweet)
      if len(tokenizedTweet) == 0 or ' '.join(tokenizedTweet).strip() == '':
        continue
      self._tweets.append([tweet.split('\t')[0], tweet.split('\t')[1], tokenizedTweet])
      
  def removeRetweets(self):
    newTweets = []
    for tweet in self._tweets:
      flag = 0
      for word in tweet[2]:
        if word[:2] == 'rt':
          flag = 1
          break  
      if flag == 0:
        newTweets.append(tweet)
    self._tweets = [t for t in newTweets]
    sys.stderr.write("Retweets removed\n")
    
  def filterAuthors(self):
    authorDict = dd(int)
    for tweet in self._tweets:
      authorDict[tweet[0]] += 1
    
    filteredAuthors = []
    for auth,tweets in authorDict.iteritems():
      if tweets >= 50:
        filteredAuthors.append(auth)
    filteredAuthors = set(filteredAuthors)
    filteredTweets = []
    for tweet in self._tweets:
      if tweet[0] in filteredAuthors:
        filteredTweets.append(tweet)
    
    self._tweets = [t for t in filteredTweets]
    
  def authorStats(self):
    authorDict = dd(int)
    for tweet in self._tweets:
      authorDict[tweet[0]] += 1
      
    numDict = dd(int)
    for auth, numTweets in authorDict.iteritems():
      numDict[numTweets-(numTweets%10)] += 1
      
    self.drawGraph(numDict)
    
  def drawGraph(self, authorDict):
    #try:
      authors = [x for x in authorDict.iterkeys()]
      authors = sorted(authors, cmp=lambda x,y:x-y)
      numTweets = [authorDict[x] for x in authors]
      width = 0.2
      fig = plt.figure()
      ax = fig.add_subplot(111)
      # bar chart of the data
      rects = ax.bar(np.arange(len(authors)), numTweets, width, color='r')
      ax.set_xlabel('User')
      ax.set_ylabel('Number of tweets')
      ax.set_xticks(np.arange(len(authors))+width/2)
      ax.set_xticklabels( map(lambda x:str(x), authors))
      
      def autolabel(rects):
        # attach some text labels
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height),
                    ha='center', va='bottom')
      
      autolabel(rects)
      plt.savefig(open("/usr0/home/pgadde/Work/Ethnic/AAEness/Data/RealTweets/PreProcessing/aaeAuthorTweets.png","w"))
      plt.show()
      
  def printInFile(self, output, label):
    output = open(output,'w')
    for tweet in self._tweets:
      tweet[2].insert(0,label)
      try:
        output.write(tweet[0]+"\t"+tweet[1]+"\t"+"\t".join(tweet[2])+"\n")
      except UnicodeEncodeError:
        pass
    output.close()
class DataHandler:
  def __init__(self, dataFile, usersData):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSplitIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self.__users = set()
    self.__userWiseIndices = {}
    self.__userWiseTimeSplitIndices = {}
    self._tok = Tokenizer(preserve_case=False)
    self.__userJoins = dd(lambda:-1)
    self.__read(dataFile)
    self.__loadUsersJoins(usersData)
    self.__splitUserWise()
    self.timeHandler = TimeHandler()
    self.__userWiseTimeSplit()
    #self.__commWiseTimeSplit()
    self.sampledUsers = set()
    self.activeForums = {}
    self.activeUsersInForums = dd(set)
    
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

  def loadActiveForums(self, activeForums):
    for line in open(activeForums):
      line = line.strip().split("\t")
      try:
        self.activeForums[line[0]] = line[1]
        self.activeUsersInForums[line[1]].add(line[0])
      except:
        pass
  
  def __validUserId(self, userId):
    try:
      userId = int(userId)
      assert userId >= 1 and userId <= 45037
      return True
    except:
      return False

  def __splitUserWise(self):
    tempDD = dd(list)
    for index in range(len(self.__data)):
      user = self.__data[index][5]
      if not self.__validUserId(user):
        continue
      tempDD[user].append(index)
    for user in tempDD.iterkeys():
      self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
    del tempDD

  def __userWiseTimeSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user])
  
  def __commWiseTimeSplit(self):
    for comm in self.__commWiseIndices.iterkeys():
      self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm])
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        succ = self.__updateVocab(record)
        if succ:
          self.__data.append(tuple(record))
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index += 1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk") >= 0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read " + str(index) + " records\n")
    sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
    sys.stderr.write("Users: " + str(len(self.__users)) + "\n")
    
  def _tokenize(self, text):
    text = text.strip()
    text = re.sub('[\s\n]+', ' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    if len(record)!=7:
      return
    comm = record[3]
    if comm.find('Talk') < 0:
      return 0
    text = record[1]
    if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def preprocessVocab(self, stopWords):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

  def getAllUsers(self):
    return copy.deepcopy(self.__users)

  def userStats(self, outFile):
    outFile = open(outFile,'w')
    for user in self.__userWiseIndices.iterkeys():
      userDataIndices = self.__userWiseIndices[user]
      timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices)
      outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n')
    outFile.close()

  def getUserDataIndices(self, user):
    userDataIndices = []
    for index in range(len(self.__data)):
      userDataIndices.append(index)
    return copy.deepcopy(userDataIndices)
  
  def divideBasedOnMonths(self, data):
    timeDividedIndices = dd(list)
    for index in data:
      timeDiff = self.__timeDiff(index)
      if timeDiff >= 0:
        timeDividedIndices[timeDiff].append(index)
      #else:
      #  print timeDiff
    return copy.deepcopy(timeDividedIndices)
    
  def __timeDiff(self, recordIndex):
    #try:
      #print recordIndex
      record = self.__data[recordIndex]
      postTime = str(record[4])
      user = str(record[5])
      userJoin = self.__userJoins[user]
      return self.timeHandler.diffMonths(postTime, userJoin)
    #except:
    #  return -1
  
  def makeDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for text in data: ## I just expect an array of texts, not the entire records
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
    #assert self.isValid(dist)
    return dist

  def isValid(self, dist):
    sumProb = 0
    for x in dist.iterkeys():
      sumProb += dist[x]
    print sumProb
    return True

  def sampleUsers(self):
    US = userSampling(self.__userWiseTimeSplitIndices)
    self.sampledUsers = US.finalizeUsers()
    self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices)
    return copy.deepcopy(self.sampledUsers)

  def getUserMonths(self, user):
    months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys())
    for i in range(1,4):
      try:
        months.remove(i)
      except:
        pass
    for i in range(25,31):
      try:
        months.remove(i)
      except:
        pass
    return months

  def getUserDataForDivergence(self, user, month):
    return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]]

  def getUserInitialData(self, user):
    data = []
    for month in range(1,4):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getUserMaturedData(self, user):
    data = []
    for month in range(25,31):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getActiveForum(self, userNum):
    return self.activeForums[userNum]

  def getForumInitialData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(1,4):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

  def getForumMaturedData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(25,31):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data
import glob
import ujson
from happyfuntokenizing import Tokenizer
from TreeTaggerWrapper import TreeTagger

path_to_data='../data/snapshots/2014-10-20/'
files = glob.glob(path_to_data+'2014-1*.data')

tokenizer = Tokenizer(preserve_case=True)
tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par')


for fileName in files:
    print fileName
    file = open(fileName)
    tweets=[ujson.loads(l) for l in file]
    tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
    tagged_tweets = tagger.tag(tokenized_tweets)
    for i in range(len(tweets)):
        tweets[i]['tagged_tweet'] = tagged_tweets[i]
    output_file_name = fileName[:-5]+'-tagged.data'
    file = open(output_file_name,'w')
    for tweet in tweets:
        file.write("%s\n"%ujson.dumps(tweet))
class EmpiricalAnalyzer:
    def __init__(self, dataFile):
        self.__data = []
        self.__vocab = dd(int)
        self.__vocabDocCount = dd(int)
        self.__backGround = {}
        self.__commWiseIndices = {}
        self.__communutyWiseVocab = dd(lambda: dd(int))
        self._tok = Tokenizer(preserve_case=False)
        self.__read(dataFile)

    def _tokenize(self, text):
        text = text.strip()
        text = re.sub("[\s\n]+", " ", text)
        return self._tok.tokenize(text)

    def freqVector(self, tokens):
        tempFreqVector = dd(int)
        for token in tokens:
            tempFreqVector[token] += 1
        return tempFreqVector

    def __updateVocab(self, record):
        comm = record[3]
        if comm.find("Talk") < 0:
            return
        text = record[1]
        if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
            return 0
        tokenDict = self.freqVector(self._tokenize(text))
        for word, freq in tokenDict.iteritems():
            self.__vocab[word] += freq
            self.__communutyWiseVocab[comm][word] += freq
            self.__vocabDocCount[word] += 1
        return 1
        ##print self.__vocab

    def __read(self, dataFile):
        dataFile = open(dataFile)
        dataFile.readline()
        csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
        index = 0
        tempDD = dd(list)
        for record in csvReader:
            try:
                self.__data.append(tuple(record))
                succ = self.__updateVocab(record)
                if succ:
                    tempDD[record[3]].append(index)
                index += 1
            except:
                pass
        for key, value in tempDD.iteritems():
            if key.find("Talk") >= 0:
                self.__commWiseIndices[key] = value
        sys.stderr.write("Read " + str(index) + " records\n")
        sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")

    def preprocessVocab(self):
        stopWords = [w.strip() for w in open("stopWords")]
        self.__backGround = {}
        totalVocab = self.__vocab.keys()
        for word in totalVocab:
            freq = self.__vocab[word]
            if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
                self.__backGround[word] = freq
            else:
                del self.__vocab[word]

        totalWords = 0
        for word, freq in self.__backGround.iteritems():
            totalWords += freq
        for word, freq in self.__backGround.iteritems():
            self.__backGround[word] = self.__backGround[word] / float(totalWords)

        for comm in self.__communutyWiseVocab.iterkeys():
            commVocab = self.__communutyWiseVocab[comm].keys()
            totalWords = 0
            for word in commVocab:
                if word in self.__vocab:
                    totalWords += self.__communutyWiseVocab[comm][word]
                    continue
                del self.__communutyWiseVocab[comm][word]
            for word in self.__communutyWiseVocab[comm].iterkeys():
                self.__communutyWiseVocab[comm][word] = self.__communutyWiseVocab[comm][word] / float(totalWords)
        sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

    def printTop1000InBack(self, outFile):
        outFile = open(outFile, "w")
        wordFreqs = [(word, freq) for word, freq in self.__backGround.iteritems()]
        wordFreqs = sorted(wordFreqs, cmp=lambda x, y: y[1] - x[1])[:1000]
        for wordFreq in wordFreqs:
            outFile.write(wordFreq[0] + "\n")
        outFile.close()

    def printTop1000(self, D, outFile):
        outFile = open(outFile, "w")
        wordFreqs = [(word, freq) for word, freq in D.iteritems()]
        wordFreqs = sorted(wordFreqs, cmp=myCMP)
        for wordFreq in wordFreqs:
            outFile.write(wordFreq[0] + "\t" + str(wordFreq[1]) + "\n")
        outFile.close()

    def __logOdd(self, word, commFreq):
        try:
            return math.log(commFreq * 1.0 / self.__backGround[word])
        except ZeroDivisionError:
            return 100000
        except ValueError:
            return -100000
        except:
            print word, commFreq, self.__backGround[word]
            sys.exit("Error while calculating logodds")

    def prepareCommunityWiseVocab(self):
        for word in self.__backGround.iterkeys():
            for comm in self.__communutyWiseVocab.iterkeys():
                self.__communutyWiseVocab[comm][word] = self.__logOdd(word, self.__communutyWiseVocab[comm][word])

    def printTopDeviations(self, baseDir):
        backFile = baseDir + "/" + "background"
        self.printTop1000(self.__backGround, backFile)
        for comm in self.__communutyWiseVocab.iterkeys():
            self.printTop1000(self.__communutyWiseVocab[comm], baseDir + "/" + comm.strip().replace(" ", ""))
Example #17
0
class DataSampler():
  def __init__(self, dataFile):
    self.__data = []
    self.__commWiseIndices = {}
    self.__commWiseSampleIndices = {}
    self.__commWiseSampleWordFreq = dd(lambda:dd(int))
    self.__read(dataFile)
    self._tok = Tokenizer(preserve_case=False)
    
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      #print record
      try:
        self.__data.append(tuple(record))
        tempDD[record[3]].append(index)
        index+=1
      except:
        print record
        sys.exit()
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    
  def sanityCheck(self):
    self.__printDictSizes(self.__commWiseIndices)
    self.__printDictSizes(self.__commWiseSampleIndices)
    #print filter(lambda x:x[0]=="74526", self.__data)
    
  def __printDictSizes(self, D):
    for key in D.iterkeys():
      sys.stdout.write(key+"\t"+str(len(D[key]))+"\n")
      
  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def communityWiseSample(self):
    numPosts = 30000
    for key in self.__commWiseIndices.iterkeys():
      self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts)
    sys.stderr.write("Sampled "+str(numPosts*5)+"\n")
      
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __filterWords(self, backGroundVector):
    lexicon = set()
    for word, freq in backGroundVector.iteritems():
      if freq > 9:
        lexicon.add(word)
    return lexicon
  
  def preparePosts(self, outputFile):
    outputFile = open(outputFile,'w')
    backGroundVector = dd(int)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for token, freq in freqVector.iteritems():
          backGroundVector[token] += freq
    
    print "Background words:",len(backGroundVector)
    filteredLexicon = self.__filterWords(backGroundVector)
    print "Filtered Words:",len(filteredLexicon)
    ##sys.exit()
    
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        words = [x+"$:$:"+str(y) for x,y in freqVector.iteritems() if x in filteredLexicon]
        if len(words) > 0:
          outputFile.write(key+'\t'+'  '.join(words)+'\n')
    outputFile.write('background'+'\t'+'  '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n')
    outputFile.close()

  
  def analyzeLexicon(self, lexicon, background):
    words = [(w,f) for w,f in background.iteritems() if w in lexicon]
    words = sorted(words,cmp=lambda x,y:y[1]-x[1])
    index = 0
    while 1:
      print words[index]
      dummy = raw_input()
      index += 1
  
  def preparePostsSingleDoc(self, outputFile):
    outputFile = open(outputFile,'w')
    backGroundVector = dd(int)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for token, freq in freqVector.iteritems():
          backGroundVector[token] += freq
    
    print "Background words:",len(backGroundVector)
    filteredLexicon = self.__filterWords(backGroundVector)
    print "Filtered Words:",len(filteredLexicon)
    ##self.analyzeLexicon(filteredLexicon, backGroundVector)
    ##sys.exit()
    
    for key in self.__commWiseSampleIndices.iterkeys():
      globalFreqVector = dd(int)
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for word, freq in freqVector:
          globalFreqVector[word] += freq 
      words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon]
      if len(words) > 0:
        outputFile.write(key+'\t'+'  '.join(words)+'\n')
    outputFile.write('background'+'\t'+'  '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n')
    outputFile.close()
Example #18
0
class ThreadCreator:
  def __init__(self):
    self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup')
    self.tok = Tokenizer()
    
  def __getMaxPage(self, page):
    soup = BS(open(page).read())
    try:
      pagesTag = int(soup.findAll('a', onclick="jumpto(); return false;")[0].findChildren('strong')[-1].contents[0])
      return pagesTag
    except:
      return -1
  
  def __getPostId(self, postProfile):
    postId = -1
    try:
      postId = int(postProfile.find('dl', {'class':'postprofile'})['id'].split('profile')[1])
    except:
      postId = -1
      pass
    return postId
  
  
  def getSmileyText(self, smileyTag):
    title = smileyTag["title"]
    title = title.lower()
    title.replace(" ", "_")
    title = re.sub("[^a-z_]", "", title)
    return "___" + title + "___"
  
  def constructString(self, contentTag):
    content = ""
    for cont in contentTag.contents:
      if type(cont) == bs4.element.NavigableString:
        content += " " + cont
        #print cont
      elif type(cont) == bs4.element.Tag and cont.name == 'span':
        content += " " + self.constructString(cont)
      elif type(cont) == bs4.element.Tag and cont.name == 'img' and cont["src"].find("./images/smilies/") == 0:
        content += " " + self.getSmileyText(cont)
    return content
  
  def __getPostBody(self, postBodyTag):
    postBody = ""
    try:
      contentDiv = postBodyTag.find('div', {'class':'content'})
      postBody = self.constructString(contentDiv)
      ##for cont in contentDiv:
      ##  #print cont, type(cont)
      ##  if type(cont) == bs4.element.NavigableString:
      ##    postBody += cont
    except:
      pass
    if postBody == "":
      return "NULL"
    
    return postBody.decode('utf8')
  
  def __getUser(self, postProfile):
    user = -1
    try:
      user = int(postProfile.find('a')['href'].split('u=')[1].split('&')[0])
    except:
      user = -1
      pass
    return user
  
  def __getForum(self, soup):
    forum = "NULL"
    try:
      forum = soup.find('li', {'class':'nav-forum active'}).find('a').find('span').contents[0]
    except:
      pass
    return forum
  
  def __getPostTime(self, postBody):
    postTime = "NULL"
    try:
      postTime = ' '.join(postBody.find('p').contents[-1].strip().split(" ")[1:])
    except:
      pass
    return postTime
  
  def __getPostBodyTag(self, postProfile):
    postBody = postProfile.nextSibling
    flag = 0
    while getattr(postBody, 'name', None) != 'div':
      if getattr(postBody, 'name', None) == 'span':
        flag = 1
        break
      postBody = postBody.nextSibling
    if flag:
      None
    return postBody
  
  def getPosts(self, page, tId):
    #print 'In getPosts'
    soup = BS(open(page).read())
    ##print soup
    postProfiles = soup.findAll('div', {'class':"profile"})
    #print "NUM:",len(postProfiles)
    posts = []
    threadId = tId
    forum = self.__getForum(soup)
    for postProfile in postProfiles:
      #print 'Inside post profiles'
      postId = self.__getPostId(postProfile)
      user = self.__getUser(postProfile)
      postBodyTag = self.__getPostBodyTag(postProfile)
      ##print postBodyTag
      if postBodyTag != None:
        time = self.__getPostTime(postBodyTag)
        postBody = self.__getPostBody(postBodyTag)
        postBody = re.sub("\[youtube\].*?\[/youtube\]", "", postBody)
        if postBody.find("quote") > postBody.find("/quote"):
            postBody = postBody[postBody.find("/quote") + 6:]
        #inReply = -1
        
        postBody = ' '.join(self.tok.tokenize(postBody))
        postBody = postBody.replace("\\", "")
        ##print postId, postBody, threadId, forum, time, user,inReply 
        ##sys.exit()
        posts.append((user, postId, threadId, postBody, forum, time))
    return posts

  def getPostsInThread(self, baseDir, fId, tId):
    posts = []
    firstPage = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=0'
    ##os.system('wget -P '+baseDir+' "'+firstPage+'"')
    page = baseDir + firstPage.split('/board/')[1]
    ###print page
    posts.extend(self.getPosts(page, tId))
    ##sys.exit()
    maxPages = self.__getMaxPage(page)
    for pageIndex in range(1, maxPages):
      offset = pageIndex * 50
      url = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=' + str(offset)
      ##os.system('wget -P '+baseDir+' "'+url+'"')
      ##continue
      page = baseDir + url.split('/board/')[1]
      pagePosts = self.getPosts(page, tId)
      posts.extend(pagePosts)
    return posts
    
  def createThreadsTable(self, baseDir, threads, outFile):
    outFile = open(outFile, 'w')
    writer = csv.writer(outFile, quotechar='"', escapechar="\\")
    ##cursor = self.__conn.cursor()
    for thread in threads:
      fId, tId = thread
      try:
        posts = self.getPostsInThread(baseDir, fId, tId)
      except:
        pass
      #print posts
      #sys.exit()
      ##continue
      try:
        for post in posts:
          writer.writerow(post)   
          ##cursor.execute("""insert into allThreads values(%s,%s,%s,%s,%s,%s,%s)""",post)
      except:
        pass
    ##self.__conn.commit()
    outFile.close()
class TimeDividedData:
  def __init__(self, dataFile, usersData):
    sys.stderr.write('In Constructor\n')
    self.__data = []
    self.__userJoins = dd(lambda:-1)
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list))
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__users = set()
    self.__read(dataFile)
    self.__loadUsersJoins(usersData)
    self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
  
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded "+str(len(self.__userJoins))+" users' joins\n")

  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    comm = record[3]
    if comm.find('Talk')<0:
      return 0
    text = record[1]
    if text.find("http")>=0 or text.find("<blockquote>")>=0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        self.__data.append(tuple(record))
        succ = self.__updateVocab(record)
        if succ:
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index+=1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    sys.stderr.write("Word types "+str(len(self.__vocab))+"\n")
    sys.stderr.write("Users: "+str(len(self.__users))+"\n")
  
  def preprocessVocab(self, stopWords):
    stopWords = [w.strip() for w in open(stopWords)]
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >=5 and self.__vocabDocCount[word]>=50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")
  
  def __timeDiff(self, record):
    postTime = str(record[4])
    user = str(record[5])
    userJoin = self.__userJoins[user]
    return self.__diff(postTime, userJoin)
  
  def test(self):
    sampleTime = "November 17th, 2007, 4:21 pm"
    sampleTime2 = "October 11th, 2006, 3:15 am"
    print self.__diff(sampleTime, sampleTime2)
    
  def __diff(self, time1, time2):
    #print time1, time2
    year1 = int(time1.split(',')[1].strip())
    year2 = int(time2.split(',')[1].strip())
    diff = 0
    if year1 > year2:
      temp = time1
      time1 = time2
      time2 = temp
      diff = (year1 - year2 - 1)*12
    elif year1 < year2:
      diff = (year2 - year1 - 1)*12
    month1 = time1.split(' ')[0]
    month2 = time2.split(' ')[0]
    diff += 12 - self.__months[month1] + self.__months[month2]
    #if diff < 0:
    #  print year1, year2, time1, time2
    return diff
  
  def numUsers(self, comm, time):
    usersSet = set()
    for index in self.__commWiseTimeSeparatedIndices[comm][time]:
      user =  self.__data[index][5]
      usersSet.add(user)
    return len(usersSet)
    
  
  def divideBasedOnTimes(self):
    for comm in self.__commWiseIndices.iterkeys():
      for index in self.__commWiseIndices[comm]:
        timeDiff = self.__timeDiff(self.__data[index])
        self.__commWiseTimeSeparatedIndices[comm][timeDiff].append(index)
    return
    for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
      for time in self.__commWiseTimeSeparatedIndices[comm].iterkeys():
        if time < 25:
          print comm, time, len(self.__commWiseTimeSeparatedIndices[comm][time]), self.numUsers(comm, time)
  
  def __wordDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for record in data:
      #print record
      ##record = self.__data[record] ## Change this based on analysis.. Bad code!!
      text = record[1]
      #print text
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2)
    #print dist
    return dist

  def splitUserWise(self, data):
    userWise = dd(list)
    for record in data:
      userWise[record[5]].append(record)
    return userWise

  def KLDAnalysis(self, comm):
    print comm
    userWiseKLD = dd(lambda:dd(int))
    data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][1]]
    userWiseData = self.splitUserWise(data)
    #sampledData = random.sample(data, 1000)
    #m1Dist = self.__wordDist(sampledData)
    ##data25 = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][25]]
    ##m25Dist = self.__wordDist(data25)
    #dataSecondYear = []
    #for 
    for time in range(2,25):
      data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][time]]
      userWiseMonthData = self.splitUserWise(data)
      #sampledData = random.sample(data, 1000)
      for user in userWiseMonthData.iterkeys():
        userDist =  self.__wordDist(userWiseMonthData[user])
      #monthDist = self.__wordDist(sampledData)
      kld = self.KLD(m1Dist, monthDist)
      #kldWith25 = self.KLD(monthDist, m25Dist)
      #sys.stdout.write(str(time)+'\t'+str(kld)+'\t'+str(kldWith25)+'\n')
      sys.stdout.write(str(time)+'\t'+str(kld)+'\n')

  def myLog(self, x):
    #try:
      return math.log(x)
    #except ValueError:
    #  return -100000
    #except ZeroDivisionError:
    #  return 100000

  def KLD(self, P, Q):
    kld = 0
    for word in P.iterkeys():
      p = P[word]
      pbyq = P[word]/Q[word]
      kld += p*self.myLog(pbyq)
    return kld

  def KLDivergenceAnalysis(self):
    ##for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
    self.KLDAnalysis('AllTalk')

  '''def regress(self):
    for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
      instances = dd(list)
      #instances = []
      users = set()
      for time in range(1,25):
        for index in self.__commWiseTimeSeparatedIndices[comm][time]:
          record = self.__data[index]
          user = str(record[5])
          users.add(user)
      #print len(users)
      users = set(random.sample(list(users),min(len(users),1500)))
      for time in range(1,25):
        for index in self.__commWiseTimeSeparatedIndices[comm][time]:
          record = self.__data[index]
          user = str(record[5])
          if user not in users:
            continue
          instances[user+'_'+str(time)].append(index)
          #instances.append((index,time))
      #instances = random.sample(instances, 1000)
      regInstances = self.createRegInstances(instances)
      print comm, len(regInstances)
      model = creg.LinearRegression()
      model.fit(creg.RealvaluedDataset(regInstances), l1=0.1)
      outFile = open("weights_"+comm.strip().replace(' ',''),"w")
      weights = sorted([(W,w) for W,w in model.weights],cmp=myCMP)
      for weight in weights:
        outFile.write(weight[0]+'\t'+str(weight[1])+'\n')
      outFile.close()
      del regInstances
      del instances'''

  def createRegInstances(self, instances):
    regInstances = []
    for userTime in instances.iterkeys():
      dataIndices = instances[userTime]
      #data = [self.__data[index] for index in dataIndices]
      time = int(userTime.split('_')[1])
      wordDist = self.__wordDist(dataIndices)
      #print wordDist
      regInstances.append((wordDist,-1*self.myLog(time)))
      #print len(regInstances)
    return regInstances
class DataProcessor:
  def __init__(self, dataFile):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__read(dataFile)

  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    comm = record[3]
    if comm.find('Talk')<0:
      return
    text = record[1]
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    ##print self.__vocab
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        self.__data.append(tuple(record))
        self.__updateVocab(record)
        tempDD[record[3]].append(index)
        index+=1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    sys.stderr.write("Word types "+str(len(self.__vocab))+"\n")
    
  def preprocessVocab(self):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >=5 and self.__vocabDocCount[word]>=5:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")
class DataHandler:
    def __init__(self, dataFile, usersData):
        self.__data = []
        self.__vocab = dd(int)
        self.__vocabDocCount = dd(int)
        self.__backGround = {}
        self.__commWiseIndices = {}
        self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list))
        self.__communutyWiseVocab = dd(lambda: dd(int))
        self.__users = set()
        self.__userWiseIndices = {}
        self._tok = Tokenizer(preserve_case=False)
        self.__userJoins = dd(lambda: -1)
        self.__read(dataFile)
        self.__loadUsersJoins(usersData)
        self.__splitUserWise()

        self.__timeHandler = TimeHandler()

    def __loadUsersJoins(self, usersData):
        dataFile = open(usersData)
        for line in dataFile:
            line = line.strip().split("\t")
            self.__userJoins[line[0]] = line[1]  ## Correct the indices
        sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

    def __validUserId(self, userId):
        try:
            userId = int(userId)
            assert userId >= 1 and userId <= 45037
            return True
        except:
            return False

    def __splitUserWise(self):
        tempDD = dd(list)
        for index in range(len(self.__data)):
            user = self.__data[index][5]
            if not self.__validUserId(user):
                continue
            tempDD[user].append(index)
        for user in tempDD.iterkeys():
            self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
        del tempDD

    def __read(self, dataFile):
        dataFile = open(dataFile)
        dataFile.readline()
        csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
        index = 0
        tempDD = dd(list)
        for record in csvReader:
            try:
                succ = self.__updateVocab(record)
                if succ:
                    self.__data.append(tuple(record))
                    tempDD[record[3]].append(index)
                    tempDD["AllTalk"].append(index)
                    self.__users.add(record[5])
                index += 1
            except:
                pass
        for key, value in tempDD.iteritems():
            if key.find("Talk") >= 0:
                self.__commWiseIndices[key] = value
        sys.stderr.write("Read " + str(index) + " records\n")
        sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
        sys.stderr.write("Users: " + str(len(self.__users)) + "\n")

    def _tokenize(self, text):
        text = text.strip()
        text = re.sub("[\s\n]+", " ", text)
        return self._tok.tokenize(text)

    def freqVector(self, tokens):
        tempFreqVector = dd(int)
        for token in tokens:
            tempFreqVector[token] += 1
        return tempFreqVector

    def __updateVocab(self, record):
        if len(record) != 7:
            return
        comm = record[3]
        if comm.find("Talk") < 0:
            return 0
        text = record[1]
        if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
            return 0
        tokenDict = self.freqVector(self._tokenize(text))
        for word, freq in tokenDict.iteritems():
            self.__vocab[word] += freq
            self.__communutyWiseVocab[comm][word] += freq
            self.__vocabDocCount[word] += 1
        return 1
        ##print self.__vocab

    def preprocessVocab(self, stopWords):
        self.__backGround = {}
        totalVocab = self.__vocab.keys()
        for word in totalVocab:
            freq = self.__vocab[word]
            if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
                self.__backGround[word] = freq
            else:
                del self.__vocab[word]
        for comm in self.__communutyWiseVocab.iterkeys():
            commVocab = self.__communutyWiseVocab[comm].keys()
            for word in commVocab:
                if word in self.__vocab:
                    continue
                del self.__communutyWiseVocab[comm][word]
        sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

    def getAllUsers(self):
        return copy.deepcopy(self.__users)

    def getUserDataIndices(self, user):
        userDataIndices = []
        for index in range(len(self.__data)):
            userDataIndices.append(index)
        return copy.deepcopy(userDataIndices)

    def divideBasedOnMonths(self, data):
        timeDividedIndices = dd(list)
        for index in data:
            timeDiff = self.__timeDiff(index)
            if timeDiff >= 0:
                timeDividedIndices[timeDiff].append(index)
        return copy.deepcopy(timeDividedIndices)

    def __timeDiff(self, recordIndex):
        try:
            record = self.__data[recordIndex]
            postTime = str(record[4])
            user = str(record[5])
            userJoin = self.__userJoins[user]
            return self.__timeHandler.diffMonths(postTime, userJoin)
        except:
            return -1

    def makeDist(self, data):
        totalWords = 0
        dist = dd(lambda: 1)
        for text in data:  ## I just expect an array of texts, not the entire records
            tokenDict = self.freqVector(self._tokenize(text))
            for word, freq in tokenDict.iteritems():
                if word in self.__vocab:
                    dist[word] += freq
                    totalWords += freq
        for word in self.__vocab:
            dist[word] += 0
        totalWords += len(self.__vocab)
        for word in self.__vocab:
            dist[word] /= float(totalWords)
            ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
        return dist