def __geo_to_dict(self, filename, stopwords): counter = 0 id_to_geotok = dict() tok = Tokenizer(preserve_case=False) geo_functions = geo.GeoFunctions() with codecs.open(filename, 'r', "utf-8") as json_file: for line in json_file: try: json_data = json.loads(line, 'utf-8') tweet_id = json_data['id'] tweet = json_data['text'] coordinates = json_data['geo']['coordinates'] region = geo_functions.get_region((float(coordinates[0]),float(coordinates[1]))) # Stopworte entfernen if region != -1: tokenized_tweet = tok.tokenize(tweet) id_to_geotok[tweet_id] = ( [token for token in tokenized_tweet if token not in stopwords], region) counter += 1 # if counter % 1000 == 0: # sys.stdout.write('- ') except: None return id_to_geotok
def __classify_tweet(self,tweet_text): tweet_vector = array([0.0,0.0,0.0,0.0,0.0,0.0,0.0]) tok = Tokenizer(preserve_case=False) for token in tok.tokenize(tweet_text): if token in self.__wv: tweet_vector += self.__wv[token] if self.__cosine_sim(tweet_vector, self.__average_distribution) > self.__sim_threshold: return None tweet_vector_normalized = self.__normalize_len(tweet_vector) tweet_vector_diff = tweet_vector_normalized - self.__average_distribution return tweet_vector_diff
class DataSampler(): def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__read(dataFile) self._tok = Tokenizer(preserve_case=False) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: record = filter(lambda x:x.strip(), record) try: self.__data.append(tuple(record)) tempDD[record[3]].append(index) index+=1 except: pass ##print record ##sys.exit() for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def communityWiseSample(self): numPosts = 1000 for key in self.__commWiseIndices.iterkeys(): self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts) def prepareOutput(self, outputFile): outputFile = open(outputFile,'w') csvWriter = csv.writer(outputFile) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: #tokens = self._tokenize(self.__data[index][1]) csvWriter.writerow(self.__data[index]) outputFile.close()
def __jsons_to_dict(self, tweet_file, stopwords): counter = 0 id_to_tok = dict() tok = Tokenizer(preserve_case=False) with codecs.open(tweet_file, 'r', "utf-8") as json_file: for line in json_file: try: tweet = json.loads(line, 'utf-8')['text'] tweet_id = json.loads(line, 'utf-8')['id'] tokenized_tweet = tok.tokenize(tweet) # Stopworte entfernen id_to_tok[tweet_id] = [token for token in tokenized_tweet if token not in stopwords] counter += 1 # if counter % 1000 == 0: # sys.stdout.write('+ ') except: None return id_to_tok
def read_and_count(): dictionary = {} tweetfolder = '/home/gontrum/april-corpus-raw' tok = Tokenizer(preserve_case=False) for tweetfile in [folder for folder in os.listdir(tweetfolder) if folder.startswith('tweets') == True ]: tweetfile = os.path.join(tweetfolder, tweetfile) with open(tweetfile, 'r') as f: for line in f: try: tw = json.loads(line, 'latin1')['text'] except: None for each in tok.tokenize(tw): dictionary[each] = dictionary.get(each, 0) + 1 return dictionary
def GetWordDictionary(filePAth): csv.field_size_limit(sys.maxsize) ifile = open(filePAth, "rb") reader = csv.reader(ifile) word_dictionary={} tok = Tokenizer(preserve_case=False) for row in reader: tokens=[] try: tokens=tok.tokenize(row[3]) except Exception,e: print e for token in tokens: if token in word_dictionary: token_count=word_dictionary.get(token) token_count=token_count+1 word_dictionary[token]=token_count else: word_dictionary[token]=1
def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.data_dir[-1] != '/': args.data_dir+='/' uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[args.database] logger.info("Connected to%s"%uri) files = glob.glob(args.data_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("done.")
def NaiveBesianClassifer(positive_word_frequency, negative_words_frequency, count_pos_words, count_neg_words, tweet, class_pos_prob, class_neg_prob): tok = Tokenizer(preserve_case=False) tokens=tok.tokenize(tweet) positiveClassProb=1.00 negativeClassProb=1.00 for token in tokens: positiveClassProb=positiveClassProb*LaplaceSmoothingValue(token,positive_word_frequency,count_pos_words) negativeClassProb=negativeClassProb*LaplaceSmoothingValue(token,negative_words_frequency,count_neg_words) positiveClassProb=positiveClassProb*class_pos_prob negativeClassProb=negativeClassProb*class_neg_prob if(positiveClassProb >=negativeClassProb): print positiveClassProb,1 return positiveClassProb,1 else: print negativeClassProb,1 return negativeClassProb,-1
class FakeMatcher: def __init__(self): self.posts = [] self.userwiseThreads = dd(set) self.userwisePosts = dd(set) # Stores indices self.threads = dd(list) self.userNames = {} self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b") self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b") self.tok = Tokenizer() self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|']) self.fakeUsers = {} # Stores the postId of the previous fake annotation we did def loadData(self, dataFile): dataFile = open(dataFile) dataFile.readline() reader = csv.reader(dataFile, quotechar='"', escapechar="\\") postIndex = 0 for line in reader: self.posts.append(line) thread = line[3] user = line[1] username = line[0] self.userNames[user] = ' '.join(self.tok.tokenize(username)) self.threads[thread].append(postIndex) self.userwiseThreads[user].add(thread) self.userwisePosts[user].add(postIndex) postIndex += 1 def loadFakeUsers(self, fakeAnnotation): fakeAnnotation = csv.reader(open(fakeAnnotation)) for line in fakeAnnotation: try: dummy = int(line[1]) dummy = int(line[2]) except: continue self.fakeUsers[line[1]] = int(line[2]) def filterUsers(self): allUsers = self.userwisePosts.keys() for user in allUsers: if user not in self.fakeUsers.iterkeys(): del self.userwisePosts[user] del self.userwiseThreads[user] del self.userNames[user] def hasFake(self, postId): postText = self.posts[postId][4] return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None) def printFakeUsers(self, fakersDir): for user in self.fakeUsers: fakePostIds = [] for thread in self.userwiseThreads[user]: for postIndex in self.threads[thread]: if self.hasFake(postIndex): fakePostIds.append(postIndex) fakePostIds = sorted(fakePostIds, cmp=lambda x, y:int(self.posts[x][2]) - int(self.posts[y][2])) #print user, self.posts[fakePostIds[0]][2], self.fakeUsers[user] if len(fakePostIds) > 0 and self.posts[fakePostIds[0]][2] != str(self.fakeUsers[user]): #self.printPosts(user, fakePostIds) dummy = 1 else: print user def printPosts(self, user, fakePostIds): fakersFile = open(fakersDir + user, 'w', 1) for postIndex in fakePostIds: postId = self.posts[postIndex][2] postBody = self.posts[postIndex][4] fakersFile.write(postId + '\t' + postBody + '\n') fakersFile.close() def sanityCheck(self): print "Posts:", len(self.posts) print "Users:", len(self.userwiseThreads) print "Fake users:", len(self.fakeUsers) for user in self.fakeUsers: if user not in self.userwiseThreads.iterkeys(): print user
class FakeMatcher: def __init__(self): self.posts = [] self.userwiseThreads = dd(lambda:dd(lambda:-1)) self.userwisePosts = dd(set) # Stores indices self.userLastPost = dd(lambda:-1) self.threads = dd(list) self.userStart = dd(lambda:5000) self.userNames = {} self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b") self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b") self.tok = Tokenizer() self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|']) def loadData(self, dataFile): dataFile = open(dataFile) dataFile.readline() reader = csv.reader(dataFile, quotechar='"', escapechar="\\") postIndex = 0 for line in reader: self.posts.append(line) thread = line[3] user = line[1] username = line[0] self.userNames[user] = ' '.join(self.tok.tokenize(username)) self.threads[thread].append(postIndex) if self.userwiseThreads[user][thread] < 0 or self.userwiseThreads[user][thread] > postIndex: self.userwiseThreads[user][thread] = postIndex self.userwisePosts[user].add(postIndex) days = int(line[8]) if self.userLastPost[user] < days: self.userLastPost[user] = days if self.userStart[user] > int(line[8]): self.userStart[user] = int(line[8]) postIndex += 1 self.sortThreads() def sortThreads(self): for thread in self.threads.iterkeys(): self.threads[thread] = sorted(self.threads[thread], cmp=lambda x, y:x - y) def filterUsers(self): allUsers = self.userwisePosts.keys() for user in allUsers: if len(self.userwisePosts[user]) < 20 or len(self.userwisePosts[user]) > 150 or (self.userStart[user] - self.userLastPost[user]) > 120: del self.userwisePosts[user] del self.userwiseThreads[user] del self.userNames[user] def hasFake(self, postId): postText = self.posts[postId][4] #if postText.find(" you a fake ")>=0: # print postText return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None) def printFakePosts(self, logFile): logFile = open(logFile, 'w') index = 0 for post in self.posts: if self.hasFake(index): logFile.write('\t'.join(post[:5]) + '\n') index += 1 def printFakeUsers(self, fakersFile): fakersFile = open(fakersFile, 'w', 1) for user in self.userwiseThreads.iterkeys(): fakePostCount = 0 fakePostIds = set() for thread in self.userwiseThreads[user].iterkeys(): userFirstPost = self.userwiseThreads[user][thread] postIndex = self.threads[thread].index(userFirstPost) + 1 while postIndex < len(self.threads[thread]): postId = self.threads[thread][postIndex] if self.hasFake(postId): #print 'here' fakePostCount += 1 fakePostIds.add(postId) postIndex += 1 if fakePostCount > 5: fakersFile.write(user + '\t' + ' '.join(map(lambda x:str(x), list(fakePostIds))) + '\n') fakersFile.close() def makeRECompatible(self, userName): for char in self.badChars: if char != '\\': userName = userName.replace(char, "\\" + char) return userName def bigRESearch(self, logFile): logFile = open(logFile, 'w', 1) bigUserName = "******" for userName in self.userNames.itervalues(): if userName in ["dat n***a", "bitch"]: continue if userName.strip() != "": if self.considerUserName(userName): userName = self.makeRECompatible(userName) bigUserName += userName + " |" bigUserName = bigUserName[:-1] + ")" bigUserName += "(is )(a )?(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)" print len(bigUserName) print bigUserName P = re.compile(bigUserName) #sampleText = "i wanna see wat dat n***a about but i aint gonna fite him im on parole . but dat n***a fake so i dont even matter" #while 1: # sampleText = raw_input("Enter the text: ") # if sampleText == 'exit': # break # print "Full match:",P.search(sampleText).group(), " username match:",P.search(sampleText).group(1) for post in self.posts: text = post[4] if P.search(text) != None: logFile.write('\t'.join(post[:5]) + '\n') logFile.close() def printNonChars(self): nonChars = set() for userName in self.userNames.itervalues(): userName = userName.lower() for char in userName: if ord(char) >= 32 and ord(char) <= 126 and (ord(char) < 97 or ord(char) > 122) and ord(char) not in range(48, 58): nonChars.add(char) print "Users:", len(self.userNames) print nonChars def contentToLookAt(self): uniqThreads = set() uniqPosts = set() for userId in self.userNames.iterkeys(): for thread in self.userwiseThreads[userId]: uniqThreads.add(thread) for post in self.threads[thread]: uniqPosts.add(post) print "Users to look at:", len(self.userNames) print "Unique threads to look at:", len(uniqThreads) print "Unique posts to look at:", len(uniqPosts) def isAllLetters(self, userName): for char in userName: if ord(char) < 97 or ord(char) > 122: return False return True def considerUserName(self, userName): for char in userName: o = ord(char) if o < 32 or o > 126: return False return True def matchUserNamesInPosts(self, logFile): logFile = open(logFile, 'w', 1) for userId in self.userNames.iterkeys(): userName = self.userNames[userId] if not self.isAllLetters(userName): continue for post in self.posts: if post[4].find(userName) >= 0: logFile.write(str(userId) + '\t' + userName + '\t' + post[4] + '\n') logFile.close()
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSplitIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self.__users = set() self.__userWiseIndices = {} self.__userWiseTimeSplitIndices = {} self.__timeWiseUserSplitIndices = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda:-1) self.timeHandler = TimeHandler() self.sampledUsers = set() self.activeForums = {} self.activeUsersInForums = dd(set) ## Processing/dealing with data #self.__read(dataFile) self.__justRead(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__userWiseTimeSplit() #self.__timeWiseUserSplit() #self.__commWiseTimeSplit() ## Extra data structures self.postingFreq = dd(int) def printMonthlyDataForUser(self, user, outFile): userTimeIndices = self.__userWiseTimeSplitIndices[user] for month in userTimeIndices.iterkeys(): f = csv.writer(open(outFile+"."+str(month),"w")) for index in userTimeIndices[month]: f.writerow(self.__data[index]) def tokenizeRecord(self, record): record = list(copy.deepcopy(record)) #print record try: text = record[1] tokenizedText = ' '.join(self._tokenize(text)) record[1] = tokenizedText #print tokenizedText return record except: return -1 def getTokenizedCSV(self): tokenizedRecords = [] for index in range(len(self.__data)): newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: tokenizedRecords.append(newRecord) return tokenizedRecords def getBasicUserMonthRecord(self, user, month): record = [] record.append(user) record.append(month) record.append(self.activeForums[user]) record.append([]) return record def getTokenizedUserMonthCSV(self): tokenizedRecords = dd(lambda:dd(list)) for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: tokenizedRecords[user][month].append(newRecord[1]) ## Only postBody being given! return tokenizedRecords def getTokenizedUserMonthForumCSV(self): tokenizedRecords = dd(lambda:dd(lambda:dd(list))) for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: forum = newRecord[3] tokenizedRecords[user][month][forum].append(newRecord[1]) ## Only postBody being given! return tokenizedRecords def getPost2Month(self): post2Month = {} for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: postId = self.__data[index][0] post2Month[postId] = month return copy.deepcopy(post2Month) def getDoc2Post(self): doc2Post = {} for index in range(len(self.__data)): doc2Post[index+1] = self.__data[index][0] return copy.deepcopy(doc2Post) def getPost2User(self): post2User = {} for user in self.__userWiseIndices.iterkeys(): for index in self.__userWiseIndices[user]: postId = self.__data[index][0] post2User[postId] = user return copy.deepcopy(post2User) def getPostingFreq(self): self.postingFreq = dd(int) for user in self.__userWiseIndices.iterkeys(): self.postingFreq[len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10] += 1 return copy.deepcopy(self.postingFreq) def getCumulativePostingFreq(self): sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n") self.postingFreq = dd(int) for user in self.__userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10 for num in range(0,userPosts+1,10): self.postingFreq[num] += 1 return copy.deepcopy(self.postingFreq) def getCutoffPostingFreq(self): totalPosts = 0 cdfFreqPosting = dd(int) for user in self.__userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10 totalPosts += userPosts for num in range(0,userPosts+1,10): cdfFreqPosting[num] += userPosts for num in cdfFreqPosting.iterkeys(): cdfFreqPosting[num] = round(cdfFreqPosting[num]*100.0/float(totalPosts),2) sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n") sys.stderr.write("Total Posts:"+str(totalPosts)+"\n") return copy.deepcopy(cdfFreqPosting) def getMonthwisePostingFrequency(self): timeWisePostedUsers = dd(int) for time in self.__timeWiseUserSplitIndices.iterkeys(): timeWisePostedUsers[time] = len(self.__timeWiseUserSplitIndices[time]) return copy.deepcopy(timeWisePostedUsers) def getMonthwiseBinnedPostingFrequency(self): timeWisePostedUsers = dd(int) for time in self.__timeWiseUserSplitIndices.iterkeys(): userWiseIndices = self.__timeWiseUserSplitIndices[time] postingFreq = dd(int) for user in userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user]) for num in range(0,userPosts+1): postingFreq[num] += 1 timeWisePostedUsers[time] = copy.deepcopy(postingFreq) return copy.deepcopy(timeWisePostedUsers) def getBasicTable(self): table = [] for user in self.__userWiseTimeSplitIndices.iterkeys(): userSubtable = [] for month in self.__userWiseTimeSplitIndices[user].iterkeys(): try: activeForum = self.activeForums[user] if activeForum == 'NULL': continue if int(month) >100: continue content = (user, month, len(self.__userWiseTimeSplitIndices[user][month]), self.activeForums[user]) userSubtable.append(content) except: pass if len(userSubtable) >= 3: table.extend(userSubtable) return table def totalPostsByUsers(self): total = 0 for user in self.__userWiseIndices.iterkeys(): total += len(self.__userWiseIndices[user]) return total def getTopPosterCoverage(self): totalPosts = self.totalPostsByUsers() postsTillTopN = 0 def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def loadActiveForums(self, activeForums): for line in csv.reader(open(activeForums)): try: self.activeForums[line[0]] = line[1] self.activeUsersInForums[line[1]].add(line[0]) except: pass def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): try: user = self.__data[index][5] except: continue if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __userWiseTimeSplit(self): for user in self.__userWiseIndices.iterkeys(): self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user]) def __timeWiseUserSplit(self): for user in self.__userWiseIndices.iterkeys(): timeDividedUserData = self.divideBasedOnMonths(self.__userWiseIndices[user]) for time in timeDividedUserData.iterkeys(): self.__timeWiseUserSplitIndices[time][user] = timeDividedUserData[time] return copy.deepcopy(self.__timeWiseUserSplitIndices) def __commWiseTimeSplit(self): for comm in self.__commWiseIndices.iterkeys(): self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm]) def __justRead(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") for record in csvReader: #self.__data.append(tuple(record[1:])) self.__data.append(tuple(record)) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+', ' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record)!=7: return comm = record[3] if comm.find('Talk') < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def userStats(self, outFile): outFile = open(outFile,'w') for user in self.__userWiseIndices.iterkeys(): userDataIndices = self.__userWiseIndices[user] timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices) outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n') outFile.close() def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = -1 try: timeDiff = self.__timeDiff(index) except: continue if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) #else: # print timeDiff return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): #try: #print recordIndex record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.timeHandler.diffMonths(postTime, userJoin) #except: # return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda:1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! #assert self.isValid(dist) return dist def isValid(self, dist): sumProb = 0 for x in dist.iterkeys(): sumProb += dist[x] print sumProb return True def sampleUsers(self): US = userSampling(self.__userWiseTimeSplitIndices) self.sampledUsers = US.finalizeUsers() self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices) return copy.deepcopy(self.sampledUsers) def getUserMonths(self, user): months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys()) for i in range(1,4): try: months.remove(i) except: pass for i in range(25,31): try: months.remove(i) except: pass return months def getUserDataForDivergence(self, user, month): return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]] def getUserInitialData(self, user): data = [] for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getUserMaturedData(self, user): data = [] for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getActiveForum(self, userNum): return self.activeForums[userNum] def getForumInitialData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getForumMaturedData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data
def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.snapshot_dir[-1] != '/': args.snapshot_dir+='/' if args.database: database = args.database else: database = "snapshot_"+args.snapshot_dir.split('/')[-2] uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[database] logger.info("Connected to%s"%uri) files = glob.glob(args.snapshot_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("Loading users from %susers.db"%args.snapshot_dir) connection = sqlite3.connect("%susers.db"%args.snapshot_dir) connection.row_factory = sqlite3.Row cursor = connection.cursor() logger.info('fetching users') cursor.execute('SELECT id,friends FROM users where friends is not NULL') users = cursor.fetchall() logger.info('%d users fetched'%len(users)) bulk_size=25000 nUsersInserted=0 usersToBeInserted=[] for user in users: id = user['id'] friends = ujson.loads(user['friends']) usersToBeInserted.append({'id':id, 'friends':friends}) if len(usersToBeInserted)>=bulk_size: client['users'].insert(usersToBeInserted) usersToBeInserted=[] nUsersInserted+=bulk_size logger.info("%d users insered"%nUsersInserted) client['users'].insert(usersToBeInserted) logger.info("all users insered.") logger.info("done.")
class preprocessor: def __init__(self, tweetsFile): self._tweets = [] self._tok = Tokenizer(preserve_case=False) self.loadTweets(tweetsFile) sys.stderr.write("preprocessor instance created\n") sys.stderr.write("@ Mentions removed\n") def anonnimize(self, tweet): tweet = tweet.split('\t')[-1] ## Assumption about the format tweet = self._tok.tokenize(tweet) ## Tokenization anonTweet = [] for word in tweet: if word[0] != '@': anonTweet.append(word) return anonTweet def loadTweets(self, tweetsFile): for tweet in open(tweetsFile): tweet = tweet.strip() tokenizedTweet = self.anonnimize(tweet) if len(tokenizedTweet) == 0 or ' '.join(tokenizedTweet).strip() == '': continue self._tweets.append([tweet.split('\t')[0], tweet.split('\t')[1], tokenizedTweet]) def removeRetweets(self): newTweets = [] for tweet in self._tweets: flag = 0 for word in tweet[2]: if word[:2] == 'rt': flag = 1 break if flag == 0: newTweets.append(tweet) self._tweets = [t for t in newTweets] sys.stderr.write("Retweets removed\n") def filterAuthors(self): authorDict = dd(int) for tweet in self._tweets: authorDict[tweet[0]] += 1 filteredAuthors = [] for auth,tweets in authorDict.iteritems(): if tweets >= 50: filteredAuthors.append(auth) filteredAuthors = set(filteredAuthors) filteredTweets = [] for tweet in self._tweets: if tweet[0] in filteredAuthors: filteredTweets.append(tweet) self._tweets = [t for t in filteredTweets] def authorStats(self): authorDict = dd(int) for tweet in self._tweets: authorDict[tweet[0]] += 1 numDict = dd(int) for auth, numTweets in authorDict.iteritems(): numDict[numTweets-(numTweets%10)] += 1 self.drawGraph(numDict) def drawGraph(self, authorDict): #try: authors = [x for x in authorDict.iterkeys()] authors = sorted(authors, cmp=lambda x,y:x-y) numTweets = [authorDict[x] for x in authors] width = 0.2 fig = plt.figure() ax = fig.add_subplot(111) # bar chart of the data rects = ax.bar(np.arange(len(authors)), numTweets, width, color='r') ax.set_xlabel('User') ax.set_ylabel('Number of tweets') ax.set_xticks(np.arange(len(authors))+width/2) ax.set_xticklabels( map(lambda x:str(x), authors)) def autolabel(rects): # attach some text labels for rect in rects: height = rect.get_height() ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), ha='center', va='bottom') autolabel(rects) plt.savefig(open("/usr0/home/pgadde/Work/Ethnic/AAEness/Data/RealTweets/PreProcessing/aaeAuthorTweets.png","w")) plt.show() def printInFile(self, output, label): output = open(output,'w') for tweet in self._tweets: tweet[2].insert(0,label) try: output.write(tweet[0]+"\t"+tweet[1]+"\t"+"\t".join(tweet[2])+"\n") except UnicodeEncodeError: pass output.close()
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSplitIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self.__users = set() self.__userWiseIndices = {} self.__userWiseTimeSplitIndices = {} self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda:-1) self.__read(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.timeHandler = TimeHandler() self.__userWiseTimeSplit() #self.__commWiseTimeSplit() self.sampledUsers = set() self.activeForums = {} self.activeUsersInForums = dd(set) def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def loadActiveForums(self, activeForums): for line in open(activeForums): line = line.strip().split("\t") try: self.activeForums[line[0]] = line[1] self.activeUsersInForums[line[1]].add(line[0]) except: pass def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): user = self.__data[index][5] if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __userWiseTimeSplit(self): for user in self.__userWiseIndices.iterkeys(): self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user]) def __commWiseTimeSplit(self): for comm in self.__commWiseIndices.iterkeys(): self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm]) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+', ' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record)!=7: return comm = record[3] if comm.find('Talk') < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def userStats(self, outFile): outFile = open(outFile,'w') for user in self.__userWiseIndices.iterkeys(): userDataIndices = self.__userWiseIndices[user] timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices) outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n') outFile.close() def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = self.__timeDiff(index) if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) #else: # print timeDiff return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): #try: #print recordIndex record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.timeHandler.diffMonths(postTime, userJoin) #except: # return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda:1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! #assert self.isValid(dist) return dist def isValid(self, dist): sumProb = 0 for x in dist.iterkeys(): sumProb += dist[x] print sumProb return True def sampleUsers(self): US = userSampling(self.__userWiseTimeSplitIndices) self.sampledUsers = US.finalizeUsers() self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices) return copy.deepcopy(self.sampledUsers) def getUserMonths(self, user): months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys()) for i in range(1,4): try: months.remove(i) except: pass for i in range(25,31): try: months.remove(i) except: pass return months def getUserDataForDivergence(self, user, month): return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]] def getUserInitialData(self, user): data = [] for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getUserMaturedData(self, user): data = [] for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getActiveForum(self, userNum): return self.activeForums[userNum] def getForumInitialData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getForumMaturedData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data
import glob import ujson from happyfuntokenizing import Tokenizer from TreeTaggerWrapper import TreeTagger path_to_data='../data/snapshots/2014-10-20/' files = glob.glob(path_to_data+'2014-1*.data') tokenizer = Tokenizer(preserve_case=True) tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par') for fileName in files: print fileName file = open(fileName) tweets=[ujson.loads(l) for l in file] tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] output_file_name = fileName[:-5]+'-tagged.data' file = open(output_file_name,'w') for tweet in tweets: file.write("%s\n"%ujson.dumps(tweet))
class EmpiricalAnalyzer: def __init__(self, dataFile): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__communutyWiseVocab = dd(lambda: dd(int)) self._tok = Tokenizer(preserve_case=False) self.__read(dataFile) def _tokenize(self, text): text = text.strip() text = re.sub("[\s\n]+", " ", text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find("Talk") < 0: return text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) succ = self.__updateVocab(record) if succ: tempDD[record[3]].append(index) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") def preprocessVocab(self): stopWords = [w.strip() for w in open("stopWords")] self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] totalWords = 0 for word, freq in self.__backGround.iteritems(): totalWords += freq for word, freq in self.__backGround.iteritems(): self.__backGround[word] = self.__backGround[word] / float(totalWords) for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() totalWords = 0 for word in commVocab: if word in self.__vocab: totalWords += self.__communutyWiseVocab[comm][word] continue del self.__communutyWiseVocab[comm][word] for word in self.__communutyWiseVocab[comm].iterkeys(): self.__communutyWiseVocab[comm][word] = self.__communutyWiseVocab[comm][word] / float(totalWords) sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def printTop1000InBack(self, outFile): outFile = open(outFile, "w") wordFreqs = [(word, freq) for word, freq in self.__backGround.iteritems()] wordFreqs = sorted(wordFreqs, cmp=lambda x, y: y[1] - x[1])[:1000] for wordFreq in wordFreqs: outFile.write(wordFreq[0] + "\n") outFile.close() def printTop1000(self, D, outFile): outFile = open(outFile, "w") wordFreqs = [(word, freq) for word, freq in D.iteritems()] wordFreqs = sorted(wordFreqs, cmp=myCMP) for wordFreq in wordFreqs: outFile.write(wordFreq[0] + "\t" + str(wordFreq[1]) + "\n") outFile.close() def __logOdd(self, word, commFreq): try: return math.log(commFreq * 1.0 / self.__backGround[word]) except ZeroDivisionError: return 100000 except ValueError: return -100000 except: print word, commFreq, self.__backGround[word] sys.exit("Error while calculating logodds") def prepareCommunityWiseVocab(self): for word in self.__backGround.iterkeys(): for comm in self.__communutyWiseVocab.iterkeys(): self.__communutyWiseVocab[comm][word] = self.__logOdd(word, self.__communutyWiseVocab[comm][word]) def printTopDeviations(self, baseDir): backFile = baseDir + "/" + "background" self.printTop1000(self.__backGround, backFile) for comm in self.__communutyWiseVocab.iterkeys(): self.printTop1000(self.__communutyWiseVocab[comm], baseDir + "/" + comm.strip().replace(" ", ""))
class DataSampler(): def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__commWiseSampleWordFreq = dd(lambda:dd(int)) self.__read(dataFile) self._tok = Tokenizer(preserve_case=False) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: #print record try: self.__data.append(tuple(record)) tempDD[record[3]].append(index) index+=1 except: print record sys.exit() for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") def sanityCheck(self): self.__printDictSizes(self.__commWiseIndices) self.__printDictSizes(self.__commWiseSampleIndices) #print filter(lambda x:x[0]=="74526", self.__data) def __printDictSizes(self, D): for key in D.iterkeys(): sys.stdout.write(key+"\t"+str(len(D[key]))+"\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def communityWiseSample(self): numPosts = 30000 for key in self.__commWiseIndices.iterkeys(): self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts) sys.stderr.write("Sampled "+str(numPosts*5)+"\n") def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __filterWords(self, backGroundVector): lexicon = set() for word, freq in backGroundVector.iteritems(): if freq > 9: lexicon.add(word) return lexicon def preparePosts(self, outputFile): outputFile = open(outputFile,'w') backGroundVector = dd(int) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for token, freq in freqVector.iteritems(): backGroundVector[token] += freq print "Background words:",len(backGroundVector) filteredLexicon = self.__filterWords(backGroundVector) print "Filtered Words:",len(filteredLexicon) ##sys.exit() for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) words = [x+"$:$:"+str(y) for x,y in freqVector.iteritems() if x in filteredLexicon] if len(words) > 0: outputFile.write(key+'\t'+' '.join(words)+'\n') outputFile.write('background'+'\t'+' '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n') outputFile.close() def analyzeLexicon(self, lexicon, background): words = [(w,f) for w,f in background.iteritems() if w in lexicon] words = sorted(words,cmp=lambda x,y:y[1]-x[1]) index = 0 while 1: print words[index] dummy = raw_input() index += 1 def preparePostsSingleDoc(self, outputFile): outputFile = open(outputFile,'w') backGroundVector = dd(int) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for token, freq in freqVector.iteritems(): backGroundVector[token] += freq print "Background words:",len(backGroundVector) filteredLexicon = self.__filterWords(backGroundVector) print "Filtered Words:",len(filteredLexicon) ##self.analyzeLexicon(filteredLexicon, backGroundVector) ##sys.exit() for key in self.__commWiseSampleIndices.iterkeys(): globalFreqVector = dd(int) for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for word, freq in freqVector: globalFreqVector[word] += freq words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon] if len(words) > 0: outputFile.write(key+'\t'+' '.join(words)+'\n') outputFile.write('background'+'\t'+' '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n') outputFile.close()
class ThreadCreator: def __init__(self): self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup') self.tok = Tokenizer() def __getMaxPage(self, page): soup = BS(open(page).read()) try: pagesTag = int(soup.findAll('a', onclick="jumpto(); return false;")[0].findChildren('strong')[-1].contents[0]) return pagesTag except: return -1 def __getPostId(self, postProfile): postId = -1 try: postId = int(postProfile.find('dl', {'class':'postprofile'})['id'].split('profile')[1]) except: postId = -1 pass return postId def getSmileyText(self, smileyTag): title = smileyTag["title"] title = title.lower() title.replace(" ", "_") title = re.sub("[^a-z_]", "", title) return "___" + title + "___" def constructString(self, contentTag): content = "" for cont in contentTag.contents: if type(cont) == bs4.element.NavigableString: content += " " + cont #print cont elif type(cont) == bs4.element.Tag and cont.name == 'span': content += " " + self.constructString(cont) elif type(cont) == bs4.element.Tag and cont.name == 'img' and cont["src"].find("./images/smilies/") == 0: content += " " + self.getSmileyText(cont) return content def __getPostBody(self, postBodyTag): postBody = "" try: contentDiv = postBodyTag.find('div', {'class':'content'}) postBody = self.constructString(contentDiv) ##for cont in contentDiv: ## #print cont, type(cont) ## if type(cont) == bs4.element.NavigableString: ## postBody += cont except: pass if postBody == "": return "NULL" return postBody.decode('utf8') def __getUser(self, postProfile): user = -1 try: user = int(postProfile.find('a')['href'].split('u=')[1].split('&')[0]) except: user = -1 pass return user def __getForum(self, soup): forum = "NULL" try: forum = soup.find('li', {'class':'nav-forum active'}).find('a').find('span').contents[0] except: pass return forum def __getPostTime(self, postBody): postTime = "NULL" try: postTime = ' '.join(postBody.find('p').contents[-1].strip().split(" ")[1:]) except: pass return postTime def __getPostBodyTag(self, postProfile): postBody = postProfile.nextSibling flag = 0 while getattr(postBody, 'name', None) != 'div': if getattr(postBody, 'name', None) == 'span': flag = 1 break postBody = postBody.nextSibling if flag: None return postBody def getPosts(self, page, tId): #print 'In getPosts' soup = BS(open(page).read()) ##print soup postProfiles = soup.findAll('div', {'class':"profile"}) #print "NUM:",len(postProfiles) posts = [] threadId = tId forum = self.__getForum(soup) for postProfile in postProfiles: #print 'Inside post profiles' postId = self.__getPostId(postProfile) user = self.__getUser(postProfile) postBodyTag = self.__getPostBodyTag(postProfile) ##print postBodyTag if postBodyTag != None: time = self.__getPostTime(postBodyTag) postBody = self.__getPostBody(postBodyTag) postBody = re.sub("\[youtube\].*?\[/youtube\]", "", postBody) if postBody.find("quote") > postBody.find("/quote"): postBody = postBody[postBody.find("/quote") + 6:] #inReply = -1 postBody = ' '.join(self.tok.tokenize(postBody)) postBody = postBody.replace("\\", "") ##print postId, postBody, threadId, forum, time, user,inReply ##sys.exit() posts.append((user, postId, threadId, postBody, forum, time)) return posts def getPostsInThread(self, baseDir, fId, tId): posts = [] firstPage = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=0' ##os.system('wget -P '+baseDir+' "'+firstPage+'"') page = baseDir + firstPage.split('/board/')[1] ###print page posts.extend(self.getPosts(page, tId)) ##sys.exit() maxPages = self.__getMaxPage(page) for pageIndex in range(1, maxPages): offset = pageIndex * 50 url = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=' + str(offset) ##os.system('wget -P '+baseDir+' "'+url+'"') ##continue page = baseDir + url.split('/board/')[1] pagePosts = self.getPosts(page, tId) posts.extend(pagePosts) return posts def createThreadsTable(self, baseDir, threads, outFile): outFile = open(outFile, 'w') writer = csv.writer(outFile, quotechar='"', escapechar="\\") ##cursor = self.__conn.cursor() for thread in threads: fId, tId = thread try: posts = self.getPostsInThread(baseDir, fId, tId) except: pass #print posts #sys.exit() ##continue try: for post in posts: writer.writerow(post) ##cursor.execute("""insert into allThreads values(%s,%s,%s,%s,%s,%s,%s)""",post) except: pass ##self.__conn.commit() outFile.close()
class TimeDividedData: def __init__(self, dataFile, usersData): sys.stderr.write('In Constructor\n') self.__data = [] self.__userJoins = dd(lambda:-1) self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list)) self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__users = set() self.__read(dataFile) self.__loadUsersJoins(usersData) self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12} def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded "+str(len(self.__userJoins))+" users' joins\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find('Talk')<0: return 0 text = record[1] if text.find("http")>=0 or text.find("<blockquote>")>=0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) succ = self.__updateVocab(record) if succ: tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index+=1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") sys.stderr.write("Word types "+str(len(self.__vocab))+"\n") sys.stderr.write("Users: "+str(len(self.__users))+"\n") def preprocessVocab(self, stopWords): stopWords = [w.strip() for w in open(stopWords)] self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >=5 and self.__vocabDocCount[word]>=50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n") def __timeDiff(self, record): postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.__diff(postTime, userJoin) def test(self): sampleTime = "November 17th, 2007, 4:21 pm" sampleTime2 = "October 11th, 2006, 3:15 am" print self.__diff(sampleTime, sampleTime2) def __diff(self, time1, time2): #print time1, time2 year1 = int(time1.split(',')[1].strip()) year2 = int(time2.split(',')[1].strip()) diff = 0 if year1 > year2: temp = time1 time1 = time2 time2 = temp diff = (year1 - year2 - 1)*12 elif year1 < year2: diff = (year2 - year1 - 1)*12 month1 = time1.split(' ')[0] month2 = time2.split(' ')[0] diff += 12 - self.__months[month1] + self.__months[month2] #if diff < 0: # print year1, year2, time1, time2 return diff def numUsers(self, comm, time): usersSet = set() for index in self.__commWiseTimeSeparatedIndices[comm][time]: user = self.__data[index][5] usersSet.add(user) return len(usersSet) def divideBasedOnTimes(self): for comm in self.__commWiseIndices.iterkeys(): for index in self.__commWiseIndices[comm]: timeDiff = self.__timeDiff(self.__data[index]) self.__commWiseTimeSeparatedIndices[comm][timeDiff].append(index) return for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): for time in self.__commWiseTimeSeparatedIndices[comm].iterkeys(): if time < 25: print comm, time, len(self.__commWiseTimeSeparatedIndices[comm][time]), self.numUsers(comm, time) def __wordDist(self, data): totalWords = 0 dist = dd(lambda:1) for record in data: #print record ##record = self.__data[record] ## Change this based on analysis.. Bad code!! text = record[1] #print text tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) #print dist return dist def splitUserWise(self, data): userWise = dd(list) for record in data: userWise[record[5]].append(record) return userWise def KLDAnalysis(self, comm): print comm userWiseKLD = dd(lambda:dd(int)) data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][1]] userWiseData = self.splitUserWise(data) #sampledData = random.sample(data, 1000) #m1Dist = self.__wordDist(sampledData) ##data25 = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][25]] ##m25Dist = self.__wordDist(data25) #dataSecondYear = [] #for for time in range(2,25): data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][time]] userWiseMonthData = self.splitUserWise(data) #sampledData = random.sample(data, 1000) for user in userWiseMonthData.iterkeys(): userDist = self.__wordDist(userWiseMonthData[user]) #monthDist = self.__wordDist(sampledData) kld = self.KLD(m1Dist, monthDist) #kldWith25 = self.KLD(monthDist, m25Dist) #sys.stdout.write(str(time)+'\t'+str(kld)+'\t'+str(kldWith25)+'\n') sys.stdout.write(str(time)+'\t'+str(kld)+'\n') def myLog(self, x): #try: return math.log(x) #except ValueError: # return -100000 #except ZeroDivisionError: # return 100000 def KLD(self, P, Q): kld = 0 for word in P.iterkeys(): p = P[word] pbyq = P[word]/Q[word] kld += p*self.myLog(pbyq) return kld def KLDivergenceAnalysis(self): ##for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): self.KLDAnalysis('AllTalk') '''def regress(self): for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): instances = dd(list) #instances = [] users = set() for time in range(1,25): for index in self.__commWiseTimeSeparatedIndices[comm][time]: record = self.__data[index] user = str(record[5]) users.add(user) #print len(users) users = set(random.sample(list(users),min(len(users),1500))) for time in range(1,25): for index in self.__commWiseTimeSeparatedIndices[comm][time]: record = self.__data[index] user = str(record[5]) if user not in users: continue instances[user+'_'+str(time)].append(index) #instances.append((index,time)) #instances = random.sample(instances, 1000) regInstances = self.createRegInstances(instances) print comm, len(regInstances) model = creg.LinearRegression() model.fit(creg.RealvaluedDataset(regInstances), l1=0.1) outFile = open("weights_"+comm.strip().replace(' ',''),"w") weights = sorted([(W,w) for W,w in model.weights],cmp=myCMP) for weight in weights: outFile.write(weight[0]+'\t'+str(weight[1])+'\n') outFile.close() del regInstances del instances''' def createRegInstances(self, instances): regInstances = [] for userTime in instances.iterkeys(): dataIndices = instances[userTime] #data = [self.__data[index] for index in dataIndices] time = int(userTime.split('_')[1]) wordDist = self.__wordDist(dataIndices) #print wordDist regInstances.append((wordDist,-1*self.myLog(time))) #print len(regInstances) return regInstances
class DataProcessor: def __init__(self, dataFile): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__read(dataFile) def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find('Talk')<0: return text = record[1] tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) self.__updateVocab(record) tempDD[record[3]].append(index) index+=1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") sys.stderr.write("Word types "+str(len(self.__vocab))+"\n") def preprocessVocab(self): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >=5 and self.__vocabDocCount[word]>=5: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list)) self.__communutyWiseVocab = dd(lambda: dd(int)) self.__users = set() self.__userWiseIndices = {} self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda: -1) self.__read(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__timeHandler = TimeHandler() def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split("\t") self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): user = self.__data[index][5] if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD["AllTalk"].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub("[\s\n]+", " ", text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record) != 7: return comm = record[3] if comm.find("Talk") < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = self.__timeDiff(index) if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): try: record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.__timeHandler.diffMonths(postTime, userJoin) except: return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda: 1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! return dist