def __geo_to_dict(self, filename, stopwords): counter = 0 id_to_geotok = dict() tok = Tokenizer(preserve_case=False) geo_functions = geo.GeoFunctions() with codecs.open(filename, 'r', "utf-8") as json_file: for line in json_file: try: json_data = json.loads(line, 'utf-8') tweet_id = json_data['id'] tweet = json_data['text'] coordinates = json_data['geo']['coordinates'] region = geo_functions.get_region((float(coordinates[0]),float(coordinates[1]))) # Stopworte entfernen if region != -1: tokenized_tweet = tok.tokenize(tweet) id_to_geotok[tweet_id] = ( [token for token in tokenized_tweet if token not in stopwords], region) counter += 1 # if counter % 1000 == 0: # sys.stdout.write('- ') except: None return id_to_geotok
def __classify_tweet(self,tweet_text): tweet_vector = array([0.0,0.0,0.0,0.0,0.0,0.0,0.0]) tok = Tokenizer(preserve_case=False) for token in tok.tokenize(tweet_text): if token in self.__wv: tweet_vector += self.__wv[token] if self.__cosine_sim(tweet_vector, self.__average_distribution) > self.__sim_threshold: return None tweet_vector_normalized = self.__normalize_len(tweet_vector) tweet_vector_diff = tweet_vector_normalized - self.__average_distribution return tweet_vector_diff
def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSplitIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self.__users = set() self.__userWiseIndices = {} self.__userWiseTimeSplitIndices = {} self.__timeWiseUserSplitIndices = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda:-1) self.timeHandler = TimeHandler() self.sampledUsers = set() self.activeForums = {} self.activeUsersInForums = dd(set) ## Processing/dealing with data #self.__read(dataFile) self.__justRead(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__userWiseTimeSplit() #self.__timeWiseUserSplit() #self.__commWiseTimeSplit() ## Extra data structures self.postingFreq = dd(int)
def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__commWiseSampleWordFreq = dd(lambda:dd(int)) self.__read(dataFile) self._tok = Tokenizer(preserve_case=False)
def __jsons_to_dict(self, tweet_file, stopwords): counter = 0 id_to_tok = dict() tok = Tokenizer(preserve_case=False) with codecs.open(tweet_file, 'r', "utf-8") as json_file: for line in json_file: try: tweet = json.loads(line, 'utf-8')['text'] tweet_id = json.loads(line, 'utf-8')['id'] tokenized_tweet = tok.tokenize(tweet) # Stopworte entfernen id_to_tok[tweet_id] = [token for token in tokenized_tweet if token not in stopwords] counter += 1 # if counter % 1000 == 0: # sys.stdout.write('+ ') except: None return id_to_tok
def read_and_count(): dictionary = {} tweetfolder = '/home/gontrum/april-corpus-raw' tok = Tokenizer(preserve_case=False) for tweetfile in [folder for folder in os.listdir(tweetfolder) if folder.startswith('tweets') == True ]: tweetfile = os.path.join(tweetfolder, tweetfile) with open(tweetfile, 'r') as f: for line in f: try: tw = json.loads(line, 'latin1')['text'] except: None for each in tok.tokenize(tw): dictionary[each] = dictionary.get(each, 0) + 1 return dictionary
def __init__(self, dataFile): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__read(dataFile)
def GetWordDictionary(filePAth): csv.field_size_limit(sys.maxsize) ifile = open(filePAth, "rb") reader = csv.reader(ifile) word_dictionary={} tok = Tokenizer(preserve_case=False) for row in reader: tokens=[] try: tokens=tok.tokenize(row[3]) except Exception,e: print e for token in tokens: if token in word_dictionary: token_count=word_dictionary.get(token) token_count=token_count+1 word_dictionary[token]=token_count else: word_dictionary[token]=1
def __init__(self): self.posts = [] self.userwiseThreads = dd(set) self.userwisePosts = dd(set) # Stores indices self.threads = dd(list) self.userNames = {} self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b") self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b") self.tok = Tokenizer() self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|']) self.fakeUsers = {} # Stores the postId of the previous fake annotation we did
def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.data_dir[-1] != '/': args.data_dir+='/' uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[args.database] logger.info("Connected to%s"%uri) files = glob.glob(args.data_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("done.")
def __init__(self, dataFile, usersData): sys.stderr.write('In Constructor\n') self.__data = [] self.__userJoins = dd(lambda:-1) self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list)) self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__users = set() self.__read(dataFile) self.__loadUsersJoins(usersData) self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
def NaiveBesianClassifer(positive_word_frequency, negative_words_frequency, count_pos_words, count_neg_words, tweet, class_pos_prob, class_neg_prob): tok = Tokenizer(preserve_case=False) tokens=tok.tokenize(tweet) positiveClassProb=1.00 negativeClassProb=1.00 for token in tokens: positiveClassProb=positiveClassProb*LaplaceSmoothingValue(token,positive_word_frequency,count_pos_words) negativeClassProb=negativeClassProb*LaplaceSmoothingValue(token,negative_words_frequency,count_neg_words) positiveClassProb=positiveClassProb*class_pos_prob negativeClassProb=negativeClassProb*class_neg_prob if(positiveClassProb >=negativeClassProb): print positiveClassProb,1 return positiveClassProb,1 else: print negativeClassProb,1 return negativeClassProb,-1
class DataSampler(): def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__read(dataFile) self._tok = Tokenizer(preserve_case=False) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: record = filter(lambda x:x.strip(), record) try: self.__data.append(tuple(record)) tempDD[record[3]].append(index) index+=1 except: pass ##print record ##sys.exit() for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def communityWiseSample(self): numPosts = 1000 for key in self.__commWiseIndices.iterkeys(): self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts) def prepareOutput(self, outputFile): outputFile = open(outputFile,'w') csvWriter = csv.writer(outputFile) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: #tokens = self._tokenize(self.__data[index][1]) csvWriter.writerow(self.__data[index]) outputFile.close()
def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list)) self.__communutyWiseVocab = dd(lambda: dd(int)) self.__users = set() self.__userWiseIndices = {} self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda: -1) self.__read(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__timeHandler = TimeHandler()
class Classifier(nltk.ClassifierI): normalizer = N1() tokenizer = Tokenizer() def __init__(self,modelfile="model.unigram.nb.bool.politics.unbiased"): modelpath = os.path.join(os.path.abspath(os.path.dirname(__file__)),"models", modelfile) f = open(modelpath,'rb') self.model = pickle.load(f) f.close() def classify(self,features): return self.model.classify(features) def prob_classify(self,features): return self.model.prob_classify(features) def labels(self): return self.model.labels() def valence(self,features): hyp = self.model.classify(features) posterior = self.model.prob_classify(features) # print "valence=%f"%(posterior.prob("positive")/(1-posterior.prob("positive")) - posterior.prob("negative")/(1-posterior.prob("negative"))) # print posterior.prob("positive")/(1-posterior.prob("positive")) # print posterior.prob("negative")/(1-posterior.prob("negative")) if hyp == "negative": valence = - posterior.prob("negative") elif hyp == "positive": valence = posterior.prob("positive") else: valence = 0 return valence def classifyFromText(self,text): def features(text,n=1): feats = defaultdict(bool) words = ['<s>'] + self.normalizer.normalize(self.tokenizer.tokenize(text)) + ['</s>'] for i in range(len(words)): for j in range(i + 1, i + n + 1): feat = " ".join(words[i:j]) feats[feat] = True return feats features = features(text) label = self.model.classify(features) valence = self.valence(features) post = self.prob_classify(features) return label, valence, post
def __init__(self): self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup') self.tok = Tokenizer()
for u in chat_utterances: if u.get('class') == a: print "Example of {}: {}".format(a, u.text) break # This kind of language is pretty different from the edited writing that many NLP tools assume. Obviously, for machine learning, it hardly matters what the input to the classifier is. But it does pay to be smarter about dividing the text up into its tokens (the words or other meaningful elements). So we'll load in [the tokenizer that Chris Potts wrote][1] to analyze twitter feeds. Some of the things that it does nicely: # - Handles emoticons, hashtags, twitter user names and other items that mix letters and punctuation # - Merges dates, URLs, phone numbers and similar items into single tokens # - Handles ordinary punctuation in an intelligent way as well # # [1]:http://sentiment.christopherpotts.net/tokenizing.html # In[11]: from happyfuntokenizing import Tokenizer chat_tokenize = Tokenizer(preserve_case=False).tokenize # Now we set up the features for this data set. The code is closely analogous to what we did with the sentiment classifier earlier. The big difference is the tokenization and stopword elimination. Content-free words and weird punctuation bits like `what` and `:)` are going to be very important for understanding what dialogue act somebody is performing so we need to keep those features around! # In[12]: def chat_feature_generator(category): return (word for post in chat_utterances if post.get('class') == category for word in chat_tokenize(post.text)) best_act_words = compute_best_features(dialogue_acts, chat_feature_generator, 2000)
def __init__(self, tweetsFile): self._tweets = [] self._tok = Tokenizer(preserve_case=False) self.loadTweets(tweetsFile) sys.stderr.write("preprocessor instance created\n") sys.stderr.write("@ Mentions removed\n")
class preprocessor: def __init__(self, tweetsFile): self._tweets = [] self._tok = Tokenizer(preserve_case=False) self.loadTweets(tweetsFile) sys.stderr.write("preprocessor instance created\n") sys.stderr.write("@ Mentions removed\n") def anonnimize(self, tweet): tweet = tweet.split('\t')[-1] ## Assumption about the format tweet = self._tok.tokenize(tweet) ## Tokenization anonTweet = [] for word in tweet: if word[0] != '@': anonTweet.append(word) return anonTweet def loadTweets(self, tweetsFile): for tweet in open(tweetsFile): tweet = tweet.strip() tokenizedTweet = self.anonnimize(tweet) if len(tokenizedTweet) == 0 or ' '.join(tokenizedTweet).strip() == '': continue self._tweets.append([tweet.split('\t')[0], tweet.split('\t')[1], tokenizedTweet]) def removeRetweets(self): newTweets = [] for tweet in self._tweets: flag = 0 for word in tweet[2]: if word[:2] == 'rt': flag = 1 break if flag == 0: newTweets.append(tweet) self._tweets = [t for t in newTweets] sys.stderr.write("Retweets removed\n") def filterAuthors(self): authorDict = dd(int) for tweet in self._tweets: authorDict[tweet[0]] += 1 filteredAuthors = [] for auth,tweets in authorDict.iteritems(): if tweets >= 50: filteredAuthors.append(auth) filteredAuthors = set(filteredAuthors) filteredTweets = [] for tweet in self._tweets: if tweet[0] in filteredAuthors: filteredTweets.append(tweet) self._tweets = [t for t in filteredTweets] def authorStats(self): authorDict = dd(int) for tweet in self._tweets: authorDict[tweet[0]] += 1 numDict = dd(int) for auth, numTweets in authorDict.iteritems(): numDict[numTweets-(numTweets%10)] += 1 self.drawGraph(numDict) def drawGraph(self, authorDict): #try: authors = [x for x in authorDict.iterkeys()] authors = sorted(authors, cmp=lambda x,y:x-y) numTweets = [authorDict[x] for x in authors] width = 0.2 fig = plt.figure() ax = fig.add_subplot(111) # bar chart of the data rects = ax.bar(np.arange(len(authors)), numTweets, width, color='r') ax.set_xlabel('User') ax.set_ylabel('Number of tweets') ax.set_xticks(np.arange(len(authors))+width/2) ax.set_xticklabels( map(lambda x:str(x), authors)) def autolabel(rects): # attach some text labels for rect in rects: height = rect.get_height() ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), ha='center', va='bottom') autolabel(rects) plt.savefig(open("/usr0/home/pgadde/Work/Ethnic/AAEness/Data/RealTweets/PreProcessing/aaeAuthorTweets.png","w")) plt.show() def printInFile(self, output, label): output = open(output,'w') for tweet in self._tweets: tweet[2].insert(0,label) try: output.write(tweet[0]+"\t"+tweet[1]+"\t"+"\t".join(tweet[2])+"\n") except UnicodeEncodeError: pass output.close()
class DataProcessor: def __init__(self, dataFile): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__read(dataFile) def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find('Talk')<0: return text = record[1] tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) self.__updateVocab(record) tempDD[record[3]].append(index) index+=1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") sys.stderr.write("Word types "+str(len(self.__vocab))+"\n") def preprocessVocab(self): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >=5 and self.__vocabDocCount[word]>=5: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")
def init(preserve_case=True): global tokenizer tokenizer = Tokenizer(preserve_case=preserve_case)
class FakeMatcher: def __init__(self): self.posts = [] self.userwiseThreads = dd(lambda:dd(lambda:-1)) self.userwisePosts = dd(set) # Stores indices self.userLastPost = dd(lambda:-1) self.threads = dd(list) self.userStart = dd(lambda:5000) self.userNames = {} self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b") self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b") self.tok = Tokenizer() self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|']) def loadData(self, dataFile): dataFile = open(dataFile) dataFile.readline() reader = csv.reader(dataFile, quotechar='"', escapechar="\\") postIndex = 0 for line in reader: self.posts.append(line) thread = line[3] user = line[1] username = line[0] self.userNames[user] = ' '.join(self.tok.tokenize(username)) self.threads[thread].append(postIndex) if self.userwiseThreads[user][thread] < 0 or self.userwiseThreads[user][thread] > postIndex: self.userwiseThreads[user][thread] = postIndex self.userwisePosts[user].add(postIndex) days = int(line[8]) if self.userLastPost[user] < days: self.userLastPost[user] = days if self.userStart[user] > int(line[8]): self.userStart[user] = int(line[8]) postIndex += 1 self.sortThreads() def sortThreads(self): for thread in self.threads.iterkeys(): self.threads[thread] = sorted(self.threads[thread], cmp=lambda x, y:x - y) def filterUsers(self): allUsers = self.userwisePosts.keys() for user in allUsers: if len(self.userwisePosts[user]) < 20 or len(self.userwisePosts[user]) > 150 or (self.userStart[user] - self.userLastPost[user]) > 120: del self.userwisePosts[user] del self.userwiseThreads[user] del self.userNames[user] def hasFake(self, postId): postText = self.posts[postId][4] #if postText.find(" you a fake ")>=0: # print postText return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None) def printFakePosts(self, logFile): logFile = open(logFile, 'w') index = 0 for post in self.posts: if self.hasFake(index): logFile.write('\t'.join(post[:5]) + '\n') index += 1 def printFakeUsers(self, fakersFile): fakersFile = open(fakersFile, 'w', 1) for user in self.userwiseThreads.iterkeys(): fakePostCount = 0 fakePostIds = set() for thread in self.userwiseThreads[user].iterkeys(): userFirstPost = self.userwiseThreads[user][thread] postIndex = self.threads[thread].index(userFirstPost) + 1 while postIndex < len(self.threads[thread]): postId = self.threads[thread][postIndex] if self.hasFake(postId): #print 'here' fakePostCount += 1 fakePostIds.add(postId) postIndex += 1 if fakePostCount > 5: fakersFile.write(user + '\t' + ' '.join(map(lambda x:str(x), list(fakePostIds))) + '\n') fakersFile.close() def makeRECompatible(self, userName): for char in self.badChars: if char != '\\': userName = userName.replace(char, "\\" + char) return userName def bigRESearch(self, logFile): logFile = open(logFile, 'w', 1) bigUserName = "******" for userName in self.userNames.itervalues(): if userName in ["dat n***a", "bitch"]: continue if userName.strip() != "": if self.considerUserName(userName): userName = self.makeRECompatible(userName) bigUserName += userName + " |" bigUserName = bigUserName[:-1] + ")" bigUserName += "(is )(a )?(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)" print len(bigUserName) print bigUserName P = re.compile(bigUserName) #sampleText = "i wanna see wat dat n***a about but i aint gonna fite him im on parole . but dat n***a fake so i dont even matter" #while 1: # sampleText = raw_input("Enter the text: ") # if sampleText == 'exit': # break # print "Full match:",P.search(sampleText).group(), " username match:",P.search(sampleText).group(1) for post in self.posts: text = post[4] if P.search(text) != None: logFile.write('\t'.join(post[:5]) + '\n') logFile.close() def printNonChars(self): nonChars = set() for userName in self.userNames.itervalues(): userName = userName.lower() for char in userName: if ord(char) >= 32 and ord(char) <= 126 and (ord(char) < 97 or ord(char) > 122) and ord(char) not in range(48, 58): nonChars.add(char) print "Users:", len(self.userNames) print nonChars def contentToLookAt(self): uniqThreads = set() uniqPosts = set() for userId in self.userNames.iterkeys(): for thread in self.userwiseThreads[userId]: uniqThreads.add(thread) for post in self.threads[thread]: uniqPosts.add(post) print "Users to look at:", len(self.userNames) print "Unique threads to look at:", len(uniqThreads) print "Unique posts to look at:", len(uniqPosts) def isAllLetters(self, userName): for char in userName: if ord(char) < 97 or ord(char) > 122: return False return True def considerUserName(self, userName): for char in userName: o = ord(char) if o < 32 or o > 126: return False return True def matchUserNamesInPosts(self, logFile): logFile = open(logFile, 'w', 1) for userId in self.userNames.iterkeys(): userName = self.userNames[userId] if not self.isAllLetters(userName): continue for post in self.posts: if post[4].find(userName) >= 0: logFile.write(str(userId) + '\t' + userName + '\t' + post[4] + '\n') logFile.close()
def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.snapshot_dir[-1] != '/': args.snapshot_dir+='/' if args.database: database = args.database else: database = "snapshot_"+args.snapshot_dir.split('/')[-2] uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[database] logger.info("Connected to%s"%uri) files = glob.glob(args.snapshot_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("Loading users from %susers.db"%args.snapshot_dir) connection = sqlite3.connect("%susers.db"%args.snapshot_dir) connection.row_factory = sqlite3.Row cursor = connection.cursor() logger.info('fetching users') cursor.execute('SELECT id,friends FROM users where friends is not NULL') users = cursor.fetchall() logger.info('%d users fetched'%len(users)) bulk_size=25000 nUsersInserted=0 usersToBeInserted=[] for user in users: id = user['id'] friends = ujson.loads(user['friends']) usersToBeInserted.append({'id':id, 'friends':friends}) if len(usersToBeInserted)>=bulk_size: client['users'].insert(usersToBeInserted) usersToBeInserted=[] nUsersInserted+=bulk_size logger.info("%d users insered"%nUsersInserted) client['users'].insert(usersToBeInserted) logger.info("all users insered.") logger.info("done.")
class EmpiricalAnalyzer: def __init__(self, dataFile): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__communutyWiseVocab = dd(lambda: dd(int)) self._tok = Tokenizer(preserve_case=False) self.__read(dataFile) def _tokenize(self, text): text = text.strip() text = re.sub("[\s\n]+", " ", text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find("Talk") < 0: return text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) succ = self.__updateVocab(record) if succ: tempDD[record[3]].append(index) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") def preprocessVocab(self): stopWords = [w.strip() for w in open("stopWords")] self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] totalWords = 0 for word, freq in self.__backGround.iteritems(): totalWords += freq for word, freq in self.__backGround.iteritems(): self.__backGround[word] = self.__backGround[word] / float(totalWords) for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() totalWords = 0 for word in commVocab: if word in self.__vocab: totalWords += self.__communutyWiseVocab[comm][word] continue del self.__communutyWiseVocab[comm][word] for word in self.__communutyWiseVocab[comm].iterkeys(): self.__communutyWiseVocab[comm][word] = self.__communutyWiseVocab[comm][word] / float(totalWords) sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def printTop1000InBack(self, outFile): outFile = open(outFile, "w") wordFreqs = [(word, freq) for word, freq in self.__backGround.iteritems()] wordFreqs = sorted(wordFreqs, cmp=lambda x, y: y[1] - x[1])[:1000] for wordFreq in wordFreqs: outFile.write(wordFreq[0] + "\n") outFile.close() def printTop1000(self, D, outFile): outFile = open(outFile, "w") wordFreqs = [(word, freq) for word, freq in D.iteritems()] wordFreqs = sorted(wordFreqs, cmp=myCMP) for wordFreq in wordFreqs: outFile.write(wordFreq[0] + "\t" + str(wordFreq[1]) + "\n") outFile.close() def __logOdd(self, word, commFreq): try: return math.log(commFreq * 1.0 / self.__backGround[word]) except ZeroDivisionError: return 100000 except ValueError: return -100000 except: print word, commFreq, self.__backGround[word] sys.exit("Error while calculating logodds") def prepareCommunityWiseVocab(self): for word in self.__backGround.iterkeys(): for comm in self.__communutyWiseVocab.iterkeys(): self.__communutyWiseVocab[comm][word] = self.__logOdd(word, self.__communutyWiseVocab[comm][word]) def printTopDeviations(self, baseDir): backFile = baseDir + "/" + "background" self.printTop1000(self.__backGround, backFile) for comm in self.__communutyWiseVocab.iterkeys(): self.printTop1000(self.__communutyWiseVocab[comm], baseDir + "/" + comm.strip().replace(" ", ""))
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list)) self.__communutyWiseVocab = dd(lambda: dd(int)) self.__users = set() self.__userWiseIndices = {} self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda: -1) self.__read(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__timeHandler = TimeHandler() def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split("\t") self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): user = self.__data[index][5] if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD["AllTalk"].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub("[\s\n]+", " ", text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record) != 7: return comm = record[3] if comm.find("Talk") < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = self.__timeDiff(index) if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): try: record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.__timeHandler.diffMonths(postTime, userJoin) except: return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda: 1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! return dist
def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__read(dataFile) self._tok = Tokenizer(preserve_case=False)
class FakeMatcher: def __init__(self): self.posts = [] self.userwiseThreads = dd(set) self.userwisePosts = dd(set) # Stores indices self.threads = dd(list) self.userNames = {} self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b") self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b") self.tok = Tokenizer() self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|']) self.fakeUsers = {} # Stores the postId of the previous fake annotation we did def loadData(self, dataFile): dataFile = open(dataFile) dataFile.readline() reader = csv.reader(dataFile, quotechar='"', escapechar="\\") postIndex = 0 for line in reader: self.posts.append(line) thread = line[3] user = line[1] username = line[0] self.userNames[user] = ' '.join(self.tok.tokenize(username)) self.threads[thread].append(postIndex) self.userwiseThreads[user].add(thread) self.userwisePosts[user].add(postIndex) postIndex += 1 def loadFakeUsers(self, fakeAnnotation): fakeAnnotation = csv.reader(open(fakeAnnotation)) for line in fakeAnnotation: try: dummy = int(line[1]) dummy = int(line[2]) except: continue self.fakeUsers[line[1]] = int(line[2]) def filterUsers(self): allUsers = self.userwisePosts.keys() for user in allUsers: if user not in self.fakeUsers.iterkeys(): del self.userwisePosts[user] del self.userwiseThreads[user] del self.userNames[user] def hasFake(self, postId): postText = self.posts[postId][4] return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None) def printFakeUsers(self, fakersDir): for user in self.fakeUsers: fakePostIds = [] for thread in self.userwiseThreads[user]: for postIndex in self.threads[thread]: if self.hasFake(postIndex): fakePostIds.append(postIndex) fakePostIds = sorted(fakePostIds, cmp=lambda x, y:int(self.posts[x][2]) - int(self.posts[y][2])) #print user, self.posts[fakePostIds[0]][2], self.fakeUsers[user] if len(fakePostIds) > 0 and self.posts[fakePostIds[0]][2] != str(self.fakeUsers[user]): #self.printPosts(user, fakePostIds) dummy = 1 else: print user def printPosts(self, user, fakePostIds): fakersFile = open(fakersDir + user, 'w', 1) for postIndex in fakePostIds: postId = self.posts[postIndex][2] postBody = self.posts[postIndex][4] fakersFile.write(postId + '\t' + postBody + '\n') fakersFile.close() def sanityCheck(self): print "Posts:", len(self.posts) print "Users:", len(self.userwiseThreads) print "Fake users:", len(self.fakeUsers) for user in self.fakeUsers: if user not in self.userwiseThreads.iterkeys(): print user
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSplitIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self.__users = set() self.__userWiseIndices = {} self.__userWiseTimeSplitIndices = {} self.__timeWiseUserSplitIndices = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda:-1) self.timeHandler = TimeHandler() self.sampledUsers = set() self.activeForums = {} self.activeUsersInForums = dd(set) ## Processing/dealing with data #self.__read(dataFile) self.__justRead(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.__userWiseTimeSplit() #self.__timeWiseUserSplit() #self.__commWiseTimeSplit() ## Extra data structures self.postingFreq = dd(int) def printMonthlyDataForUser(self, user, outFile): userTimeIndices = self.__userWiseTimeSplitIndices[user] for month in userTimeIndices.iterkeys(): f = csv.writer(open(outFile+"."+str(month),"w")) for index in userTimeIndices[month]: f.writerow(self.__data[index]) def tokenizeRecord(self, record): record = list(copy.deepcopy(record)) #print record try: text = record[1] tokenizedText = ' '.join(self._tokenize(text)) record[1] = tokenizedText #print tokenizedText return record except: return -1 def getTokenizedCSV(self): tokenizedRecords = [] for index in range(len(self.__data)): newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: tokenizedRecords.append(newRecord) return tokenizedRecords def getBasicUserMonthRecord(self, user, month): record = [] record.append(user) record.append(month) record.append(self.activeForums[user]) record.append([]) return record def getTokenizedUserMonthCSV(self): tokenizedRecords = dd(lambda:dd(list)) for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: tokenizedRecords[user][month].append(newRecord[1]) ## Only postBody being given! return tokenizedRecords def getTokenizedUserMonthForumCSV(self): tokenizedRecords = dd(lambda:dd(lambda:dd(list))) for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: newRecord = self.tokenizeRecord(self.__data[index]) if newRecord != -1: forum = newRecord[3] tokenizedRecords[user][month][forum].append(newRecord[1]) ## Only postBody being given! return tokenizedRecords def getPost2Month(self): post2Month = {} for user in self.__userWiseTimeSplitIndices.iterkeys(): for month in self.__userWiseTimeSplitIndices[user].iterkeys(): for index in self.__userWiseTimeSplitIndices[user][month]: postId = self.__data[index][0] post2Month[postId] = month return copy.deepcopy(post2Month) def getDoc2Post(self): doc2Post = {} for index in range(len(self.__data)): doc2Post[index+1] = self.__data[index][0] return copy.deepcopy(doc2Post) def getPost2User(self): post2User = {} for user in self.__userWiseIndices.iterkeys(): for index in self.__userWiseIndices[user]: postId = self.__data[index][0] post2User[postId] = user return copy.deepcopy(post2User) def getPostingFreq(self): self.postingFreq = dd(int) for user in self.__userWiseIndices.iterkeys(): self.postingFreq[len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10] += 1 return copy.deepcopy(self.postingFreq) def getCumulativePostingFreq(self): sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n") self.postingFreq = dd(int) for user in self.__userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10 for num in range(0,userPosts+1,10): self.postingFreq[num] += 1 return copy.deepcopy(self.postingFreq) def getCutoffPostingFreq(self): totalPosts = 0 cdfFreqPosting = dd(int) for user in self.__userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10 totalPosts += userPosts for num in range(0,userPosts+1,10): cdfFreqPosting[num] += userPosts for num in cdfFreqPosting.iterkeys(): cdfFreqPosting[num] = round(cdfFreqPosting[num]*100.0/float(totalPosts),2) sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n") sys.stderr.write("Total Posts:"+str(totalPosts)+"\n") return copy.deepcopy(cdfFreqPosting) def getMonthwisePostingFrequency(self): timeWisePostedUsers = dd(int) for time in self.__timeWiseUserSplitIndices.iterkeys(): timeWisePostedUsers[time] = len(self.__timeWiseUserSplitIndices[time]) return copy.deepcopy(timeWisePostedUsers) def getMonthwiseBinnedPostingFrequency(self): timeWisePostedUsers = dd(int) for time in self.__timeWiseUserSplitIndices.iterkeys(): userWiseIndices = self.__timeWiseUserSplitIndices[time] postingFreq = dd(int) for user in userWiseIndices.iterkeys(): userPosts = len(self.__userWiseIndices[user]) for num in range(0,userPosts+1): postingFreq[num] += 1 timeWisePostedUsers[time] = copy.deepcopy(postingFreq) return copy.deepcopy(timeWisePostedUsers) def getBasicTable(self): table = [] for user in self.__userWiseTimeSplitIndices.iterkeys(): userSubtable = [] for month in self.__userWiseTimeSplitIndices[user].iterkeys(): try: activeForum = self.activeForums[user] if activeForum == 'NULL': continue if int(month) >100: continue content = (user, month, len(self.__userWiseTimeSplitIndices[user][month]), self.activeForums[user]) userSubtable.append(content) except: pass if len(userSubtable) >= 3: table.extend(userSubtable) return table def totalPostsByUsers(self): total = 0 for user in self.__userWiseIndices.iterkeys(): total += len(self.__userWiseIndices[user]) return total def getTopPosterCoverage(self): totalPosts = self.totalPostsByUsers() postsTillTopN = 0 def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def loadActiveForums(self, activeForums): for line in csv.reader(open(activeForums)): try: self.activeForums[line[0]] = line[1] self.activeUsersInForums[line[1]].add(line[0]) except: pass def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): try: user = self.__data[index][5] except: continue if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __userWiseTimeSplit(self): for user in self.__userWiseIndices.iterkeys(): self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user]) def __timeWiseUserSplit(self): for user in self.__userWiseIndices.iterkeys(): timeDividedUserData = self.divideBasedOnMonths(self.__userWiseIndices[user]) for time in timeDividedUserData.iterkeys(): self.__timeWiseUserSplitIndices[time][user] = timeDividedUserData[time] return copy.deepcopy(self.__timeWiseUserSplitIndices) def __commWiseTimeSplit(self): for comm in self.__commWiseIndices.iterkeys(): self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm]) def __justRead(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") for record in csvReader: #self.__data.append(tuple(record[1:])) self.__data.append(tuple(record)) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+', ' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record)!=7: return comm = record[3] if comm.find('Talk') < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def userStats(self, outFile): outFile = open(outFile,'w') for user in self.__userWiseIndices.iterkeys(): userDataIndices = self.__userWiseIndices[user] timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices) outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n') outFile.close() def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = -1 try: timeDiff = self.__timeDiff(index) except: continue if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) #else: # print timeDiff return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): #try: #print recordIndex record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.timeHandler.diffMonths(postTime, userJoin) #except: # return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda:1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! #assert self.isValid(dist) return dist def isValid(self, dist): sumProb = 0 for x in dist.iterkeys(): sumProb += dist[x] print sumProb return True def sampleUsers(self): US = userSampling(self.__userWiseTimeSplitIndices) self.sampledUsers = US.finalizeUsers() self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices) return copy.deepcopy(self.sampledUsers) def getUserMonths(self, user): months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys()) for i in range(1,4): try: months.remove(i) except: pass for i in range(25,31): try: months.remove(i) except: pass return months def getUserDataForDivergence(self, user, month): return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]] def getUserInitialData(self, user): data = [] for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getUserMaturedData(self, user): data = [] for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getActiveForum(self, userNum): return self.activeForums[userNum] def getForumInitialData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getForumMaturedData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data
class ThreadCreator: def __init__(self): self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup') self.tok = Tokenizer() def __getMaxPage(self, page): soup = BS(open(page).read()) try: pagesTag = int(soup.findAll('a', onclick="jumpto(); return false;")[0].findChildren('strong')[-1].contents[0]) return pagesTag except: return -1 def __getPostId(self, postProfile): postId = -1 try: postId = int(postProfile.find('dl', {'class':'postprofile'})['id'].split('profile')[1]) except: postId = -1 pass return postId def getSmileyText(self, smileyTag): title = smileyTag["title"] title = title.lower() title.replace(" ", "_") title = re.sub("[^a-z_]", "", title) return "___" + title + "___" def constructString(self, contentTag): content = "" for cont in contentTag.contents: if type(cont) == bs4.element.NavigableString: content += " " + cont #print cont elif type(cont) == bs4.element.Tag and cont.name == 'span': content += " " + self.constructString(cont) elif type(cont) == bs4.element.Tag and cont.name == 'img' and cont["src"].find("./images/smilies/") == 0: content += " " + self.getSmileyText(cont) return content def __getPostBody(self, postBodyTag): postBody = "" try: contentDiv = postBodyTag.find('div', {'class':'content'}) postBody = self.constructString(contentDiv) ##for cont in contentDiv: ## #print cont, type(cont) ## if type(cont) == bs4.element.NavigableString: ## postBody += cont except: pass if postBody == "": return "NULL" return postBody.decode('utf8') def __getUser(self, postProfile): user = -1 try: user = int(postProfile.find('a')['href'].split('u=')[1].split('&')[0]) except: user = -1 pass return user def __getForum(self, soup): forum = "NULL" try: forum = soup.find('li', {'class':'nav-forum active'}).find('a').find('span').contents[0] except: pass return forum def __getPostTime(self, postBody): postTime = "NULL" try: postTime = ' '.join(postBody.find('p').contents[-1].strip().split(" ")[1:]) except: pass return postTime def __getPostBodyTag(self, postProfile): postBody = postProfile.nextSibling flag = 0 while getattr(postBody, 'name', None) != 'div': if getattr(postBody, 'name', None) == 'span': flag = 1 break postBody = postBody.nextSibling if flag: None return postBody def getPosts(self, page, tId): #print 'In getPosts' soup = BS(open(page).read()) ##print soup postProfiles = soup.findAll('div', {'class':"profile"}) #print "NUM:",len(postProfiles) posts = [] threadId = tId forum = self.__getForum(soup) for postProfile in postProfiles: #print 'Inside post profiles' postId = self.__getPostId(postProfile) user = self.__getUser(postProfile) postBodyTag = self.__getPostBodyTag(postProfile) ##print postBodyTag if postBodyTag != None: time = self.__getPostTime(postBodyTag) postBody = self.__getPostBody(postBodyTag) postBody = re.sub("\[youtube\].*?\[/youtube\]", "", postBody) if postBody.find("quote") > postBody.find("/quote"): postBody = postBody[postBody.find("/quote") + 6:] #inReply = -1 postBody = ' '.join(self.tok.tokenize(postBody)) postBody = postBody.replace("\\", "") ##print postId, postBody, threadId, forum, time, user,inReply ##sys.exit() posts.append((user, postId, threadId, postBody, forum, time)) return posts def getPostsInThread(self, baseDir, fId, tId): posts = [] firstPage = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=0' ##os.system('wget -P '+baseDir+' "'+firstPage+'"') page = baseDir + firstPage.split('/board/')[1] ###print page posts.extend(self.getPosts(page, tId)) ##sys.exit() maxPages = self.__getMaxPage(page) for pageIndex in range(1, maxPages): offset = pageIndex * 50 url = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=' + str(offset) ##os.system('wget -P '+baseDir+' "'+url+'"') ##continue page = baseDir + url.split('/board/')[1] pagePosts = self.getPosts(page, tId) posts.extend(pagePosts) return posts def createThreadsTable(self, baseDir, threads, outFile): outFile = open(outFile, 'w') writer = csv.writer(outFile, quotechar='"', escapechar="\\") ##cursor = self.__conn.cursor() for thread in threads: fId, tId = thread try: posts = self.getPostsInThread(baseDir, fId, tId) except: pass #print posts #sys.exit() ##continue try: for post in posts: writer.writerow(post) ##cursor.execute("""insert into allThreads values(%s,%s,%s,%s,%s,%s,%s)""",post) except: pass ##self.__conn.commit() outFile.close()
import glob import ujson from happyfuntokenizing import Tokenizer from TreeTaggerWrapper import TreeTagger path_to_data='../data/snapshots/2014-10-20/' files = glob.glob(path_to_data+'2014-1*.data') tokenizer = Tokenizer(preserve_case=True) tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par') for fileName in files: print fileName file = open(fileName) tweets=[ujson.loads(l) for l in file] tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] output_file_name = fileName[:-5]+'-tagged.data' file = open(output_file_name,'w') for tweet in tweets: file.write("%s\n"%ujson.dumps(tweet))
def __init__(self, tweet): self.tweet = tweet self.lookup = LookupService(model='bing-body/apr10/5') self.toz = Tokenizer() self.tokens = self.tokenize(tweet) pass
class DataHandler: def __init__(self, dataFile, usersData): self.__data = [] self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSplitIndices = {} self.__communutyWiseVocab = dd(lambda:dd(int)) self.__users = set() self.__userWiseIndices = {} self.__userWiseTimeSplitIndices = {} self._tok = Tokenizer(preserve_case=False) self.__userJoins = dd(lambda:-1) self.__read(dataFile) self.__loadUsersJoins(usersData) self.__splitUserWise() self.timeHandler = TimeHandler() self.__userWiseTimeSplit() #self.__commWiseTimeSplit() self.sampledUsers = set() self.activeForums = {} self.activeUsersInForums = dd(set) def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n") def loadActiveForums(self, activeForums): for line in open(activeForums): line = line.strip().split("\t") try: self.activeForums[line[0]] = line[1] self.activeUsersInForums[line[1]].add(line[0]) except: pass def __validUserId(self, userId): try: userId = int(userId) assert userId >= 1 and userId <= 45037 return True except: return False def __splitUserWise(self): tempDD = dd(list) for index in range(len(self.__data)): user = self.__data[index][5] if not self.__validUserId(user): continue tempDD[user].append(index) for user in tempDD.iterkeys(): self.__userWiseIndices[user] = copy.deepcopy(tempDD[user]) del tempDD def __userWiseTimeSplit(self): for user in self.__userWiseIndices.iterkeys(): self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user]) def __commWiseTimeSplit(self): for comm in self.__commWiseIndices.iterkeys(): self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm]) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: succ = self.__updateVocab(record) if succ: self.__data.append(tuple(record)) tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index += 1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk") >= 0: self.__commWiseIndices[key] = value sys.stderr.write("Read " + str(index) + " records\n") sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n") sys.stderr.write("Users: " + str(len(self.__users)) + "\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+', ' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): if len(record)!=7: return comm = record[3] if comm.find('Talk') < 0: return 0 text = record[1] if text.find("http") >= 0 or text.find("<blockquote>") >= 0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def preprocessVocab(self, stopWords): self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n") def getAllUsers(self): return copy.deepcopy(self.__users) def userStats(self, outFile): outFile = open(outFile,'w') for user in self.__userWiseIndices.iterkeys(): userDataIndices = self.__userWiseIndices[user] timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices) outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n') outFile.close() def getUserDataIndices(self, user): userDataIndices = [] for index in range(len(self.__data)): userDataIndices.append(index) return copy.deepcopy(userDataIndices) def divideBasedOnMonths(self, data): timeDividedIndices = dd(list) for index in data: timeDiff = self.__timeDiff(index) if timeDiff >= 0: timeDividedIndices[timeDiff].append(index) #else: # print timeDiff return copy.deepcopy(timeDividedIndices) def __timeDiff(self, recordIndex): #try: #print recordIndex record = self.__data[recordIndex] postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.timeHandler.diffMonths(postTime, userJoin) #except: # return -1 def makeDist(self, data): totalWords = 0 dist = dd(lambda:1) for text in data: ## I just expect an array of texts, not the entire records tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!! #assert self.isValid(dist) return dist def isValid(self, dist): sumProb = 0 for x in dist.iterkeys(): sumProb += dist[x] print sumProb return True def sampleUsers(self): US = userSampling(self.__userWiseTimeSplitIndices) self.sampledUsers = US.finalizeUsers() self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices) return copy.deepcopy(self.sampledUsers) def getUserMonths(self, user): months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys()) for i in range(1,4): try: months.remove(i) except: pass for i in range(25,31): try: months.remove(i) except: pass return months def getUserDataForDivergence(self, user, month): return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]] def getUserInitialData(self, user): data = [] for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getUserMaturedData(self, user): data = [] for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getActiveForum(self, userNum): return self.activeForums[userNum] def getForumInitialData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(1,4): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data def getForumMaturedData(self, comm): #assert comm in self.__commWiseIndices data = [] #for user in self.__users: for user in self.activeUsersInForums[comm]: for month in range(25,31): try: for index in self.__userWiseTimeSplitIndices[user][month]: data.append(self.__data[index][1]) except: pass return data
class DataSampler(): def __init__(self, dataFile): self.__data = [] self.__commWiseIndices = {} self.__commWiseSampleIndices = {} self.__commWiseSampleWordFreq = dd(lambda:dd(int)) self.__read(dataFile) self._tok = Tokenizer(preserve_case=False) def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: #print record try: self.__data.append(tuple(record)) tempDD[record[3]].append(index) index+=1 except: print record sys.exit() for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") def sanityCheck(self): self.__printDictSizes(self.__commWiseIndices) self.__printDictSizes(self.__commWiseSampleIndices) #print filter(lambda x:x[0]=="74526", self.__data) def __printDictSizes(self, D): for key in D.iterkeys(): sys.stdout.write(key+"\t"+str(len(D[key]))+"\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def communityWiseSample(self): numPosts = 30000 for key in self.__commWiseIndices.iterkeys(): self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts) sys.stderr.write("Sampled "+str(numPosts*5)+"\n") def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __filterWords(self, backGroundVector): lexicon = set() for word, freq in backGroundVector.iteritems(): if freq > 9: lexicon.add(word) return lexicon def preparePosts(self, outputFile): outputFile = open(outputFile,'w') backGroundVector = dd(int) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for token, freq in freqVector.iteritems(): backGroundVector[token] += freq print "Background words:",len(backGroundVector) filteredLexicon = self.__filterWords(backGroundVector) print "Filtered Words:",len(filteredLexicon) ##sys.exit() for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) words = [x+"$:$:"+str(y) for x,y in freqVector.iteritems() if x in filteredLexicon] if len(words) > 0: outputFile.write(key+'\t'+' '.join(words)+'\n') outputFile.write('background'+'\t'+' '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n') outputFile.close() def analyzeLexicon(self, lexicon, background): words = [(w,f) for w,f in background.iteritems() if w in lexicon] words = sorted(words,cmp=lambda x,y:y[1]-x[1]) index = 0 while 1: print words[index] dummy = raw_input() index += 1 def preparePostsSingleDoc(self, outputFile): outputFile = open(outputFile,'w') backGroundVector = dd(int) for key in self.__commWiseSampleIndices.iterkeys(): for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for token, freq in freqVector.iteritems(): backGroundVector[token] += freq print "Background words:",len(backGroundVector) filteredLexicon = self.__filterWords(backGroundVector) print "Filtered Words:",len(filteredLexicon) ##self.analyzeLexicon(filteredLexicon, backGroundVector) ##sys.exit() for key in self.__commWiseSampleIndices.iterkeys(): globalFreqVector = dd(int) for index in self.__commWiseSampleIndices[key]: tokens = self._tokenize(self.__data[index][1]) freqVector = self.freqVector(tokens) for word, freq in freqVector: globalFreqVector[word] += freq words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon] if len(words) > 0: outputFile.write(key+'\t'+' '.join(words)+'\n') outputFile.write('background'+'\t'+' '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n') outputFile.close()
class TimeDividedData: def __init__(self, dataFile, usersData): sys.stderr.write('In Constructor\n') self.__data = [] self.__userJoins = dd(lambda:-1) self.__vocab = dd(int) self.__vocabDocCount = dd(int) self.__backGround = {} self.__commWiseIndices = {} self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list)) self.__communutyWiseVocab = dd(lambda:dd(int)) self._tok = Tokenizer(preserve_case=False) self.__users = set() self.__read(dataFile) self.__loadUsersJoins(usersData) self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12} def __loadUsersJoins(self, usersData): dataFile = open(usersData) for line in dataFile: line = line.strip().split('\t') self.__userJoins[line[0]] = line[1] ## Correct the indices sys.stderr.write("Loaded "+str(len(self.__userJoins))+" users' joins\n") def _tokenize(self, text): text = text.strip() text = re.sub('[\s\n]+',' ', text) return self._tok.tokenize(text) def freqVector(self, tokens): tempFreqVector = dd(int) for token in tokens: tempFreqVector[token] += 1 return tempFreqVector def __updateVocab(self, record): comm = record[3] if comm.find('Talk')<0: return 0 text = record[1] if text.find("http")>=0 or text.find("<blockquote>")>=0: return 0 tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): self.__vocab[word] += freq self.__communutyWiseVocab[comm][word] += freq self.__vocabDocCount[word] += 1 return 1 ##print self.__vocab def __read(self, dataFile): dataFile = open(dataFile) dataFile.readline() csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\") index = 0 tempDD = dd(list) for record in csvReader: try: self.__data.append(tuple(record)) succ = self.__updateVocab(record) if succ: tempDD[record[3]].append(index) tempDD['AllTalk'].append(index) self.__users.add(record[5]) index+=1 except: pass for key, value in tempDD.iteritems(): if key.find("Talk")>=0: self.__commWiseIndices[key] = value sys.stderr.write("Read "+str(index)+" records\n") sys.stderr.write("Word types "+str(len(self.__vocab))+"\n") sys.stderr.write("Users: "+str(len(self.__users))+"\n") def preprocessVocab(self, stopWords): stopWords = [w.strip() for w in open(stopWords)] self.__backGround = {} totalVocab = self.__vocab.keys() for word in totalVocab: freq = self.__vocab[word] if freq >=5 and self.__vocabDocCount[word]>=50 and word not in stopWords: self.__backGround[word] = freq else: del self.__vocab[word] for comm in self.__communutyWiseVocab.iterkeys(): commVocab = self.__communutyWiseVocab[comm].keys() for word in commVocab: if word in self.__vocab: continue del self.__communutyWiseVocab[comm][word] sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n") def __timeDiff(self, record): postTime = str(record[4]) user = str(record[5]) userJoin = self.__userJoins[user] return self.__diff(postTime, userJoin) def test(self): sampleTime = "November 17th, 2007, 4:21 pm" sampleTime2 = "October 11th, 2006, 3:15 am" print self.__diff(sampleTime, sampleTime2) def __diff(self, time1, time2): #print time1, time2 year1 = int(time1.split(',')[1].strip()) year2 = int(time2.split(',')[1].strip()) diff = 0 if year1 > year2: temp = time1 time1 = time2 time2 = temp diff = (year1 - year2 - 1)*12 elif year1 < year2: diff = (year2 - year1 - 1)*12 month1 = time1.split(' ')[0] month2 = time2.split(' ')[0] diff += 12 - self.__months[month1] + self.__months[month2] #if diff < 0: # print year1, year2, time1, time2 return diff def numUsers(self, comm, time): usersSet = set() for index in self.__commWiseTimeSeparatedIndices[comm][time]: user = self.__data[index][5] usersSet.add(user) return len(usersSet) def divideBasedOnTimes(self): for comm in self.__commWiseIndices.iterkeys(): for index in self.__commWiseIndices[comm]: timeDiff = self.__timeDiff(self.__data[index]) self.__commWiseTimeSeparatedIndices[comm][timeDiff].append(index) return for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): for time in self.__commWiseTimeSeparatedIndices[comm].iterkeys(): if time < 25: print comm, time, len(self.__commWiseTimeSeparatedIndices[comm][time]), self.numUsers(comm, time) def __wordDist(self, data): totalWords = 0 dist = dd(lambda:1) for record in data: #print record ##record = self.__data[record] ## Change this based on analysis.. Bad code!! text = record[1] #print text tokenDict = self.freqVector(self._tokenize(text)) for word, freq in tokenDict.iteritems(): if word in self.__vocab: dist[word] += freq totalWords += freq for word in self.__vocab: dist[word] += 0 totalWords += len(self.__vocab) for word in self.__vocab: dist[word] /= float(totalWords) ##dist[word] = round(-1*self.myLog(dist[word]),2) #print dist return dist def splitUserWise(self, data): userWise = dd(list) for record in data: userWise[record[5]].append(record) return userWise def KLDAnalysis(self, comm): print comm userWiseKLD = dd(lambda:dd(int)) data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][1]] userWiseData = self.splitUserWise(data) #sampledData = random.sample(data, 1000) #m1Dist = self.__wordDist(sampledData) ##data25 = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][25]] ##m25Dist = self.__wordDist(data25) #dataSecondYear = [] #for for time in range(2,25): data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][time]] userWiseMonthData = self.splitUserWise(data) #sampledData = random.sample(data, 1000) for user in userWiseMonthData.iterkeys(): userDist = self.__wordDist(userWiseMonthData[user]) #monthDist = self.__wordDist(sampledData) kld = self.KLD(m1Dist, monthDist) #kldWith25 = self.KLD(monthDist, m25Dist) #sys.stdout.write(str(time)+'\t'+str(kld)+'\t'+str(kldWith25)+'\n') sys.stdout.write(str(time)+'\t'+str(kld)+'\n') def myLog(self, x): #try: return math.log(x) #except ValueError: # return -100000 #except ZeroDivisionError: # return 100000 def KLD(self, P, Q): kld = 0 for word in P.iterkeys(): p = P[word] pbyq = P[word]/Q[word] kld += p*self.myLog(pbyq) return kld def KLDivergenceAnalysis(self): ##for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): self.KLDAnalysis('AllTalk') '''def regress(self): for comm in self.__commWiseTimeSeparatedIndices.iterkeys(): instances = dd(list) #instances = [] users = set() for time in range(1,25): for index in self.__commWiseTimeSeparatedIndices[comm][time]: record = self.__data[index] user = str(record[5]) users.add(user) #print len(users) users = set(random.sample(list(users),min(len(users),1500))) for time in range(1,25): for index in self.__commWiseTimeSeparatedIndices[comm][time]: record = self.__data[index] user = str(record[5]) if user not in users: continue instances[user+'_'+str(time)].append(index) #instances.append((index,time)) #instances = random.sample(instances, 1000) regInstances = self.createRegInstances(instances) print comm, len(regInstances) model = creg.LinearRegression() model.fit(creg.RealvaluedDataset(regInstances), l1=0.1) outFile = open("weights_"+comm.strip().replace(' ',''),"w") weights = sorted([(W,w) for W,w in model.weights],cmp=myCMP) for weight in weights: outFile.write(weight[0]+'\t'+str(weight[1])+'\n') outFile.close() del regInstances del instances''' def createRegInstances(self, instances): regInstances = [] for userTime in instances.iterkeys(): dataIndices = instances[userTime] #data = [self.__data[index] for index in dataIndices] time = int(userTime.split('_')[1]) wordDist = self.__wordDist(dataIndices) #print wordDist regInstances.append((wordDist,-1*self.myLog(time))) #print len(regInstances) return regInstances