def __init__(self, corpusdirectory): self.users = {} #initialisation of dictionary that will store all twitter users. They keys are the names, the values are TwitterUser objects. #Load the twitter corpus for filepath in preprocess.find_corpus_files(corpusdirectory): text = preprocess.read_corpus_file(filepath) for line in text.split("\n"): try: user, time, tweetmessage = line.split("\t", 3) #do a maximum of three splits except ValueError: continue #we have an invalid line in our data, ignore it and continue the for loop if not user in self.users: #we have a new user, make a new TwitterUser instance and add it to the dictionary: self.users[user] = TwitterUser(user) #Does this message contain a @, which indicated there may be @recipient syntax in the message #Otherwise, we are not interested in the tweet and just ignore it if tweetmessage.find('@') != -1: tweet = Tweet(tweetmessage,time) self.users[user].append(tweet) #Compute relations between users for user in self: user.computerelations(self)
def __init__(self, corpusdirectory): self.users = { } #initialisation of dictionary that will store all twitter users. They keys are the names, the values are TwitterUser objects. #Load the twitter corpus for filepath in preprocess.find_corpus_files(corpusdirectory): text = preprocess.read_corpus_file(filepath) for line in text.split("\n"): try: user, time, tweetmessage = line.split( "\t", 3) #do a maximum of three splits except ValueError: continue #we have an invalid line in our data, ignore it and continue the for loop if not user in self.users: #we have a new user, make a new TwitterUser instance and add it to the dictionary: self.users[user] = TwitterUser(user) #Does this message contain a @, which indicated there may be @recipient syntax in the message #Otherwise, we are not interested in the tweet and just ignore it if tweetmessage.find('@') != -1: tweet = Tweet(tweetmessage, time) self.users[user].append(tweet) #Compute relations between users for user in self: user.computerelations(self)
def extract_features(filename): "Open and tokenise the contents of a file." return tokenise(read_corpus_file(filename))