def readToks(phrasemodel=False): tweets = [] for line in open(FILE, 'r', errors='ignore'): tweets.append(json.loads(line)) #tweets_on_topic = defaultdict(list) #for topic in TOPICS: # for index, tweet in enumerate(tweets): # for keyword in KEYWORDS[topic]: # if keyword in tweet['text'].lower(): # tweets_on_topic[topic].append(index) # break tokens_pb = Tokens() if phrasemodel == False: with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) else: with open(TOKENSPHRASE, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print("Reading counts for ", str(len(tokens)), "tokens") return tokens, tweets, tweets
def findTokensJson(): tokens = Counter() for line in open(INPUT, 'r'): for token in tokenize(json.loads(line)['text']): tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count output.write(tokens_pb.SerializeToString()) output.close
def findTokensAll(): tokens = Counter() twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 for token in tokenize(json.loads(line)['text']): tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[2]): #For Trump it's [1] supercntr += 1 tokens[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[1]): #For Trump it's [1] trumpcntr += 1 tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print("Saving token counts for ", tokens.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets") output.write(tokens_pb.SerializeToString()) output.close
def readToks2(dimension, usephrasemodel=False): tokens_pb = Tokens() if usephrasemodel == False: with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) else: with open(TOKENSPHRASE, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print("Reading counts for ", str(len(tokens)), "tokens, taking most frequent ", dimension) return tokens[:dimension]
def findTokensOfficial(): tokens = Counter() for line in io.open(INPUT, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue for token in tokenize(line.split("\t")[2]): #For Trump it's [1] tokens[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokens.most_common(): token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count output.write(tokens_pb.SerializeToString()) output.close
#TWEETS = './small.tweets' TWEETS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/all.tweets' TOKENS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/tokensFinal' keywords = {'clinton': ['hillary', 'clinton'], 'obama' : ['barack', 'obama'], 'climate': ['climate'], 'feminism': ['feminism', 'feminist'], 'abortion': ['abortion', 'aborting'], 'atheism': ['atheism', 'atheist'] } topics = keywords.keys() tokens_pb = Tokens() with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print (len(tokens)) sys.exit() tweets_on_topic = defaultdict(list) for topic in topics:
def findTokensPhrases(phrasemodel="phrase.model", useDev=False): tokencnt = Counter() bigram = Phrases(phrasemodel) twcntr = 0 supercntr = 0 trumpcntr = 0 for line in open(INPUT, 'r'): twcntr += 1 tokenised = tokenize(json.loads(line)['text'].lower()) tokens = filterStopwords(tokenised) # filter stopwords for token in bigram[ tokens]: # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 if useDev == True: for line in io.open(tokenize_tweets.FILEDEV, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[2].lower())) #For Trump it's [1] for token in bigram[tokens]: supercntr += 1 tokencnt[token] += 1 for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8 if line.startswith('ID\t'): continue tokens = filterStopwords(tokenize( line.split("\t")[1].lower())) #For Trump it's [1] for token in bigram[tokens]: trumpcntr += 1 tokencnt[token] += 1 output = open(OUTPUT, "wb") tokens_pb = Tokens() for token, count in tokencnt.most_common(): if count > 1: # not even worth saving singletons token_pb = tokens_pb.tokens.add() token_pb.token = token token_pb.count = count print "Saving token counts for ", tokencnt.__sizeof__( ), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets" output.write(tokens_pb.SerializeToString()) output.close