Example #1
0
def readToks(phrasemodel=False):
    tweets = []
    for line in open(FILE, 'r', errors='ignore'):
        tweets.append(json.loads(line))

    #tweets_on_topic = defaultdict(list)
    #for topic in TOPICS:
    #    for index, tweet in enumerate(tweets):
    #        for keyword in KEYWORDS[topic]:
    #            if keyword in tweet['text'].lower():
    #                tweets_on_topic[topic].append(index)
    #                break

    tokens_pb = Tokens()
    if phrasemodel == False:
        with open(TOKENS, "rb") as f:
            tokens_pb.ParseFromString(f.read())
    else:
        with open(TOKENSPHRASE, "rb") as f:
            tokens_pb.ParseFromString(f.read())

    tokens = []
    for token_pb in tokens_pb.tokens:
        if token_pb.count == 1:
            break
        tokens.append(token_pb.token)

    print("Reading counts for ", str(len(tokens)), "tokens")
    return tokens, tweets, tweets
def findTokensJson():
    tokens = Counter()

    for line in open(INPUT, 'r'):
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
Example #3
0
def findTokensAll():
    tokens = Counter()

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            supercntr += 1
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[1]):  #For Trump it's [1]
            trumpcntr += 1
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print("Saving token counts for ", tokens.__sizeof__(), ". ", twcntr,
          " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ",
          supercntr, " labelled tweets")

    output.write(tokens_pb.SerializeToString())
    output.close
Example #4
0
def readToks2(dimension, usephrasemodel=False):

    tokens_pb = Tokens()
    if usephrasemodel == False:
        with open(TOKENS, "rb") as f:
            tokens_pb.ParseFromString(f.read())
    else:
        with open(TOKENSPHRASE, "rb") as f:
            tokens_pb.ParseFromString(f.read())

    tokens = []
    for token_pb in tokens_pb.tokens:
        if token_pb.count == 1:
            break
        tokens.append(token_pb.token)

    print("Reading counts for ", str(len(tokens)),
          "tokens, taking most frequent ", dimension)
    return tokens[:dimension]
def findTokensOfficial():
    tokens = Counter()

    for line in io.open(INPUT, encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
#TWEETS = './small.tweets'
TWEETS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/all.tweets'
TOKENS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/tokensFinal'

keywords = {'clinton': ['hillary', 'clinton'], 
            'obama' : ['barack', 'obama'],
            'climate': ['climate'],
            'feminism': ['feminism', 'feminist'],
            'abortion': ['abortion', 'aborting'],
            'atheism': ['atheism', 'atheist']
}

topics = keywords.keys()

tokens_pb = Tokens()
with open(TOKENS, "rb") as f:
    tokens_pb.ParseFromString(f.read())

tokens = []
for token_pb in tokens_pb.tokens:
    if token_pb.count == 1:
        break
    tokens.append(token_pb.token)

print (len(tokens))

sys.exit()

tweets_on_topic = defaultdict(list)
for topic in topics:
def findTokensPhrases(phrasemodel="phrase.model", useDev=False):
    tokencnt = Counter()
    bigram = Phrases(phrasemodel)

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        tokenised = tokenize(json.loads(line)['text'].lower())
        tokens = filterStopwords(tokenised)  # filter stopwords
        for token in bigram[
                tokens]:  # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    if useDev == True:
        for line in io.open(tokenize_tweets.FILEDEV,
                            encoding='windows-1252',
                            mode='r'):  #for the Trump file it's utf-8
            if line.startswith('ID\t'):
                continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[1].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            trumpcntr += 1
            tokencnt[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokencnt.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print "Saving token counts for ", tokencnt.__sizeof__(
    ), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"

    output.write(tokens_pb.SerializeToString())
    output.close