コード例 #1
0
def findTokensJson():
    tokens = Counter()

    for line in open(INPUT, 'r'):
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
コード例 #2
0
def findTokensAll():
    tokens = Counter()

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            supercntr += 1
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[1]):  #For Trump it's [1]
            trumpcntr += 1
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print("Saving token counts for ", tokens.__sizeof__(), ". ", twcntr,
          " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ",
          supercntr, " labelled tweets")

    output.write(tokens_pb.SerializeToString())
    output.close
コード例 #3
0
def findTokensOfficial():
    tokens = Counter()

    for line in io.open(INPUT, encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
コード例 #4
0
def findTokensPhrases(phrasemodel="phrase.model", useDev=False):
    tokencnt = Counter()
    bigram = Phrases(phrasemodel)

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        tokenised = tokenize(json.loads(line)['text'].lower())
        tokens = filterStopwords(tokenised)  # filter stopwords
        for token in bigram[
                tokens]:  # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    if useDev == True:
        for line in io.open(tokenize_tweets.FILEDEV,
                            encoding='windows-1252',
                            mode='r'):  #for the Trump file it's utf-8
            if line.startswith('ID\t'):
                continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[1].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            trumpcntr += 1
            tokencnt[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokencnt.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print "Saving token counts for ", tokencnt.__sizeof__(
    ), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"

    output.write(tokens_pb.SerializeToString())
    output.close