def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False):
    tokencounts = Counter()
    features_final = []
    bigram = Phrases(phrasemodel)
    #tokens_topic = []

    #if keyword == "all":
    #    for top in tokenize_tweets.TOPICS:
    #        if top != 'clinton':
    #            for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]):
    #                tokens_topic.append(tok)
    #else:
    #    tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword])

    for tweet in tweets:
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:  #unigram features
                tokencounts[token] += 1
                #for toktopic in tokens_topic:
                #    tokencounts[toktopic + '|' + token] += 1
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features
                tokencounts["_".join(l)] += 1
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1
        else:
            # this includes unigrams and frequent bigrams
            tokens = filterStopwords(tokenize(tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]
            target_keywords = []
            if anon_targets==True:
                for top in tokenize_tweets.TOPICS:
                    if top == "climate": # hack, this is the only non-list value
                        target_keywords.append("climate")
                    else:
                        #for keyw in tokenize_tweets.KEYWORDS[top]:
                        target_keywords.extend(tokenize_tweets.KEYWORDS[top])

                phrasetoks_new = []
                for token in phrasetoks:
                    for keyw in target_keywords:
                        if keyw in token:
                            token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                tokencounts[token] += 1
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                tokencounts["_".join(l)] += 1

    for token, count in tokencounts.most_common():
        if count > 1:
            features_final.append(token)
            #print token, count

    return features_final
def findTokensPhrases(phrasemodel="phrase.model", useDev=False):
    tokencnt = Counter()
    bigram = Phrases(phrasemodel)

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        tokenised = tokenize(json.loads(line)['text'].lower())
        tokens = filterStopwords(tokenised) # filter stopwords
        for token in bigram[tokens]: # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams
            tokencnt[token] += 1


    for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    if useDev == True:
        for line in io.open(tokenize_tweets.FILEDEV, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
            if line.startswith('ID\t'):
                continue
        tokens = filterStopwords(tokenize(line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(line.split("\t")[1].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            trumpcntr += 1
            tokencnt[token] += 1


    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokencnt.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print "Saving token counts for ", tokencnt.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"

    output.write(tokens_pb.SerializeToString())
    output.close
def extractFeaturesBOW(tweets,
                       targets,
                       features_final,
                       anon_targets=False,
                       usephrasemodel=False,
                       phrasemodel="phrase.model"):

    bigram = Phrases(phrasemodel)

    matrix = []  # np.zeros((len(features_final), len(tweets)))

    for i, tweet in enumerate(tweets):
        vect = np.zeros((len(features_final)))
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:
                insertIntoVect(features_final, vect, token)
                #for toktopic in tokens_topic:
                #    insertIntoVect(features_final, vect, toktopic + '|' + token)
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l))
        else:
            inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
            target_keywords = tokenize_tweets.KEYWORDS.get(
                inv_topics.get(targets[i]))

            tokens = filterStopwords(tokenize(
                tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]

            if anon_targets == True:
                phrasetoks_new = []
                for token in phrasetoks:
                    if target_keywords == "climate":
                        if target_keywords in token:
                            token = token.replace(keyw, "TARGET")
                    else:
                        for keyw in target_keywords:
                            if keyw in token:
                                token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                insertIntoVect(features_final, vect, token)
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))

        matrix.append(vect)
        #print " ".join(str(v) for v in vect), "\n"

    return matrix
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"):

    bigram = Phrases(phrasemodel)

    matrix = [] # np.zeros((len(features_final), len(tweets)))

    for i, tweet in enumerate(tweets):
        vect = np.zeros((len(features_final)))
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:
                insertIntoVect(features_final, vect, token)
                #for toktopic in tokens_topic:
                #    insertIntoVect(features_final, vect, toktopic + '|' + token)
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l))
        else:
            inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
            target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i]))

            tokens = filterStopwords(tokenize(tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]


            if anon_targets==True:
                phrasetoks_new = []
                for token in phrasetoks:
                    if target_keywords == "climate":
                        if target_keywords in token:
                            token = token.replace(keyw, "TARGET")
                    else:
                        for keyw in target_keywords:
                            if keyw in token:
                                token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                insertIntoVect(features_final, vect, token)
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))

        matrix.append(vect)
        #print " ".join(str(v) for v in vect), "\n"

    return matrix
Beispiel #5
0
def countHashTags(tweets, labels):
    neut = Counter()
    neg = Counter()
    pos = Counter()
    all = Counter()

    for it, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet)
        label = labels[it]
        for token in tokenised_tweet:
            if token.startswith("#"):
                all[token] += 1
                if label == "NONE":
                    neut[token] += 1
                elif label == "AGAINST":
                    neg[token] += 1
                elif label == "FAVOR":
                    pos[token] += 1

    print("Hashtags\tAll\tNeut\tNeg\tPos")
    for token, count in all.most_common():
        neutrcnt, poscnt, negcnt = 0, 0, 0
        if neut.__contains__(token):
            neutrcnt = neut[token]
        if neg.__contains__(token):
            negcnt = neg[token]
        if pos.__contains__(token):
            poscnt = pos[token]
        print(token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt)
def writeToksToFile():

    tokens,tweets_on_topic,tweets = readToks()


    for topic in TOPICS:

        tokenized_tweets = Tweets()

        for index in tweets_on_topic[topic]:

            tweet = tweets[index]

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens.index(token)
                    tokenized.tokens.append(index)
                except ValueError:
                    tokenized.tokens.append(-1)

            print(tokenized.tokens)
            f = open(topic + '.tweets', "wb")
            f.write(tokenized_tweets.SerializeToString())
            f.close()
def countHashTags(tweets, labels):
    neut = Counter()
    neg = Counter()
    pos = Counter()
    all = Counter()

    for it, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet)
        label = labels[it]
        for token in tokenised_tweet:
            if token.startswith("#"):
                all[token] += 1
                if label == "NONE":
                    neut[token] += 1
                elif label == "AGAINST":
                    neg[token] += 1
                elif label == "FAVOR":
                    pos[token] += 1

    print "Hashtags\tAll\tNeut\tNeg\tPos"
    for token, count in all.most_common():
        neutrcnt, poscnt, negcnt = 0, 0, 0
        if neut.__contains__(token):
            neutrcnt = neut[token]
        if neg.__contains__(token):
            negcnt = neg[token]
        if pos.__contains__(token):
            poscnt = pos[token]
        print token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = [] # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t" , str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t" , str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t" , str(len(tweets)))


    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
Beispiel #9
0
def prepData(filepath, stopfilter, multiword):
    print("Preparing data...")

    ret = [] # list of lists

    print("Reading data...")
    # this reads file in JSON format
    #tweets = readTweets(jsonfilepath)

    # this reads SemEval format tweets
    tweets, _, _, _ = readTweetsOfficial(filepath)
    #tweets = "\n".join(tweets)

    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels):

    feats = []
    # for each tweet, multiply the word vectors
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)
        numvects = 0
        vect = []
        for token in phrasemodel[words]:
            try:
                s = w2vmodel[token]
                vect.append(s)
                numvects += 1
            except KeyError:
                s = 0.0
        if vect.__len__() > 0:
            mtrmean = np.average(vect, axis=0)
            if i == 0:
                feats = mtrmean
            else:
                feats = np.vstack((feats, mtrmean))
        else:
            feats = np.vstack(
                (feats, np.zeros(300)))  # 300-dimensional vector for now

    return feats
Beispiel #11
0
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = []  # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(
        tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(
        tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t", str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t", str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t", str(len(tweets)))

    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels):

    feats = []
    # for each tweet, multiply the word vectors
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)
        numvects = 0
        vect = []
        for token in phrasemodel[words]:
            try:
                s = w2vmodel[token]
                vect.append(s)
                numvects += 1
            except KeyError:
                s = 0.0
        if vect.__len__() > 0:
            mtrmean = np.average(vect, axis=0)
            if i == 0:
                feats = mtrmean
            else:
                feats = np.vstack((feats, mtrmean))
        else:
            feats = np.vstack((feats, np.zeros(300)))  # 300-dimensional vector for now

    return feats
Beispiel #13
0
def writeToksToFile():

    tokens, tweets_on_topic, tweets = readToks()

    for topic in TOPICS:

        tokenized_tweets = Tweets()

        for index in tweets_on_topic[topic]:

            tweet = tweets[index]

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens.index(token)
                    tokenized.tokens.append(index)
                except ValueError:
                    tokenized.tokens.append(-1)

            print(tokenized.tokens)
            f = open(topic + '.tweets', "wb")
            f.write(tokenized_tweets.SerializeToString())
            f.close()
Beispiel #14
0
def main(tweet_fp):
    pos_tagger = mallet_wrapper.MalletPOSTagger(_MODEL_FP, _TOKEN2POS_MAPS,
                                                _TOKEN_MAPS, _BIGRAM,
                                                _TEMP_DIR)
    tweet_tokens_list = []
    sys.stderr.write('Creating mallet test file.\n')
    for line in open(tweet_fp):
        tweet_tokens_list.append(twokenize_wrapper.tokenize(line.rstrip('\n')))
    return pos_tagger.pos_tag_tweets(tweet_tokens_list)
Beispiel #15
0
def main(tweet_fp):
    pos_tagger = mallet_wrapper.MalletPOSTagger(_MODEL_FP, _TOKEN2POS_MAPS,
                                                _TOKEN_MAPS, _BIGRAM,
                                                _TEMP_DIR)
    tweet_tokens_list = []
    sys.stderr.write('Creating mallet test file.\n')
    for line in open(tweet_fp):
        tweet_tokens_list.append(twokenize_wrapper.tokenize(line.rstrip('\n')))
    return pos_tagger.pos_tag_tweets(tweet_tokens_list)
Beispiel #16
0
def convertTweetsOfficialToVec(numtoks,
                               tokens,
                               tweets,
                               filtering=False,
                               phrasemodelpath="phrase.model"):

    tokens_sub = tokens[:numtoks]
    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    if filtering == True:
        bigram = Phrases(phrasemodelpath)

    for tweet in tweets:

        vect = np.zeros(
            numtoks
        )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
        norm_tweet = []

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet
        if filtering == False:
            tokenised_tweet = tokenize(tokenized.tweet)
        else:
            tokens = filterStopwords(tokenize(tokenized.tweet.lower()))
            tokenised_tweet = bigram[tokens]
        for token in tokenised_tweet:
            try:
                index = tokens_sub.index(token)
            except ValueError:
                index = -1
            if index > -1:
                vect[index] = 1
                norm_tweet.append(token)
            else:
                norm_tweet.append('NULL')

        #print(norm_tweet)
        norm_tweets.append(norm_tweet)
        vects.append(vect)

    return vects, norm_tweets
Beispiel #17
0
def findTokensAll():
    tokens = Counter()

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            supercntr += 1
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[1]):  #For Trump it's [1]
            trumpcntr += 1
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print("Saving token counts for ", tokens.__sizeof__(), ". ", twcntr,
          " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ",
          supercntr, " labelled tweets")

    output.write(tokens_pb.SerializeToString())
    output.close
Beispiel #18
0
    def cleanHelper(self, body):
        tokens = tokenize(body)
        tokens = [x.lower().strip() for x in tokens]
        tokens = [x for x in tokens if emoticons(x) == "NA"]
        tokens = [x.strip(" #\-*!._(){}~,^") for x in tokens]
        tokens = [self.normalizer.replace(x) for x in tokens]
        tokens = [x for x in tokens if re.search("\w", x)]

        body = " ".join(tokens)
        return tokens, body
def findTokensAll():
    tokens = Counter()

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            supercntr += 1
            tokens[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[1]):  #For Trump it's [1]
            trumpcntr += 1
            tokens[token] += 1


    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print "Saving token counts for ", tokens.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"

    output.write(tokens_pb.SerializeToString())
    output.close
def convertTweetsOfficialToVec(numtoks, tokens, tweets, filtering=False, phrasemodelpath="phrase.model"):

    tokens_sub = tokens[:numtoks]
    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    if filtering==True:
        bigram = Phrases(phrasemodelpath)

    for tweet in tweets:

        vect = np.zeros(numtoks)  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
        norm_tweet = []

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet
        if filtering == False:
            tokenised_tweet = tokenize(tokenized.tweet)
        else:
            tokens = filterStopwords(tokenize(tokenized.tweet.lower()))
            tokenised_tweet = bigram[tokens]
        for token in tokenised_tweet:
            try:
                index = tokens_sub.index(token)
            except ValueError:
                index = -1
            if index > -1:
                vect[index] = 1
                norm_tweet.append(token)
            else:
                norm_tweet.append('NULL')

        #print(norm_tweet)
        norm_tweets.append(norm_tweet)
        vects.append(vect)


    return vects,norm_tweets
def thinposts(lines):

    for line in lines:
    	m = re.search(subreddit_re, line)
        if not m:
            continue

        comment = json.loads(line)
    
    	if comment['body'] == '[deleted]':
    		continue
	
        if comment['subreddit'].lower() in subreddits:
        	reformed_text = ' '.join(twokenize_wrapper.tokenize(comment['body']))
        	yield reformed_text.strip() + ' <EOS>'
def thinposts(lines):

    posts = []

    for line in lines:
#        if not re.search(subreddit_re, line):
#            continue

        comment = json.loads(line)

        if comment['text'] == '[deleted]':
            continue
    
        if comment['community'][1]['name'].lower() in subreddits:
            tokens = twokenize_wrapper.tokenize(comment['text'].strip())
            yield ' '.join(tokens) + ' <EOS> '
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}


    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0


        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc/neutcnt
        possc_tweet = possc/poscnt
        negsc_tweet = negsc/negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def findTokensJson():
    tokens = Counter()

    for line in open(INPUT, 'r'):
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
def findTokensJson():
    tokens = Counter()

    for line in open(INPUT, 'r'):
        for token in tokenize(json.loads(line)['text']):
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
Beispiel #26
0
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}

    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0

        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc / neutcnt
        possc_tweet = possc / poscnt
        negsc_tweet = negsc / negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t",
              possc_tweet, "\t", negsc_tweet)
def findTokensOfficial():
    tokens = Counter()

    for line in io.open(INPUT, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
def extractFeaturesCrossTweetTarget(tweets, targets):
    ret = []
    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
    #TOPICS = inv_topics.keys()
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet)
        target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i]))
        target_in_tweet = 0
        for key in target_keywords:
            if tweet.__contains__(key):
                target_in_tweet = 1
                break
        # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better
        #for tweettok in tokenised_tweet:
        #    if tweettok in target_keywords:
        #        target_in_tweet = 1
        #        break
        ret.append(target_in_tweet)
    return ret
def findTokensOfficial():
    tokens = Counter()

    for line in io.open(INPUT, encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        for token in tokenize(line.split("\t")[2]):  #For Trump it's [1]
            tokens[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokens.most_common():
        token_pb = tokens_pb.tokens.add()
        token_pb.token = token
        token_pb.count = count

    output.write(tokens_pb.SerializeToString())
    output.close
Beispiel #30
0
def extractFeaturesCrossTweetTarget(tweets, targets):
    ret = []
    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
    #TOPICS = inv_topics.keys()
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet)
        target_keywords = tokenize_tweets.KEYWORDS.get(
            inv_topics.get(targets[i]))
        target_in_tweet = 0
        for key in target_keywords:
            if tweet.__contains__(key):
                target_in_tweet = 1
                break
        # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better
        #for tweettok in tokenised_tweet:
        #    if tweettok in target_keywords:
        #        target_in_tweet = 1
        #        break
        ret.append(target_in_tweet)
    return ret
Beispiel #31
0
def thinposts(lines):

    posts = []

    for line in lines:
#        if not re.search(subreddit_re, line):
#            continue

        comment = json.loads(line)

        if comment['text'] == '[deleted]':
            continue
    
        if comment['community'][1]['name'].lower() in subreddits:
            out_comment = {}
            tokens = twokenize_wrapper.tokenize(comment['text'])
            out_comment['body'] = comment['text']
            out_comment['subreddit'] = comment['community'][1]['name']
            out_comment['author'] = comment['user']['username']
            out_comment['created_utc'] = str(int(comment['createdAt']) / 1000)
            out_comment['tokens'] = tokens
            out_comment['id'] = 'x'
            yield(json.dumps(out_comment))
def main(argv):
	import nltk
	import random, re
	import twokenize_wrapper as tok
	import pickle


	stopWords = {}
	st = open('stopWordsNew.txt', 'r')
	inputFile = open(argv[0],'r')
	outputFile = open(argv[1],'w')
	maxEntObjectFile = open('maxEntObject.pkl','rb')

	for line in st:
		line = line.strip('\n')
		if(not stopWords.has_key(line)):
			stopWords[line] = 1

	def featureFunc(tweet):
		feat = {}
		for word in tweet:
			feat[word] = 1
		return feat

	wnl = nltk.stem.WordNetLemmatizer()
	wordListMap = {}
	tokenizedTweets = []
	totalTweets = 0

	for line in inputFile:
		tweet = line.strip('\n')
		tweet = tweet.lower()
		tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 				#HASH TAG
		tweet = re.sub(r'(@[\w]+)','_HANDLE_',tweet)			#HANDLE
		tweet = re.sub(r'http[\w://.~?=%&-]+','_URL_',tweet)	#URL
		tweet = re.sub(r'(/|:|&|\(|\))',' ',tweet) 				# / : & ( ) spaced
		tweet = re.sub(r'(\d+)',r' \1 ',tweet) 					# Digit clusters spaced
		tweet = re.sub(r'(\w+)(-|;)(\w+)',r'\1 \3',tweet)		# words(-|;)word separated
		tokens = tok.tokenize(tweet)
		
		pattern = re.compile(r"(.)\1{2,}", re.DOTALL) # hunggggryy -> hungryy

		newTokens = []
		flag = 0
		for word in tokens:
			word = pattern.sub(r"\1", word)
			# word = word.strip('\'"?,.!')
			word = word.strip('.,();-*~[]_=|+%')
			word = re.sub(r'(\w+)[..|.](\w+)',r'\1 \2',word)
			newWord = word.split()
			for word in newWord:
				word = wnl.lemmatize(word)
		# 		if(stopWords.has_key(word) or word == ''or word.isdigit())):
				if(stopWords.has_key(word) or word == '' or word.isdigit() or word=='\''):	
					continue
				else:
					if(flag == 1):
						word = "NOT_" + word
						flag = 0
					if(word == "n't" and flag == 0):
						flag = 1
						word = "not"
					newTokens.append(word)
		for word in newTokens:
			# str = str + word + ' '
			if(not wordListMap.has_key(word)):
				wordListMap[word] = 1
		if(len(newTokens)>0):
			totalTweets = totalTweets + 1
			tokenizedTweets.append(newTokens)

	classifier = pickle.load(maxEntObjectFile)
	for i in range(0,len(tokenizedTweets)):
		testTweet = tokenizedTweets[i]
		pred = classifier.classify(featureFunc(testTweet))
		outputFile.write(str(pred)+"\n")
	return
    i += 1
dictMap = {}
i = 1
for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
    dictionary = line.rstrip('\n')
    dictMap[i] = dictionary
    i += 1
dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
    (dictionary, label) = line.rstrip('\n').split(' ')
    dict2label[dictionary] = label

nLines = 1
for line in sys.stdin:
    line = line.rstrip('\n')
    words = twokenize_wrapper.tokenize(line)
    seq_features = []
    tags = []

    goodCap = capClassifier.Classify(words) > 0.9

    # POS Tagging the tweet
    if posTagger:
        pos = posTagger.TagSentence(words)
        pos = [p.split(':')[0] for p in pos]  # remove weights
    else:
        pos = fields[-1].split(' ')

    # Chunking the tweet
    if chunkTagger:
        word_pos = zip(words, [p.split(':')[0] for p in pos])
def findTokensPhrases(phrasemodel="phrase.model", useDev=False):
    tokencnt = Counter()
    bigram = Phrases(phrasemodel)

    twcntr = 0
    supercntr = 0
    trumpcntr = 0

    for line in open(INPUT, 'r'):
        twcntr += 1
        tokenised = tokenize(json.loads(line)['text'].lower())
        tokens = filterStopwords(tokenised)  # filter stopwords
        for token in bigram[
                tokens]:  # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRAIN,
                        encoding='windows-1252',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    if useDev == True:
        for line in io.open(tokenize_tweets.FILEDEV,
                            encoding='windows-1252',
                            mode='r'):  #for the Trump file it's utf-8
            if line.startswith('ID\t'):
                continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[2].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            supercntr += 1
            tokencnt[token] += 1

    for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8',
                        mode='r'):  #for the Trump file it's utf-8
        if line.startswith('ID\t'):
            continue
        tokens = filterStopwords(tokenize(
            line.split("\t")[1].lower()))  #For Trump it's [1]
        for token in bigram[tokens]:
            trumpcntr += 1
            tokencnt[token] += 1

    output = open(OUTPUT, "wb")
    tokens_pb = Tokens()

    for token, count in tokencnt.most_common():
        if count > 1:  # not even worth saving singletons
            token_pb = tokens_pb.tokens.add()
            token_pb.token = token
            token_pb.count = count

    print "Saving token counts for ", tokencnt.__sizeof__(
    ), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"

    output.write(tokens_pb.SerializeToString())
    output.close
Beispiel #35
0
    def process_text(self, sentence):
        sentence = sentence.strip()
        sentence = re.sub(self.open_a, '', sentence)
        sentence = re.sub(self.close_a, '', sentence)

        return tokenize(sentence.lower())
def convertTweetsToVec(topic="all", numtoks='all', phrasemodel=False, phrasemodelpath="phrase.model"):

    print("Reading tokens")
    tokens,tweets_on_topic,tweets = readToks(phrasemodel)

    if phrasemodel==True:
        bigram = Phrases(phrasemodelpath)

    if numtoks != "all":
        tokens_sub = tokens[:numtoks]
    else:
        tokens_sub = tokens
        numtoks = tokens.__sizeof__()

    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    print("Converting JSON tweets")
    if topic=='all':
        #for topic in TOPICS:
        for tweet in tweets:

            vect = np.zeros(numtoks, dtype=bool)  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            if phrasemodel == False:
                tokenised_tweet = tokenize(tweet['text'])
            else:
                tokens = filterStopwords(tokenize(tweet['text'].lower()))
                tokenised_tweet = bigram[tokens]
            for token in tokenised_tweet:
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            #print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)
    else:  # discouraged, needs to be updated
        for index in tweets_on_topic[topic]:

            tweet = tweets[index]
            vect = np.zeros(numtoks)  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)

    print("Finished converting JSON tweets")
    return tokens_sub,vects,norm_tweets
def extractFeatureVocab(tweets,
                        keyword="all",
                        usephrasemodel=True,
                        phrasemodel="phrase.model",
                        anon_targets=False):
    tokencounts = Counter()
    features_final = []
    bigram = Phrases(phrasemodel)
    #tokens_topic = []

    #if keyword == "all":
    #    for top in tokenize_tweets.TOPICS:
    #        if top != 'clinton':
    #            for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]):
    #                tokens_topic.append(tok)
    #else:
    #    tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword])

    for tweet in tweets:
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:  #unigram features
                tokencounts[token] += 1
                #for toktopic in tokens_topic:
                #    tokencounts[toktopic + '|' + token] += 1
            for l in zip(*[tokenised_tweet[i:]
                           for i in range(2)]):  #bigram features
                tokencounts["_".join(l)] += 1
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1
        else:
            # this includes unigrams and frequent bigrams
            tokens = filterStopwords(tokenize(
                tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]
            target_keywords = []
            if anon_targets == True:
                for top in tokenize_tweets.TOPICS:
                    if top == "climate":  # hack, this is the only non-list value
                        target_keywords.append("climate")
                    else:
                        #for keyw in tokenize_tweets.KEYWORDS[top]:
                        target_keywords.extend(tokenize_tweets.KEYWORDS[top])

                phrasetoks_new = []
                for token in phrasetoks:
                    for keyw in target_keywords:
                        if keyw in token:
                            token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                tokencounts[token] += 1
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                tokencounts["_".join(l)] += 1

    for token, count in tokencounts.most_common():
        if count > 1:
            features_final.append(token)
            #print token, count

    return features_final
Beispiel #38
0
def convertTweetsToVec(topic="all",
                       numtoks='all',
                       phrasemodel=False,
                       phrasemodelpath="phrase.model"):

    print("Reading tokens")
    tokens, tweets_on_topic, tweets = readToks(phrasemodel)

    if phrasemodel == True:
        bigram = Phrases(phrasemodelpath)

    if numtoks != "all":
        tokens_sub = tokens[:numtoks]
    else:
        tokens_sub = tokens
        numtoks = tokens.__sizeof__()

    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    print("Converting JSON tweets")
    if topic == 'all':
        #for topic in TOPICS:
        for tweet in tweets:

            vect = np.zeros(
                numtoks, dtype=bool
            )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            if phrasemodel == False:
                tokenised_tweet = tokenize(tweet['text'])
            else:
                tokens = filterStopwords(tokenize(tweet['text'].lower()))
                tokenised_tweet = bigram[tokens]
            for token in tokenised_tweet:
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            #print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)
    else:  # discouraged, needs to be updated
        for index in tweets_on_topic[topic]:

            tweet = tweets[index]
            vect = np.zeros(
                numtoks
            )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)

    print("Finished converting JSON tweets")
    return tokens_sub, vects, norm_tweets
Beispiel #39
0
        feat_list = []
        for word in words:
            feat_list.append(self.fe.get_features(word))
        # Add context features
        feat_list = features.add_context_features(feat_list)
        self.fe.add_bigram_features(feat_list)

        # Create string to feed into Mallet
        feat_list_str = []
        for word_feats in feat_list:
            feat_list_str.append(' '.join(word_feats))

        self.tagger.stdin.write(
            ("\t".join(feat_list_str) + "\n").encode('utf8'))
        pos = []
        for i in range(len(feat_list)):
            pos.append(self.tagger.stdout.readline().rstrip('\n').strip(' '))
        self.nTagged += 1
        return pos


if __name__ == "__main__":
    posTagger = PosTagger()
    for line in sys.stdin:
        words = twokenize_wrapper.tokenize(line.strip())
        if not words:
            continue
        pos = posTagger.TagSentence(words)
        print "%s\t%s\t%s" % (line, " ".join(words), " ".join(pos))
Beispiel #40
0
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets,
                           labels):
    features = []

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}

    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        neutsim = w2vmodel.most_similar(neut, topn=60)
        possim = w2vmodel.most_similar(pos, topn=60)
        negsim = w2vmodel.most_similar(neg, topn=60)

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0

        # transform, as earlier, with the phrase model
        for token in phrasemodel[words]:
            if neut == token:
                neutsimp = 1
            if pos == token:
                possimp = 1
            if neg == token:
                negsimp = 1
            for n, sc in neutsim:
                if sc >= 0.4 and n == token:
                    neutcnt += 1
            for n, sc in possim:
                if sc >= 0.4 and n == token:
                    poscnt += 1
            for n, sc in negsim:
                if sc >= 0.4 and n == token:
                    negcnt += 1

        #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp
        #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp]
        pn = 0
        if possim and negsim:
            pn = 1
            possimp = 0
            negsimp = 0
        if mode == "hash":
            featint = [neutsimp, possimp, negsimp, pn]
            features.append(featint)
        if mode == "w2v_hash":
            featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn]
            features.append(featint)

    featlabels = []
    if mode == "hash":
        featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"]
    if mode == "w2v_hash":
        featlabels = [
            "neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash",
            "neg_hash", "posneg_hash"
        ]

    return features, featlabels
            self.GetTagger()

        feat_list = []
        for word in words:
            feat_list.append(self.fe.get_features(word))
        # Add context features
        feat_list = features.add_context_features(feat_list)
        self.fe.add_bigram_features(feat_list)

        # Create string to feed into Mallet
        feat_list_str = []
        for word_feats in feat_list:
            feat_list_str.append(' '.join(word_feats))

        self.tagger.stdin.write(("\t".join(feat_list_str) + "\n").encode('utf8'))
        pos = []
        for i in range(len(feat_list)):
            pos.append(self.tagger.stdout.readline().rstrip('\n').strip(' '))
        self.nTagged += 1
        return pos


if __name__ == "__main__":
    posTagger = PosTagger()
    for line in sys.stdin:
        words = twokenize_wrapper.tokenize(line.strip())
        if not words:
            continue
        pos = posTagger.TagSentence(words)
        print "%s\t%s\t%s" % (line, " ".join(words), " ".join(pos))
Beispiel #42
0
def create_project(sent_file, template_folder, new_proj_folder, new_proj_name):
    # Check that directories/files exist (or don't)
    if not os.path.isdir(template_folder):
        print 'Template folder DNE: ', template_folder
        return
    elif os.path.isdir(new_proj_folder):
        print 'Will overwrite the following directory: ', new_proj_folder
        return
    elif not os.path.exists(sent_file):
        print 'Sentence files does not exist: ', sent_file
        return

    # Make sure the project name doesn't have spaces
    new_proj_name = re.sub(' ', '_', new_proj_name)

    # Copy over all standard mmax2 files
    shutil.copytree(template_folder, new_proj_folder)
    f = open(os.path.join(new_proj_folder, new_proj_name + '.mmax'), 'w')
    f.write(MMAX_STRING % new_proj_name)
    f.close()

    # Creating word, pos, and sentence files
    f_word = open(os.path.join(new_proj_folder,
                               new_proj_name + '_words.xml'), 'w')
    f_pos = open(os.path.join(new_proj_folder,
                              new_proj_name + '_POS_level.xml'), 'w')
    f_sent = open(os.path.join(new_proj_folder,
                               new_proj_name + '_sentence_level.xml'), 'w')

    # Add headers
    f_word.write(WORDS_HEADER_STRING)
    f_pos.write(POS_HEADER_STRING)
    f_sent.write(SENT_HEADER_STRING)

    # For each sentence(tweet) in the file.
    tweet_tokens_list = []
    for tweet in open(sent_file):
        tweet_tokens_list.append(twokenize_wrapper.tokenize(tweet.strip()))

    # Load POS tagger and tag the tweets
    pos_tagger = mallet_wrapper.MalletPOSTagger(_model_location, _token2pos,
                                                _token, _temp_dir)
    pos_tagged_tweets = pos_tagger.pos_tag_tweets(tweet_tokens_list)

    # Add markables  
    word_count = 1
    tweet_count = 1
    for tagged_tweet in pos_tagged_tweets:
        start_count = word_count
        # For each word/pos in the sentence
        for word, pos in tagged_tweet:
            # Check if the word is a user, RT, or hash tag
            new_pos = symbol_tag.tag_token(word)
            if new_pos:
                pos = new_pos
            f_word.write(WORD_STRING % (word_count, word))
            f_pos.write(POS_STRING % (word_count, word_count, pos.lower()))
            word_count += 1

        f_sent.write(SENT_STRING % (tweet_count, start_count, word_count - 1))
        tweet_count += 1

    # Add closing tags
    f_word.write('</words>')
    f_pos.write('</markables>')
    f_sent.write('</markables>')

    f_word.close()
    f_pos.close()
    f_sent.close()
tweets_on_topic = defaultdict(list)
for topic in topics:
    for index, tweet in enumerate(tweets):
        for keyword in keywords[topic]:
            if keyword in tweet['text'].lower():
                tweets_on_topic[topic].append(index)
                break


for topic in topics:

    tokenized_tweets = Tweets()

    for index in tweets_on_topic[topic]:

        tweet = tweets[index]

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet['text']
        for token in tokenize(tweet['text']):
            try:
                index = tokens.index(token)
                tokenized.tokens.append(index)
            except ValueError:
                tokenized.tokens.append(-1)

        f = open(topic + '.tweets', "wb")
        f.write(tokenized_tweets.SerializeToString())
        f.close()
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels):
    features = []

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}



    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        neutsim = w2vmodel.most_similar(neut, topn=60)
        possim = w2vmodel.most_similar(pos, topn=60)
        negsim = w2vmodel.most_similar(neg, topn=60)

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0


        # transform, as earlier, with the phrase model
        for token in phrasemodel[words]:
            if neut == token:
                neutsimp = 1
            if pos == token:
                possimp = 1
            if neg == token:
                negsimp = 1
            for n, sc in neutsim:
                if sc >= 0.4 and n == token:
                   neutcnt += 1
            for n, sc in possim:
                if sc >= 0.4 and n == token:
                   poscnt += 1
            for n, sc in negsim:
                if sc >= 0.4 and n == token:
                   negcnt += 1

        #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp
        #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp]
        pn = 0
        if possim and negsim:
            pn = 1
            possimp = 0
            negsimp = 0
        if mode == "hash":
            featint = [neutsimp, possimp, negsimp, pn]
            features.append(featint)
        if mode == "w2v_hash":
            featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn]
            features.append(featint)

    featlabels = []
    if mode == "hash":
        featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"]
    if mode == "w2v_hash":
        featlabels = ["neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash"]

    return features, featlabels
tweets_on_topic = defaultdict(list)
for topic in topics:
    for index, tweet in enumerate(tweets):
        for keyword in keywords[topic]:
            if keyword in tweet['text'].lower():
                tweets_on_topic[topic].append(index)
                break


for topic in topics:

    tokenized_tweets = Tweets()

    for index in tweets_on_topic[topic]:

        tweet = tweets[index]

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet['text']
        for token in tokenize(tweet['text']):
            try:
                index = tokens.index(token)
                tokenized.tokens.append(index)
            except ValueError:
                tokenized.tokens.append(-1)

        f = open(topic + '.tweets', "wb")
        f.write(tokenized_tweets.SerializeToString())
        f.close()
Beispiel #46
0
    i += 1
dictMap = {}
i = 1
for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
    dictionary = line.rstrip('\n')
    dictMap[i] = dictionary
    i += 1
dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
    (dictionary, label) = line.rstrip('\n').split(' ')
    dict2label[dictionary] = label

nLines = 1
for line in sys.stdin:
    line = line.rstrip('\n')
    words = twokenize_wrapper.tokenize(line)
    seq_features = []
    tags = []

    goodCap = capClassifier.Classify(words) > 0.9

    # POS Tagging the tweet
    if posTagger:
        pos = posTagger.TagSentence(words)
        pos = [p.split(':')[0] for p in pos]  # remove weights
    else:
        pos = fields[-1].split(' ')

    # Chunking the tweet
    if chunkTagger:
        word_pos = zip(words, [p.split(':')[0] for p in pos])