Example #1
0
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = []  # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(
        tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(
        tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t", str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t", str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t", str(len(tweets)))

    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = [] # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t" , str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t" , str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t" , str(len(tweets)))


    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def deep_test():
    sess = tf.Session()

    start_dim = 50000

    x = tf.placeholder("float", [None, start_dim])
    autoencoder = create(
        x, [500]
    )  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.

    tokens, vects, norm_tweets = convertTweetsToVec('all', start_dim)
    tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
        tokenize_tweets.FILEDEV, 'windows-1252', 2)
    vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_dev)
    devbatch = []
    for v in vects_dev:
        devbatch.append(v)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # Restore variables from disk.
    saver.restore(sess, "model.ckpt")
    print("Model restored.")

    decoded = sess.run(autoencoder['decoded'],
                       feed_dict={x: devbatch})  # apply to dev
    encoded = sess.run(autoencoder['encoded'],
                       feed_dict={x: devbatch})  # apply to dev

    sampnr = 12  # which ones of the dev samples to display for sanity check
    print("\noriginal", labels_dev[sampnr],
          norm_tweets_dev[sampnr])  # print "\noriginal", norm_tweets[2]
    print(vects_dev[sampnr])

    dec_tweet = []
    n = 0
    for r in decoded[sampnr]:  # display first result
        if r > 0.1:
            dec_tweet.append(tokens[n])
        n += 1

    print(" cost", sess.run(autoencoder['cost'], feed_dict={x: devbatch}))
    #print i, " original", batch[0]
    print(
        " encoded",
        encoded[sampnr])  # latent representation of input, feed this to SVM(s)
    print(" decoded", decoded[sampnr])
    print(" decoded bow", dec_tweet)
        # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better
        #for tweettok in tokenised_tweet:
        #    if tweettok in target_keywords:
        #        target_in_tweet = 1
        #        break
        ret.append(target_in_tweet)
    return ret





if __name__ == '__main__':
    useDev = True
    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

     # "model_phrase_100_samp500_it2000.ckpt"
    features_train, labels_train, features_dev, labels_dev = extractFeaturesAutoencoder("model_trump_phrase_100_samp500_it2600.ckpt",
            tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", True)

    #train_classifiers(features_train, labels_train, features_dev, labels_dev, "out_auto_added.txt") # train and predict two 2-way models
    train_classifier_3way(features_train, labels_train, features_dev, labels_dev, "out_trump_postprocess.txt", [], "false", "false", useDev=useDev)
def extractFeaturesMulti(
        features=[
            "auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v",
            "bow_phrase"
        ],
        automodel="model.ckpt",
        w2vmodel="skip_nostop_multi_300features_10minwords_10context",
        phrasemodel="phrase.model",
        useDev=True):
    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILETEST, 'windows-1252', 2)

    features_final = []

    if features.__contains__("bow"):
        features_final = extractFeatureVocab(tweets_train)
        features_train = extractFeaturesBOW(tweets_train, targets_train,
                                            features_final)
        features_dev = extractFeaturesBOW(tweets_dev, targets_dev,
                                          features_final)
    elif features.__contains__("targetInTweet"):
        features_train = extractFeaturesCrossTweetTarget(
            tweets_train, targets_train)
        features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")

    if features.__contains__("bow_phrase") or features.__contains__(
            "bow_phrase_anon"):
        if features.__contains__("bow_phrase"):
            features_vocab = extractFeatureVocab(tweets_train,
                                                 usephrasemodel=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train,
                                                       targets_train,
                                                       features_vocab,
                                                       usephrasemodel=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev,
                                                     targets_dev,
                                                     features_vocab,
                                                     usephrasemodel=True)
        elif features.__contains__("bow_phrase_anon"):
            features_vocab = extractFeatureVocab(tweets_train,
                                                 usephrasemodel=True,
                                                 anon_targets=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train,
                                                       targets_train,
                                                       features_vocab,
                                                       usephrasemodel=True,
                                                       anon_targets=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev,
                                                     targets_dev,
                                                     features_vocab,
                                                     usephrasemodel=True,
                                                     anon_targets=True)
        features_final.extend(features_vocab)

    if features.__contains__("auto_added"):
        useph = False
        if "phrase" in automodel:
            useph = True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(
            automodel,
            tweets_train,
            targets_train,
            labels_train,
            tweets_dev,
            targets_dev,
            labels_dev,
            "added",
            usephrasemodel=useph)
    elif features.__contains__("auto_true"):
        useph = False
        if "phrase" in automodel:
            useph = True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(
            automodel,
            tweets_train,
            targets_train,
            labels_train,
            tweets_dev,
            targets_dev,
            labels_dev,
            "true",
            usephrasemodel=useph)
    elif features.__contains__("auto_false"):
        useph = False
        if "phrase" in automodel:
            useph = True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(
            automodel,
            tweets_train,
            targets_train,
            labels_train,
            tweets_dev,
            targets_dev,
            labels_dev,
            "false",
            usephrasemodel=useph)

    targetInTweetTrain = []
    targetInTweetDev = []
    if features.__contains__("targetInTweet") and features.__contains__("bow"):
        targetInTweetTrain = extractFeaturesCrossTweetTarget(
            tweets_train, targets_train)
        targetInTweetDev = extractFeaturesCrossTweetTarget(
            tweets_dev, targets_dev)
        features_final.append("targetInTweet")
    if features.__contains__("emoticons"):
        emoticons_train, emoticons_vocab = extractEmoticons(tweets_train)
        emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev)
        for emo in emoticons_vocab:
            features_final.append("Emoticon_" + emo)
    if features.__contains__("affect"):
        affect_train, affect_vocab = getAffect(tweets_train)
        affect_dev, affect_vocab = getAffect(tweets_dev)
        for aff in affect_vocab:
            features_final.append("WNaffect_" + aff)

    if features.__contains__("hash"):
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(
            w2vmodel, phmodel, "hash", tweets_train, targets_train,
            labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(
            w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev)
    elif features.__contains__("w2v_hash"):  # this contains hash
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(
            w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train,
            labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(
            w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev)

    # combine features
    for i, featvec in enumerate(features_train):  #features_train_auto)
        if features.__contains__("auto_added") or features.__contains__(
                "auto_true") or features.__contains__("auto_false"):
            features_train[i] = np.append(
                features_train[i], features_train_auto[i]
            )  # numpy append works as extend works for python lists
        if features.__contains__("targetInTweet") and features.__contains__(
                "bow"):
            features_train[i] = np.append(features_train[i],
                                          targetInTweetTrain[i])
        if features.__contains__("bow_phrase") or features.__contains__(
                "bow_phrase_anon"):
            features_train[i] = np.append(features_train[i],
                                          features_train_phrbow[i])
        if features.__contains__("emoticons"):
            features_train[i] = np.append(features_train[i],
                                          emoticons_train[i])
        if features.__contains__("affect"):
            features_train[i] = np.append(features_train[i], affect_train[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_train[i] = np.append(features_train[i],
                                          features_train_w2v[i])
    for i, featvec in enumerate(features_dev):  #features_dev_auto):
        if features.__contains__("auto_added") or features.__contains__(
                "auto_true") or features.__contains__("auto_false"):
            features_dev[i] = np.append(features_dev[i], features_dev_auto[i])
        if features.__contains__("targetInTweet") and features.__contains__(
                "bow"):
            features_dev[i] = np.append(features_dev[i], targetInTweetDev[i])
        if features.__contains__("bow_phrase") or features.__contains__(
                "bow_phrase_anon"):
            features_dev[i] = np.append(features_dev[i],
                                        features_dev_phrbow[i])
        if features.__contains__("emoticons"):
            features_dev[i] = np.append(features_dev[i], emoticons_dev[i])
        if features.__contains__("affect"):
            features_dev[i] = np.append(features_dev[i], affect_dev[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_dev[i] = np.append(features_dev[i], features_dev_w2v[i])

    return features_train, labels_train, features_dev, labels_dev, features_final
def train_classifier_3way(feats_train, labels_train, feats_dev, labels_dev, outfilepath, feature_vocab=[], debug='false', auto_thresh='false', useDev=True, postprocess=True):
    labels = []  # -1 for NONE, 0 for AGAINST, 1 for FAVOR
    labels_dev_tr = [] #transformed from "NONE" etc to -1,0,1

    for i, lab in enumerate(labels_train):
        if lab == 'NONE':
            labels.append(-1)
        elif lab == 'FAVOR':
            labels.append(1)
        elif lab == 'AGAINST':
            labels.append(0)

    for i, lab in enumerate(labels_dev):
        if lab == 'NONE' or lab == 'UNKNOWN':
            labels_dev_tr.append(-1)
        elif lab == 'FAVOR':
            labels_dev_tr.append(1)
        elif lab == 'AGAINST':
            labels_dev_tr.append(0)


    print("Training classifier...")

    model = LogisticRegression(penalty='l2')#, class_weight='balanced') #svm.SVC(class_weight={1: weight})
    model.fit(feats_train, labels)
    preds = model.predict(feats_dev)
    preds_prob = model.predict_proba(feats_dev)
    coef = model.coef_
    print("Label options", model.classes_)

    print("Labels", labels_dev_tr)
    print("Predictions", preds)
    print("Predictions prob", preds_prob)
    print("Feat length ", feats_train[0].__len__())
    #print "Features ", feature_vocab.__len__(), "\t", feature_vocab
    #print "Weights "
    #for co in coef:
    #    print co.__len__(), "\t", co

    if useDev == False:
        tweets_test_file = tokenize_tweets.FILEDEV
        target_short = "clinton"
    else:
        tweets_test_file = tokenize_tweets.FILETEST
        target_short = "trump"

    if auto_thresh == "true":
        print("Number dev samples:\t", len(labels_dev_tr))
        optlabels = optimiseThresh(labels_dev_tr, preds_prob, len(labels_dev_tr)/2)
        printPredsToFileOneModel(tweets_test_file, outfilepath, optlabels, len(labels_dev_tr)/2)
    else:
        printPredsToFileOneModel(tweets_test_file, outfilepath, preds)


    if postprocess == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tweets_test_file, 'windows-1252', 2)
        targetInTweet = {}#istargetInTweet(tweets_dev, targets_dev)
        for i, tweet in enumerate(tweets_dev):
            target_keywords = tokenize_tweets.KEYWORDS.get(target_short)
            target_in_tweet = False
            for key in target_keywords:
                if key.lower() in tweet.lower():
                    target_in_tweet = True
                    break
            targetInTweet[i] = target_in_tweet

        predictions_new = []
        for i, pred_prob in enumerate(preds_prob):
            inTwe = targetInTweet[i]
            if inTwe == True:  # NONE/AGAINST/FAVOUR
                pred = 0
                if pred_prob[2] > pred_prob[1]:
                    pred = 1
                predictions_new.append(pred)
            else:
                plist = pred_prob.tolist()
                pred = plist.index(max(plist))-1
                predictions_new.append(pred)
        printPredsToFileOneModel(tweets_test_file, outfilepath, predictions_new)



    if debug == "true":

        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)

    #    printProbsToFileOneModel(tokenize_tweets.FILEDEV, outfilepath.replace(".txt", ".debug.txt"), preds_prob, preds)
        print("\nFeature analysis\nFeature\tNone\tAgainst\tFavor")
        for i, feat in enumerate(feature_vocab):
            print(feat, "\t", coef[0][i], "\t", coef[1][i], "\t", coef[2][i])

        print("\nActive features on dev (Hillary Clinton) per instance, coef for None/Against/Favour")
        for i, featvect in enumerate(feats_dev):
            featprint = []
            for ii, feat in enumerate(featvect):
                featname = feature_vocab[ii]
                if feat == 1.0:
                    featprint.append("[" + featname + " " + str(coef[0][ii]) + " / " + str(coef[1][ii]) + " / " + str(coef[2][ii]) + "]")
Example #7
0
        tokenised_tweet = tokenize(tweet)
        label = labels[it]
        for token in tokenised_tweet:
            if token.startswith("#"):
                all[token] += 1
                if label == "NONE":
                    neut[token] += 1
                elif label == "AGAINST":
                    neg[token] += 1
                elif label == "FAVOR":
                    pos[token] += 1

    print("Hashtags\tAll\tNeut\tNeg\tPos")
    for token, count in all.most_common():
        neutrcnt, poscnt, negcnt = 0, 0, 0
        if neut.__contains__(token):
            neutrcnt = neut[token]
        if neg.__contains__(token):
            negcnt = neg[token]
        if pos.__contains__(token):
            poscnt = pos[token]
        print(token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt)


if __name__ == '__main__':
    #tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    #tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1)
    tweets_train, targets_dev, labels_train = readTweetsOfficial(
        tokenize_tweets.FILEDEV, 'windows-1252', 2)
    countHashTags(tweets_train, labels_train)
Example #8
0
def train_classifier_3way(feats_train,
                          labels_train,
                          feats_dev,
                          labels_dev,
                          outfilepath,
                          feature_vocab=[],
                          debug='false',
                          auto_thresh='false',
                          useDev=True,
                          postprocess=True):
    labels = []  # -1 for NONE, 0 for AGAINST, 1 for FAVOR
    labels_dev_tr = []  #transformed from "NONE" etc to -1,0,1

    for i, lab in enumerate(labels_train):
        if lab == 'NONE':
            labels.append(-1)
        elif lab == 'FAVOR':
            labels.append(1)
        elif lab == 'AGAINST':
            labels.append(0)

    for i, lab in enumerate(labels_dev):
        if lab == 'NONE' or lab == 'UNKNOWN':
            labels_dev_tr.append(-1)
        elif lab == 'FAVOR':
            labels_dev_tr.append(1)
        elif lab == 'AGAINST':
            labels_dev_tr.append(0)

    print("Training classifier...")

    model = LogisticRegression(
        penalty='l2'
    )  #, class_weight='balanced') #svm.SVC(class_weight={1: weight})
    model.fit(feats_train, labels)
    preds = model.predict(feats_dev)
    preds_prob = model.predict_proba(feats_dev)
    coef = model.coef_
    print("Label options", model.classes_)

    print("Labels", labels_dev_tr)
    print("Predictions", preds)
    print("Predictions prob", preds_prob)
    print("Feat length ", feats_train[0].__len__())
    #print "Features ", feature_vocab.__len__(), "\t", feature_vocab
    #print "Weights "
    #for co in coef:
    #    print co.__len__(), "\t", co

    if useDev == False:
        tweets_test_file = tokenize_tweets.FILEDEV
        target_short = "clinton"
    else:
        tweets_test_file = tokenize_tweets.FILETEST
        target_short = "trump"

    if auto_thresh == "true":
        print("Number dev samples:\t", len(labels_dev_tr))
        optlabels = optimiseThresh(labels_dev_tr, preds_prob,
                                   len(labels_dev_tr) / 2)
        printPredsToFileOneModel(tweets_test_file, outfilepath, optlabels,
                                 len(labels_dev_tr) / 2)
    else:
        printPredsToFileOneModel(tweets_test_file, outfilepath, preds)

    if postprocess == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tweets_test_file, 'windows-1252', 2)
        targetInTweet = {}  #istargetInTweet(tweets_dev, targets_dev)
        for i, tweet in enumerate(tweets_dev):
            target_keywords = tokenize_tweets.KEYWORDS.get(target_short)
            target_in_tweet = False
            for key in target_keywords:
                if key.lower() in tweet.lower():
                    target_in_tweet = True
                    break
            targetInTweet[i] = target_in_tweet

        predictions_new = []
        for i, pred_prob in enumerate(preds_prob):
            inTwe = targetInTweet[i]
            if inTwe == True:  # NONE/AGAINST/FAVOUR
                pred = 0
                if pred_prob[2] > pred_prob[1]:
                    pred = 1
                predictions_new.append(pred)
            else:
                plist = pred_prob.tolist()
                pred = plist.index(max(plist)) - 1
                predictions_new.append(pred)
        printPredsToFileOneModel(tweets_test_file, outfilepath,
                                 predictions_new)

    if debug == "true":

        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)

        #    printProbsToFileOneModel(tokenize_tweets.FILEDEV, outfilepath.replace(".txt", ".debug.txt"), preds_prob, preds)
        print("\nFeature analysis\nFeature\tNone\tAgainst\tFavor")
        for i, feat in enumerate(feature_vocab):
            print(feat, "\t", coef[0][i], "\t", coef[1][i], "\t", coef[2][i])

        print(
            "\nActive features on dev (Hillary Clinton) per instance, coef for None/Against/Favour"
        )
        for i, featvect in enumerate(feats_dev):
            featprint = []
            for ii, feat in enumerate(featvect):
                featname = feature_vocab[ii]
                if feat == 1.0:
                    featprint.append("[" + featname + " " + str(coef[0][ii]) +
                                     " / " + str(coef[1][ii]) + " / " +
                                     str(coef[2][ii]) + "]")
    for it, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet)
        label = labels[it]
        for token in tokenised_tweet:
            if token.startswith("#"):
                all[token] += 1
                if label == "NONE":
                    neut[token] += 1
                elif label == "AGAINST":
                    neg[token] += 1
                elif label == "FAVOR":
                    pos[token] += 1

    print "Hashtags\tAll\tNeut\tNeg\tPos"
    for token, count in all.most_common():
        neutrcnt, poscnt, negcnt = 0, 0, 0
        if neut.__contains__(token):
            neutrcnt = neut[token]
        if neg.__contains__(token):
            negcnt = neg[token]
        if pos.__contains__(token):
            poscnt = pos[token]
        print token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt


if __name__ == "__main__":
    # tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    # tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1)
    tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILEDEV, "windows-1252", 2)
    countHashTags(tweets_train, labels_train)
Example #10
0
            if tweet.__contains__(key):
                target_in_tweet = 1
                break
        # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better
        #for tweettok in tokenised_tweet:
        #    if tweettok in target_keywords:
        #        target_in_tweet = 1
        #        break
        ret.append(target_in_tweet)
    return ret


if __name__ == '__main__':
    useDev = True
    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(
            tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILETEST, 'windows-1252', 2)

    # "model_phrase_100_samp500_it2000.ckpt"
    features_train, labels_train, features_dev, labels_dev = extractFeaturesAutoencoder(
def deep(modelname, layers, phrasem=True, useDev=True):
    sess = tf.Session()

    #load and convert tweets
    tokens, vects, norm_tweets = convertTweetsToVec('all',
                                                    50000,
                                                    phrasemodel=phrasem)

    start_dim = 50000  #tokens.__sizeof__() # 129887 tokens without singletons. Dimensionality of input. keep as big as possible, but throw singletons away.
    x = tf.placeholder("float", [None, start_dim])
    print("Creating autoencoder")
    autoencoder = create(
        x, layers
    )  # Dimensionality of the hidden layers. To start with, only use 1 hidden layer.
    print("Creating Adam")
    train_step = tf.train.AdamOptimizer(0.1).minimize(autoencoder['cost'])

    print("Initialising all variables")
    init = tf.initialize_all_variables()
    sess.run(init)

    print("Converting official training data to vectors")
    tweets_train, targets_train, labels_train = readTweetsOfficial(
        tokenize_tweets.FILETRAIN)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(
        tokenize_tweets.FILETRUMP, 'utf-8', 1)
    vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_train, filtering=True)
    vects_trump, norm_tweets_trump = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_trump, filtering=True)
    for v in vects_train:
        vects.append(v)
    for v in vects_trump:
        vects.append(v)

    tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
        tokenize_tweets.FILEDEV)
    vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec(
        start_dim, tokens, tweets_dev, filtering=True)

    devbatch = []
    if useDev == False:
        for v in vects_dev:
            devbatch.append(v)

    else:
        for v in vects_dev:
            vects.append(v)
        tweets_test, targets_test, labels_test = readTweetsOfficial(
            tokenize_tweets.FILETEST)
        vects_test, norm_tweets_test = tokenize_tweets.convertTweetsOfficialToVec(
            start_dim, tokens, tweets_test, filtering=True)
        for v in vects_test:
            devbatch.append(v)

    # start training
    sampnr = 12  # which ones of the dev samples to display for sanity check
    print("\noriginal", labels_dev[sampnr],
          norm_tweets_dev[sampnr])  # print "\noriginal", norm_tweets[2]
    print(vects[sampnr])

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    cost = 1.0
    # do 1000 training steps
    #for i in range(2000):
    i = 0
    while cost > 0.01:
        # make a batch of 100:
        batch = []
        for j in range(500):
            num = random.randint(0, len(vects) - 1)
            batch.append(vects[num])
        sess.run(train_step, feed_dict={x: np.array(batch)})
        if i % 100 == 0:
            decoded = sess.run(autoencoder['decoded'],
                               feed_dict={x: devbatch})  # apply to dev
            encoded = sess.run(autoencoder['encoded'],
                               feed_dict={x: devbatch})  # apply to dev

            #dec_tweet = []
            #n = 0
            #for r in decoded[sampnr]:  # display first result
            #    if r > 0.1:
            #        dec_tweet.append(tokens[n])
            #    n+=1

            cost = sess.run(autoencoder['cost'], feed_dict={x: devbatch})
            print(i, " cost", cost)
            #print i, " original", batch[0]
            #print i, " encoded", encoded[sampnr] # latent representation of input, feed this to SVM(s)
            print(i, " decoded", decoded[sampnr])
            #print i, " decoded bow", dec_tweet

            save_path = saver.save(
                sess, modelname.replace(".ckpt", "_it" + str(i) + ".ckpt"))
            print("Model saved in file: %s" % save_path)
        i += 1
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"]
        , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model",
        useDev=True):
    if useDev==False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    features_final = []

    if features.__contains__("bow"):
        features_final = extractFeatureVocab(tweets_train)
        features_train = extractFeaturesBOW(tweets_train, targets_train, features_final)
        features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final)
    elif features.__contains__("targetInTweet"):
        features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")

    if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
        if features.__contains__("bow_phrase"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True)
        elif features.__contains__("bow_phrase_anon"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True)
        features_final.extend(features_vocab)

    if features.__contains__("auto_added"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph)
    elif features.__contains__("auto_true"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph)
    elif features.__contains__("auto_false"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph)

    targetInTweetTrain = []
    targetInTweetDev = []
    if features.__contains__("targetInTweet") and features.__contains__("bow"):
        targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")
    if features.__contains__("emoticons"):
        emoticons_train, emoticons_vocab = extractEmoticons(tweets_train)
        emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev)
        for emo in emoticons_vocab:
            features_final.append("Emoticon_" + emo)
    if features.__contains__("affect"):
        affect_train, affect_vocab = getAffect(tweets_train)
        affect_dev, affect_vocab = getAffect(tweets_dev)
        for aff in affect_vocab:
            features_final.append("WNaffect_" + aff)

    if features.__contains__("hash"):
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev)
    elif features.__contains__("w2v_hash"): # this contains hash
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev)

    # combine features
    for i, featvec in enumerate(features_train):#features_train_auto)
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_train[i] = np.append(features_train[i], features_train_auto[i])  # numpy append works as extend works for python lists
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_train[i] = np.append(features_train[i], targetInTweetTrain[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_train[i] = np.append(features_train[i], features_train_phrbow[i])
        if features.__contains__("emoticons"):
            features_train[i] = np.append(features_train[i], emoticons_train[i])
        if features.__contains__("affect"):
            features_train[i] = np.append(features_train[i], affect_train[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_train[i] = np.append(features_train[i], features_train_w2v[i])
    for i, featvec in enumerate(features_dev):#features_dev_auto):
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_dev[i] = np.append(features_dev[i], features_dev_auto[i])
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_dev[i] = np.append(features_dev[i], targetInTweetDev[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i])
        if features.__contains__("emoticons"):
            features_dev[i] = np.append(features_dev[i], emoticons_dev[i])
        if features.__contains__("affect"):
            features_dev[i] = np.append(features_dev[i], affect_dev[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_dev[i] = np.append(features_dev[i], features_dev_w2v[i])


    return features_train, labels_train, features_dev, labels_dev, features_final
Example #13
0
        if line.startswith('ID\t'):
            outfIn.write(line)
            outfOut.write(line)
        else:
            if cntr in inlist:
                outfIn.write(line)
            else:
                outfOut.write(line)
            cntr += 1

    outfIn.close()
    outfOut.close()


if __name__ == '__main__':
    tweets_gold, targets_gold, labels_gold = readTweetsOfficial(
        tokenize_tweets.FILEDEV, 'windows-1252', 2)
    tweets_res, targets_res, labels_res = readTweetsOfficial(
        "out_hillary_auto_false_targetInTweet.txt", 'windows-1252', 2)

    inlist = selectTrainData(tweets_gold, targets_gold)
    printInOutFiles(inlist, "out_hillary_auto_false_targetInTweet.txt",
                    "out_hillary_inTwe.txt", "out_hillary_outTwe.txt")
    printInOutFiles(inlist, tokenize_tweets.FILEDEV, "_gold_hillary_inTwe.txt",
                    "_gold_hillary_outTwe.txt")

    print("Inlist")
    eval("_gold_hillary_inTwe.txt", "out_hillary_inTwe.txt")

    print("Outlist")
    eval("_gold_hillary_outTwe.txt", "out_hillary_outTwe.txt")
        if line.startswith("ID\t"):
            outfIn.write(line)
            outfOut.write(line)
        else:
            if cntr in inlist:
                outfIn.write(line)
            else:
                outfOut.write(line)
            cntr += 1

    outfIn.close()
    outfOut.close()


if __name__ == "__main__":
    tweets_gold, targets_gold, labels_gold = readTweetsOfficial(tokenize_tweets.FILEDEV, "windows-1252", 2)
    tweets_res, targets_res, labels_res = readTweetsOfficial(
        "out_hillary_auto_false_targetInTweet.txt", "windows-1252", 2
    )

    inlist = selectTrainData(tweets_gold, targets_gold)
    printInOutFiles(
        inlist, "out_hillary_auto_false_targetInTweet.txt", "out_hillary_inTwe.txt", "out_hillary_outTwe.txt"
    )
    printInOutFiles(inlist, tokenize_tweets.FILEDEV, "_gold_hillary_inTwe.txt", "_gold_hillary_outTwe.txt")

    print("Inlist")
    eval("_gold_hillary_inTwe.txt", "out_hillary_inTwe.txt")

    print("Outlist")
    eval("_gold_hillary_outTwe.txt", "out_hillary_outTwe.txt")