def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t", str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t", str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t", str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def prepData(stopfilter, multiword, useDev=False): print("Preparing data...") ret = [] # list of lists print("Reading data...") tweets = readTweets() tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1) print(str(len(tweets))) tweets.extend(tweets_train) print(str(len(tweets_train)), "\t" , str(len(tweets))) tweets.extend(tweets_trump) print(str(len(tweets_trump)), "\t" , str(len(tweets))) if useDev == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets.extend(tweets_dev) print(str(len(tweets_dev)), "\t" , str(len(tweets))) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
def deep_test(): sess = tf.Session() start_dim = 50000 x = tf.placeholder("float", [None, start_dim]) autoencoder = create( x, [500] ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. tokens, vects, norm_tweets = convertTweetsToVec('all', start_dim) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev) devbatch = [] for v in vects_dev: devbatch.append(v) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Restore variables from disk. saver.restore(sess, "model.ckpt") print("Model restored.") decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects_dev[sampnr]) dec_tweet = [] n = 0 for r in decoded[sampnr]: # display first result if r > 0.1: dec_tweet.append(tokens[n]) n += 1 print(" cost", sess.run(autoencoder['cost'], feed_dict={x: devbatch})) #print i, " original", batch[0] print( " encoded", encoded[sampnr]) # latent representation of input, feed this to SVM(s) print(" decoded", decoded[sampnr]) print(" decoded bow", dec_tweet)
# option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better #for tweettok in tokenised_tweet: # if tweettok in target_keywords: # target_in_tweet = 1 # break ret.append(target_in_tweet) return ret if __name__ == '__main__': useDev = True if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) # "model_phrase_100_samp500_it2000.ckpt" features_train, labels_train, features_dev, labels_dev = extractFeaturesAutoencoder("model_trump_phrase_100_samp500_it2600.ckpt", tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", True) #train_classifiers(features_train, labels_train, features_dev, labels_dev, "out_auto_added.txt") # train and predict two 2-way models train_classifier_3way(features_train, labels_train, features_dev, labels_dev, "out_trump_postprocess.txt", [], "false", "false", useDev=useDev)
def extractFeaturesMulti( features=[ "auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase" ], automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev=True): if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILETEST, 'windows-1252', 2) features_final = [] if features.__contains__("bow"): features_final = extractFeatureVocab(tweets_train) features_train = extractFeaturesBOW(tweets_train, targets_train, features_final) features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final) elif features.__contains__("targetInTweet"): features_train = extractFeaturesCrossTweetTarget( tweets_train, targets_train) features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("bow_phrase") or features.__contains__( "bow_phrase_anon"): if features.__contains__("bow_phrase"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True) elif features.__contains__("bow_phrase_anon"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True) features_final.extend(features_vocab) if features.__contains__("auto_added"): useph = False if "phrase" in automodel: useph = True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder( automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph) elif features.__contains__("auto_true"): useph = False if "phrase" in automodel: useph = True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder( automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph) elif features.__contains__("auto_false"): useph = False if "phrase" in automodel: useph = True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder( automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph) targetInTweetTrain = [] targetInTweetDev = [] if features.__contains__("targetInTweet") and features.__contains__("bow"): targetInTweetTrain = extractFeaturesCrossTweetTarget( tweets_train, targets_train) targetInTweetDev = extractFeaturesCrossTweetTarget( tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("emoticons"): emoticons_train, emoticons_vocab = extractEmoticons(tweets_train) emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev) for emo in emoticons_vocab: features_final.append("Emoticon_" + emo) if features.__contains__("affect"): affect_train, affect_vocab = getAffect(tweets_train) affect_dev, affect_vocab = getAffect(tweets_dev) for aff in affect_vocab: features_final.append("WNaffect_" + aff) if features.__contains__("hash"): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures( w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures( w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev) elif features.__contains__("w2v_hash"): # this contains hash phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures( w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures( w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev) # combine features for i, featvec in enumerate(features_train): #features_train_auto) if features.__contains__("auto_added") or features.__contains__( "auto_true") or features.__contains__("auto_false"): features_train[i] = np.append( features_train[i], features_train_auto[i] ) # numpy append works as extend works for python lists if features.__contains__("targetInTweet") and features.__contains__( "bow"): features_train[i] = np.append(features_train[i], targetInTweetTrain[i]) if features.__contains__("bow_phrase") or features.__contains__( "bow_phrase_anon"): features_train[i] = np.append(features_train[i], features_train_phrbow[i]) if features.__contains__("emoticons"): features_train[i] = np.append(features_train[i], emoticons_train[i]) if features.__contains__("affect"): features_train[i] = np.append(features_train[i], affect_train[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_train[i] = np.append(features_train[i], features_train_w2v[i]) for i, featvec in enumerate(features_dev): #features_dev_auto): if features.__contains__("auto_added") or features.__contains__( "auto_true") or features.__contains__("auto_false"): features_dev[i] = np.append(features_dev[i], features_dev_auto[i]) if features.__contains__("targetInTweet") and features.__contains__( "bow"): features_dev[i] = np.append(features_dev[i], targetInTweetDev[i]) if features.__contains__("bow_phrase") or features.__contains__( "bow_phrase_anon"): features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i]) if features.__contains__("emoticons"): features_dev[i] = np.append(features_dev[i], emoticons_dev[i]) if features.__contains__("affect"): features_dev[i] = np.append(features_dev[i], affect_dev[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_dev[i] = np.append(features_dev[i], features_dev_w2v[i]) return features_train, labels_train, features_dev, labels_dev, features_final
def train_classifier_3way(feats_train, labels_train, feats_dev, labels_dev, outfilepath, feature_vocab=[], debug='false', auto_thresh='false', useDev=True, postprocess=True): labels = [] # -1 for NONE, 0 for AGAINST, 1 for FAVOR labels_dev_tr = [] #transformed from "NONE" etc to -1,0,1 for i, lab in enumerate(labels_train): if lab == 'NONE': labels.append(-1) elif lab == 'FAVOR': labels.append(1) elif lab == 'AGAINST': labels.append(0) for i, lab in enumerate(labels_dev): if lab == 'NONE' or lab == 'UNKNOWN': labels_dev_tr.append(-1) elif lab == 'FAVOR': labels_dev_tr.append(1) elif lab == 'AGAINST': labels_dev_tr.append(0) print("Training classifier...") model = LogisticRegression(penalty='l2')#, class_weight='balanced') #svm.SVC(class_weight={1: weight}) model.fit(feats_train, labels) preds = model.predict(feats_dev) preds_prob = model.predict_proba(feats_dev) coef = model.coef_ print("Label options", model.classes_) print("Labels", labels_dev_tr) print("Predictions", preds) print("Predictions prob", preds_prob) print("Feat length ", feats_train[0].__len__()) #print "Features ", feature_vocab.__len__(), "\t", feature_vocab #print "Weights " #for co in coef: # print co.__len__(), "\t", co if useDev == False: tweets_test_file = tokenize_tweets.FILEDEV target_short = "clinton" else: tweets_test_file = tokenize_tweets.FILETEST target_short = "trump" if auto_thresh == "true": print("Number dev samples:\t", len(labels_dev_tr)) optlabels = optimiseThresh(labels_dev_tr, preds_prob, len(labels_dev_tr)/2) printPredsToFileOneModel(tweets_test_file, outfilepath, optlabels, len(labels_dev_tr)/2) else: printPredsToFileOneModel(tweets_test_file, outfilepath, preds) if postprocess == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tweets_test_file, 'windows-1252', 2) targetInTweet = {}#istargetInTweet(tweets_dev, targets_dev) for i, tweet in enumerate(tweets_dev): target_keywords = tokenize_tweets.KEYWORDS.get(target_short) target_in_tweet = False for key in target_keywords: if key.lower() in tweet.lower(): target_in_tweet = True break targetInTweet[i] = target_in_tweet predictions_new = [] for i, pred_prob in enumerate(preds_prob): inTwe = targetInTweet[i] if inTwe == True: # NONE/AGAINST/FAVOUR pred = 0 if pred_prob[2] > pred_prob[1]: pred = 1 predictions_new.append(pred) else: plist = pred_prob.tolist() pred = plist.index(max(plist))-1 predictions_new.append(pred) printPredsToFileOneModel(tweets_test_file, outfilepath, predictions_new) if debug == "true": tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) # printProbsToFileOneModel(tokenize_tweets.FILEDEV, outfilepath.replace(".txt", ".debug.txt"), preds_prob, preds) print("\nFeature analysis\nFeature\tNone\tAgainst\tFavor") for i, feat in enumerate(feature_vocab): print(feat, "\t", coef[0][i], "\t", coef[1][i], "\t", coef[2][i]) print("\nActive features on dev (Hillary Clinton) per instance, coef for None/Against/Favour") for i, featvect in enumerate(feats_dev): featprint = [] for ii, feat in enumerate(featvect): featname = feature_vocab[ii] if feat == 1.0: featprint.append("[" + featname + " " + str(coef[0][ii]) + " / " + str(coef[1][ii]) + " / " + str(coef[2][ii]) + "]")
tokenised_tweet = tokenize(tweet) label = labels[it] for token in tokenised_tweet: if token.startswith("#"): all[token] += 1 if label == "NONE": neut[token] += 1 elif label == "AGAINST": neg[token] += 1 elif label == "FAVOR": pos[token] += 1 print("Hashtags\tAll\tNeut\tNeg\tPos") for token, count in all.most_common(): neutrcnt, poscnt, negcnt = 0, 0, 0 if neut.__contains__(token): neutrcnt = neut[token] if neg.__contains__(token): negcnt = neg[token] if pos.__contains__(token): poscnt = pos[token] print(token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt) if __name__ == '__main__': #tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) #tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1) tweets_train, targets_dev, labels_train = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) countHashTags(tweets_train, labels_train)
def train_classifier_3way(feats_train, labels_train, feats_dev, labels_dev, outfilepath, feature_vocab=[], debug='false', auto_thresh='false', useDev=True, postprocess=True): labels = [] # -1 for NONE, 0 for AGAINST, 1 for FAVOR labels_dev_tr = [] #transformed from "NONE" etc to -1,0,1 for i, lab in enumerate(labels_train): if lab == 'NONE': labels.append(-1) elif lab == 'FAVOR': labels.append(1) elif lab == 'AGAINST': labels.append(0) for i, lab in enumerate(labels_dev): if lab == 'NONE' or lab == 'UNKNOWN': labels_dev_tr.append(-1) elif lab == 'FAVOR': labels_dev_tr.append(1) elif lab == 'AGAINST': labels_dev_tr.append(0) print("Training classifier...") model = LogisticRegression( penalty='l2' ) #, class_weight='balanced') #svm.SVC(class_weight={1: weight}) model.fit(feats_train, labels) preds = model.predict(feats_dev) preds_prob = model.predict_proba(feats_dev) coef = model.coef_ print("Label options", model.classes_) print("Labels", labels_dev_tr) print("Predictions", preds) print("Predictions prob", preds_prob) print("Feat length ", feats_train[0].__len__()) #print "Features ", feature_vocab.__len__(), "\t", feature_vocab #print "Weights " #for co in coef: # print co.__len__(), "\t", co if useDev == False: tweets_test_file = tokenize_tweets.FILEDEV target_short = "clinton" else: tweets_test_file = tokenize_tweets.FILETEST target_short = "trump" if auto_thresh == "true": print("Number dev samples:\t", len(labels_dev_tr)) optlabels = optimiseThresh(labels_dev_tr, preds_prob, len(labels_dev_tr) / 2) printPredsToFileOneModel(tweets_test_file, outfilepath, optlabels, len(labels_dev_tr) / 2) else: printPredsToFileOneModel(tweets_test_file, outfilepath, preds) if postprocess == True: tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tweets_test_file, 'windows-1252', 2) targetInTweet = {} #istargetInTweet(tweets_dev, targets_dev) for i, tweet in enumerate(tweets_dev): target_keywords = tokenize_tweets.KEYWORDS.get(target_short) target_in_tweet = False for key in target_keywords: if key.lower() in tweet.lower(): target_in_tweet = True break targetInTweet[i] = target_in_tweet predictions_new = [] for i, pred_prob in enumerate(preds_prob): inTwe = targetInTweet[i] if inTwe == True: # NONE/AGAINST/FAVOUR pred = 0 if pred_prob[2] > pred_prob[1]: pred = 1 predictions_new.append(pred) else: plist = pred_prob.tolist() pred = plist.index(max(plist)) - 1 predictions_new.append(pred) printPredsToFileOneModel(tweets_test_file, outfilepath, predictions_new) if debug == "true": tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) # printProbsToFileOneModel(tokenize_tweets.FILEDEV, outfilepath.replace(".txt", ".debug.txt"), preds_prob, preds) print("\nFeature analysis\nFeature\tNone\tAgainst\tFavor") for i, feat in enumerate(feature_vocab): print(feat, "\t", coef[0][i], "\t", coef[1][i], "\t", coef[2][i]) print( "\nActive features on dev (Hillary Clinton) per instance, coef for None/Against/Favour" ) for i, featvect in enumerate(feats_dev): featprint = [] for ii, feat in enumerate(featvect): featname = feature_vocab[ii] if feat == 1.0: featprint.append("[" + featname + " " + str(coef[0][ii]) + " / " + str(coef[1][ii]) + " / " + str(coef[2][ii]) + "]")
for it, tweet in enumerate(tweets): tokenised_tweet = tokenize(tweet) label = labels[it] for token in tokenised_tweet: if token.startswith("#"): all[token] += 1 if label == "NONE": neut[token] += 1 elif label == "AGAINST": neg[token] += 1 elif label == "FAVOR": pos[token] += 1 print "Hashtags\tAll\tNeut\tNeg\tPos" for token, count in all.most_common(): neutrcnt, poscnt, negcnt = 0, 0, 0 if neut.__contains__(token): neutrcnt = neut[token] if neg.__contains__(token): negcnt = neg[token] if pos.__contains__(token): poscnt = pos[token] print token, "\t", count, "\t", neutrcnt, "\t", negcnt, "\t", poscnt if __name__ == "__main__": # tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) # tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1) tweets_train, targets_dev, labels_train = readTweetsOfficial(tokenize_tweets.FILEDEV, "windows-1252", 2) countHashTags(tweets_train, labels_train)
if tweet.__contains__(key): target_in_tweet = 1 break # option below cares for tokenisation, but since hashtags are not tokenised at the moment, the above works better #for tweettok in tokenised_tweet: # if tweettok in target_keywords: # target_in_tweet = 1 # break ret.append(target_in_tweet) return ret if __name__ == '__main__': useDev = True if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILETEST, 'windows-1252', 2) # "model_phrase_100_samp500_it2000.ckpt" features_train, labels_train, features_dev, labels_dev = extractFeaturesAutoencoder(
def deep(modelname, layers, phrasem=True, useDev=True): sess = tf.Session() #load and convert tweets tokens, vects, norm_tweets = convertTweetsToVec('all', 50000, phrasemodel=phrasem) start_dim = 50000 #tokens.__sizeof__() # 129887 tokens without singletons. Dimensionality of input. keep as big as possible, but throw singletons away. x = tf.placeholder("float", [None, start_dim]) print("Creating autoencoder") autoencoder = create( x, layers ) # Dimensionality of the hidden layers. To start with, only use 1 hidden layer. print("Creating Adam") train_step = tf.train.AdamOptimizer(0.1).minimize(autoencoder['cost']) print("Initialising all variables") init = tf.initialize_all_variables() sess.run(init) print("Converting official training data to vectors") tweets_train, targets_train, labels_train = readTweetsOfficial( tokenize_tweets.FILETRAIN) tweets_trump, targets_trump, labels_trump = readTweetsOfficial( tokenize_tweets.FILETRUMP, 'utf-8', 1) vects_train, norm_tweets_train = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_train, filtering=True) vects_trump, norm_tweets_trump = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_trump, filtering=True) for v in vects_train: vects.append(v) for v in vects_trump: vects.append(v) tweets_dev, targets_dev, labels_dev = readTweetsOfficial( tokenize_tweets.FILEDEV) vects_dev, norm_tweets_dev = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_dev, filtering=True) devbatch = [] if useDev == False: for v in vects_dev: devbatch.append(v) else: for v in vects_dev: vects.append(v) tweets_test, targets_test, labels_test = readTweetsOfficial( tokenize_tweets.FILETEST) vects_test, norm_tweets_test = tokenize_tweets.convertTweetsOfficialToVec( start_dim, tokens, tweets_test, filtering=True) for v in vects_test: devbatch.append(v) # start training sampnr = 12 # which ones of the dev samples to display for sanity check print("\noriginal", labels_dev[sampnr], norm_tweets_dev[sampnr]) # print "\noriginal", norm_tweets[2] print(vects[sampnr]) # Add ops to save and restore all the variables. saver = tf.train.Saver() cost = 1.0 # do 1000 training steps #for i in range(2000): i = 0 while cost > 0.01: # make a batch of 100: batch = [] for j in range(500): num = random.randint(0, len(vects) - 1) batch.append(vects[num]) sess.run(train_step, feed_dict={x: np.array(batch)}) if i % 100 == 0: decoded = sess.run(autoencoder['decoded'], feed_dict={x: devbatch}) # apply to dev encoded = sess.run(autoencoder['encoded'], feed_dict={x: devbatch}) # apply to dev #dec_tweet = [] #n = 0 #for r in decoded[sampnr]: # display first result # if r > 0.1: # dec_tweet.append(tokens[n]) # n+=1 cost = sess.run(autoencoder['cost'], feed_dict={x: devbatch}) print(i, " cost", cost) #print i, " original", batch[0] #print i, " encoded", encoded[sampnr] # latent representation of input, feed this to SVM(s) print(i, " decoded", decoded[sampnr]) #print i, " decoded bow", dec_tweet save_path = saver.save( sess, modelname.replace(".ckpt", "_it" + str(i) + ".ckpt")) print("Model saved in file: %s" % save_path) i += 1
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"] , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev=True): if useDev==False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) features_final = [] if features.__contains__("bow"): features_final = extractFeatureVocab(tweets_train) features_train = extractFeaturesBOW(tweets_train, targets_train, features_final) features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final) elif features.__contains__("targetInTweet"): features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train) features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): if features.__contains__("bow_phrase"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True) elif features.__contains__("bow_phrase_anon"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True) features_final.extend(features_vocab) if features.__contains__("auto_added"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph) elif features.__contains__("auto_true"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph) elif features.__contains__("auto_false"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph) targetInTweetTrain = [] targetInTweetDev = [] if features.__contains__("targetInTweet") and features.__contains__("bow"): targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train) targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("emoticons"): emoticons_train, emoticons_vocab = extractEmoticons(tweets_train) emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev) for emo in emoticons_vocab: features_final.append("Emoticon_" + emo) if features.__contains__("affect"): affect_train, affect_vocab = getAffect(tweets_train) affect_dev, affect_vocab = getAffect(tweets_dev) for aff in affect_vocab: features_final.append("WNaffect_" + aff) if features.__contains__("hash"): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev) elif features.__contains__("w2v_hash"): # this contains hash phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev) # combine features for i, featvec in enumerate(features_train):#features_train_auto) if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_train[i] = np.append(features_train[i], features_train_auto[i]) # numpy append works as extend works for python lists if features.__contains__("targetInTweet") and features.__contains__("bow"): features_train[i] = np.append(features_train[i], targetInTweetTrain[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_train[i] = np.append(features_train[i], features_train_phrbow[i]) if features.__contains__("emoticons"): features_train[i] = np.append(features_train[i], emoticons_train[i]) if features.__contains__("affect"): features_train[i] = np.append(features_train[i], affect_train[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_train[i] = np.append(features_train[i], features_train_w2v[i]) for i, featvec in enumerate(features_dev):#features_dev_auto): if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_dev[i] = np.append(features_dev[i], features_dev_auto[i]) if features.__contains__("targetInTweet") and features.__contains__("bow"): features_dev[i] = np.append(features_dev[i], targetInTweetDev[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i]) if features.__contains__("emoticons"): features_dev[i] = np.append(features_dev[i], emoticons_dev[i]) if features.__contains__("affect"): features_dev[i] = np.append(features_dev[i], affect_dev[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_dev[i] = np.append(features_dev[i], features_dev_w2v[i]) return features_train, labels_train, features_dev, labels_dev, features_final
if line.startswith('ID\t'): outfIn.write(line) outfOut.write(line) else: if cntr in inlist: outfIn.write(line) else: outfOut.write(line) cntr += 1 outfIn.close() outfOut.close() if __name__ == '__main__': tweets_gold, targets_gold, labels_gold = readTweetsOfficial( tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_res, targets_res, labels_res = readTweetsOfficial( "out_hillary_auto_false_targetInTweet.txt", 'windows-1252', 2) inlist = selectTrainData(tweets_gold, targets_gold) printInOutFiles(inlist, "out_hillary_auto_false_targetInTweet.txt", "out_hillary_inTwe.txt", "out_hillary_outTwe.txt") printInOutFiles(inlist, tokenize_tweets.FILEDEV, "_gold_hillary_inTwe.txt", "_gold_hillary_outTwe.txt") print("Inlist") eval("_gold_hillary_inTwe.txt", "out_hillary_inTwe.txt") print("Outlist") eval("_gold_hillary_outTwe.txt", "out_hillary_outTwe.txt")
if line.startswith("ID\t"): outfIn.write(line) outfOut.write(line) else: if cntr in inlist: outfIn.write(line) else: outfOut.write(line) cntr += 1 outfIn.close() outfOut.close() if __name__ == "__main__": tweets_gold, targets_gold, labels_gold = readTweetsOfficial(tokenize_tweets.FILEDEV, "windows-1252", 2) tweets_res, targets_res, labels_res = readTweetsOfficial( "out_hillary_auto_false_targetInTweet.txt", "windows-1252", 2 ) inlist = selectTrainData(tweets_gold, targets_gold) printInOutFiles( inlist, "out_hillary_auto_false_targetInTweet.txt", "out_hillary_inTwe.txt", "out_hillary_outTwe.txt" ) printInOutFiles(inlist, tokenize_tweets.FILEDEV, "_gold_hillary_inTwe.txt", "_gold_hillary_outTwe.txt") print("Inlist") eval("_gold_hillary_inTwe.txt", "out_hillary_inTwe.txt") print("Outlist") eval("_gold_hillary_outTwe.txt", "out_hillary_outTwe.txt")