def predict_word_dataset(word, dataset): """ Predicts presence of word in each instance in dataset, and returns results array. """ # Get weight vector for word if os.path.isfile(c.rbf_data + word + '.txt'): results = u.get_lines(c.rbf_data + word + '.txt') return results else: start_time = time.clock() print 'starting training for', word, 'at:', datetime.datetime.now().time() clf = u.get_clf(word, 'rbf') # Predict presence of word in each sentence, saving results results = clf.predict(dataset) print len(results) print with open(c.rbf_data + word + '.txt', 'w') as file: file.write(word + ' ') for i in results: file.write(str(i)) file.write('\n') time_taken = (time.clock()-start_time)/60 print 'Time taken to train and write predictions to file for', word, print 'is:', time_taken, 'minutes' return results
def get_word_dataset(word): """ Predicts presence of word in each instance in dataset, and returns results array. """ file_name = c.rbf_data + word + '.txt' # Get predicted classes for word results = u.get_lines(file_name) results = results[1:] return results
# Simply combines Large Movie Review datasets (test and train) into one larger file. import lib.utility as u if __name__ == "__main__": sents1 = set(u.get_lines("imdb-sentences-neg.txt")) sents2 = set(u.get_lines("imdb-sentences-test-neg.txt")) l1 = len(sents1) l2 = len(sents2) all = sents1.union(sents2) l3 = len(all) #print l1, l2, l3 #with open("imdb-all-sentences-neg.txt", "w") as file: # for s in all: # if len(s.split()) > 1: # file.write(s + "\n")
import lib.utility as u import config as c import random import itertools import os from time import time if __name__ == "__main__": # Load target words into memory target_words = u.get_target_words() # Get highly related pairs of words to get agreement data for #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt") # Store all sentences in memory sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt")) sentences_without = set(u.get_lines(c.output_dir + "sentences-without.txt")) # Get all sentences used already - using a dict (so we only care about sentences used for specific word) used_sentences = {} for word in target_words: with open(c.train_data_dir + word + ".txt") as file: sents = set() for line in file: sents.add(line[3:].strip()) used_sentences[word] = sents # Get sentences with that we haven't used, and sentences without that we haven't used, using set difference #sentences_with_available = list(sentences_with.difference(used_sentences)) sentences_without_available = list(
# Take in pairs of targets and their cosine scores and select the highest N pairs # to then compute agreement in normal way import lib.utility as u import config as c if __name__ == "__main__": # Read in all pairs with cosine scores pairs = u.get_lines(c.output_dir + "edges-cosine.txt") # Iterate each pair and add to dictionary, where key is tuple and # value is score - then we can sort by value pairs_dict = {} for p in pairs: p = p.split() t1 = p[0] t2 = p[1] score = float(p[2]) # check reverse isn't in dict already before inserting if (t2, t1) not in pairs_dict: pairs_dict[(t1, t2)] = score #total = len(pairs_dict) #print "total number of pairs =", total # Set N (number of pairs to select) N = 5000 print "chosen number of pairs =", N
# Get individual words #pair = pair.split()[0:2] ti = pair[0] # i tj = pair[1] # j print ctr, "- i:", ti, "j:", tj # Get row and col for matrix row = target_words.index(ti) col = target_words.index(tj) #print row, col # Read pair test dataset int sents array - file could be with words in reversed order dir = c.output_dir + "agreement-data/" sents = [] try: sents = u.get_lines(dir + ti + "-" + tj + ".txt") except: try: sents = u.get_lines(dir + tj + "-" + ti + ".txt") except: # exit program if no file found for these words print "No dataset for:", ti, tj #sys.exit("Can't find dataset file. Program terminating.") continue # Convert sents to binary feature vectors data = [] for sentence in sents: vect = numpy.zeros(feat_size) for w in sentence.strip().split()[1:]: # ignore target words in feature vectors
# Gets highest related pairs and prints to new file. import lib.utility as u import numpy if __name__ == "__main__": sim_a = u.get_lines("output-700/list-sim.txt") agr_a = u.get_lines("output-700/list-agr.txt") # create dict of tuples from two arrays (so we can sort together) d = {} for i in xrange(len(sim_a)): d[i] = (float(sim_a[i]), float(agr_a[i])) #print d # sort dict and take top half highest and put into new arrays sim_a_top = [] agr_a_top = [] half = len(d)/float(2) i=0 for key in sorted(d, key=d.get, reverse=True): if i > half: break #print d[key] sim_a_top.append(d[key][0]) agr_a_top.append(d[key][1]) i+=1
and returns dictionary """ counts = {} with open(f) as file: for line in file: for w in words: if w in line: try: counts[w] += 1 except: counts[w] = 1 return counts if __name__ == "__main__": words = u.get_lines(c.output_dir + "frequent-words.txt") # Get count in positive sentences, and negative sentences to use to compute LLR pos_counts = count_occurrences("imdb-all-sentences-pos.txt", words) neg_counts = count_occurrences("imdb-all-sentences-neg.txt", words) # Compute log likelihood ratio for each word and store in dictionary llrs = {} for w in words: # Get each count from dicts pos = pos_counts[w] neg = neg_counts[w] # Compute each conditional prob, then take log as score # P(t = +1 | w) p1 = pos / float(pos + neg) # P(t = -1 | w)
# Computes likelihoods of targets based on occurrences in corpus. import lib.utility as u if __name__ == "__main__": corpus_pos = u.get_lines("../data/imdb-sentences-pos.txt") corpus_neg = u.get_lines("../data/imdb-sentences-neg.txt") corpus = corpus_pos + corpus_neg # get total count of all words total_count = 0 counts = {} for line in corpus: for word in set(line.strip().split()): if word.isalpha(): if word in counts: counts[word] += 1 else: counts[word] = 1 total_count += 1 targets = u.get_target_words() likelihoods = {} for t in targets: likelihoods[t] = counts[t] / float(total_count) #for key in sorted(likelihoods, key=likelihoods.get, reverse=False): # print key, likelihoods[key]
import lib.utility as u import config as c import random import itertools if __name__ == "__main__": # Load target words into memory target_words = u.get_target_words() # Get highly related pairs of words to get agreement data for #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt") # Store all sentences in memory sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt")) sentences_without = set(u.get_lines(c.output_dir + "sentences-without.txt")) # Get all sentences used already - using a dict (so we only care about sentences used for specific word) used_sentences = {} for word in target_words: with open(c.train_data_dir + word + ".txt") as file: sents = set() for line in file: sents.add(line[3:].strip()) used_sentences[word] = sents # Get sentences with that we haven't used, and sentences without that we haven't used, using set difference #sentences_with_available = list(sentences_with.difference(used_sentences)) sentences_without_available = list(sentences_without.difference(used_sentences))
# Take in pairs of targets and their cosine scores and select the highest N pairs # to then compute agreement in normal way import lib.utility as u import config as c if __name__ == "__main__": # Read in all pairs with cosine scores pairs = u.get_lines(c.output_dir + "edges-cosine.txt") # Iterate each pair and add to dictionary, where key is tuple and # value is score - then we can sort by value pairs_dict = {} for p in pairs: p = p.split() t1 = p[0] t2 = p[1] score = float(p[2]) # check reverse isn't in dict already before inserting if (t2, t1) not in pairs_dict: pairs_dict[(t1, t2)] = score #total = len(pairs_dict) #print "total number of pairs =", total # Set N (number of pairs to select) N = 5000 print "chosen number of pairs =", N # Iterate through sorted pairs, and take top N
# Get test data for sample of target words to evaluate accuracy. # Similar to get-data-train.py, but more to do here as we have to ensure not to select those already in train # set, meaning there are fewer potential sentences. import lib.utility as u import config as c import random if __name__ == "__main__": # Hard coded list of words to test with target_words = ['wonderful', 'love', 'excellent', 'great', 'classic', 'terrible', 'boring', 'worst', 'stupid', 'crap'] # Store all sentences in memory sentences_with = u.get_lines(c.output_dir + "sentences-with.txt") sentences_without = u.get_lines(c.output_dir + "sentences-without.txt") all_sentences = sentences_with + sentences_without # For each target word, find sample sentences (pos and neg) # N is number of pos/neg instances to select N = 500 i = 0 for word in target_words: #print "for:", word # Shuffle data first random.shuffle(sentences_with) random.shuffle(all_sentences) # Get train set in an array to check we don't get a sentence we already have
# Computes frequency of features (from feature space) in Large Movie Review dataset. import lib.utility as u import math if __name__ == "__main__": corpus_neg = u.get_lines("imdb-all-sentences-neg.txt") corpus_pos = u.get_lines("imdb-all-sentences-pos.txt") corpus = corpus_neg + corpus_pos # get total count of all docs total_docs = len(corpus) print "total docs:", total_docs # get features from feat-space file features = {} with open("output-700/feat-space.txt") as file: line = file.readline() for term in line.strip().split(): features[term] = 0 # go through corpus and count in how many docs each term appears for line in corpus: # we need to remove all the labels and ":1" after each term on each line line = set(line.split()[1:]) # go through each line as a set - so no duplicates for term in line: term = term.split(":")[0] try: features[term] += 1
# Computes idf of features to use for feature values for sentiment classifier dataset. import lib.utility as u import math if __name__ == "__main__": corpus = u.get_lines("train-sample.txt") # get total count of all docs total_docs = len(corpus) # get features from feat-space file features = {} with open("feat-space-sent-prefix.txt") as file: line = file.readline() for term in line.strip().split(): features[term] = 0 # go through corpus and count in how many docs each term appears for line in corpus: # we need to remove all the labels and ":1" after each term on each line line = set(line.split()[1:]) # go through each line as a set - so no duplicates for term in line: term = term.split(":")[0] try: features[term] += 1 except: #print term, "not in feature space" continue