def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" version = 1 output = [partId, version] irsys = IRSystem() irsys.read_data('../data/RiderHaggard') irsys.index() irsys.compute_tfidf() out = sys.stdout if partId in [2,4,6,8,10]: # test parts sys.stdout = open(os.devnull, 'w') if partId == 1 or partId == 2: # Inverted Index. 1 ==> dev; 2 ==> test queries = ch_aux.split(", ") for query in queries: posting = irsys.get_posting_unstemmed(query) output.append(list(posting)) elif partId == 3 or partId == 4: # Boolean Retrieval. 3 ==> dev; 4 ==> test queries = ch_aux.split(", ") for query in queries: result = irsys.query_retrieve(query) result = list(result) output.append(result) elif partId == 5 or partId == 6: # Phrase Query Retrieval. 5 ==> dev; 6 ==> test queries = ch_aux.split(",") print queries for query in queries: result = irsys.phrase_query_retrieve(query) result = list(result) output.append(result) elif partId == 7 or partId == 8: # TF-IDF. 7 ==> dev; 8 ==> test queries = ch_aux.split("; ") for query in queries: word, docID = query.split(", ") result = irsys.get_tfidf_unstemmed(word, int(docID)); #print 'word: "%s" docID: "%s" result: %f output.append(result) elif partId == 9 or partId == 10: # Cosine Similarity. 9 ==> dev; 10 ==> test queries = ch_aux.split(", ") for query in queries: results = irsys.query_rank(query) first_result = [results[0][0], results[0][1]] output.append(first_result) else: print "Unknown partId: %d" % partId return None if partId in [2,4,6,8,10]: # test parts sys.stdout = out # put in the part ID as well (hacky) output = str(output) #print 'output: %s' % output return output
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" version = 1 output = [partId, version] irsys = IRSystem() irsys.read_data('../data/RiderHaggard') irsys.index() irsys.compute_tfidf() out = sys.stdout if partId in [2,4,6,8,10]: # test parts sys.stdout = open(os.devnull, 'w') if partId == 1 or partId == 2: # Inverted Index. 1 ==> dev; 2 ==> test queries = ch_aux.split(", ") for query in queries: posting = irsys.get_posting_unstemmed(query) output.append(list(posting)) elif partId == 3 or partId == 4: # Boolean Retrieval. 3 ==> dev; 4 ==> test queries = ch_aux.split(", ") for query in queries: result = irsys.query_retrieve(query) result = list(result) output.append(result) elif partId == 5 or partId == 6: # Phrase Query Retrieval. 5 ==> dev; 6 ==> test queries = ch_aux.split(",") print queries for query in queries: result = irsys.phrase_query_retrieve(query) result = list(result) output.append(result) elif partId == 7 or partId == 8: # TF-IDF. 7 ==> dev; 8 ==> test queries = ch_aux.split("; ") for query in queries: word, docID = query.split(", ") result = irsys.get_tfidf_unstemmed(word, int(docID)); #print 'word: "%s" docID: "%s" result: %f output.append(result) elif partId == 9 or partId == 10: # Cosine Similarity. 9 ==> dev; 10 ==> test queries = ch_aux.split(", ") for query in queries: results = irsys.query_rank(query) first_result = [results[0][0], results[0][1]] output.append(first_result) else: print "Unknown partId: %d" % partId return None if partId in [2,4,6,8,10]: # test parts sys.stdout = out # put in the part ID as well (hacky) output = str(output) #print 'output: %s' % output return output
if __name__ == '__main__': for labelfile in ("labels.hcnorms_misgoodfortune.all.txt", "labels.hcnorms_char.all.txt", "dream_acts.txt", 'dream_sets.txt', "all_labels.txt"): # First we'll do a regular IR experiment with BM25 documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")} labels = list(read_labels("data/" + labelfile)) y, X = zip(*match_labels_documents(documents, labels)) y, X = np.array(y), np.array(X) kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1) rank_scores = np.zeros(10) for i, (train, test) in enumerate(kf): X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] labels = Counter(flatten(list(y_train))) labels = [label for label, count in labels.items() if count >= 1] model = IRSystem(k1=1.2, b=0.75, cutoff=0) model.fit_raw(X_train, y_train, ngram_range=(1, 1), stop_words='english', min_df=2) ranking = model.rank_labels(X_test, raw=True) ranking = ranking.tolist() ranking = map(lambda r: list(unique_everseen(r)), map(flatten, ranking)) ranking, y_test = zip(*[(r, y_) for r, y_ in zip(ranking, y_test) if any(l in labels for l in y_)]) rank_scores[i] = mean_average_precision(ranking, y_test) print 'IR: (%s)' % (labelfile), rank_scores.mean(), rank_scores.std() # Next, we'll do an IR experiment with Big Documents documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")} labels = list(read_labels("data/" + labelfile)) y, X = zip(*match_labels_documents(documents, labels)) y, X = np.array(y), np.array(X) kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1) rank_scores = np.zeros(10)