Ejemplo n.º 1
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    version = 1
    output = [partId, version]

    irsys = IRSystem()
    irsys.read_data('../data/RiderHaggard')
    irsys.index()
    irsys.compute_tfidf()

    out = sys.stdout
    if partId in [2,4,6,8,10]:   # test parts
        sys.stdout = open(os.devnull, 'w')

    if partId == 1 or partId == 2:
        # Inverted Index. 1 ==> dev; 2 ==> test
        queries = ch_aux.split(", ")
        for query in queries:
            posting = irsys.get_posting_unstemmed(query)
            output.append(list(posting))
    elif partId == 3 or partId == 4:
        # Boolean Retrieval. 3 ==> dev; 4 ==> test
        queries = ch_aux.split(", ")
        for query in queries:
            result = irsys.query_retrieve(query)
            result = list(result)
            output.append(result)
    elif partId == 5 or partId == 6:
        # Phrase Query Retrieval. 5 ==> dev; 6 ==> test
        queries = ch_aux.split(",")
        print queries
        for query in queries:
            result = irsys.phrase_query_retrieve(query)
            result = list(result)
            output.append(result)
    elif partId == 7 or partId == 8:
        # TF-IDF. 7 ==> dev; 8 ==> test
        queries = ch_aux.split("; ")
        for query in queries:
            word, docID = query.split(", ")
            result = irsys.get_tfidf_unstemmed(word, int(docID));
            #print 'word: "%s" docID: "%s" result: %f
            output.append(result)
    elif partId == 9 or partId == 10:
        # Cosine Similarity. 9 ==> dev; 10 ==> test
        queries = ch_aux.split(", ")
        for query in queries:
            results = irsys.query_rank(query)
            first_result = [results[0][0], results[0][1]]
            output.append(first_result)
    else:
        print "Unknown partId: %d" % partId
        return None

    if partId in [2,4,6,8,10]:   # test parts
        sys.stdout = out

    # put in the part ID as well (hacky)
    output = str(output)
    #print 'output: %s' % output
    return output
Ejemplo n.º 2
0
  def output(self, partId, ch_aux):
      """Uses the student code to compute the output for test cases."""
      version = 1
      output = [partId, version]

      irsys = IRSystem()
      irsys.read_data('../data/RiderHaggard')
      irsys.index()
      irsys.compute_tfidf()

      out = sys.stdout
      if partId in [2,4,6,8,10]:   # test parts
          sys.stdout = open(os.devnull, 'w')

      if partId == 1 or partId == 2:
          # Inverted Index. 1 ==> dev; 2 ==> test
          queries = ch_aux.split(", ")
          for query in queries:
              posting = irsys.get_posting_unstemmed(query)
              output.append(list(posting))
      elif partId == 3 or partId == 4:
          # Boolean Retrieval. 3 ==> dev; 4 ==> test
          queries = ch_aux.split(", ")
          for query in queries:
              result = irsys.query_retrieve(query)
              result = list(result)
              output.append(result)
      elif partId == 5 or partId == 6:
          # Phrase Query Retrieval. 5 ==> dev; 6 ==> test
          queries = ch_aux.split(",")
          print queries
          for query in queries:
              result = irsys.phrase_query_retrieve(query)
              result = list(result)
              output.append(result)
      elif partId == 7 or partId == 8:
          # TF-IDF. 7 ==> dev; 8 ==> test
          queries = ch_aux.split("; ")
          for query in queries:
              word, docID = query.split(", ")
              result = irsys.get_tfidf_unstemmed(word, int(docID));
              #print 'word: "%s" docID: "%s" result: %f
              output.append(result)
      elif partId == 9 or partId == 10:
          # Cosine Similarity. 9 ==> dev; 10 ==> test
          queries = ch_aux.split(", ")
          for query in queries:
              results = irsys.query_rank(query)
              first_result = [results[0][0], results[0][1]]
              output.append(first_result)
      else:
          print "Unknown partId: %d" % partId
          return None

      if partId in [2,4,6,8,10]:   # test parts
          sys.stdout = out

      # put in the part ID as well (hacky)
      output = str(output)
      #print 'output: %s' % output
      return output
Ejemplo n.º 3
0
if __name__ == '__main__':
    for labelfile in ("labels.hcnorms_misgoodfortune.all.txt",
                      "labels.hcnorms_char.all.txt",
                      "dream_acts.txt", 'dream_sets.txt', "all_labels.txt"):
    # First we'll do a regular IR experiment with BM25
        documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")}
        labels = list(read_labels("data/" + labelfile))
        y, X = zip(*match_labels_documents(documents, labels))
        y, X = np.array(y), np.array(X)
        kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1)
        rank_scores = np.zeros(10)
        for i, (train, test) in enumerate(kf):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            labels = Counter(flatten(list(y_train)))
            labels = [label for label, count in labels.items() if count >= 1]
            model = IRSystem(k1=1.2, b=0.75, cutoff=0)
            model.fit_raw(X_train, y_train, ngram_range=(1, 1), stop_words='english', min_df=2)
            ranking = model.rank_labels(X_test, raw=True)
            ranking = ranking.tolist()
            ranking = map(lambda r: list(unique_everseen(r)), map(flatten, ranking))
            ranking, y_test = zip(*[(r, y_) for r, y_ in zip(ranking, y_test) if any(l in labels for l in y_)])
            rank_scores[i] = mean_average_precision(ranking, y_test)
        print 'IR: (%s)' % (labelfile), rank_scores.mean(), rank_scores.std()

        # Next, we'll do an IR experiment with Big Documents
        documents = {doc_id: text for doc_id, text in read_dreams("data/dreambank.en.stanford.out")}
        labels = list(read_labels("data/" + labelfile))
        y, X = zip(*match_labels_documents(documents, labels))
        y, X = np.array(y), np.array(X)
        kf = KFold(len(y), n_folds=10, shuffle=True, random_state=1)
        rank_scores = np.zeros(10)