Beispiel #1
0
    def testSource(self):
        corpus1 = annotationIo.load("data/corpusWithPathsSmall.v0.yaml")
        annotation1 = corpus1[0]
        esdc1 = annotation1.esdcs[0]
        annotation1.setSource(esdc1, "person 1")
        self.assertEqual(annotation1.getSource(esdc1), "person 1")

        annotationIo.save(corpus1, "data/corpusWithPathsSmall.v1.yaml")

        corpus2 = annotationIo.load("data/corpusWithPathsSmall.v1.yaml")
        annotation2 = corpus2[0]
        esdc2 = annotation2.esdcs[0]
        self.assertEqual(annotation1.getSource(esdc1),
                         annotation2.getSource(esdc2))
Beispiel #2
0
def main():
    """
    Splits a corpus yaml file into multiple smaller files, for faster
    annotation saving and loading.
    """
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--corpus_fname", dest="corpus_fname")
    parser.add_option("--page_size", dest="page_size", type="int")
    parser.add_option("--result_dir", dest="result_dir", metavar="FILE")
    (options, args) = parser.parse_args()

    corpus = annotationIo.load(options.corpus_fname)

    pages = []
    current_page = []
    for annotation in corpus:
        current_page.append(annotation)
        if len(current_page) >= options.page_size:
            pages.append(current_page)
            current_page = []
    if len(current_page) != 0:
        pages.append(current_page)
    if not os.path.exists(options.result_dir):
        os.makedirs(options.result_dir)
    basename = os.path.basename(options.corpus_fname)
    name = basename[0:-5]
    extension = basename[-5:]
    assert extension == ".yaml"
    for page_i, page in enumerate(pages):
        fname = "%s/%s.page_%d.yaml" % (options.result_dir, name, page_i)
        annotationIo.save(page, fname)
Beispiel #3
0
    def testGroundings(self):
        corpus = annotationIo.load(SOURCE_FILE)
        annotation = corpus[0]

        esdc = annotation.flattenedEsdcs[0]

        annotation.addGrounding(
            esdc,
            PhysicalObject(
                Prism.from_points_xy(tp([(0, 0), (1, 0), (1, 1), (0, 1)]), 3,
                                     4), ["tire", "pallet"]))

        annotation.addGrounding(
            esdc,
            Place(
                Prism.from_points_xy(tp([(0, 0), (1, 0), (1, 1), (0, 1)]), 3,
                                     4)))

        annotation.addGrounding(
            esdc,
            Path.from_xyztheta(timestamps=[0, 1],
                               points_xyztheta=pts_to_xyzTheta([(0, 0),
                                                                (1, 1)])))

        yamlCorpus = annotationIo.toYaml(corpus)

        print "yaml", yamlCorpus
        newCorpus = annotationIo.fromYaml(yamlCorpus)

        esdc1 = corpus[0].flattenedEsdcs[0]
        esdc2 = newCorpus[0].flattenedEsdcs[0]
        null_ids(esdc1)
        null_ids(esdc2)
        self.assertEqual(esdc1, esdc2)
Beispiel #4
0
    def load(self, fname=None):
        if fname == None:
            fname = QFileDialog.getLoadFileName(self, "Open File", self.fname)
        if fname != "":
            self.fname = fname
            annotations = annotationIo.load(self.fname)

            self.loadAnnotations(annotations)
Beispiel #5
0
def main():
    trainer = Trainer()
    #dirName = "%s/tools/forklift/dataAnnotation/data"  % os.environ["SLU_HOME"]
    #positiveFname = dirName + "/forkliftMturkEsdcs.stefie10.yaml"

    fname = "%s/tools/forklift/dataAnnotation/data/forkliftMturkEsdcs.stefie10.groundings.yaml" % os.environ["SLU_HOME"]
    
    corpus = annotationIo.load(fname)
    trainer.trainAndPlot(corpus)
Beispiel #6
0
def evaluate_objects(model, corpus_fname, state_type):
    corpus = annotationIo.load(corpus_fname)
    state_cls = state_type_from_name(state_type)
    from g3.inference import nodeSearch
    taskPlanner = nodeSearch.BeamSearch(model)
    predictions = []
    done = False
    phrases = set()
    for i, annotation in enumerate(corpus):
        start_state = state_cls.from_context(annotation.context)

        for esdc in annotation.esdcs:
            #if esdc.text != "the pallet of boxes":
            #    continue
            #if esdc.text in phrases:
            #    continue
            isCorrect = annotation.isGroundingCorrect(esdc)
            if isCorrect != None:
                ggg = ggg_from_esdc(esdc)
                groundings = annotation.getGroundings(esdc)
                assert len(groundings) == 1
                grounding = groundings[0]
                #if "generator" not in grounding.tags:
                #    continue
                prob = evaluate_ggg(ggg, grounding, start_state, taskPlanner)
                if prob > 0.7:
                    predicted_class = True
                else:
                    predicted_class = False
                predictions.append((predicted_class, isCorrect))
                #print "Query: Is object", " ".join(grounding.tags),
                #print "'" + esdc.text + "'?"
                #print "System: ",
                #if predicted_class:
                #    print "Yes."
                #else:
                #    print "No."
                #done = True
                phrases.add(esdc.text)
            if done:
                break
        if done:
            break

    tp = len([(p, l) for p, l in predictions if p and p == l])
    fp = len([(p, l) for p, l in predictions if p and p != l])
    tn = len([(p, l) for p, l in predictions if not p and p == l])
    fn = len([(p, l) for p, l in predictions if not p and p != l])
    cm = ConfusionMatrix(tp, fp, tn, fn)
    #cm.print_all()

    #if len(phrases) > 20:
    #    phrases = random.sample(phrases, 20)
    #for phrase in sorted(phrases):
    #    print phrase
    return cm
Beispiel #7
0
 def testDuplicateEsdcs(self):
     corpus = annotationIo.load(SOURCE_FILE)
     annotation = corpus[-1]
     self.assertEqual(len(annotation.esdcs), 3)
     for i, esdc in enumerate(annotation.esdcs):
         groundings = annotation.getGroundings(esdc)
         self.assertEqual(len(groundings), 1)
         grounding = groundings[0]
         print "id", esdc.id
         self.assertEqual(grounding.tags, ("trailer%d" % (i + 1), ))
Beispiel #8
0
def main():

    corpus = annotationIo.load(
        "%s/tools/forklift/dataAnnotation/data/negativeEsdcs.yaml" % SLU_HOME)

    assignment_id_to_count = collections.defaultdict(lambda: 0)

    new_corpus = []
    for a in corpus:
        assignment_id_to_count[a.id] += 1
        if assignment_id_to_count[a.id] <= 2:
            new_corpus.append(a)
    annotationIo.save(annotationIo.Corpus(new_corpus), "test.yaml")
Beispiel #9
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--cost-function-class", dest="cost_function_class")
    parser.add_option("--model", dest="model_fname")
    parser.add_option("--dataset", dest="dataset_fname", default=None)
    parser.add_option("--corpus-fname", dest="corpus_fnames")
    parser.add_option("--runid", dest="run_id", type="int", default=-1)
    parser.add_option("--userrt", dest="use_rrt", action="store_true", default=False)
    parser.add_option("--esdcmulti", dest="esdc_multi", type="string", default="False")
    parser.add_option("--state-type",dest="state_type",
                      help="State or Agent Type", metavar="FILE")


    parser.add_option("--esdc-extractor", type="string", dest="esdc_extractor")
    parser.add_option("--esdc-extractor-model", type="string",
                      dest="esdc_extractor_model")
    parser.add_option("-m", "--merging", dest="use_merging",
                      help="Use merging?", metavar="FILE")

    (options, args) = parser.parse_args()

    cost_function_cls = make_cost_function_class(options.cost_function_class)
    model = cost_function_cls(options.model_fname)
    if options.dataset_fname != None:
        dataset = cPickle.load(open(options.dataset_fname))
    else:
        dataset = None

    runId = options.run_id
    multiEsdcs = eval(options.esdc_multi)
    use_merging = eval(options.use_merging)
    
    #########semantic map for d8##########
#    smap = CarmenSemanticMap("%s/data/directions/direction_floor_8_full/direction_floor_8_full.cmf.gz" % SLU_HOME,
#                             "%s/data/directions/direction_floor_8_full/tags/df8_full_tags.tag" % SLU_HOME,
#                             "%s/data/directions/direction_floor_8_full/partitions/d8_full_part.pck" % SLU_HOME)
    #state_cls = state_type_from_name("forklift")
    state_cls = state_type_from_name(options.state_type)

    corpus = annotationIo.load(options.corpus_fnames)
    print "corpus is", corpus

    extractor_func = extractor_utils.make_extractor_func(options.esdc_extractor,
                                                         options.esdc_extractor_model)
    print "func", extractor_func
    print "multi", multiEsdcs
    evaluateDataSet(model, corpus, dataset, extractor_func, state_cls,
                    useRrt=options.use_rrt, runId=runId, 
                    runMultiEsdcCommands=multiEsdcs, use_merging=use_merging)
Beispiel #10
0
def main():

    corpus = annotationIo.load(
        "dataAnnotation/data/forkliftMturkEsdcs.stefie10.groundings.yaml")
    for annotation in corpus:
        esdcs = annotation.esdcs
        for esdc in esdcs.flattenedEsdcs:
            if (not esdc.childIsEmpty("r") and esdc.type == "OBJECT"
                    or esdc.type == "PLACE"):
                rwords = [str(e.text) for e in esdc.r]
                if (len(rwords) > 1 and not "right" in rwords
                        and not "left" in rwords and not "front" in rwords
                        and not "next" in rwords):
                    print esdc.text
Beispiel #11
0
def main():
    corpus = annotationIo.load(
        "dataAnnotation/data/spatialRelations.stefie10.yaml")
    sr_to_num_positive = collections.defaultdict(lambda: 0)
    sr_to_num_negative = collections.defaultdict(lambda: 0)
    srs = set()
    total = 0
    for annotation in corpus:
        path_esdc = annotation.esdcs[0]
        sr = path_esdc.r[0].text
        srs.add(sr)
        if annotation.isGroundingCorrect(path_esdc):
            sr_to_num_positive[sr] += 1
        else:
            sr_to_num_negative[sr] += 1
        total += 1

    for sr in sorted(srs):
        print sr, sr_to_num_positive[sr], sr_to_num_negative[sr]
    print "total", total
Beispiel #12
0
    def testReader(self):

        corpus = annotationIo.load(SOURCE_FILE)

        assignmentIds = set()
        for annotation in corpus:
            assignmentIds.add(annotation.assignmentId)

        self.assertEqual(len(assignmentIds), len(corpus))

        yamlCorpus = annotationIo.toYaml(corpus)
        newCorpus = annotationIo.fromYaml(yamlCorpus)

        esdc1 = corpus[0].esdcs[-1]
        esdc2 = newCorpus[0].esdcs[-1]

        null_ids(esdc1)
        null_ids(esdc2)

        self.assertEqual(esdc1, esdc2)

        self.assertEqual(corpus[0].esdcToGroundings[esdc1],
                         newCorpus[0].esdcToGroundings[esdc2])

        self.assertEqual(corpus[0], newCorpus[0])
        for a1, a2 in zip(corpus, newCorpus):
            print "***********"
            print "a1"
            print[str(e) for e in a1.esdcs]
            print "a2"
            print[str(e) for e in a2.esdcs]

            for esdc1, esdc2 in zip(a1.flattenedEsdcs, a2.flattenedEsdcs):
                self.assertEqual(esdc1.text, esdc2.text)
                groundings1 = a1.getGroundings(esdc1)
                groundings2 = a2.getGroundings(esdc2)
                self.assertEqual(len(groundings1), len(groundings2))
                for g1, g2 in zip(groundings1, groundings2):
                    self.assertEqual(g1, g2)
Beispiel #13
0
def main(argv):
    app = basewindow.makeApp()

    from optparse import OptionParser
    parser = OptionParser()

    parser.add_option("--training_filename",
                      dest="training_fname",
                      help="Training Filename",
                      metavar="FILE")
    (options, args) = parser.parse_args()

    annotations = annotationIo.load(options.training_fname)
    annotation = annotations[0]
    state, esdc_to_ggg = annotation_to_ggg_map(annotation)
    ggg = esdc_to_ggg[annotation.esdcs[0]]

    wnd = MainWindow()
    wnd.show()

    wnd.load(ggg)
    app.exec_()
Beispiel #14
0
def main():

    corpus = annotationIo.load(
        "dataAnnotation/data/forkliftMturkEsdcs.stefie10.groundings.yaml")

    wordHist = Histogram()
    fieldToHist = {}

    count = 0
    for annotation in corpus:
        for token in annotation.entireText.split():
            if not token in stopwords:
                wordHist.add(token.lower())
        esdcs = annotation.esdcs
        for esdc in esdcs:
            for fieldName in esdc.fieldNames:
                fieldToHist.setdefault(fieldName, Histogram())
                for word in esdc.childTokens(fieldName):
                    text = word.text
                    if not text in stopwords:
                        fieldToHist[fieldName].add(text.lower())
        count += 1
        if count >= 10:
            #break
            pass

    graphStacked({"words": wordHist}, "histogram", "Words", maxCols=10)

    for key in ExtendedSdc.fieldNames:
        if len(fieldToHist[key].bins) != 0:
            graphStacked(
                {key: fieldToHist[key]},
                "histogram",
                ExtendedSdc.fieldNamesToDescriptions[key].capitalize(),
                maxCols=10)

    mpl.show()
Beispiel #15
0
def main():
    import sys
    fname = sys.argv[1]
    assignments = readCorpus.Corpus(
        "dataCollection/data/corpusCommandsForVideoSmallFilesOnly/")
    corpus = annotationIo.load(fname)

    word_cnt = 0
    workers = set()
    scenarios = set()

    for annotation in corpus:

        assignment = assignments.assignmentForId(annotation.assignmentId)
        word_cnt += len(annotation.entireText.split())
        workers.add(assignment.workerId)
        scenarios.add(assignment.scenario)
        if assignment.scenario.name == "put_tire_pallet_on_loaded_truck":
            print "command", assignment.scenario.name, annotation.entireText

    print len(scenarios), "scenarios"
    print len(workers), "annotators"
    print word_cnt, "words"
    print len(corpus), "commands"
Beispiel #16
0
def main():

    extractor = Extractor()
    oldCorpus = annotationIo.load("%s/dataAnnotation/data/forkliftMturkEsdcs.stefie10.groundings.withPaths.yaml" % os.environ['FORKLIFT_HOME'])
    annotations = []
    for i, a in enumerate(oldCorpus):
#        if i != 140:
#            continue
        print "doing", i, a.entireText
        automatic_esdcs_groups = extractor.extractTopNEsdcs(a.entireText, n=10)

        
        for automatic_esdc_group in automatic_esdcs_groups:
            annotation = annotationIo.Annotation(a.assignmentId,
                                                 automatic_esdc_group)
            
            for automatic_esdc in automatic_esdc_group.flattenedEsdcs:
                if automatic_esdc in a.flattenedEsdcs:
                    annotation.setGroundingIsCorrect(automatic_esdc, True)
                else:
                    annotation.setGroundingIsCorrect(automatic_esdc, False)

            annotations.append(annotation)
    annotationIo.save(annotationIo.Corpus(annotations), "negativeEsdcs.yaml")
Beispiel #17
0
    def testLoadSave(self):
        corpus1 = annotationIo.load("data/corpusWithPathsSmall.v0.yaml")
        annotationIo.save(corpus1, "data/corpusWithPathsSmall.v1.yaml")
        corpus2 = annotationIo.load("data/corpusWithPathsSmall.v1.yaml")

        self.assertEqual(len(corpus1), len(corpus2))
Beispiel #18
0
def evaluateParallel(options):
    cost_function_cls = make_cost_function_class(options.cost_function_class)
    model = cost_function_cls.from_mallet(options.model_fname,
                                          feature_extractor_cls=GGGFeatures,
                                          guiMode=False)
    if options.test_set_fname != None:
        test_set = cPickle.load(open(options.test_set_fname))
    else:
        test_set = None

    runId = options.run_id
    multi_esdcs = eval(options.esdc_multi)
    use_merging = eval(options.use_merging)
    debug("use_merging: %s" % use_merging)
    force_sync = eval(options.force_sync)
    resolver_type = options.resolver_type

    # qa_corpus_fname_1 = "%s/tools/forklift/dataAnnotation/data/dialog_ambiguous_RSS_12/forklift_ambiguous_larger_corpus_all_questions.yaml" % SLU_HOME
    qa_corpus_fname_1 = options.qa_corpus_fname_1
    qa_corpus_fname_2 = options.qa_corpus_fname_2
    qa_corpora = []
    if qa_corpus_fname_1:
        qa_corpora.append(annotationIo.load(qa_corpus_fname_1))

    # qa_corpus_fname_2 = "%s/tools/forklift/dataAnnotation/data/dialog_ambiguous_RSS_12/forklift_ambiguous_larger_corpus_all_questions_set000.yaml" % SLU_HOME
    if qa_corpus_fname_2:
        qa_corpus_2 = annotationIo.load(qa_corpus_fname_2)
        qa_corpora.append(qa_corpus_2)

    if resolver_type == "bag_of_words":
        from coreference.bag_of_words_resolver import BagOfWordsResolver
        resolver = BagOfWordsResolver(
            "%s/tools/coreference/models/coref_1.5.pck" % SLU_HOME)
    elif resolver_type == "oracle":
        from coreference.oracle_resolver import OracleResolver
        resolver = OracleResolver(options.corpus_fname)
    else:
        resolver = None

    state_cls = state_type_from_name(options.state_type)
    corpus = annotationIo.load_all(options.corpus_fname)
    ########### UGLY HACK ##############
    # The corpus file is always a command set, so we set its ESDC sources to be
    # "command" in every case. This is necessary, because our commands_AAAI_11
    # datasets don't have their sources set at all.
    for annotation in corpus:
        for esdc in annotation.flattenedEsdcs:
            annotation.setSource(esdc, 'command')
    extractor_func = extractor_utils.make_extractor_func(
        options.esdc_extractor, options.esdc_extractor_model)
    info("num questions: %d" % options.num_questions)
    info("num answers: %d" % options.num_answers)
    evaluateDataSet(model,
                    corpus,
                    test_set,
                    extractor_func,
                    state_cls,
                    options.entropy_metric,
                    useRrt=options.use_rrt,
                    runId=runId,
                    multi_esdcs=multi_esdcs,
                    use_merging=use_merging,
                    resolver=resolver,
                    qa_corpora=qa_corpora,
                    num_questions=options.num_questions,
                    num_answers=options.num_answers,
                    force_sync=force_sync,
                    question_type=options.question_type,
                    random_seed=options.random_seed)
Beispiel #19
0
def main():

    parser = OptionParser()

    parser.add_option("--outfile_training",
                      dest="training_fname",
                      help="Training Output Filename")
    parser.add_option("--outfile_test",
                      dest="testing_fname",
                      help="Test Output Filename")
    parser.add_option(
        "--infile_positive",
        dest="positive_fnames",
        action="append",
        default=[],
        help="Positive Filename; default to True if isGroundingCorrect is None"
    )
    parser.add_option(
        "--infile_negative",
        dest="negative_fnames",
        action="append",
        default=[],
        help="Negative Filename; default to False if isGroundingCorrect is None"
    )
    parser.add_option(
        "--infile_labeled",
        dest="labeled_fnames",
        action="append",
        default=[],
        help="Labeled examples; skip if isGroundingCorrect is None")

    parser.add_option("--infile_unlabeled",
                      dest="unlabeled_fnames",
                      action="append",
                      default=[],
                      help="unlabeld fnames")

    parser.add_option("--feature_extractor",
                      dest="feature_extractor",
                      help="Feature Extractor Class")

    parser.add_option("--split",
                      dest="split",
                      type="string",
                      help="'random' to split randomly; 'scenario' to split " +
                      "by scenario.")

    parser.add_option(
        "--training_examples",
        dest="training_examples",
        action="append",
        help=
        "Examples that are in the training set; others go in the test set.  Can be passed more than once. "
    )

    (options, args) = parser.parse_args()

    try:
        from g3.feature_extractor.esdc_features import EsdcFeatures
        from g3.feature_extractor.esdc_flattened_features import EsdcFlattenedFeatures
        from g3.feature_extractor.grounded_features import GGGFeatures
        from g3.feature_extractor.rl_features import RLFeatures
        from g3.feature_extractor.bolt_features import BoltFeatures
        from g3.feature_extractor.ikea_features import IkeaFeatures
        from g3.feature_extractor.sr_features import SrFeatures
        #feature_extractor = semantic_map.esdc_semantic_map2.esdc_semantic_map()
        feature_extractor_cls = eval(options.feature_extractor)
        feature_extractor = feature_extractor_cls()
    except:
        print "error doing", options.feature_extractor
        raise

    observations = list()

    for positive_fname in options.positive_fnames:
        corpus = annotationIo.load(positive_fname)
        new_examples = generate_examples(basename(positive_fname),
                                         corpus,
                                         feature_extractor,
                                         default_class_value=True)
        if len(new_examples) == 0:
            raise ValueError("No examples from" + ` positive_fname `)
        observations.extend(new_examples)

    for negative_fname in options.negative_fnames:
        corpus = annotationIo.load(negative_fname)
        new_examples = generate_examples(basename(negative_fname),
                                         corpus,
                                         feature_extractor,
                                         default_class_value=False)
        if len(new_examples) == 0:
            raise ValueError("No examples from" + ` negative_fname `)

        observations.extend(new_examples)

    for labeled_fname in options.labeled_fnames:
        corpus = annotationIo.load(labeled_fname, check=False)
        new_examples = generate_examples(basename(labeled_fname),
                                         corpus,
                                         feature_extractor,
                                         default_class_value=None)
        if len(new_examples) == 0:
            raise ValueError("No examples from" + ` labeled_fname `)
        observations.extend(new_examples)

    for unlabeled_fname in options.unlabeled_fnames:
        corpus = annotationIo.load(unlabeled_fname)
        new_examples = generate_examples(basename(unlabeled_fname),
                                         corpus,
                                         feature_extractor,
                                         default_class_value=None,
                                         force_default_class_value=True)
        if len(new_examples) == 0:
            raise ValueError("No examples from" + ` unlabeled_fname `)
        observations.extend(new_examples)

    if options.split == "scenario":
        mturkCorpus = readCorpus.Corpus(
            "%s/data/corpusCommandsForVideoSmallFilesOnly/" % SLU_HOME)
        scenario_names = list(
            set(
                mturkCorpus.assignmentForId(
                    obs.annotation.assignmentId.split("_")[0]).scenario.name
                for obs in observations))
        random.shuffle(scenario_names)

        n_training_scenarios = int(ceil(len(scenario_names) * 0.7))

        training_scenarios = scenario_names[:n_training_scenarios]
        testing_scenarios = scenario_names[n_training_scenarios:]

        training = [
            o for o in observations if mturkCorpus.assignmentForId(
                o.annotation.assignmentId.split("_")[0]).scenario.name in
            training_scenarios
        ]

        testing = [
            o for o in observations if mturkCorpus.assignmentForId(
                o.annotation.assignmentId.split("_")[0]).scenario.name in
            testing_scenarios
        ]
    elif options.split == "annotation":
        '''
        Splits the examples, grouped by annotation.
        If the spatial relations corpus is included,
        that data will be in the training set only.
        '''
        training = []
        testing = []
        sr_ids = []
        ids = []

        for o in observations:
            aid = o.annotation.id
            if ((aid not in ids) and ("sr_" not in aid)):
                ids.append(aid)
            elif "sr_" in aid:
                sr_ids.append(aid)

        random.shuffle(ids)
        n_training_ids = int(ceil(len(ids) * 0.7))

        training_ids = ids[:n_training_ids]
        testing_ids = ids[n_training_ids:]

        training = [
            o for o in observations if o.annotation.id in training_ids
            or o.annotation.assignmentId in sr_ids
        ]
        testing = [o for o in observations if o.annotation.id in testing_ids]
    elif options.split == "random":
        random.shuffle(observations)
        n_training = int(ceil(len(observations) * 0.7))
        training = observations[0:n_training]
        testing = observations[n_training:]
    elif options.split == "labeled_annotation":
        training_ids = set()
        training = []
        testing = []
        for training_fname in options.training_examples:
            ds = pickle_util.load(training_fname)
            for ex in ds.observations:

                training_ids.add(ex.annotation.id)
                training_ids.add(ex.annotation.id.split("_")[0])
        print "training", training_ids
        for example in observations:
            if example.annotation.id in training_ids:
                training.append(example)
            else:
                aid = example.annotation.id.split("_")[0]
                if aid in training_ids:
                    training.append(example)
                else:
                    print "skipping", example.annotation.id, aid
                    testing.append(example)
        print "labeled training", len(training)
        print "labeled testing", len(testing)
    elif options.split == "labeled_file":
        training = []
        testing = []
        for example in observations:
            if "training" in example.annotation.fname:
                training.append(example)
            elif "testing" in example.annotation.fname:
                testing.append(example)
            else:
                training.append(example)

    elif options.split == "labeled":
        training_ids = set()
        training = []
        testing = []
        for training_fname in options.training_examples:
            ds = pickle_util.load(training_fname)
            for ex in ds.observations:
                print "id", ex.id
                training_ids.add(ex.id)

        for example in observations:
            print "example", example.id
            if example.id in training_ids:

                training.append(example)
            else:
                testing.append(example)

    else:
        raise ValueError("Unexpected split type: " + ` options.split `)

    training_dataset = ContinuousDataset(training, feature_extractor_cls)
    testing_dataset = ContinuousDataset(testing, feature_extractor_cls)

    print "saving ", len(training), " examples to:", options.training_fname
    pickle_util.save(options.training_fname, training_dataset)

    print "saving ", len(testing), " examples to:", options.testing_fname
    pickle_util.save(options.testing_fname, testing_dataset)