Beispiel #1
0
def main(args):

    my_turk = turkdata.turkdata(args[0])
    read_it = turkdata.init_reader(my_turk)
    my_turk.read_all(read_it)

    yes_no_set = my_turk.training_set()
    if verbose:
        pprint.pprint(yes_no_set)

    ## do this if you've pickled the yes_no sets
    #fd = open("./mt_files/Batch_63422_result_training.pkl", 'rb')
    #ob = pickle.load(fd)

    #yes_no_set = [ob[0], ob[1]]
    ptree_classifier = ptree.mult_sample_learn_ccc(yes_no_set[0], yes_no_set[1],
                                                   n_depth)

    all_ngrams = n_grams.n_grams_depth_first(ptree_classifier, n_depth)

    print len(all_ngrams), " ngrams created"
    print '\nLOG LIKELIHOOD: ', n_grams.n_grams_prior(all_ngrams)
    [n.print_ngram(ptree_classifier.words) for n in all_ngrams]


    #    def store(self):
    cl_name = str(n_depth) + '_classifier_' + time.strftime("%M-%H-%j") + '.pkl'
    out_name = re.sub('.csv', cl_name, my_turk.name)
    try:
        outfd = open(out_name, 'wb')
        pickle.dump(ptree_classifier, outfd) # , pickle.HIGHEST_PROTOCOL)
        print >> sys.stderr, 'Wrote ', out_name
        outfd.close()
    except pickle.PicklingError:
        print >> sys.stderr, 'Failed to serialize ', out_name
Beispiel #2
0
def main(args):

    my_turk = turkdata.turkdata(args[0])
    read_it = turkdata.init_reader(my_turk)
    my_turk.read_all(read_it)

    yes_no_set = my_turk.training_set()
    if verbose:
        pprint.pprint(yes_no_set)

    ## do this if you've pickled the yes_no sets
    #fd = open("./mt_files/Batch_63422_result_training.pkl", 'rb')
    #ob = pickle.load(fd)

    #yes_no_set = [ob[0], ob[1]]
    ptree_classifier = ptree.mult_sample_learn_ccc(yes_no_set[0], yes_no_set[1],
                                                   n_depth)

    all_ngrams = n_grams.n_grams_depth_first(ptree_classifier, n_depth)

    print len(all_ngrams), " ngrams created"
    print '\nLOG LIKELIHOOD: ', n_grams.n_grams_prior(all_ngrams)
    [n.print_ngram(ptree_classifier.words) for n in all_ngrams]


    #    def store(self):
    cl_name = str(n_depth) + '_classifier_' + time.strftime("%M-%H-%j") + '.pkl'
    out_name = re.sub('.csv', cl_name, my_turk.name)
    try:
        outfd = open(out_name, 'wb')
        pickle.dump(ptree_classifier, outfd) # , pickle.HIGHEST_PROTOCOL)
        print >> sys.stderr, 'Wrote ', out_name
        outfd.close()
    except pickle.PicklingError:
        print >> sys.stderr, 'Failed to serialize ', out_name
Beispiel #3
0
def build_class(yes_txt, no_txt):

    ## Be careful, this destroys its arguments.
    yes_txt = [x for x in yes_txt] # copy sequence
    no_txt = [x for x in no_txt] # copy sequence

    ptree_classifier = ptree.mult_sample_learn_ccc(yes_txt, no_txt, n_depth)
    if dbg:
        ptree_classifier.print_tree()
    return ptree_classifier
Beispiel #4
0
def main(args):

    # Create the turkdata training set.
    my_turk = turkdata.turkdata(args[0])
    read_it = turkdata.init_reader(my_turk)
    my_turk.read_all(read_it)

    yes_no_set = my_turk.training_set()
    if verbose:
        pprint.pprint(yes_no_set)

    ## Create the cross validaton folds from the pos & neg examples
    cv = cv_constructor.CrossValidationDataConstructor(yes_no_set[0], yes_no_set[1], numPartitions=n_folds)
    cv_set = cv.getDataSets()

    # pprint.pprint(cv_set.next)

    ## Run the plug-in classifier on each fold, computing fp & fn
    for (training_set, test_set) in cv_set:

        ptree_classifier = ptree.mult_sample_learn_ccc(yes_no_set[0], yes_no_set[1],
                                                   n_depth)
Beispiel #5
0
def main(args):

    # Create the turkdata training set.
    my_turk = turkdata.turkdata(args[0])
    read_it = turkdata.init_reader(my_turk)
    my_turk.read_all(read_it)

    yes_no_set = my_turk.training_set()
    if verbose:
        pprint.pprint(yes_no_set)

    ## Create the cross validaton folds from the pos & neg examples
    cv = cv_constructor.CrossValidationDataConstructor(yes_no_set[0],
                                                       yes_no_set[1],
                                                       numPartitions=n_folds)
    cv_set = cv.getDataSets()

    # pprint.pprint(cv_set.next)

    ## Run the plug-in classifier on each fold, computing fp & fn
    for (training_set, test_set) in cv_set:

        ptree_classifier = ptree.mult_sample_learn_ccc(yes_no_set[0],
                                                       yes_no_set[1], n_depth)