Exemple #1
0
 def save_necessities(self, file_name):
     dump((self.course_id_lookup_dict, self.class_number_lookup_dict,
           self.course_cluster_probs_dict, self.k, self.vectorizer,
           self.tfidf_mat, self.word_dict, self.course_doc_dict,
           self.__get_course_id_list(), self.course_info_dict,
           self.course_association_dictionary, self.pagerank_dict),
          open(file_name, 'wb'))
def auto_phase_match(cfg, samples=30, restore=False, meter_base_names=None):
    data = []

    if 'PCKL_FILE' in os.environ:
        data = pickle.load(open(os.environ['PCKL_FILE'], "rb"))
    else:
        backupfile = "%s.conf.%d.backup.json" % (cfg.devurl.hostname, int(time.time()))
        print "Making a backup config for later restore"
        cfg.getregisters(ofile=backupfile)
        for i in range(3):
            data.append(measure_and_rotate(cfg, samples, meter_base_names=meter_base_names))
        try:
            from cloud.serialization.cloudpickle import dump
            dump(data, open("{}/tests/{}T{}.pckl".format(egauge_config.THISDIR, cfg.devurl.netloc,
                                                         int(time.time())), "wb"))
        except ImportError as ex:
            print "unable to save pckl file", ex

    team, updates = phase_match(data)

    for tt in team:
        print tt

    channels = data[0][0][0]
    totals = data[0][0][2]
    #from IPython.core.debugger import Pdb; Pdb().set_trace()

    if 'PCKL_FILE' not in os.environ:
        if restore is True:
            print "Restoring to original configuration on request"
            cfg.setregisters(ifile=backupfile, skip_backup=True)
            cfg.wait()
            cfg.reboot()
        else:
            if updates > 0:
                body = cfg.get_installation_POST(channels, team, totals)
                uri = "/cgi-bin/protected/egauge-cfg"
                resp, cont = cfg.request(uri, method="POST", body=body)
                cfg.wait()
                cfg.reboot()
            else:
                print "No recommended change to the original config"
    else:
        if updates > 0:
            obj = cfg._to_json(channels, team, totals)
            obj_str = cfg._format_json(obj)
            ofile = "%s.conf.%d.phase_checked.json" % (cfg.devurl.hostname, int(time.time()))
            with open(ofile, "wt") as of:
                print >> of, obj_str
            print "saved config to ", ofile

    return ((channels, team, totals, data))
    def _save_results(self, output_path):
        """
        
        Arguments:
        - `output_path`:
        """

        if os.path.exists(output_path):
            os.remove(output_path)

        with open(output_path, "w") as fpout:
            cloudpickle.dump(self.results, fpout)
            pass
        pass
Exemple #4
0
    functions_dict['phi0'] = lambdify(syms, term)
    
    # first order
    
    functions_dict['ddx'] = lambdify(syms, term.diff(x))
    functions_dict['ddy'] = lambdify(syms, term.diff(y))
    functions_dict['ddz'] = lambdify(syms, term.diff(z))
    
    # second order
    
    functions_dict['d2dx2'] = lambdify(syms, term.diff(x,2))
    functions_dict['d2dy2'] = lambdify(syms, term.diff(y,2))
    functions_dict['d2dz2'] = lambdify(syms, term.diff(z,2))
    functions_dict['d2dxdy'] = lambdify(syms, term.diff(x,y))
    functions_dict['d2dxdz'] = lambdify(syms, term.diff(x,z))
    functions_dict['d2dydz'] = lambdify(syms, term.diff(y,z))
    
    # third order
    
    functions_dict['d3dz3'] = lambdify(syms, term.diff(z,3))
    functions_dict['d3dxdz2'] = lambdify(syms, term.diff(x,z,2))
    functions_dict['d3dydz2'] = lambdify(syms, term.diff(y,z,2))
    
    # fourth order
    
    functions_dict['d4dz4'] = lambdify(syms, term.diff(z,4))
    functions_dict['d4dx2dz2'] = lambdify(syms, term.diff(x,2,z,2))
    functions_dict['d4dy2dz2'] = lambdify(syms, term.diff(y,2,z,2))

    dump(functions_dict, open('ad_0.pickle', 'wb'))
def main():
    start = time.time()
    ### READ ###########################################################################################################
    print '\n------------'
    print 'Reading data'
    print '------------\n'

    all_train_sentences = Paragraphs("Dataset/Train/").all_sentences()

    ###
    read_end = time.time()
    print 'Reading time:', read_end - start, 's'
    ####################################################################################################################

    ### PREPROCESS #####################################################################################################
    print '\n------------------'
    print 'Preprocessing data'
    print '------------------\n'

    used_fraction = 0.005
    train_fraction = 0.8
    none_fraction = 0.05

    print 'Fraction of data used:', used_fraction
    print 'Fraction of data for training:', train_fraction
    print 'Fraction of None-labelled samples used:', none_fraction

    (used_sentences, _) = all_train_sentences.split_randomly(used_fraction)
    (train_sentences, test_sentences) = used_sentences.split_randomly(train_fraction)

    all_train_tokens = train_sentences.tokens()
    subsampled_tokens = subsample_none(all_train_tokens, none_fraction)

    print 'Number of training tokens:', len(subsampled_tokens)

    class_dict = get_class_dict(subsampled_tokens)
    stem_dict = get_stem_dict(subsampled_tokens)
    word_dict = get_word_dict(subsampled_tokens)
    ngram_order = 2
    char_ngram_dict = get_char_ngram_dict(subsampled_tokens, ngram_order)
    ngram_dict = get_ngram_dict(all_train_tokens, ngram_order)
    trigger_dict = get_trigger_dict(subsampled_tokens)
    arg_word_dict = get_arg_word_dict(subsampled_tokens)

    feature_strings = ['word_template_feature',
                       'word_class_template_feature',
                       'capital_letter_feature',
                       'token_in_trigger_dict_feature',
                       'number_in_token_feature',
                       'token_in_protein_feature',
                       'token_is_after_dash_feature',
                       'pos_class_feature',
                       'character_ngram_feature']
    phi = partial(set_of_features, stem_dict, word_dict, class_dict, trigger_dict, ngram_order, char_ngram_dict,
                  ngram_dict, feature_strings)

    print 'Used features:', feature_strings

    ###
    preprocess_end = time.time()
    print 'Preprocessing time:', preprocess_end - read_end, 's'
    ####################################################################################################################

    ### TRAIN ##########################################################################################################
    print '\n-------------'
    print 'Training data'
    print '-------------\n'

    alpha = 0.2
    max_iterations = 10

    print 'Alpha =', alpha
    print 'Max iterations =', max_iterations

    classifier = LoglinearModel(lambda t: t.event_candidate, phi, class_dict.keys(), alpha, max_iterations)\
        .train(subsampled_tokens)

    ###
    train_end = time.time()
    print 'Training time:', train_end - read_end, 's'
    ####################################################################################################################

    #### TEST ###########################################################################################################
    print '\n-------'
    print 'Testing'
    print '-------\n'

    all_test_tokens = test_sentences.tokens()
    subsampled_test_tokens = all_test_tokens

    print 'Number of test tokens:', len(subsampled_test_tokens)

    predictions = classifier.predict_all(subsampled_test_tokens)

    ###
    predict_end = time.time()
    print 'Predict time:', predict_end - train_end, 's'
    ####################################################################################################################

    ### ERROR ANALYSIS #################################################################################################
    print '\n-----------------'
    print 'Analysing results'
    print '-----------------\n'

    true_labels = []
    for token in all_test_tokens:
        true_labels.append(token.event_candidate)

    test_keys = class_dict.keys()
    for label in test_keys:
        print 'Analyzing label: ', label
        precision_recall_f1(true_labels, predictions, label)


    y_test = map(lambda t: t.event_candidate, all_test_tokens)
    y_pred = predictions

    # Computer our confusion matrix
    cm2 = confusion_matrix(class_dict, y_test, y_pred)
    pprint(cm2)

    print cm2

    none_index = class_dict['None']
    classes = class_dict.keys()

    for i in range(len(class_dict)):
        print '\nCLASS: ', classes[i]
        print 'Recall: ', label_recall(cm2, i)
        print 'Precision: ', label_precision(cm2, i)
        print 'F1: ', label_f1(cm2, i)

    print '\n'
    print 'Precision micro:', precision_micro(cm2, none_index)
    print 'Recall micro:', recall_micro(cm2, none_index)
    print 'F1 micro:', f1_micro(cm2, none_index)
    print '\n'
    print 'Precision macro:', precision_macro(cm2, none_index)
    print 'Recall macro:', recall_macro(cm2, none_index)
    print 'F1 macro:', f1_macro(cm2, none_index)

    ###
    analysis_end = time.time()
    print '\nAnalysis time:', analysis_end - predict_end, 's'
    # ####################################################################################################################
    #
    cp.dump(classifier, open('classifier_' + time.strftime("%Y%m%d-%H%M%S") + '.p', 'wb'))
 def save_model(self, fname):
     dump(self.model, open(fname, "wb"))
Exemple #7
0
 def save(self, filename):
     #binary mode is important for pickling
     with open(filename, 'wb') as f:
         dump(self, f)
def auto_phase_match(cfg, samples=30, restore=False, meter_base_names=None):
    data = []

    if 'PCKL_FILE' in os.environ:
        data = pickle.load(open(os.environ['PCKL_FILE'], "rb"))
    else:
        backupfile = "%s.conf.%d.backup.json" % (cfg.devurl.hostname,
                                                 int(time.time()))
        print "Making a backup config for later restore"
        cfg.getregisters(ofile=backupfile)
        for i in range(3):
            data.append(
                measure_and_rotate(cfg,
                                   samples,
                                   meter_base_names=meter_base_names))
        try:
            from cloud.serialization.cloudpickle import dump
            dump(
                data,
                open(
                    "{}/tests/{}T{}.pckl".format(egauge_config.THISDIR,
                                                 cfg.devurl.netloc,
                                                 int(time.time())), "wb"))
        except ImportError as ex:
            print "unable to save pckl file", ex

    team, updates = phase_match(data)

    for tt in team:
        print tt

    channels = data[0][0][0]
    totals = data[0][0][2]
    #from IPython.core.debugger import Pdb; Pdb().set_trace()

    if 'PCKL_FILE' not in os.environ:
        if restore is True:
            print "Restoring to original configuration on request"
            cfg.setregisters(ifile=backupfile, skip_backup=True)
            cfg.wait()
            cfg.reboot()
        else:
            if updates > 0:
                body = cfg.get_installation_POST(channels, team, totals)
                uri = "/cgi-bin/protected/egauge-cfg"
                resp, cont = cfg.request(uri, method="POST", body=body)
                cfg.wait()
                cfg.reboot()
            else:
                print "No recommended change to the original config"
    else:
        if updates > 0:
            obj = cfg._to_json(channels, team, totals)
            obj_str = cfg._format_json(obj)
            ofile = "%s.conf.%d.phase_checked.json" % (cfg.devurl.hostname,
                                                       int(time.time()))
            with open(ofile, "wt") as of:
                print >> of, obj_str
            print "saved config to ", ofile

    return ((channels, team, totals, data))
def main():
    start = time.time()
    ### READ ###########################################################################################################
    print '\n------------'
    print 'Reading data'
    print '------------\n'

    all_train_sentences = Paragraphs("Dataset/Train/").all_sentences()
    ###
    read_end = time.time()
    print 'Reading time:', read_end - start, 's'
    ####################################################################################################################

    ### PREPROCESS #####################################################################################################
    print '\n------------------'
    print 'Preprocessing data'
    print '------------------\n'

    used_fraction = 1
    train_fraction = 0.8
    none_fraction = 0.10

    print 'Fraction of data used:', used_fraction
    print 'Fraction of data for training:', train_fraction
    print 'Fraction of None-labelled samples used:', none_fraction

    (used_sentences, _) = all_train_sentences.split_randomly(used_fraction)
    (train_sentences, test_sentences) = used_sentences.split_randomly(train_fraction)

    all_train_tokens = train_sentences.tokens()
    subsampled_tokens = subsample_none(all_train_tokens, none_fraction)

    print 'Number of training tokens:', len(subsampled_tokens)

    class_dict = get_class_dict(subsampled_tokens)
    arg_dict = {'None': 0, 'Theme': 1, 'Cause': 2}
    stem_dict = get_stem_dict(subsampled_tokens)
    word_dict = get_word_dict(subsampled_tokens)
    ngram_order = 2
    char_ngram_dict = get_char_ngram_dict(subsampled_tokens, ngram_order)
    ngram_dict = get_ngram_dict(all_train_tokens, ngram_order)
    trigger_dict = get_trigger_dict(subsampled_tokens)
    arg_word_dict = get_arg_word_dict(subsampled_tokens)

    classes = dict(map(lambda c: (c, 0), class_dict.keys()))
    for token in subsampled_tokens:
        classes[token.event_candidate] += 1

    print classes

    feature_strings = [#'word_template_feature',
                       'word_class_template_feature',
                       'capital_letter_feature',
                       # 'token_in_trigger_dict_feature',
                       'number_in_token_feature',
                       'token_in_protein_feature',
                       # 'token_is_after_dash_feature',
                       'pos_class_feature']
                       # 'character_ngram_feature']
    phi = partial(set_of_features_structured, stem_dict, word_dict, arg_dict, class_dict, arg_word_dict, ngram_order, char_ngram_dict,
                  ngram_dict, feature_strings)

    print 'Used features:', feature_strings

    ###
    preprocess_end = time.time()
    print 'Preprocessing time:', preprocess_end - read_end, 's'
    ####################################################################################################################

    ### TRAIN ##########################################################################################################
    print '\n-------------'
    print 'Training data'
    print '-------------\n'

    alpha = 0.2
    max_iterations = 15
    arg_none_subsampling = 0.05

    def gold(trigger):
        args = [u'None'] * len(trigger.tokens_in_sentence)
        for (i, arg) in trigger.event_candidate_args:
            args[i] = arg
        return args

    print 'Alpha =', alpha
    print 'Max iterations =', max_iterations

    # classifier = SearchStructuredLoglinearModel(gold, phi, arg_dict.keys(), alpha, max_iterations)\
    #     .train(subsampled_tokens, average=True)

    classifier = StructuredLoglinearModel(gold, phi, arg_dict.keys(), alpha, arg_none_subsampling, max_iterations)\
        .train(subsampled_tokens, average=True)

    ###
    train_end = time.time()
    print 'Training time:', train_end - read_end, 's'
    ####################################################################################################################

    #### TEST ###########################################################################################################
    print '\n-------'
    print 'Testing'
    print '-------\n'

    all_test_tokens = test_sentences.tokens()
    subsampled_test_tokens = subsample_none(all_test_tokens, 0)

    print 'Number of test tokens:', len(subsampled_test_tokens)

    predictions = classifier.predict_all(subsampled_test_tokens)

    predict_end = time.time()
    print 'Predict time:', predict_end - train_end, 's'
    ####################################################################################################################

    ### ERROR ANALYSIS #################################################################################################
    print '\n-----------------'
    print 'Analysing results'
    print '-----------------\n'


    n_args = len(arg_dict)
    confusion = mat(zeros((n_args, n_args)))

    hits = 0
    misses = 0

    for i in range(0, len(predictions)):
        truth = gold(subsampled_test_tokens[i])
        if truth == predictions[i]:
            hits += 1
        else:
            misses += 1
        for j in range(0, len(predictions[i])):
            confusion[arg_dict[predictions[i][j]], arg_dict[truth[j]]] += 1

    np.set_printoptions(suppress=True)
    print confusion

    print 'precision micro:', precision_micro(confusion, 0)
    print 'recall micro:', recall_micro(confusion, 0)
    print 'f1 micro:', f1_micro(confusion, 0)

    print 'precision macro:', precision_macro(confusion, 0)
    print 'recall macro:', recall_macro(confusion, 0)
    print 'f1 macro:', f1_macro(confusion, 0)

    ###
    analysis_end = time.time()
    print '\nAnalysis time:', analysis_end - predict_end, 's'
    # ####################################################################################################################
    #
    cp.dump(classifier, open('classifier_' + time.strftime("%Y%m%d-%H%M%S") + '.p', 'wb'))
 def to_pickle(self, filepath):
     from cloud.serialization.cloudpickle import dump
     fh = open(filepath, 'w')
     dump(self, fh)
     fh.close()