def process_projects(src_directory, glossary_description, glossary_file):
    corpus = Corpus(src_directory)
    corpus.process()

    reference_sources = ReferenceSources()
    reference_sources.read_sources()

    metrics = Metrics()
    metrics.create(corpus)

    # Select terms
    MAX_TERMS = 5000
    sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get,
                                   reverse=True)

    # Developer report
    glossary_entries = OrderedDict()
    translations = Translations()
    selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency

    for term in selected_terms:
        glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources)

    dev_glossary_serializer = DevGlossarySerializer()
    dev_glossary_serializer.create(u"dev-" + glossary_file + ".html",
                                   glossary_description, corpus,
                                   glossary_entries, reference_sources)

    # User report
    glossary_entries = []
    selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS])  # Sorted by term

    glossary = Glossary(glossary_description)
    for term in selected_terms:
        glossary_entry = GlossaryEntry(
            term,
            translations.create_for_word_sorted_by_frequency(corpus.documents,
                                                             term,
                                                             reference_sources)
        )
        glossary.entries.append(glossary_entry)

    glossary_entries = glossary.get_dict()
    process_template('templates/userglossary-html.mustache',
                     glossary_file + ".html", glossary_entries)
    process_template('templates/userglossary-csv.mustache',
                     glossary_file + ".csv", glossary_entries)

    generate_database(glossary, glossary_file)
Example #2
0
def update_res(config_results, mydir, latest):
	if (os.path.exists(latest)):
		shutil.rmtree(latest)
	
	text = 'var configs = ['
	for config in config_results[:-1]:
		text += str(config) + ','
	
	text += str(config_results[-1]) + '];'

	if (os.path.exists('config.js')):
		os.remove('config.js')
	
	Metrics.saveConfig('config.js', text)
	Metrics.copyDirectory(mydir, latest)
Example #3
0
    def get_average_throughput_for_node(self, node_id=0):
        data_set = []       
        data = self.f_receive_events_at(self._all_events, node_id)

        if self._sim_mode == MODE_WIRELESS:
            assert(I_TIMESTAMP_TOKEN == 1)
            data = self.f_events_at_level(data, 'MAC')

        data = self.f_events_with_data_pkts(data)
        
        if self._sim_mode == MODE_WIRELESS or \
            self._sim_mode == MODE_WIRED:
            data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) )
        else:
            c_ts = []
            c_len = []
            for e in data:
                #print e
                if is_wired(e):
                    c_ts.append( e[I_W_TIMESTAMP] )
                    c_len.append( e[I_W_PKT_LEN] )
                else:
                    c_ts.append( e[I_WL_TIMESTAMP] )
                    c_len.append( e[I_WL_PKT_LEN] )
                    
            data_set = izip(c_ts, c_len)
       
        #for e in data_set: print e        
        return Metrics.average_throughput( data_set, -1 )
Example #4
0
    def get_packet_retransmissions(self, src_node=0, dst_node=0):
        data = self.f_send_events(self._all_events)
        data = self.f_events_at_node(data, src_node)

        if self._sim_mode == MODE_WIRELESS:
            assert(I_TIMESTAMP_TOKEN == 1)
            data = self.f_events_at_level(data, L_AGENT)
            
        data = self.f_events_with_data_pkts(data)
        
        # Check for destination node
        data = self.f_events_with_dst_node(data, dst_node)
                
        pkt_seq_num = []
        if self._sim_mode == MODE_WIRELESS:
            for event in data:
                try:
                    if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN:   # Wireless, from AGT
                        pkt_seq_num.append( event[I_SEQ_NUM] )
                except IndexError:
                    continue
        elif self._sim_mode == MODE_WIRED:
            assert(I_TIMESTAMP_TOKEN == -1)
            for event in data:
                try:
                    pkt_seq_num.append( event[I_SEQ_NUM] )
                except IndexError:
                    continue

        #pkt_seq_num = self.f_get_cols(data, col_num=I_SEQ_NUM)
        #print pkt_seq_num[:40]
        return Metrics.packet_retransmissions(pkt_seq_num)
Example #5
0
    def get_end2end_delay(self, src_node=0, dst_node=0):
        data = self.f_send_events(self._all_events)
        data = self.f_events_at_node(data, src_node)
        
        if self._sim_mode == MODE_WIRELESS:
            assert(I_TIMESTAMP_TOKEN == 1)
            data = self.f_events_at_level(data, L_AGENT)

        data = self.f_events_with_data_pkts(data)
        
        pkt_seq_num = []
        pkt_timestamp = []
        #print data[:10]

        if self._sim_mode == MODE_WIRELESS:
            assert(I_TIMESTAMP_TOKEN == 1)
            for event in data:
                try:
                    if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN:   # Wireless, from AGT
                        pkt_seq_num.append( event[I_SEQ_NUM] )
                        pkt_timestamp.append( event[I_TIMESTAMP] )
                except IndexError:
                    continue
        else:
            assert(I_TIMESTAMP_TOKEN == -1)
            for event in data:
                try:                    
                    pkt_seq_num.append( event[I_SEQ_NUM] )
                    pkt_timestamp.append( event[I_TIMESTAMP] )
                except IndexError:
                    continue

        
        send_pkts = izip( pkt_seq_num, pkt_timestamp )
        
        data = self.__common_filters__(dst_node)
        pkt_seq_num = []
        pkt_timestamp = []
        
        #print data[:10]
        if self._sim_mode == MODE_WIRELESS:
            for event in data:
                try:
                    if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN:   # Wireless, from AGT
                        pkt_seq_num.append( event[I_SEQ_NUM] )
                        pkt_timestamp.append( event[I_TIMESTAMP] )
                except IndexError:
                    continue
        else:
            for event in data:
                try:
                    pkt_seq_num.append( event[I_SEQ_NUM] )
                    pkt_timestamp.append( event[I_TIMESTAMP] )
                except IndexError:
                    continue
        
        rcvd_pkts = izip( pkt_seq_num, pkt_timestamp )

        return Metrics.end2end_delay(send_pkts, rcvd_pkts)
def process_projects():
    global glossary_file
    global glossary_description

    corpus = Corpus(src_directory)
    corpus.process()

    reference_sources = ReferenceSources()
    reference_sources.read_sources()

    metrics = Metrics()
    metrics.create(corpus)

    # Select terms
    MAX_TERMS = 1000
    sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get,
                                   reverse=True)

    # Developer report
    glossary_entries = OrderedDict()
    translations = Translations()
    selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency

    for term in selected_terms:
        glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources)

    dev_glossary_serializer = DevGlossarySerializer()
    dev_glossary_serializer.create(u"dev-" + glossary_file + ".html",
                                   glossary_description, corpus,
                                   glossary_entries, reference_sources)

    # User report
    glossary_entries = []
    selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS])  # Sorted by term

    glossary = Glossary()
    glossary.description = glossary_description
    for term in selected_terms:
        glossary_entry = GlossaryEntry()
        glossary_entry.source_term = term
        glossary_entry.translations = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources)
        glossary.entries.append(glossary_entry)

    user_glossary_serializer = UserGlossarySerializer()
    user_glossary_serializer.create(glossary_file, glossary.get_dict(),
                                    reference_sources)
Example #7
0
    def get_cumulative_bytes_received_for_node_at_layers(self, node_id=0, layers=[]):
        data = []        
        data_set = []
        if layers == []: layers = [L_AGENT,]
            
        #print 'layers:', layers
        
        if self._sim_mode == MODE_WIRELESS or \
            self._sim_mode == MODE_WIRED:
            data = self.f_receive_events_at(self._all_events, node_id)
            #print 'get_cumulative_bytes_received_for_node_at_layers:', len(data)            

            if self._sim_mode == MODE_WIRELESS:                            
                assert(I_TIMESTAMP_TOKEN == 1)
                data = self.f_events_at_levels(data, layers)           
            
            data = self.f_events_with_data_pkts(data)   # All trace types are taken care of
            data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) )
            #print 'get_cumulative_bytes_received_for_node_at_layers:', len(data)
            
        else:
            # Mixed mode            
            wired_events = []
            wireless_events = []
            rcv_events = self.f_receive_events(self._all_events) # All receive events
            #print 'len(rcv_events):', len(rcv_events)
            
            for e in rcv_events:
                if is_wired(e):
                    if e[I_W_NXT_NODE_ID] == node_id:
                        wired_events.append(e)
                else:
                    if e[I_WL_NXT_NODE_ID] == node_id:
                        wireless_events.append(e)                       
                                        
                    wireless_events = self.f_events_at_levels(wireless_events, layers)              
            
            data = wired_events + wireless_events
            #print 'len(data):', len(data)
            #data = self.f_events_with_data_pkts(data)   # All trace types are taken care of
            c_ts = self.f_get_cols(wired_events, col_num=I_W_TIMESTAMP) + \
                    self.f_get_cols(wireless_events, col_num=I_WL_TIMESTAMP)
            c_len = self.f_get_cols(wired_events, col_num=I_W_PKT_LEN) + \
                    self.f_get_cols(wireless_events, col_num=I_WL_PKT_LEN)                                  
          
            data_set = izip( c_ts, c_len )
            #print c_ts[:10]
            
          
        
        return Metrics.cumulative_bytes_received(data_set)
Example #8
0
    def get_instantaneous_throughput_for_node(self, node_id=0):
        data = self.f_receive_events_at(self._all_events, node_id)
        #data = self.f_events_at_node(data, node_id)

        if self._sim_mode == MODE_WIRELESS:
            assert(I_TIMESTAMP_TOKEN == 1)
            data = self.f_events_at_level(data, 'AGT')

        data = self.f_events_with_data_pkts(data)

        data_set = []
        data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) )
        #for e in data_set: print e
        
        return Metrics.instantaneous_throughput(data_set)
 def setUp(self):
     self.labels_pred = {
         0: 1,
         1: 0,
         2: 1,
         3: 1,
         4: 1,
         5: 1,
         6: 1,
         7: 0,
         8: 0,
         9: 0,
         10: 0,
         11: 2,
         12: 1,
         13: 2,
         14: 1,
         15: 2,
         16: 2}
     self.labels_true = {
         0: 0,
         1: 0,
         2: 0,
         3: 0,
         4: 0,
         5: 0,
         6: 1,
         7: 1,
         8: 1,
         9: 1,
         10: 1,
         11: 1,
         12: 2,
         13: 2,
         14: 2,
         15: 2,
         16: 2}
     self._n = len(self.labels_pred)
     self.metrics = Metrics(self.labels_true, self.labels_pred)
Example #10
0
#!venv/bin/python
from metrics import Metrics
m = Metrics()
files = ['2010-13-080', '2010-00-072',
         '2010-00-094', '2010-26-075', '2010-58-011', '2010-08-078',
         '2010-94-034', '2010-71-034', '2010-40-008', '2010-08-069',
         '2010-92-061', '2010-70-013']
# tests the cutRecall
recall = m.cutRecall(sorted(files), '2010-001')
print(recall)
# tests the cutPrecision
precision = m.cutPrecision(sorted(files), '2010-001')
print(precision)
# tests the FMeasure
FMeasure = m.FMeasure(precision, recall)
print(FMeasure)
# tests the RRank1
RRank1 = m.RRank1(files, '2010-001')
print(RRank1)
# tests the RRank2
RRank2 = m.RRank2(files, '2010-001')
print(RRank2)
# tests the Aprecision
Aprecision = m.APrecision(files, '2010-001')
print(Aprecision)
# tests the nDCG
nDCG = m.nDCG(files, '2010-001', 10)
print(nDCG)
Example #11
0
    if not os.path.exists(DUMPS_FOLDER):
        os.makedirs(DUMPS_FOLDER)

    if not os.environ.get(WEBSITE_HOSTNAME):
        os.environ[WEBSITE_HOSTNAME] = f'localhost-main-{now()}'

    if 'email' in watch_types:
        store = CredentialsStore()

        def start_mail_checker_thread(email):
            mail_checker = MailChecker(store, email, config, tg_bot)
            return mail_checker.start_loop(args.nb_attempts)

        mail_checkers = list(map(lambda data: start_mail_checker_thread(data['email']), form_data))
        logger.info(f"Waiting for {len(mail_checkers)} email checking threads to finish")
        for th in mail_checkers:
            th.join()

    if 'website' in watch_types:
        http_client = HttpClient()
        #browsers = list(map(lambda data: Browser(config, data, tg_bot, http_client), form_data))
        metrics = Metrics(export_metrics=False)
        watcher = WatcherMultislot(tg_bot, http_client, metrics, config, args.parallelism)
        watcher.start_loop(
            max_attempts=args.nb_attempts
        )
        logger.info(f"Waiting for {len(watcher.form_submit_threads)} submit actions to finish")
        for th in watcher.form_submit_threads:
            th.join()

    logger.info("Done. Exiting")
Example #12
0
data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)


# TRAIN
def lr_func_exp(step):
    return 0.95 ** step

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
if store.optims:
    optimizer.load_state_dict(store.optims)
scheduler = LambdaLR(optimizer, lr_lambda=lr_func_exp, last_epoch=epoch if store.optims else -1)
# criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()
criterion = CrossEntropyLoss2d()

metrics = Metrics()
if store.metrics:
    metrics.load_state_dict(store.metrics)

if FAKE:
    print('STOP TRAINING')
    exit(0)

# LOOP
print(f'Starting ({now_str()})')
iter_count = len(data_set) // BATCH_SIZE
while epoch < first_epoch + EPOCH_COUNT:
    iter_metrics = Metrics()
    lr = scheduler.get_lr()[0]
    for i, (inputs, labels) in enumerate(data_loader):
        inputs = inputs.to(device)
Example #13
0
from config import Configuration
from reddit import Reddit
from data import Data
from metrics import Metrics
from logger import Logger

configuration = Configuration()
loggerInstance = Logger(configuration.logLocation)

metrics = Metrics(loggerInstance)
metrics.start()

r = Reddit(configuration.clientId, configuration.clientSecret,
           configuration.password, configuration.userAgent,
           configuration.username, configuration.call, metrics, loggerInstance)
data = Data(configuration.apiKey, loggerInstance)

r.parseUnreadItems(data)

metrics.end()
metrics.buildInboxReport()
    def evaluate(
            args,
            model,
            iterator,
            vocab,
            optimizers,
            step=0,
            epoch=0,
            save_checkpoint=True,
            save_predictions=True,
            save_csv=True,
            sampled_evaluation=False,
            metrics=Metrics(),
    ):

        print()
        logging.info(
            f"Start evaluation on split {'test' if args.eval_on_test_only else 'valid'}"
        )

        model.eval()
        model.to(args.device, args.eval_device)

        all_words, all_tags, all_y, all_y_hat, all_predicted, all_token_ids = [], [], [], [], [], []
        with torch.no_grad():
            for iter, batch in enumerate(tqdm.tqdm(iterator)):
                (
                    batch_token_ids,
                    label_ids,
                    label_probs,
                    eval_mask,
                    label_id_to_entity_id_dict,
                    batch_entity_ids,
                    orig_batch,
                    _,
                ) = batch

                logits, y, y_hat, probs, _, _ = model(
                    batch_token_ids, None,
                    None)  # logits: (N, T, VOCAB), y: (N, T)

                tags = list()
                predtags = list()
                y_resolved_list = list()
                y_hat_resolved_list = list()
                token_list = list()

                chunk_len = args.create_integerized_training_instance_text_length
                chunk_overlap = args.create_integerized_training_instance_text_overlap

                for batch_id, seq in enumerate(label_probs.max(-1)[1]):
                    for tok_id, label_id in enumerate(
                            seq[chunk_overlap:-chunk_overlap]):
                        y_resolved = (vocab.PAD_ID
                                      if eval_mask[batch_id][tok_id +
                                                             chunk_overlap]
                                      == 0 else label_ids[label_id].item())
                        y_resolved_list.append(y_resolved)
                        tags.append(vocab.idx2tag[y_resolved])
                        if sampled_evaluation:
                            y_hat_resolved = (
                                vocab.PAD_ID
                                if eval_mask[batch_id][tok_id + chunk_overlap]
                                == 0 else label_ids[y_hat[batch_id][
                                    tok_id + chunk_overlap]].item())
                        else:
                            y_hat_resolved = y_hat[batch_id][
                                tok_id + chunk_overlap].item()
                        y_hat_resolved_list.append(y_hat_resolved)
                        predtags.append(vocab.idx2tag[y_hat_resolved])
                        token_list.append(
                            batch_token_ids[batch_id][tok_id +
                                                      chunk_overlap].item())

                all_y.append(y_resolved_list)
                all_y_hat.append(y_hat_resolved_list)
                all_tags.append(tags)
                all_predicted.append(predtags)
                all_words.append(
                    vocab.tokenizer.convert_ids_to_tokens(token_list))
                all_token_ids.append(token_list)

        ## calc metric
        y_true = numpy.array(list(chain(*all_y)))
        y_pred = numpy.array(list(chain(*all_y_hat)))
        all_token_ids = numpy.array(list(chain(*all_token_ids)))

        num_proposed = len(y_pred[(vocab.OUTSIDE_ID > y_pred)
                                  & (all_token_ids > 0)])
        num_correct = (((y_true == y_pred) & (vocab.OUTSIDE_ID > y_true) &
                        (all_token_ids > 0))).astype(numpy.int).sum()
        num_gold = len(y_true[(vocab.OUTSIDE_ID > y_true)
                              & (all_token_ids > 0)])

        new_metrics = Metrics(
            epoch=epoch,
            step=step,
            num_correct=num_correct,
            num_proposed=num_proposed,
            num_gold=num_gold,
        )

        if save_predictions:
            final = args.logdir + "/%s.P%.2f_R%.2f_F%.2f" % (
                "{}-{}".format(str(epoch), str(step)),
                new_metrics.precision,
                new_metrics.recall,
                new_metrics.f1,
            )
            with open(final, "w") as fout:

                for words, tags, y_hat, preds in zip(all_words, all_tags,
                                                     all_y_hat, all_predicted):
                    assert len(preds) == len(words) == len(tags)
                    for w, t, p in zip(words, tags, preds):
                        fout.write(f"{w}\t{t}\t{p}\n")
                    fout.write("\n")

                fout.write(f"num_proposed:{num_proposed}\n")
                fout.write(f"num_correct:{num_correct}\n")
                fout.write(f"num_gold:{num_gold}\n")
                fout.write(f"precision={new_metrics.precision}\n")
                fout.write(f"recall={new_metrics.recall}\n")
                fout.write(f"f1={new_metrics.f1}\n")

        if not args.dont_save_checkpoints:

            if save_checkpoint and metrics.was_improved(new_metrics):
                config = {
                    "args": args,
                    "optimizer_dense": optimizers[0].state_dict(),
                    "optimizer_sparse": optimizers[1].state_dict(),
                    "model": model.state_dict(),
                    "epoch": epoch,
                    "step": step,
                    "performance": new_metrics.dict(),
                }
                fname = os.path.join(args.logdir,
                                     "{}-{}".format(str(epoch), str(step)))
                torch.save(config, f"{fname}.pt")
                fname = os.path.join(
                    args.logdir, new_metrics.get_best_checkpoint_filename())
                torch.save(config, f"{fname}.pt")
                logging.info(f"weights were saved to {fname}.pt")

        if save_csv:
            new_metrics.to_csv(epoch=epoch, step=step, args=args)

        if metrics.was_improved(new_metrics):
            metrics.update(new_metrics)

        logging.info("Finished evaluation")

        return metrics
Example #15
0
from PIL import Image, ImageDraw, ImageFont
import codecs
from metrics import Metrics,A4_LANDSCAPE_IN_MM
import sys


msg = str(sys.argv[1]).upper()
file_name = str(sys.argv[2])


m = Metrics(300)

font = ImageFont.truetype("arial.ttf", m.mm2pt(10))
im = Image.new("RGB",m.mmpoint2px(A4_LANDSCAPE_IN_MM),"#ffffff")
draw = ImageDraw.Draw(im)


def draw_letter_rect(x,y,letter):
	draw.rectangle( (m.mmpoint2px((x,y)), m.mmpoint2px((x+12,y+10))),outline="#000000", fill=None)
	textsize = draw.textsize(letter)
	tx = m.mm2px(x+2)
	ty = m.mm2px(y)		
	draw.text((tx,ty), letter, font=font,fill="#000000")
	
def draw_empty_rect(x,y):
	draw.rectangle( (m.mmpoint2px((x,y)), m.mmpoint2px((x+12,y+10))),outline="#000000", fill=None)	
	
xpos = 10	
for c in "ABCDEFGHIJKLM":
	draw_letter_rect(xpos,10,c)
	draw_letter_rect(xpos,20,codecs.encode(c,"rot_13"))
Example #16
0
    def evaluate(self, dataloader):
        """ Evaluate a model on a validation dataloader.
        """
        print("Running Validation...")
        t0 = time.time()
        self.model.eval()

        # Tracking variables 
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        y_true = []
        y_pred = []

        # Evaluate data for one epoch
        for batch in dataloader:
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            #   [3]: output_mask (optional)
            input_ids = batch[0].to(self.device)
            attention_mask = batch[1].to(self.device)
            label_ids = batch[2].to(self.device)
            output_mask = None 
            if self.use_output_mask:
                output_mask = batch[3].to(self.device)
            
            with torch.no_grad():        
                # The documentation for the BERT `models` are here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html
                outputs = self.model(input_ids, 
                                    attention_mask=attention_mask,
                                    labels=label_ids)
            loss = outputs[0] 
            logits = outputs[1]
            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            if self.use_output_mask:
                output_mask = output_mask.to('cpu').numpy()
                active_loss = (output_mask == 1)
            else:
                active_loss = np.ones(label_ids.shape)
                active_loss = (active_loss == 1)
            pred_flat = np.argmax(logits, axis=-1)[active_loss].flatten()
            labels_flat = label_ids[active_loss].flatten()
            y_true.append(labels_flat)
            y_pred.append(pred_flat)

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += Metrics.flat_accuracy(label_ids, logits)

        # Report results
        report = Metrics.report(self.metric_name, 
                                [item for sublist in y_true for item in sublist], 
                                [item for sublist in y_pred for item in sublist])
        print(report)
        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(dataloader)
        
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
        
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        return avg_val_accuracy, avg_val_loss, validation_time, report
            
Example #17
0
class GMVAE:
    def __init__(self, args):
        self.num_epochs = args.epochs
        self.cuda = args.cuda
        self.verbose = args.verbose

        self.batch_size = args.batch_size
        self.batch_size_val = args.batch_size_val
        self.learning_rate = args.learning_rate
        self.decay_epoch = args.decay_epoch
        self.lr_decay = args.lr_decay
        self.w_cat = args.w_categ
        self.w_gauss = args.w_gauss
        self.w_rec = args.w_rec
        self.rec_type = args.rec_type

        self.num_classes = args.num_classes
        self.gaussian_size = args.gaussian_size
        self.input_size = args.input_size

        # gumbel
        self.init_temp = args.init_temp
        self.decay_temp = args.decay_temp
        self.hard_gumbel = args.hard_gumbel
        self.min_temp = args.min_temp
        self.decay_temp_rate = args.decay_temp_rate
        self.gumbel_temp = self.init_temp

        self.network = GMVAENet(self.input_size, self.gaussian_size,
                                self.num_classes)
        self.losses = LossFunctions()
        self.metrics = Metrics()

        if self.cuda:
            self.network = self.network.cuda()

    def unlabeled_loss(self, data, out_net):
        """Method defining the loss functions derived from the variational lower bound
    Args:
        data: (array) corresponding array containing the input data
        out_net: (dict) contains the graph operations or nodes of the network output

    Returns:
        loss_dic: (dict) contains the values of each loss function and predictions
    """
        # obtain network variables
        z, data_recon = out_net['gaussian'], out_net['x_rec']
        logits, prob_cat = out_net['logits'], out_net['prob_cat']
        y_mu, y_var = out_net['y_mean'], out_net['y_var']
        mu, var = out_net['mean'], out_net['var']

        # reconstruction loss
        loss_rec = self.losses.reconstruction_loss(data, data_recon,
                                                   self.rec_type)

        # gaussian loss
        loss_gauss = self.losses.gaussian_loss(z, mu, var, y_mu, y_var)

        # categorical loss
        loss_cat = -self.losses.entropy(logits, prob_cat) - np.log(0.1)

        # total loss
        loss_total = self.w_rec * loss_rec + self.w_gauss * loss_gauss + self.w_cat * loss_cat

        # obtain predictions
        _, predicted_labels = torch.max(logits, dim=1)

        loss_dic = {
            'total': loss_total,
            'predicted_labels': predicted_labels,
            'reconstruction': loss_rec,
            'gaussian': loss_gauss,
            'categorical': loss_cat
        }
        return loss_dic

    def train_epoch(self, optimizer, data_loader):
        """Train the model for one epoch

    Args:
        optimizer: (Optim) optimizer to use in backpropagation
        data_loader: (DataLoader) corresponding loader containing the training data

    Returns:
        average of all loss values, accuracy, nmi
    """
        self.network.train()
        total_loss = 0.
        recon_loss = 0.
        cat_loss = 0.
        gauss_loss = 0.

        accuracy = 0.
        nmi = 0.
        num_batches = 0.

        true_labels_list = []
        predicted_labels_list = []

        # iterate over the dataset
        for (data, labels) in data_loader:
            if self.cuda == 1:
                data = data.cuda()

            optimizer.zero_grad()

            # flatten data
            data = data.view(data.size(0), -1)

            # forward call
            out_net = self.network(data, self.gumbel_temp, self.hard_gumbel)
            unlab_loss_dic = self.unlabeled_loss(data, out_net)
            total = unlab_loss_dic['total']

            # accumulate values
            total_loss += total.item()
            recon_loss += unlab_loss_dic['reconstruction'].item()
            gauss_loss += unlab_loss_dic['gaussian'].item()
            cat_loss += unlab_loss_dic['categorical'].item()

            # perform backpropagation
            total.backward()
            optimizer.step()

            # save predicted and true labels
            predicted = unlab_loss_dic['predicted_labels']
            true_labels_list.append(labels)
            predicted_labels_list.append(predicted)

            num_batches += 1.

        # average per batch
        total_loss /= num_batches
        recon_loss /= num_batches
        gauss_loss /= num_batches
        cat_loss /= num_batches

        # concat all true and predicted labels
        true_labels = torch.cat(true_labels_list, dim=0).cpu().numpy()
        predicted_labels = torch.cat(predicted_labels_list,
                                     dim=0).cpu().numpy()

        # compute metrics
        accuracy = 100.0 * self.metrics.cluster_acc(predicted_labels,
                                                    true_labels)
        nmi = 100.0 * self.metrics.nmi(predicted_labels, true_labels)

        return total_loss, recon_loss, gauss_loss, cat_loss, accuracy, nmi

    def test(self, data_loader, return_loss=False):
        """Test the model with new data

    Args:
        data_loader: (DataLoader) corresponding loader containing the test/validation data
        return_loss: (boolean) whether to return the average loss values
          
    Return:
        accuracy and nmi for the given test data

    """
        self.network.eval()
        total_loss = 0.
        recon_loss = 0.
        cat_loss = 0.
        gauss_loss = 0.

        accuracy = 0.
        nmi = 0.
        num_batches = 0.

        true_labels_list = []
        predicted_labels_list = []

        with torch.no_grad():
            for data, labels in data_loader:
                if self.cuda == 1:
                    data = data.cuda()

                # flatten data
                data = data.view(data.size(0), -1)

                # forward call
                out_net = self.network(data, self.gumbel_temp,
                                       self.hard_gumbel)
                unlab_loss_dic = self.unlabeled_loss(data, out_net)

                # accumulate values
                total_loss += unlab_loss_dic['total'].item()
                recon_loss += unlab_loss_dic['reconstruction'].item()
                gauss_loss += unlab_loss_dic['gaussian'].item()
                cat_loss += unlab_loss_dic['categorical'].item()

                # save predicted and true labels
                predicted = unlab_loss_dic['predicted_labels']
                true_labels_list.append(labels)
                predicted_labels_list.append(predicted)

                num_batches += 1.

        # average per batch
        if return_loss:
            total_loss /= num_batches
            recon_loss /= num_batches
            gauss_loss /= num_batches
            cat_loss /= num_batches

        # concat all true and predicted labels
        true_labels = torch.cat(true_labels_list, dim=0).cpu().numpy()
        predicted_labels = torch.cat(predicted_labels_list,
                                     dim=0).cpu().numpy()

        # compute metrics
        accuracy = 100.0 * self.metrics.cluster_acc(predicted_labels,
                                                    true_labels)
        nmi = 100.0 * self.metrics.nmi(predicted_labels, true_labels)

        if return_loss:
            return total_loss, recon_loss, gauss_loss, cat_loss, accuracy, nmi
        else:
            return accuracy, nmi

    def train(self, train_loader, val_loader):
        """Train the model

    Args:
        train_loader: (DataLoader) corresponding loader containing the training data
        val_loader: (DataLoader) corresponding loader containing the validation data

    Returns:
        output: (dict) contains the history of train/val loss
    """
        optimizer = optim.Adam(self.network.parameters(),
                               lr=self.learning_rate)
        train_history_acc, val_history_acc = [], []
        train_history_nmi, val_history_nmi = [], []

        for epoch in range(1, self.num_epochs + 1):
            train_loss, train_rec, train_gauss, train_cat, train_acc, train_nmi = self.train_epoch(
                optimizer, train_loader)
            val_loss, val_rec, val_gauss, val_cat, val_acc, val_nmi = self.test(
                val_loader, True)

            # if verbose then print specific information about training
            if self.verbose == 1:
                print("(Epoch %d / %d)" % (epoch, self.num_epochs))
                print("Train - REC: %.5lf;  Gauss: %.5lf;  Cat: %.5lf;" % \
                      (train_rec, train_gauss, train_cat))
                print("Valid - REC: %.5lf;  Gauss: %.5lf;  Cat: %.5lf;" % \
                      (val_rec, val_gauss, val_cat))
                print("Accuracy=Train: %.5lf; Val: %.5lf   NMI=Train: %.5lf; Val: %.5lf   Total Loss=Train: %.5lf; Val: %.5lf" % \
                      (train_acc, val_acc, train_nmi, val_nmi, train_loss, val_loss))
            else:
                print('(Epoch %d / %d) Train_Loss: %.3lf; Val_Loss: %.3lf   Train_ACC: %.3lf; Val_ACC: %.3lf   Train_NMI: %.3lf; Val_NMI: %.3lf' % \
                      (epoch, self.num_epochs, train_loss, val_loss, train_acc, val_acc, train_nmi, val_nmi))

            # decay gumbel temperature
            if self.decay_temp == 1:
                self.gumbel_temp = np.maximum(
                    self.init_temp * np.exp(-self.decay_temp_rate * epoch),
                    self.min_temp)
                if self.verbose == 1:
                    print("Gumbel Temperature: %.3lf" % self.gumbel_temp)

            train_history_acc.append(train_acc)
            val_history_acc.append(val_acc)
            train_history_nmi.append(train_nmi)
            val_history_nmi.append(val_nmi)
        return {
            'train_history_nmi': train_history_nmi,
            'val_history_nmi': val_history_nmi,
            'train_history_acc': train_history_acc,
            'val_history_acc': val_history_acc
        }

    def latent_features(self, data_loader, return_labels=False):
        """Obtain latent features learnt by the model

    Args:
        data_loader: (DataLoader) loader containing the data
        return_labels: (boolean) whether to return true labels or not

    Returns:
       features: (array) array containing the features from the data
    """
        self.network.eval()
        N = len(data_loader.dataset)
        features = np.zeros((N, self.gaussian_size))
        if return_labels:
            true_labels = np.zeros(N, dtype=np.int64)
        start_ind = 0
        with torch.no_grad():
            for (data, labels) in data_loader:
                if self.cuda == 1:
                    data = data.cuda()
                # flatten data
                data = data.view(data.size(0), -1)
                out = self.network.inference(data, self.gumbel_temp,
                                             self.hard_gumbel)
                latent_feat = out['mean']
                end_ind = min(start_ind + data.size(0), N + 1)

                # return true labels
                if return_labels:
                    true_labels[start_ind:end_ind] = labels.cpu().numpy()
                features[start_ind:end_ind] = latent_feat.cpu().detach().numpy(
                )
                start_ind += data.size(0)
        if return_labels:
            return features, true_labels
        return features

    def reconstruct_data(self, data_loader, sample_size=-1):
        """Reconstruct Data

    Args:
        data_loader: (DataLoader) loader containing the data
        sample_size: (int) size of random data to consider from data_loader
      
    Returns:
        reconstructed: (array) array containing the reconstructed data
    """
        self.network.eval()

        # sample random data from loader
        indices = np.random.randint(0,
                                    len(data_loader.dataset),
                                    size=sample_size)
        test_random_loader = torch.utils.data.DataLoader(
            data_loader.dataset,
            batch_size=sample_size,
            sampler=SubsetRandomSampler(indices))

        # obtain values
        it = iter(test_random_loader)
        test_batch_data, _ = it.next()
        original = test_batch_data.data.numpy()
        if self.cuda:
            test_batch_data = test_batch_data.cuda()

        # obtain reconstructed data
        out = self.network(test_batch_data, self.gumbel_temp, self.hard_gumbel)
        reconstructed = out['x_rec']
        return original, reconstructed.data.cpu().numpy()

    def plot_latent_space(self, data_loader, save=False):
        """Plot the latent space learnt by the model

    Args:
        data: (array) corresponding array containing the data
        labels: (array) corresponding array containing the labels
        save: (bool) whether to save the latent space plot

    Returns:
        fig: (figure) plot of the latent space
    """
        # obtain the latent features
        features = self.latent_features(data_loader)

        # plot only the first 2 dimensions
        fig = plt.figure(figsize=(8, 6))
        plt.scatter(features[:, 0],
                    features[:, 1],
                    c=labels,
                    marker='o',
                    edgecolor='none',
                    cmap=plt.cm.get_cmap('jet', 10),
                    s=10)
        plt.colorbar()
        if (save):
            fig.savefig('latent_space.png')
        return fig

    def random_generation(self, num_elements=1):
        """Random generation for each category

    Args:
        num_elements: (int) number of elements to generate

    Returns:
        generated data according to num_elements
    """
        # categories for each element
        arr = np.array([])
        for i in range(self.num_classes):
            arr = np.hstack([arr, np.ones(num_elements) * i])
        indices = arr.astype(int).tolist()

        categorical = F.one_hot(torch.tensor(indices),
                                self.num_classes).float()

        if self.cuda:
            categorical = categorical.cuda()

        # infer the gaussian distribution according to the category
        mean, var = self.network.generative.pzy(categorical)

        # gaussian random sample by using the mean and variance
        noise = torch.randn_like(var)
        std = torch.sqrt(var)
        gaussian = mean + noise * std

        # generate new samples with the given gaussian
        generated = self.network.generative.pxz(gaussian)

        return generated.cpu().detach().numpy()
Example #18
0
 def setUp(self):
     self.metrics = Metrics()
     self.tile = Tile()
def main(args):
    assert args.arch_gcn in ['firstchebnet'], '[ERROR] Architecture not implemented!'
    assert args.dataset == 'mit67', '[ERROR] Dataset not supported yet!'
    
    obj = args.storage + '/graph.bin'
    # load and preprocess dataset
    if not os.path.isfile(obj):
        print('Graph not found!')
        print('Creating graph...')
        gh = GNNHandler('dataset', args.pretrained_cnn)
        graph = gh.build_graph()
        gh.save_graph(obj, graph)
        obj = graph
        
    g, features, labels, train_mask, val_mask, test_mask, in_feats, n_classes = GNNHandler.get_info_from_graph(obj)
    n_edges = g.number_of_edges()
    
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.sum().item(),
              val_mask.sum().item(),
              test_mask.sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    # add self loop
    # if args.self_loop:
    #     g.remove_edges_from(g.selfloop_edges())
    #     g.add_edges_from(zip(g.nodes(), g.nodes()))
    
    n_edges = g.number_of_edges()
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    if args.arch_gcn == 'firstchebnet':
        model = FirstChebNet(g,
                            in_feats,
                            args.n_hidden,
                            n_classes,
                            args.n_layers,
                            F.relu,
                            args.dropout)
    
    else:
        print('ARCHITECTURE NOT IMPLEMENT! EXITING...')
        exit(1)

    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    mtc = Metrics(features, labels, val_mask, backend='pytorch')
    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        acc = mtc.evaluate(model)
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(),
                                             acc, n_edges / np.mean(dur) / 1000))

    print("Test Accuracy {:.2%}".format(mtc.evaluate(model)))
    
    mtc.save_metrics(args.save_path, model)
    torch.save(model.state_dict(), args.save_path + '/cp.pt')
    
    return mtc, model
Example #20
0
def validate(localizer,
             adversarial,
             dataloader,
             experiment_directory,
             labels,
             segmentation_map_threshold,
             num_classes,
             evaluate=False,
             save_results=False):
    """ Loop over the validation set (in batches) to acquire relevant metrics """
    print('Validating...')
    if evaluate:
        metrics = Metrics(20)
    localizer_criterion = torch.nn.BCELoss()
    adversarial_criterion = torch.nn.BCELoss()
    localizer_loss_meter = AverageMeter()
    adversarial_loss_meter = AverageMeter()
    for i, (inputs, targets) in enumerate(dataloader):
        if evaluate:
            # Segmentation maps are included in the targets
            targets, segmentation_maps = targets
        else:
            segmentation_maps = None

        if torch.cuda.is_available():
            inputs, targets = inputs.cuda(), targets.cuda()

        output, gcams = localizer(inputs, labels=targets)

        loss = localizer_criterion(output, targets)
        localizer_loss_meter.update(loss.item())

        gcams, new_images, new_targets, original_targets = gcams

        if adversarial is not None or save_results:
            new_batch_size = gcams.size(0)
            masks = gcam_to_mask(gcams)

            masked_image = erase_mask(new_images, masks)
            if adversarial is not None:
                adversarial_output = adversarial(masked_image)
                adversarial_output = torch.sigmoid(adversarial_output)
                adversarial_loss = adversarial_criterion(
                    adversarial_output, original_targets)
                adversarial_loss_meter.update(adversarial_loss.item())

            if save_results:
                for k in range(new_batch_size):
                    number = f'{i * new_batch_size + k}'  #TODO: fix
                    label_string = labels[new_targets[k]]
                    file_postfix = f'{number}_{label_string}'
                    save_location = os.path.join(
                        experiment_directory, f'heatmap_{file_postfix}.png')
                    save_gradcam(filename=save_location,
                                 gcam=gcams[k, 0].detach(),
                                 raw_image=new_images[k].clone())
                    save_location = os.path.join(
                        experiment_directory,
                        f'raw_heatmap_{file_postfix}.png')
                    save_gradcam(filename=save_location,
                                 gcam=gcams[k, 0].detach())
                    save_location = os.path.join(experiment_directory,
                                                 f'erased_{file_postfix}.png')
                    tensor2imwrite(save_location, denormalize(masked_image[k]))

        if evaluate:
            # Generate and visualize predicted segmentation map
            predicted_segmentation_maps = generate_segmentation_map(
                gcams,
                num_classes,
                segmentation_maps.shape[1:],
                new_targets,
                threshold=segmentation_map_threshold)
            metrics.update(predicted_segmentation_maps, segmentation_maps)

            if save_results:
                predicted_indices = predicted_segmentation_maps.unique()
                all_labels = ['background', *labels]
                predicted_labels = [
                    all_labels[idx] for idx in predicted_indices
                ]
                labels_string = '_'.join(predicted_labels)
                filename = f'map_{i:04d}_{labels_string}.png'
                save_location = os.path.join(experiment_directory, filename)
                save_segmentation_map(save_location,
                                      predicted_segmentation_maps,
                                      denormalize(new_images[k]).clone())
                filename = f'map_raw_{i:04d}_{labels_string}.png'
                save_location = os.path.join(experiment_directory, filename)
                save_segmentation_map(save_location,
                                      predicted_segmentation_maps)

    print('Validation localizer loss:', localizer_loss_meter.avg)
    print('Validation adversarial loss:', adversarial_loss_meter.avg)

    if evaluate:
        miou = metrics.miou().item()
        precision = metrics.precision(skip_background=True).item()
        recall = metrics.recall(skip_background=True).item()
        metrics.print_scores_per_class()
        print('mIoU:', miou)
        print('precision:', precision)
        print('recall:', recall)
Example #21
0
 def __init__(self, interface):
     Thread.__init__(self)
     self.stop_event = Event()
     self.interface = interface
     self.metrics = Metrics()
Example #22
0
import json

import mongo_structure as mdb
from metrics_util import *
from metrics import Metrics
from alldayplay import AllDayPlay

# Create the application object
app = Flask(__name__)
app.config.from_pyfile('yr_metrics_api.cfg', silent=False)

# Connect to the database
db = Connection(app.config["MONGODB_HOST"], app.config["MONGODB_PORT"])

# Set up API methods
metrics = Metrics(request=request, database_connection=db)
adp = AllDayPlay(request=request, database_connection=db)

# Mongo Schema. These objects all live in the mongo_structure import.
db.register(mdb.RootDocument)
db.register(mdb.Event)
db.register(mdb.Count)

# Add an event to the logging table.
app.add_url_rule('/event/<func>', 'event_add_or_touch', lambda func: metrics.addOrTouchEvent(func), methods=["GET", "POST"])

# AllDayPlay Metrics.
app.add_url_rule('/adp/songs/played', 'adp_last_songs_played', adp.lastSongsPlayed, methods=["GET"])
app.add_url_rule('/adp/songs/total', 'adp_total_songs_played', adp.totalSongsPlayed, methods=["GET"])
app.add_url_rule('/adp/sessions/current', 'adp_current_num_sessions', adp.currentNumberOfListeningSessions, methods=["GET"])
app.add_url_rule('/adp/sessions/bounced', 'adp_total_sessions_bounced', adp.totalSessionsBounced, methods=["GET"])
Example #23
0
 def get_instantaneous_throughput(self):
     Metrics.instantaneousThroughput()
Example #24
0
					#Remove first row
					targetByClass = np.delete(targetByClass, (0), axis=0)

					test_mse = test_mse/float(len(predictions))

					"""
					PLOT AND CALCULATE METRICS
					"""

					pos_len = len(base['testing']['data'][base['testing']['target']==1])
					neg_len = len(base['testing']['data'][base['testing']['target']==0])
					confusion_matrix_percentage = calc_confusion_matrix(vp,fp,fn,vn,pos_len,neg_len)

					#Confusion Matrix
					Metrics.plot_confusion_matrix(confusion_matrix_percentage, configDir)

					#MSE (Training and Validation)
					Metrics.plot_mse_curve(np.array(error_train), np.array(error_valid), configDir)

					#Area Under ROC Curve
					roc_area = Metrics.plot_roc_curve(targetByClass, prob_predictions, configDir)

					#precision
					acurracy = ((len(base['testing']['data'])-errors_total)/len(base['testing']['data']))*100

					print("acurracy:", acurracy,'%')
					print('errors',errors_total,'of', len(base['testing']['data']))
					
					configDesc = {'opt_samp':opt_samp.name, 'opt_learning':opt_learning, 'activation_function_options':opt_actvfunc, 'topology_options':opt_top}
        # os.path.join('.', 'results_cifar10_pot_conv', 'checkpoints', 'trained-pot-1.meta'))
        os.path.join(ckpt_dir, 'trained-pot-126480.meta'))
    saver.restore(sess, os.path.join(ckpt_dir, 'trained-pot-126480'))
    # saver.restore(sess, os.path.join('.', 'results_cifar10_pot_conv', 'checkpoints', 'trained-pot-1'))
    noise_ph = tf.get_collection('noise_ph')[0]
    bn_ph = tf.get_collection('is_training_ph')[0]
    decoder = tf.get_collection('decoder')[0]

    mean = np.zeros(z_dim)
    cov = np.identity(z_dim)
    noise = pz_std * np.random.multivariate_normal(
        mean, cov, 16 * num_cols).astype(np.float32)

    # 1. Random samples
    res = sess.run(decoder, feed_dict={noise_ph: noise, bn_ph: False})
    metrics = Metrics()
    opts = {}
    opts['dataset'] = dataset
    opts['input_normalize_sym'] = normalyze
    opts['work_dir'] = output_dir
    metrics.make_plots(opts, 0, None, res, prefix='samples')

    # #2. Interpolations
    # ids = np.random.choice(16 * num_cols, num_pairs, replace=False)
    # for i in range(len(ids)):
    #     for j in range(i + 1, len(ids)):
    #         id1, id2 = ids[i], ids[j]
    #         a = np.reshape(noise[id1, :], (1, z_dim))
    #         b = np.reshape(noise[id2, :], (1, z_dim))
    #         _lambda = np.linspace(0., 1., 60)
    #         _lambda = np.reshape(_lambda, (60, 1))
Example #26
0
    model.add(LSTM(units=rnn_units, activation=rnn_activation))
elif layer == "GRU":
    model.add(GRU(units=rnn_units, activation=rnn_activation))
else:
    print("ERROR: Invalid layer", layer)
    exit(1)
model.add(Dense(y.shape[1], activation=dense_activation))
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

print(model.summary())

# Filename: [layer]_1-layer_A-timesteps_B-[activation]-units_[optimizer]_[loss]_D-batch-size_[epoch]_[loss]_[accuracy]_[vLoss]_[vAccuracy].hdf5
filename = weights_dir+"/"+layer+"_1-layer_"+str(timesteps)+"-timesteps_"+str(rnn_units)+"-"+rnn_activation+"-units_"+\
        optimizer+"_"+loss+"_"+str(batch_size)+"-batch-size_{epoch:03d}_{loss:.4f}_{acc:.4f}_{val_loss:.4f}_{val_acc:.4f}.hdf5"

callback_metric = Metrics(trainX, trainY)
checkpoint = ModelCheckpoint(filename,
                             monitor='loss',
                             verbose=1,
                             save_best_only=True,
                             mode='min')
checkpoint = [checkpoint, callback_metric]
if "dataset1" in dataset:
    checkpoint = [callback_metric]

history = model.fit(x=trainX,
                    y=trainY,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=verbosity,
                    validation_data=(validationX, validationY),
Example #27
0
 def get_average_throughput(self):
     Metrics.average_throughput()
Example #28
0
def main():
    global args
    args = parse_args()

    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    print(args)
    train_dir = glob.glob(os.path.join(args.data, 'train/holistic/*.pt'))
    dev_dir = glob.glob(os.path.join(args.data, 'val/holistic/*.pt'))
    test_dir = glob.glob(os.path.join(args.data, 'test/holistic/*.pt'))

    train_dataset = Dataset(os.path.join(args.data, 'train'), train_dir)
    dev_dataset = Dataset(os.path.join(args.data, 'val'), dev_dir)
    test_dataset = Dataset(os.path.join(args.data, 'test'), test_dir)

    print('==> Size of train data   : %d ' % len(train_dataset))
    print('==> Size of val data   : %d ' % len(dev_dataset))
    print('==> Size of test data   : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    if args.pretrained_model == 'vgg16':
        pretrained_vgg16 = models.vgg16(pretrained=True)

        # Freeze training for all layers
        for child in pretrained_vgg16.children():
            for param in child.parameters():
                param.requires_grad = False

        if args.pretrained_holistic == 0:
            model = model_vgg16.DocClassificationHolistic(
                args, pretrained_vgg16)
        elif args.pretrained_holistic == 1:
            pretrained_orig_vgg16 = model_vgg16.DocClassificationHolistic(
                args, pretrained_vgg16)
            pretrained_holistic = model_vgg16.DocClassificationHolistic(
                args, pretrained_orig_vgg16.pretrained_model)
            checkpoint = torch.load('./checkpoints/vgg16.pt')
            pretrained_holistic.load_state_dict(checkpoint['model'])

            model = model_vgg16.DocClassificationRest(args,
                                                      pretrained_orig_vgg16,
                                                      pretrained_holistic)

    elif args.pretrained_model == 'vgg19':
        pretrained_vgg19 = models.vgg19(pretrained=True)

        # Freeze training for all layers
        for child in pretrained_vgg19.children():
            for param in child.parameters():
                param.requires_grad = False

        if args.pretrained_holistic == 0:
            model = model_vgg19.DocClassificationHolistic(
                args, pretrained_vgg19)
        elif args.pretrained_holistic == 1:
            pretrained_orig_vgg19 = model_vgg19.DocClassificationHolistic(
                args, pretrained_vgg19)
            pretrained_holistic = model_vgg19.DocClassificationHolistic(
                args, pretrained_orig_vgg19.pretrained_model)
            checkpoint = torch.load('./checkpoints/vgg19.pt')
            pretrained_holistic.load_state_dict(checkpoint['model'])

            model = model_vgg19.DocClassificationRest(args,
                                                      pretrained_orig_vgg19,
                                                      pretrained_holistic)

    elif args.pretrained_model == 'resnet50':
        pretrained_resnet50 = models.resnet50(pretrained=True)

        # Freeze training for all layers
        for child in pretrained_resnet50.children():
            for param in child.parameters():
                param.requires_grad = False

        if args.pretrained_holistic == 0:
            model = model_resnet50.DocClassificationHolistic(
                args, pretrained_resnet50)
        elif args.pretrained_holistic == 1:
            pretrained_orig_resnet50 = model_resnet50.DocClassificationHolistic(
                args, pretrained_resnet50)
            pretrained_holistic = model_resnet50.DocClassificationHolistic(
                args, pretrained_orig_resnet50.pretrained_model)
            checkpoint = torch.load('./checkpoints/resnet50.pt')
            pretrained_holistic.load_state_dict(checkpoint['model'])

            model = model_resnet50.DocClassificationRest(
                args, pretrained_orig_resnet50, pretrained_holistic)

    elif args.pretrained_model == 'densenet121':
        pretrained_densenet121 = models.densenet121(pretrained=True)

        # Freeze training for all layers
        for child in pretrained_densenet121.children():
            for param in child.parameters():
                param.requires_grad = False

        if args.pretrained_holistic == 0:
            model = model_densenet121.DocClassificationHolistic(
                args, pretrained_densenet121)
        elif args.pretrained_holistic == 1:
            pretrained_orig_densenet121 = model_densenet121.DocClassificationHolistic(
                args, pretrained_densenet121)
            pretrained_holistic = model_densenet121.DocClassificationHolistic(
                args, pretrained_orig_densenet121.pretrained_model)
            checkpoint = torch.load('./checkpoints/densenet121.pt')
            pretrained_holistic.load_state_dict(checkpoint['model'])

            model = model_densenet121.DocClassificationRest(
                args, pretrained_orig_densenet121, pretrained_holistic)

    elif args.pretrained_model == 'inceptionv3':
        pretrained_inceptionv3 = models.inception_v3(pretrained=True)

        # Freeze training for all layers
        for child in pretrained_inceptionv3.children():
            for param in child.parameters():
                param.requires_grad = False

        if args.pretrained_holistic == 0:
            model = model_inceptionv3.DocClassificationHolistic(
                args, pretrained_inceptionv3)
        elif args.pretrained_holistic == 1:
            pretrained_orig_inceptionv3 = model_inceptionv3.DocClassificationHolistic(
                args, pretrained_inceptionv3)
            pretrained_holistic = model_inceptionv3.DocClassificationHolistic(
                args, pretrained_orig_inceptionv3.pretrained_model)
            checkpoint = torch.load('./checkpoints/inceptionv3.pt')
            pretrained_holistic.load_state_dict(checkpoint['model'])

            model = model_inceptionv3.DocClassificationRest(
                args, pretrained_orig_inceptionv3, pretrained_holistic)

    criterion = nn.CrossEntropyLoss(reduction='sum')

    parameters = filter(lambda p: p.requires_grad, model.parameters())

    if args.cuda:
        model.cuda(), criterion.cuda()

    if args.optim == 'adam':
        optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(parameters,
                                   lr=args.lr,
                                   weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    train_idx = list(np.arange(len(train_dataset)))
    dev_idx = list(np.arange(len(dev_dataset)))
    test_idx = list(np.arange(len(test_dataset)))

    best = float('inf')
    columns = ['ExpName', 'ExpNo', 'Epoch', 'Loss', 'Accuracy']
    results = []
    early_stop_count = 0

    for epoch in range(args.epochs):

        train_loss = 0.0
        dev_loss = 0.0
        test_loss = 0.0

        train_predictions = []
        train_labels = []

        dev_predictions = []
        dev_labels = []

        test_predictions = []
        test_labels = []

        random.shuffle(train_idx)
        random.shuffle(dev_idx)
        random.shuffle(test_idx)

        batch_train_data = [
            train_idx[i:i + args.batchsize]
            for i in range(0, len(train_idx), args.batchsize)
        ]
        batch_dev_data = [
            dev_idx[i:i + args.batchsize]
            for i in range(0, len(dev_idx), args.batchsize)
        ]
        batch_test_data = [
            test_idx[i:i + args.batchsize]
            for i in range(0, len(test_idx), args.batchsize)
        ]

        for batch in tqdm(batch_train_data, desc='Training batches..'):
            train_batch_holistic, \
            train_batch_header, \
            train_batch_footer, \
            train_batch_left_body, \
            train_batch_right_body, \
            train_batch_labels = train_dataset[batch]

            if args.pretrained_holistic == 0:
                _ = trainer.train_holistic(train_batch_holistic,
                                           train_batch_labels)
            elif args.pretrained_holistic == 1:
                _ = trainer.train_rest(train_batch_holistic, \
                                        train_batch_header, \
                                        train_batch_footer, \
                                        train_batch_left_body, \
                                        train_batch_right_body, \
                                        train_batch_labels)

        for batch in tqdm(batch_train_data, desc='Training batches..'):
            train_batch_holistic, \
            train_batch_header, \
            train_batch_footer, \
            train_batch_left_body, \
            train_batch_right_body, \
            train_batch_labels = train_dataset[batch]

            if args.pretrained_holistic == 0:
                train_batch_loss, train_batch_predictions, train_batch_labels = trainer.test_holistic(
                    train_batch_holistic, train_batch_labels)
            elif args.pretrained_holistic == 1:
                train_batch_loss, train_batch_predictions, train_batch_labels = trainer.test_rest(train_batch_holistic, \
                                                                                            train_batch_header, \
                                                                                            train_batch_footer, \
                                                                                            train_batch_left_body, \
                                                                                            train_batch_right_body, \
                                                                                            train_batch_labels)

            train_predictions.append(train_batch_predictions)
            train_labels.append(train_batch_labels)
            train_loss = train_loss + train_batch_loss

        train_accuracy = metrics.accuracy(np.concatenate(train_predictions),
                                          np.concatenate(train_labels))

        for batch in tqdm(batch_dev_data, desc='Dev batches..'):
            dev_batch_holistic, \
            dev_batch_header, \
            dev_batch_footer, \
            dev_batch_left_body, \
            dev_batch_right_body, \
            dev_batch_labels = dev_dataset[batch]

            if args.pretrained_holistic == 0:
                dev_batch_loss, dev_batch_predictions, dev_batch_labels = trainer.test_holistic(
                    dev_batch_holistic, dev_batch_labels)
            elif args.pretrained_holistic == 1:
                dev_batch_loss, dev_batch_predictions, dev_batch_labels = trainer.test_rest(dev_batch_holistic, \
                                                                                        dev_batch_header, \
                                                                                        dev_batch_footer, \
                                                                                        dev_batch_left_body, \
                                                                                        dev_batch_right_body, \
                                                                                        dev_batch_labels)

            dev_predictions.append(dev_batch_predictions)
            dev_labels.append(dev_batch_labels)
            dev_loss = dev_loss + dev_batch_loss

        dev_accuracy = metrics.accuracy(np.concatenate(dev_predictions),
                                        np.concatenate(dev_labels))

        for batch in tqdm(batch_test_data, desc='Test batches..'):
            test_batch_holistic, \
            test_batch_header, \
            test_batch_footer, \
            test_batch_left_body, \
            test_batch_right_body, \
            test_batch_labels = test_dataset[batch]

            if args.pretrained_holistic == 0:
                test_batch_loss, test_batch_predictions, test_batch_labels = trainer.test_holistic(
                    test_batch_holistic, test_batch_labels)
            elif args.pretrained_holistic == 1:
                test_batch_loss, test_batch_predictions, test_batch_labels = trainer.test_rest(test_batch_holistic, \
                                                                                        test_batch_header, \
                                                                                        test_batch_footer, \
                                                                                        test_batch_left_body, \
                                                                                        test_batch_right_body, \
                                                                                        test_batch_labels)

            test_predictions.append(test_batch_predictions)
            test_labels.append(test_batch_labels)
            test_loss = test_loss + test_batch_loss

        test_accuracy = metrics.accuracy(np.concatenate(test_predictions),
                                         np.concatenate(test_labels))

        print('==> Training Epoch: %d, \
                        \nLoss: %f, \
                        \nAccuracy: %f'                                       %(epoch + 1, \
                                            train_loss/(len(batch_train_data) * args.batchsize), \
                                            train_accuracy))
        print('==> Dev Epoch: %d, \
                        \nLoss: %f, \
                        \nAccuracy: %f'                                       %(epoch + 1, \
                                            dev_loss/(len(batch_dev_data) * args.batchsize), \
                                            dev_accuracy))

        print('==> Test Epoch: %d, \
                        \nLoss: %f, \
                        \nAccuracy: %f'                                       %(epoch + 1, \
                                            test_loss/(len(batch_test_data) * args.batchsize), \
                                            test_accuracy))
        #quit()
        results.append((args.expname, \
                        args.expno, \
                        epoch+1, \
                        test_loss/(len(batch_test_data) * args.batchsize), \
                        test_accuracy))

        if best > test_loss:
            best = test_loss
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'loss': test_loss,
                'accuracy': test_accuracy,
                'args': args,
                'epoch': epoch
            }
            print('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint,
                       '%s.pt' % os.path.join(args.save, args.expname))
            #np.savetxt("test_pred.csv", test_pred.numpy(), delimiter=",")
        else:
            early_stop_count = early_stop_count + 1

            if early_stop_count == 20:
                quit()
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
    logger.debug('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    logger.debug('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                args.hidden_dim,
                args.num_classes,
                args.sparse,
                args.freeze_embed)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d'))
        logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.emb.weight.data.copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss             = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred     = trainer.test(dev_dataset)
        test_loss, test_pred   = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(), 
                'optim': trainer.optimizer,
                'pearson': test_pearson, 'mse': test_mse,
                'args': args, 'epoch': epoch
                }
            logger.debug('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
Example #30
0
File: node.py Project: sorinros/cxm
 def get_metrics(self):
     """Return the metrics instance of this node."""
     if self.__metrics is None:
         self.__metrics = Metrics(self)
     return self.__metrics
class MyTestCase(unittest.TestCase):
    def setUp(self):
        self.labels_pred = {
            0: 1,
            1: 0,
            2: 1,
            3: 1,
            4: 1,
            5: 1,
            6: 1,
            7: 0,
            8: 0,
            9: 0,
            10: 0,
            11: 2,
            12: 1,
            13: 2,
            14: 1,
            15: 2,
            16: 2}
        self.labels_true = {
            0: 0,
            1: 0,
            2: 0,
            3: 0,
            4: 0,
            5: 0,
            6: 1,
            7: 1,
            8: 1,
            9: 1,
            10: 1,
            11: 1,
            12: 2,
            13: 2,
            14: 2,
            15: 2,
            16: 2}
        self._n = len(self.labels_pred)
        self.metrics = Metrics(self.labels_true, self.labels_pred)

    def test_pairwise_precision_recall_f1(self):
        precision, recall, f1 = self.metrics._pairwise_precision_recall_f1()
        self.assertEqual(precision, 20.0/44)
        self.assertEqual(recall, 20.0/40)
        self.assertEqual(f1, 10.0/21)

    def test_cluster_precision_recall_f1(self):
        precision, recall, f1 = self.metrics._cluster_precision_recall_f1()
        self.assertEqual(precision, 0)
        self.assertEqual(recall, 0)
        self.assertEqual(f1, 0)

    def test_closest_cluster_precision_recall_f1(self):
        precision, recall, f1 = self.metrics._closest_cluster_precision_recall_f1()
        x = (4.0/7.0 + 5.0/9.0 + 3.0/6.0)/3.0
        self.assertEqual(precision, x)
        self.assertEqual(recall, x)
        self.assertEqual(f1, 2*x*x/(x+x))

    def test_average_author_cluster_purity(self):
        aap = 149.0/255
        acp = 193.0/340
        average_author_purity, average_cluster_purity, k = self.metrics._average_author_cluster_purity()
        self.assertEqual(average_author_purity, aap)
        self.assertEqual(average_cluster_purity, acp)
        self.assertEqual(k, (aap*acp)**0.5)

    def test_homogeneity_completeness_vmeasure(self):
        labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred)
        sk_homogeneity, sk_completeness, sk_vmeasure = skmetrics.homogeneity_completeness_v_measure(labels_true,
                                                                                                    labels_pred)
        homogeneity, completeness, vmeasure = self.metrics._homogeneity_completeness_vmeasure(1)
        self.assertEqual(homogeneity, sk_homogeneity)
        self.assertEqual(completeness, sk_completeness)
        self.assertEqual(sk_vmeasure, vmeasure)

    def test_cluster(self):
        clusters = frozenset({frozenset({0, 1, 2, 3, 4, 5}), frozenset({6, 7, 8, 9, 10, 11}),
                              frozenset({12, 13, 14, 15, 16})})

        self.assertSetEqual(_cluster(self.labels_true), clusters)

    def test_intersection_size(self):
        self.assertEqual(_intersection_size(_cluster(self.labels_true), _cluster(self.labels_pred)), 20)

    def test_number_pairs(self):
        self.assertEqual(_number_pairs(_cluster(self.labels_true)), 40)

    def test_jaccard(self):
        set1 = {1, 2, 3}
        set2 = {3, 4, 5}
        set3 = {6}
        self.assertEqual(_jaccard(set1, set2), 1.0/5)
        self.assertEqual(_jaccard(set1, set3), 0)

    def test_global_merge_distance(self):
        """
        This tests GMD using the relationship to other properties specified in the original paper
        """
        fs = lambda x, y: x*y
        fm = lambda x, y: 0
        independent = frozenset({frozenset({0}), frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}),
                                 frozenset({5}), frozenset({6}), frozenset({7}), frozenset({8}), frozenset({9}),
                                 frozenset({10}), frozenset({11}), frozenset({12}), frozenset({13}), frozenset({14}),
                                 frozenset({15}), frozenset({16})})
        gmd_pairwise_precision = 1 - self.metrics.global_merge_distance(fs, fm)/self.metrics.global_merge_distance(fs, fm, S=independent)
        fs = lambda x, y: 0
        fm = lambda x, y: x*y
        gmd_pairwise_recall = 1 - self.metrics.global_merge_distance(fs, fm)/self.metrics.global_merge_distance(fs, fm, R=independent)
        pairwise_precision, pairwise_recall, f1 = self.metrics._pairwise_precision_recall_f1()
        self.assertAlmostEqual(gmd_pairwise_precision, pairwise_precision)
        self.assertAlmostEqual(gmd_pairwise_recall, pairwise_recall)

    def test_mutual_information(self):
        labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred)
        mi = self.metrics._mutual_information(self.metrics._clusters_pred, self.metrics._clusters_true)
        self.assertAlmostEqual(mi, skmetrics.mutual_info_score(labels_true, labels_pred))

    def test_variation_of_information(self):
        vi = self.metrics._variation_of_information()
        h = lambda x: float(x)/self._n*math.log(float(x)/self._n)
        fs = lambda x, y: h(x+y) - h(x) - h(y)
        fm = fs
        gmd_vi = self.metrics.global_merge_distance(fs, fm)
        self.assertEqual(vi, gmd_vi)

    def test_purity(self):
        purity = self.metrics._purity()
        self.assertEqual(purity, 12.0/17)

    def test_entity_sizes(self):
        sizes_true = np.array([[5, 1],
                              [6, 2]])
        sizes_pred = np.array([[4, 1],
                               [5, 1],
                               [8, 1]])
        self.assertTrue(np.array_equal(sizes_true, self.metrics._entity_sizes_true))
        self.assertTrue(np.array_equal(sizes_pred, self.metrics._entity_sizes_pred))

    def test_plot(self):
        self.metrics.display()
class CTTTrainer(TensorboardMixin, WandBMixin, IOMixin, BaseExperiment):
    WANDB_PROJECT = "ctt"

    def __init__(self):
        super(CTTTrainer, self).__init__()
        self.auto_setup()
        self._build()

    def _build(self):
        self._build_loaders()
        self._build_model()
        self._build_criteria_and_optim()
        self._build_scheduler()

    def _build_model(self):
        self.model: nn.Module = to_device(
            ContactTracingTransformer(**self.get("model/kwargs", {})),
            self.device)

    def _build_loaders(self):
        train_path = self.get("data/paths/train", ensure_exists=True)
        validate_path = self.get("data/paths/validate", ensure_exists=True)
        self.train_loader = get_dataloader(path=train_path,
                                           **self.get("data/loader_kwargs",
                                                      ensure_exists=True))
        self.validate_loader = get_dataloader(path=validate_path,
                                              **self.get("data/loader_kwargs",
                                                         ensure_exists=True))

    def _build_criteria_and_optim(self):
        # noinspection PyArgumentList
        self.loss = WeightedSum.from_config(
            self.get("losses", ensure_exists=True))
        optim_cls = getattr(opts, self.get("optim/name", "Adam"))
        self.optim = optim_cls(self.model.parameters(),
                               **self.get("optim/kwargs"))
        self.metrics = Metrics()

    def _build_scheduler(self):
        # Set up an epoch-wise scheduler here if you want to, but the
        # recommendation is to use the one defined in opts.
        self.scheduler = None

    @property
    def device(self):
        return self.get("device",
                        "cuda" if torch.cuda.is_available() else "cpu")

    @register_default_dispatch
    def train(self):
        if self.get("wandb/use", True):
            self.initialize_wandb()
        for epoch in self.progress(range(
                self.get("training/num_epochs", ensure_exists=True)),
                                   tag="epochs"):
            self.train_epoch()
            validation_stats = self.validate_epoch()
            self.checkpoint()
            self.log_progress("epochs", **validation_stats)
            self.step_scheduler(epoch)
            self.next_epoch()

    def train_epoch(self):
        self.clear_moving_averages()
        self.model.train()
        for model_input in self.progress(self.train_loader, tag="train"):
            # Evaluate model
            model_input = to_device(model_input, self.device)
            model_output = Dict(self.model(model_input))
            # Compute loss
            losses = self.loss(model_input, model_output)
            loss = losses.loss
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()
            # Log to wandb (if required)
            self.log_training_losses(losses)
            self.log_learning_rates()
            # Log to pbar
            self.accumulate_in_cache("moving_loss", loss.item(),
                                     momentum_accumulator(0.9))
            self.log_progress(
                "train",
                loss=self.read_from_cache("moving_loss"),
            )
            self.next_step()

    def validate_epoch(self):
        all_losses_and_metrics = defaultdict(list)
        self.metrics.reset()
        self.model.eval()
        for model_input in self.progress(self.validate_loader,
                                         tag="validation"):
            with torch.no_grad():
                model_input = to_device(model_input, self.device)
                model_output = Dict(self.model(model_input))
                losses = self.loss(model_input, model_output)
                self.metrics.update(model_input, model_output)
                all_losses_and_metrics["loss"].append(losses.loss.item())
                for key in losses.unweighted_losses:
                    all_losses_and_metrics[key].append(
                        losses.unweighted_losses[key].item())
        # Compute mean for all losses
        all_losses_and_metrics = Dict(
            {key: np.mean(val)
             for key, val in all_losses_and_metrics.items()})
        all_losses_and_metrics.update(Dict(self.metrics.evaluate()))
        self.log_validation_losses_and_metrics(all_losses_and_metrics)
        # Store the validation loss in cache. This will be used for checkpointing.
        self.write_to_cache("current_validation_metrics",
                            all_losses_and_metrics)
        return all_losses_and_metrics

    def log_training_losses(self, losses):
        if self.log_wandb_now and self.get("wandb/use", False):
            metrics = Dict({"training_loss": losses.loss})
            metrics.update({
                f"training_{k}": v
                for k, v in losses.unweighted_losses.items()
            })
            self.wandb_log(**metrics)
        if self.log_scalars_now:
            for key, value in losses.unweighted_losses.items():
                self.log_scalar(f"training/{key}", value)
        return self

    def checkpoint(self, force=False):
        # Checkpoint as required
        if force or self.epoch % self.get("training/checkpoint/every", 1) == 0:
            info_dict = {
                "model": self.model.state_dict(),
                "optim": self.optim.state_dict(),
            }
            torch.save(info_dict, self.checkpoint_path)
        if self.get("training/checkpoint/if_best", True):
            # Save a checkpoint if the validation loss is better than best
            self.checkpoint_if_best_validation_loss()
        return self

    def checkpoint_if_best_validation_loss(self):
        current_validation_loss = self.read_from_cache(
            "current_validation_loss", float("inf"))
        best_validation_loss = self.read_from_cache("best_validation_loss",
                                                    float("inf"))
        if current_validation_loss < best_validation_loss:
            self.write_to_cache("best_validation_loss",
                                current_validation_loss)
            ckpt_path = os.path.join(self.checkpoint_directory, "best.ckpt")
        else:
            ckpt_path = None
        if ckpt_path is not None:
            info_dict = {
                "model": self.model.state_dict(),
                "optim": self.optim.state_dict(),
            }
            torch.save(info_dict, ckpt_path)
        return self

    def load(self, device=None):
        ckpt_path = os.path.join(self.checkpoint_directory, "best.ckpt")
        if not os.path.exists(ckpt_path):
            raise FileNotFoundError
        info_dict = torch.load(
            ckpt_path,
            map_location=torch.device(
                (self.device if device is None else device)),
        )
        self.model.load_state_dict(info_dict["model"])
        self.optim.load_state_dict(info_dict["optim"])
        return self

    def log_validation_losses_and_metrics(self, losses):
        if self.get("wandb/use", False):
            metrics = {f"validation_{k}": v for k, v in losses.items()}
            self.wandb_log(**metrics)
        for key, value in losses.items():
            self.log_scalar(f"validation/{key}", value)
        return self

    def clear_moving_averages(self):
        return self.clear_in_cache("moving_loss")

    def step_scheduler(self, epoch):
        if self.scheduler is not None:
            self.scheduler.step(epoch)
        return self

    def log_learning_rates(self):
        lrs = {
            f"lr_{i}": param_group["lr"]
            for i, param_group in enumerate(self.optim.param_groups)
        }
        if self.get("wandb/use", False):
            self.wandb_log(**lrs)
        for key, value in lrs.items():
            self.log_scalar(f"training/{key}", value)
        return self
Example #33
0
# Load data
u = Utils()
train_facile = u.load_matrix('data/data_train_facile.mat')

#generate pairs
pairs_idx, pairs_label = u.generate_pairs(train_facile['label'], 1000, 0.1)
newX,newY = u.select_pairs_data(pairs_idx,train_facile['X'],train_facile['label'],c=700)
feat_idx = u._feat_idx

#test gradient
g = Gradient()
M_ini = g.generate_I(newX.shape[1])
M = g.sgd_metric_learning(newX, newY, 0.002, 50000, 0, M_ini)

# Calculate distance
m = Metrics()
X = u.select_features(train_facile['X'],feat_idx)
X -= X.mean(axis=0)
X /= X.std(axis=0)
X[np.isnan(X)] = 0.
dist = m.mahalanobis_dist(X, pairs_idx,M)
#dist[np.isnan(dist)] = 50.
## Evaluate model
e = Evaluate()
e.evaluation(pairs_label,dist)
## display results
e.display_roc()
e.easy_score()

# Evaluate test dataset and save it
test_facile = u.load_matrix('data/data_test_facile.mat')
Example #34
0
class Metric_Tests(unittest.TestCase):
    def setUp(self):
        self.metrics = Metrics()
        self.tile = Tile()

    def test_evaluations_exist(self):
        self.assertNotEqual(Metrics, None)

    # When the tile is one step away
    def test_displaced_tiles_yields_one(self):
        tile = Mover.move_left(Tile.duplicate(self.tile))
        self.assertEqual(self.metrics.displaced(tile), 1)

    # When the tile is two steps away
    def test_displaced_tiles_yields_two(self):
        tile = Mover.move_left(Tile.duplicate(self.tile))
        tile = Mover.move_up(tile)
        self.assertEqual(self.metrics.displaced(tile), 2)

    # When no corners are solved
    def test_corner_yields_two(self):
        results = Tile()
        results.layout = [[2, 5, 6], [1, 7, 8], [3, 4, 0]]
        self.assertEqual(self.metrics.subset(results), 2)

    # When upper left corner is solved
    def test_corner_yields_one(self):
        results = Tile()
        results.layout = [[1, 2, 3], [8, 5, 6], [7, 4, 0]]
        self.assertEqual(self.metrics.subset(results), 1)

    # When bottom right corner is solved
    def test_corner_yields_one_again(self):
        results = Tile()
        results.layout = [[2, 0, 3], [8, 1, 4], [7, 6, 5]]
        self.assertEqual(self.metrics.subset(results), 1)

# When in the goal state

    def test_corner_yields_zero_again(self):
        results = Tile()
        self.assertEqual(self.metrics.subset(results), 0)

    # When one tile is out of place
    def test_manhattan_distance_yields_one(self):
        tile = Mover.move_down(Tile())
        self.assertEqual(self.metrics.manhattan(tile), 1)

    def test_manhattan_distance_yields_zero(self):
        self.assertEqual(self.metrics.manhattan(Tile()), 0)

    # When 14 tiles are out of place
    def test_manhattan_distance_yields_fourteen(self):
        results = Tile()
        results.layout = [
            [2, 4, 0],
            [8, 6, 7],
            [5, 1, 3],
        ]
        self.assertEqual(self.metrics.manhattan(results), 14)

    # When two tiles are out of place
    def test_manhattan_distance_yields_two(self):
        results = Tile()
        results.layout = [
            [1, 3, 0],
            [8, 2, 4],
            [7, 6, 5],
        ]
        self.assertEqual(self.metrics.manhattan(results), 2)
Example #35
0
class TestMetrics(MockTestCase):
  def setUp(self):
    self.metrics = Metrics()
    self.metrics.factory = self.mock()
    self.fileMetrics = self.mock()
    self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics))

  def node(self, name="test.cpp"):
    newNode = self.mock()
    newNode.expects(at_least_once()).file().will(return_value(name))
    return newNode

  def testAddFile(self):
    node = self.node()
    self.fileMetrics.expects(once()).addNode(same(node))
    self.metrics.addFile(node)
    assert(self.metrics.file("test.cpp") is self.fileMetrics)
    self.assertEqual(len(self.metrics.files()), 1)
    assert(self.metrics.files()[0] is self.fileMetrics)

  def testAddSameFileTwice(self):
    node = self.node()
    self.fileMetrics.expects(once()).addNode(same(node))
    self.fileMetrics.expects(once()).addNode(same(node))

    self.metrics.addFile(node)
    self.metrics.addFile(node)
    assert(self.metrics.file("test.cpp") is self.fileMetrics)
    self.assertEqual(len(self.metrics.files()), 1)

  def testTwoDifferentFiles(self):
    node1 = self.node("test1.h")
    node2 = self.node("test2.h")
    self.fileMetrics.expects(once()).addNode(same(node1))
    self.fileMetrics.expects(once()).addNode(same(node2))
    self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics))

    self.metrics.addFile(node1)
    self.metrics.addFile(node2)
    assert(self.metrics.file("test1.h") is self.fileMetrics)
    assert(self.metrics.file("test2.h") is self.fileMetrics)
    self.assertEqual(len(self.metrics.files()), 2)
Example #36
0
def main(argv):
    # Allow running multiple at once
    set_gpu_memory(FLAGS.gpumem)
    # Figure out the log and model directory filenames
    assert FLAGS.uid != "", "uid cannot be an empty string"
    model_dir, log_dir = get_directory_names()

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Write config file about what dataset we're using, sources, target, etc.
    file_utils.write_config_from_args(log_dir)

    # Load datasets
    source_datasets, target_dataset = load_datasets.load_da(FLAGS.dataset,
        FLAGS.sources, FLAGS.target, test=FLAGS.test)
    # for x in source_datasets:
    #     print (x)
    # source_train_iterators = [iter(x.train) for x in source_datasets]
    # print (len(source_train_iterators))
    # for x in source_train_iterators:
    #     a = next(x)
    #     print (a)
    # data_sources = [next(x) for x in source_train_iterators]
    # data_sources = [next(x) for x in source_train_iterators]
    # data_sources = [next(x) for x in source_train_iterators]

    # Need to know which iteration for learning rate schedule
    global_step = tf.Variable(0, name="global_step", trainable=False)

    # Load the method, model, etc.
    method = methods.get_method(FLAGS.method,
        source_datasets=source_datasets,
        target_dataset=target_dataset,
        model_name=FLAGS.model,
        global_step=global_step,
        total_steps=FLAGS.steps,
        ensemble_size=FLAGS.ensemble,
        moving_average=FLAGS.moving_average,
        share_most_weights=FLAGS.share_most_weights)

    # Check that this method is supposed to be trainable. If not, we're done.
    # (Basically, we just wanted to write the config file for non-trainable
    # models.)


    if not method.trainable:
        print("Method not trainable. Exiting now.")
        return

    # Checkpoints
    checkpoint = tf.train.Checkpoint(
        global_step=global_step, **method.checkpoint_variables)
    checkpoint_manager = CheckpointManager(checkpoint, model_dir, log_dir)
    checkpoint_manager.restore_latest()

    # Metrics
    has_target_domain = target_dataset is not None
    metrics = Metrics(log_dir, method, source_datasets, target_dataset,
        has_target_domain)

    # Start training
    #
    # TODO maybe eventually rewrite this in the more-standard Keras way
    # See: https://www.tensorflow.org/guide/keras/train_and_evaluate
    for i in range(int(global_step), FLAGS.steps+1):
        t = time.time()
        data_sources, data_target = method.train_step()
        global_step.assign_add(1)
        t = time.time() - t

        if FLAGS.time_training:
            print(int(global_step), t, sep=",")
            continue  # skip evaluation, checkpointing, etc. when timing

        if i%1000 == 0:
            print("step %d took %f seconds"%(int(global_step), t))
            sys.stdout.flush()  # otherwise waits till the end to flush on Kamiak

        # Metrics on training/validation data
        if FLAGS.log_train_steps != 0 and i%FLAGS.log_train_steps == 0:
            metrics.train(data_sources, data_target, global_step, t)

        # Evaluate every log_val_steps but also at the last step
        validation_accuracy_source = None
        validation_accuracy_target = None
        if (FLAGS.log_val_steps != 0 and i%FLAGS.log_val_steps == 0) \
                or i == FLAGS.steps:
            validation_accuracy_source, validation_accuracy_target \
                = metrics.test(global_step)
            print(validation_accuracy_source,validation_accuracy_target)

        # Checkpoints -- Save either if at the right model step or if we found
        # a new validation accuracy. If this is better than the previous best
        # model, we need to make a new checkpoint so we can restore from this
        # step with the best accuracy.


        if (FLAGS.model_steps != 0 and i%FLAGS.model_steps == 0) \
                or validation_accuracy_source is not None:
            checkpoint_manager.save(int(global_step-1),
                validation_accuracy_source, validation_accuracy_target)

        # Plots
        if FLAGS.log_plots_steps != 0 and i%FLAGS.log_plots_steps == 0:
            metrics.plots(global_step)

    # We're done -- used for hyperparameter tuning
    file_utils.write_finished(log_dir)
Example #37
0
 def setUp(self):
   self.metrics = Metrics()
   self.metrics.factory = self.mock()
   self.fileMetrics = self.mock()
   self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics))
Example #38
0
def train(model,
          train_loader,
          val_loader,
          num_epochs,
          optimizer,
          criterion,
          args,
          start_epoch=0,
          best_val_score=0,
          best_val_epoch=0):
    """
    This is the main training loop. It trains the model, evaluates the model and saves the metrics and predictions.
    """
    metrics_stats_list = []
    val_per_type_metric_list = []

    if args.apply_rubi:
        val_per_type_metric_list_rubi, val_per_type_metric_list_q = [], []

    lr_decay_step = 2
    lr_decay_rate = .25
    if optimizer is None:

        # lr_decay_epochs = range(10, 25, lr_decay_step)
        # gradual_warmup_steps = [0.5 * args.lr, 1.0 * args.lr, 1.5 * args.lr, 2.0 * args.lr]
        # if args.apply_rubi:
        lr_decay_epochs = range(14, 100, lr_decay_step)
        gradual_warmup_steps = [
            i * args.lr for i in torch.linspace(0.5, 2.0, 7)
        ]
        print(gradual_warmup_steps)
        # else:
        #     lr_decay_epochs = range(10, 25, lr_decay_step)
        #     gradual_warmup_steps = [0.5 * args.lr, 1.0 * args.lr, 1.5 * args.lr, 2.0 * args.lr]
        optimizer = getattr(torch.optim,
                            args.optimizer)(filter(lambda p: p.requires_grad,
                                                   model.parameters()),
                                            lr=args.lr)
    else:
        gradual_warmup_steps = []
        lr_decay_epochs = range(14, 100, lr_decay_step)

    iter_num = 0
    if args.test and start_epoch == num_epochs:
        start_epoch = num_epochs - 1
    for epoch in range(start_epoch, num_epochs):
        if epoch < len(gradual_warmup_steps):
            optimizer.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
        elif epoch in lr_decay_epochs:
            optimizer.param_groups[0]['lr'] *= lr_decay_rate
        else:
            optimizer.param_groups[0]['lr'] = args.lr
        print("lr {}".format(optimizer.param_groups[0]['lr']))

        is_best = False
        train_metrics, val_metrics = Metrics(), Metrics()

        if args.apply_rubi:
            train_metrics_rubi, val_metrics_rubi = Metrics(), Metrics()
            train_metrics_q, val_metrics_q = Metrics(), Metrics()
        else:
            val_metrics_rubi, val_metrics_q = None, None

        if not args.test:
            tqdm_train_loader = tqdm(train_loader, position=0, leave=True)
            for i, (visual_features, boxes, question_features, answers,
                    question_types, question_ids,
                    question_lengths) in enumerate(tqdm_train_loader):

                tqdm_train_loader.set_description(
                    f'Loss : {train_metrics.get_loss()} | Score {train_metrics.get_score()}'
                )

                visual_features = Variable(visual_features.float())
                boxes = Variable(boxes.float())
                question_features = Variable(question_features)
                answers = Variable(answers)

                if torch.cuda.is_available():
                    visual_features = visual_features.cuda()
                    boxes = boxes.cuda()
                    question_features = question_features.cuda()
                    answers = answers.cuda()

                pred = model(visual_features, boxes, question_features,
                             answers, question_lengths)
                loss = criterion(pred, answers)['loss']
                loss.backward()
                train_metrics.update_per_batch(model, answers, loss, pred,
                                               visual_features.shape[0])
                if args.apply_rubi:
                    train_metrics_rubi.update_per_batch(
                        model,
                        answers,
                        loss,
                        pred,
                        visual_features.shape[0],
                        logits_key='logits_rubi')
                    train_metrics_q.update_per_batch(model,
                                                     answers,
                                                     loss,
                                                     pred,
                                                     visual_features.shape[0],
                                                     logits_key='logits_q')
                nn.utils.clip_grad_norm_(model.parameters(), 50)
                optimizer.step()
                optimizer.zero_grad()
                iter_num += 1
                #if i % 10 == 0:
                #train_metrics.print(epoch)
                # if args.apply_rubi:
                #     print("\n\n### logits_rubi ###")
                #     train_metrics_rubi.print(epoch)
                #     print("\n\n### logits_q ###")
                #     train_metrics_q.print(epoch)
            train_metrics.update_per_epoch()
            if args.apply_rubi:
                train_metrics_rubi.update_per_epoch()
                train_metrics_q.update_per_epoch()

        if None != val_loader:  # TODO: "val_loader is not None' was not working for some reason
            print("Starting the test ... ")

            model.eval()
            with torch.no_grad():
                val_results = evaluate_by_logits_key(model,
                                                     val_loader,
                                                     epoch,
                                                     criterion,
                                                     args,
                                                     val_metrics,
                                                     logits_key='logits')
                if args.apply_rubi:
                    val_results_rubi = evaluate_by_logits_key(
                        model,
                        val_loader,
                        epoch,
                        criterion,
                        args,
                        val_metrics_rubi,
                        logits_key='logits_rubi')
                    val_results_q = evaluate_by_logits_key(
                        model,
                        val_loader,
                        epoch,
                        criterion,
                        args,
                        val_metrics_q,
                        logits_key='logits_q')
                # eval_results = evaluate(model, val_loader, epoch, criterion, args, val_metrics, val_metrics_rubi,
                #                         val_metrics_q) # TODO: FIX, use a loop to do this

            model.train()
            if val_metrics.score > best_val_score:
                best_val_score = val_metrics.score
                best_val_epoch = epoch
                is_best = True

            save_val_metrics = not args.test or not args.test_does_not_have_answers
            if save_val_metrics:
                print("Best val score {} at epoch {}".format(
                    best_val_score, best_val_epoch))
                print(f"### Val from Logits {val_metrics.score}")
                if args.apply_rubi:
                    print(f"### Val from Logits_rubi {val_metrics_rubi.score}")
                    print(f"### Val from Logits_q {val_metrics_q.score}")
                    # print(
                    #     f"##### by logits key {val_metrics_by_logits_key.score} "
                    #     f"val_metrics_by_logits_key_rubi {val_metrics_by_logits_key_rubi.score} "
                    #     f"Logits score: {val_metrics.score} "
                    #     f"Logits_rubi score: {val_metrics_rubi.score} "
                    #     f"Logits_q score: {val_metrics_q.score} ####")

                val_per_type_metric_list.append(
                    val_results['per_type_metric'].get_json())
                if args.apply_rubi:
                    val_per_type_metric_list_rubi.append(
                        val_results_rubi['per_type_metric'].get_json())
                    val_per_type_metric_list_q.append(
                        val_results_q['per_type_metric'].get_json())

            metrics = accumulate_metrics(epoch, train_metrics, val_metrics,
                                         val_results['per_type_metric'],
                                         best_val_score, best_val_epoch,
                                         save_val_metrics)

            metrics_stats_list.append(metrics)

            # Add metrics + parameters of the model and optimizer
            metrics_n_model = save_metrics_n_model(metrics, model, optimizer,
                                                   args, is_best)
            VqaUtils.save_stats(metrics_stats_list,
                                val_per_type_metric_list,
                                val_results['all_preds'],
                                args.expt_save_dir,
                                split=args.test_split,
                                epoch=epoch)
            # if args.apply_rubi:
            #     VqaUtils.save_stats(metrics_stats_list, val_per_type_metric_list_rubi, val_results_rubi['all_preds'],
            #                         args.expt_save_dir,
            #                         split=args.test_split, epoch=epoch, suffix='rubi')
            #     VqaUtils.save_stats(metrics_stats_list, val_per_type_metric_list_q, val_results_q['all_preds'],
            #                         args.expt_save_dir,
            #                         split=args.test_split, epoch=epoch, suffix='q')

        if args.test:
            VqaUtils.save_preds(val_results['all_preds'], args.expt_save_dir,
                                args.test_split, epoch)
            print("Test completed!")
            break
Example #39
0
Created on Tue Jun 16 17:57:09 2015

@author: Paco
"""

from utils import Utils
from evaluate import Evaluate
from metrics import Metrics

# Load data
u = Utils()
train_hard = u.load_matrix('data/data_train_difficile.mat')

#generate pairs
pairs_idx, pairs_label = u.generate_pairs(train_hard['label'], 1000, 0.1)

# Calculate distance
m = Metrics()
dist = m.braycurtis_dist(train_hard['X'], pairs_idx)

# Evaluate model
e = Evaluate()
e.evaluation(pairs_label,dist)
# display results
e.display_roc()
e.hard_score()

# Evaluate test dataset and save it
test_hard = u.load_matrix('data/data_test_difficile.mat')
dist_test = m.braycurtis_dist(test_hard['X'], test_hard['pairs'])
u.save_test(dist_test,filetxt='soumission_dur.txt')   
Example #40
0
df_binary = pd.DataFrame(data_binary)
df_nominal_full = pd.DataFrame(data_nominal_full)
df_nominal_missing = pd.DataFrame(data_nominal_missing)
df_cohens = pd.DataFrame(data_cohens)
df_fleiss = pd.DataFrame(data_fleiss)

kripp_binary = Krippendorff(df_binary)
kripp_nominal_full = Krippendorff(df_nominal_full)
kripp_nominal_missing = Krippendorff(df_nominal_missing)

kripp_test = Krippendorff(df_test)
kripp_binary = Krippendorff(df_binary)
kripp_nominal_full = Krippendorff(df_nominal_full)
kripp_nominal_missing = Krippendorff(df_nominal_missing)

mets = Metrics(df_test)
mets_cohens = Metrics(df_cohens)
mets_fleiss = Metrics(df_fleiss)


class TestMetrics(unittest.TestCase):
    """
    Tests for Krippendorff's alpha computations from
    disagree.metrics.Krippendorff
    """
    def test_kripps_alpha_value_with_binary_data(self):
        # Test the final value of kripps alpha, from Krippendorff paper
        # Page 3
        alpha = kripp_binary.alpha(data_type="nominal")
        alpha = float("{:.3f}".format(alpha))
        self.assertTrue(alpha == 0.095)
Example #41
0
def execute():
    metrics = Metrics()
    document_vectors_list = []
    document_id = 0
    initial_centroids = []
    tempo_final = 0
    tempo_inicial = 0
    new_closests = []
    new_clusters_mpi = []
    temp_dist_mpi = 1
    initial_centroids_mpi = []
    document_mpi = []

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    comm.Barrier()

    if rank == 0:
        files = []
        fileNames = sorted(glob2.glob("longo/*.txt"))
        for filename in fileNames:
            files.append(open(filename, "r+").read())

        serviceTextMining = ServiceTextMining()
        terms = serviceTextMining.select_terms(files)
        matriz_tf = serviceTextMining.create_matriz_itf_terms(terms)
        matriz_df = serviceTextMining.create_matriz_idf_terms(terms, files)
        matriz_tf_df = serviceTextMining.create_matriz_tf_df_terms(
            matriz_tf, matriz_df)

        for line in range(len(matriz_tf_df)):
            document_vector = []
            for column in matriz_tf_df[line]:
                document_vector.append(column)

            document_vectors_list.append((document_id, document_vector))
            document_id += 1

        initial_centroids_mpi = random.sample(document_vectors_list, k=3)
    else:
        document_vectors_list = []
        initial_centroids_mpi = []

    comm.Barrier()

    initial_centroids_mpi = comm.bcast(initial_centroids_mpi, root=0)
    document_mpi = comm.scatter(document_vectors_list, root=0)

    tempo_inicial = MPI.Wtime()
    print("RUN MPI")
    while temp_dist_mpi > 0.01:

        best_centroid = {}
        reduce_closest_mpi = []
        closests = []
        reduce_closest = []
        new_clusters = []
        if rank == 0:
            num_workers = len(document_vectors_list) - 1
            closed_workers = 0

            while closed_workers < num_workers:
                status = MPI.Status()
                best = comm.recv(source=MPI.ANY_SOURCE,
                                 tag=MPI.ANY_TAG,
                                 status=status)
                if best['max_value'] != 0:
                    closests.append(
                        (best['best_index'], (best['best_vc_doc'], 1)))
                else:
                    closests.append(('erro', (best['best_vc_doc'], 1)))

                closed_workers += 1

            new_closests = [
                d for d in [d for d in closests if d[0] != 'erro']
                if d[0] != []
            ]

            for nc in new_closests:
                total_doc = 0
                document_sum = []
                closest = [d for d in reduce_closest if d[0] == nc[0]]

                if not closest:
                    reduce_closest.append((nc[0], (0, nc[1][0][1]), nc[1][1]))
                    continue

                for k, rc in enumerate(reduce_closest):
                    if rc[0] == nc[0]:
                        total_doc = closest[0][2] + nc[1][1]
                        document_sum = [
                            sum(x) for x in zip(nc[1][0][1], closest[0][1][1])
                        ]
                        reduce_closest[k] = (nc[0], (0, document_sum),
                                             total_doc)
                        break

            for rc in reduce_closest:
                new_clusters.append(get_new_clusters(rc))

        else:
            max_value = 0
            var = 0
            best = 0
            for c in initial_centroids_mpi:
                temp = metrics.get_cosine_distance(c[1], document_mpi[1])
                if temp != 0:
                    if temp > max_value:
                        max_value = temp
                        best = c[0]

            best_centroid = {
                'best_index': best,
                'max_value': max_value,
                'best_vc_doc': document_mpi
            }

            comm.send(best_centroid, dest=0)

        new_clusters_mpi = comm.bcast(new_clusters, root=0)

        comm.Barrier()
        if rank == 0:
            results = []
            for index_cluster in range(len(new_clusters_mpi)):
                results.append(
                    metrics.get_eculedian_distance(
                        initial_centroids_mpi[index_cluster][1],
                        new_clusters_mpi[index_cluster][1]))
            temp_dist_mpi = sum(results)

        for iK in range(len(new_clusters_mpi)):
            initial_centroids_mpi[iK] = (new_clusters_mpi[iK][0],
                                         new_clusters_mpi[iK][1])

        initial_centroids_mpi = comm.bcast(initial_centroids_mpi, root=0)
        temp_dist_mpi = comm.bcast(temp_dist_mpi, root=0)

    if rank == 0:
        tempo_final = MPI.Wtime()
        print("Tempo ", tempo_final - tempo_inicial)
    return new_closests
    def train_one_epoch(
            args,
            model,
            train_iter,
            optimizers,
            criterion,
            eval_iter,
            vocab,
            epoch,
            metrics=Metrics(),
            loss_aggr=None,
    ):
        labels_with_high_model_score = None

        with trange(len(train_iter)) as t:
            for iter, batch in enumerate(train_iter):

                model.to(
                    args.device,
                    args.out_device,
                )
                model.train()

                batch_token_ids, label_ids, label_probs, eval_mask, _, _, orig_batch, loaded_batch = batch

                enc = None

                if (args.collect_most_popular_labels_steps is not None
                        and args.collect_most_popular_labels_steps > 0
                        and iter > 0 and
                        iter % args.collect_most_popular_labels_steps == 0):
                    model.to(args.device, args.eval_device)
                    with torch.no_grad():
                        logits_, _, _, _, _, enc = model(
                            batch_token_ids,
                            None,
                            None,
                        )  # logits: (N, T, VOCAB), y: (N, T)
                        labels_with_high_model_score = get_topk_ids_aggregated_from_seq_prediction(
                            logits_,
                            topk_from_batch=args.label_size,
                            topk_per_token=args.topk_neg_examples)
                        batch_token_ids, label_ids, label_probs, eval_mask, _, _, _, _ = EDLDataset_collate_func(
                            args=args,
                            labels_with_high_model_score=
                            labels_with_high_model_score,
                            batch=orig_batch,
                            return_labels=True,
                            vocab=vocab,
                            is_training=False,
                            loaded_batch=loaded_batch,
                        )

                # if args.label_size is not None:
                logits, y, y_hat, label_probs, sparse_params, _ = model(
                    batch_token_ids, label_ids, label_probs,
                    enc=enc)  # logits: (N, T, VOCAB), y: (N, T)
                logits = logits.view(-1)  # (N*T, VOCAB)
                label_probs = label_probs.view(-1)  # (N*T,)

                loss = criterion(logits, label_probs)

                loss.backward()

                if (iter + 1) % args.accumulate_batch_gradients == 0:
                    for optimizer in optimizers:
                        optimizer.step()
                        optimizer.zero_grad()

                if iter == 0:
                    logging.debug(f"Sanity check")
                    logging.debug("x:", batch_token_ids.cpu().numpy()[0])
                    logging.debug(
                        "tokens:",
                        vocab.tokenizer.convert_ids_to_tokens(
                            batch_token_ids.cpu().numpy()[0]))
                    logging.debug("y:", label_probs.cpu().numpy()[0])

                loss_aggr = running_mean(loss.detach().item(), loss_aggr)

                if iter > 0 and iter % args.checkpoint_eval_steps == 0:
                    metrics = Net.evaluate(
                        args=args,
                        model=model,
                        iterator=eval_iter,
                        optimizers=optimizers,
                        step=iter,
                        epoch=epoch,
                        save_checkpoint=iter % args.checkpoint_save_steps == 0,
                        sampled_evaluation=False,
                        metrics=metrics,
                        vocab=vocab,
                    )

                t.set_postfix(
                    loss=loss_aggr,
                    nr_labels=len(label_ids),
                    aggr_labels=len(labels_with_high_model_score)
                    if labels_with_high_model_score else 0,
                    last_eval=metrics.report(
                        filter={"f1", "num_proposed", "epoch", "step"}),
                )
                t.update()

        for optimizer in optimizers:
            optimizer.step()
            optimizer.zero_grad()

        return metrics
Example #43
0
# net = torch.load('/home/intern1/qiuzhen/Works/result_for_structure_segment/yin_U_Net_LRS_256_cv2_newdata.pkl')
# net = UNet256_kernel(4, BatchNorm=True)
# net = torch.load('/home/intern1/qiuzhen/Works/result_for_structure_segment/M_Net_LBO_256_cv2.pkl')
print('model : yin_unet_for_newdata')
# net = models.resnet34(pretrained=True)
# fc_features = net.fc.in_features
# net.fc = nn.Linear(fc_features, 2)
net.cuda()
loss = nn.CrossEntropyLoss(size_average=True).cuda()
optimizer = optim.Adam(net.parameters(), lr=0.01)  #优化方法
image_list, iterper_epo = loadImageList(path,
                                        batchsize=batchsize,
                                        flag='train')
total = len(image_list)
print('train_data_len:' + str(total))
metric = Metrics(4)
epochs = 500
max = 1.28982
for i in range(epochs):
    net.train()
    metric.reset()
    acc_list = []
    random.shuffle(image_list)
    running_loss = []
    for j in range(iterper_epo):
        if j == (iterper_epo - 1):
            iterlist = image_list[j * batchsize:]
        else:
            iterlist = image_list[j * batchsize:(j + 1) * batchsize]
        img_data, img_label = loaddata(path, iterlist)
        r_loss, correct = train(net, loss, optimizer, img_data, img_label,
Example #44
0
File: train.py Project: ioalzx/AdaS
def main(args: APNamespace):
    root_path = Path(args.root).expanduser()
    config_path = root_path / Path(args.config).expanduser()
    data_path = root_path / Path(args.data).expanduser()
    output_path = root_path / Path(args.output).expanduser()
    global checkpoint_path
    checkpoint_path = root_path / Path(args.checkpoint).expanduser()

    if not config_path.exists():
        # logging.critical(f"AdaS: Config path {config_path} does not exist")
        print(f"AdaS: Config path {config_path} does not exist")
        raise ValueError
    if not data_path.exists():
        print(f"AdaS: Data dir {data_path} does not exists, building")
        data_path.mkdir(exist_ok=True, parents=True)
    if not output_path.exists():
        print(f"AdaS: Output dir {output_path} does not exists, building")
        output_path.mkdir(exist_ok=True, parents=True)
    if not checkpoint_path.exists():
        if args.resume:
            print(f"AdaS: Cannot resume from checkpoint without specifying " +
                  "checkpoint dir")
            raise ValueError
        if checkpoint_path.is_dir():
            print(f"AdaS: Checkpoint dir {checkpoint_path} does not exists, " +
                  "building")
            checkpoint_path.mkdir(exist_ok=True, parents=True)
        else:
            print(f"AdaS: Checkpoint path {checkpoint_path} doesn't exist " +
                  "building directory to store checkpoints: .adas-checkpoint")
            checkpoint_path.cwd().mkdir(exist_ok=True, parents=True)

    with config_path.open() as f:
        config = yaml.load(f)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    global best_acc
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    print("Adas: Argument Parser Options")
    print("-"*45)
    print(f"    {'config':<20}: {args.config:<20}")
    print(f"    {'data':<20}: {args.data:<20}")
    print(f"    {'output':<20}: {args.output:<20}")
    print(f"    {'checkpoint':<20}: {args.checkpoint:<20}")
    print(f"    {'resume':<20}: {args.resume:<20}")
    print("\nAdas: Train: Config")
    print(f"    {'Key':<20} {'Value':<20}")
    print("-"*45)
    for k, v in config.items():
        print(f"    {k:<20} {v:<20}")

    for trial in range(config['n_trials']):
        device
        # Data
        # logging.info("Adas: Preparing Data")
        train_loader, test_loader = get_data(
            root=data_path,
            dataset=config['dataset'],
            mini_batch_size=config['mini_batch_size'])
        global performance_statistics, net, metrics, adas
        performance_statistics = {}

        # logging.info("AdaS: Building Model")
        net = get_net(config['network'], num_classes=10 if config['dataset'] ==
                      'CIFAR10' else 100 if config['dataset'] == 'CIFAR100'
                      else 1000 if config['dataset'] == 'ImageNet' else 10)
        metrics = Metrics(list(net.parameters()),
                          p=config['p'])
        if config['lr_scheduler'] == 'AdaS':
            adas = AdaS(parameters=list(net.parameters()),
                        beta=config['beta'],
                        zeta=config['zeta'],
                        init_lr=float(config['init_lr']),
                        min_lr=float(config['min_lr']),
                        p=config['p'])

        net = net.to(device)

        global criterion
        criterion = get_loss(config['loss'])

        # TODO config
        optimizer, scheduler = get_optimizer_scheduler(
            init_lr=float(config['init_lr']),
            optim_method=config['optim_method'],
            lr_scheduler=config['lr_scheduler'])

        if device == 'cuda':
            net = torch.nn.DataParallel(net)
            cudnn.benchmark = True

        if args.resume:
            # Load checkpoint.
            print("Adas: Resuming from checkpoint...")
            if checkpoint_path.is_dir():
                checkpoint = torch.load(str(checkpoint_path / 'ckpt.pth'))
            else:
                checkpoint = torch.load(str(checkpoint_path))
            net.load_state_dict(checkpoint['net'])
            best_acc = checkpoint['acc']
            start_epoch = checkpoint['epoch']
            if adas is not None:
                adas.historical_io_metrics = \
                    checkpoint['historical_io_metrics']

        # model_parameters = filter(lambda p: p.requires_grad,
        #                           net.parameters())
        # params = sum([np.prod(p.size()) for p in model_parameters])
        # print(params)
        epochs = range(start_epoch, start_epoch + config['max_epoch'])
        for epoch in epochs:
            start_time = time.time()
            print(f"AdaS: Epoch {epoch} Started.")
            train_loss, train_accuracy = epoch_iteration(
                train_loader, epoch, device, optimizer)
            end_time = time.time()
            if config['lr_scheduler'] == 'StepLR':
                scheduler.step()
            test_loss, test_accuracy = test_main(test_loader, epoch, device)
            total_time = time.time()
            print(
                f"AdaS: Epoch {epoch}/{epochs[-1]} Ended | " +
                "Total Time: {:.3f}s | ".format(total_time - start_time) +
                "Epoch Time: {:.3f}s | ".format(end_time - start_time) +
                "Est. Time Remaining: {:.3f}s | ".format(
                    (total_time - start_time) * (epochs[-1] - epoch)),
                "Train Loss: {:.4f}% | Train Acc. {:.4f}% | ".format(
                    train_loss,
                    train_accuracy) +
                "Test Loss: {:.4f}% | Test Acc. {:.4f}%".format(test_loss,
                                                                test_accuracy))
            df = pd.DataFrame(data=performance_statistics)
            if config['lr_scheduler'] == 'AdaS':
                xlsx_name = \
                    f"config['optim_method']_AdaS_trial={trial}_" +\
                    f"beta={config['beta']}_initlr=config['init_lr']_" +\
                    f"net={config['network']}_dataset={config['dataset']}.xlsx"
            else:
                xlsx_name = \
                    f"config['optim_method']_config['lr_scheduler']_" +\
                    f"trial={trial}_initlr=config['init_lr']" +\
                    f"net={config['network']}_dataset={config['dataset']}.xlsx"

            df.to_excel(str(output_path / xlsx_name))
Example #45
0
    def validate(self):

        self.decoder.eval()  # eval mode (no dropout or batchnorm)
        if self.encoder is not None:
            self.encoder.eval()

        batch_time = AverageMeter()
        losses = AverageMeter()
        top5accs = AverageMeter()

        start = time.time()

        ground_truth = list(
        )  # ground_truth (true captions) for calculating BLEU-4 score
        prediction = list()  # prediction (predicted captions)

        # explicitly disable gradient calculation to avoid CUDA memory error
        # solves the issue #57
        with torch.no_grad():
            # Batches
            for i, (imgs, caps, caplens,
                    allcaps) in enumerate(self.val_loader):

                # move to device, if available
                imgs = imgs.to(self.device)
                caps = caps.to(self.device)
                caplens = caplens.to(self.device)

                # forward encoder
                if self.encoder is not None:
                    imgs = self.encoder(imgs)

                # forward decoder
                if self.caption_model == 'att2all':
                    scores, caps_sorted, decode_lengths, alphas, sort_ind = self.decoder(
                        imgs, caps, caplens)
                else:
                    scores, caps_sorted, decode_lengths, sort_ind = self.decoder(
                        imgs, caps, caplens)

                # since we decoded starting with <start>, the targets are all words after <start>, up to <end>
                targets = caps_sorted[:, 1:]

                # remove timesteps that we didn't decode at, or are pads
                # pack_padded_sequence is an easy trick to do this
                scores_copy = scores.clone()
                scores = pack_padded_sequence(scores,
                                              decode_lengths,
                                              batch_first=True)[0]
                targets = pack_padded_sequence(targets,
                                               decode_lengths,
                                               batch_first=True)[0]

                # calc loss
                loss = self.loss_function(scores, targets)

                # doubly stochastic attention regularization (in paper: show, attend and tell)
                if self.caption_model == 'att2all':
                    loss += self.tau * ((1. - alphas.sum(dim=1))**2).mean()

                # keep track of metrics
                losses.update(loss.item(), sum(decode_lengths))
                top5 = accuracy(scores, targets, 5)
                top5accs.update(top5, sum(decode_lengths))
                batch_time.update(time.time() - start)

                start = time.time()

                if i % self.print_freq == 0:
                    print(
                        'Validation: [{0}/{1}]\t'
                        'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.
                        format(i,
                               len(self.val_loader),
                               batch_time=batch_time,
                               loss=losses,
                               top5=top5accs))

                # store ground truth captions and predicted captions of each image
                # for n images, each of them has one prediction and multiple ground truths (a, b, c...):
                # prediction = [ [hyp1], [hyp2], ..., [hypn] ]
                # ground_truth = [ [ [ref1a], [ref1b], [ref1c] ], ..., [ [refna], [refnb] ] ]

                # ground truth
                allcaps = allcaps[
                    sort_ind]  # because images were sorted in the decoder
                for j in range(allcaps.shape[0]):
                    img_caps = allcaps[j].tolist()
                    img_captions = list(
                        map(
                            lambda c: [
                                w for w in c if w not in {
                                    self.word_map['<start>'], self.word_map[
                                        '<pad>']
                                }
                            ], img_caps))  # remove <start> and pads
                    ground_truth.append(img_captions)

                # prediction
                _, preds = torch.max(scores_copy, dim=2)
                preds = preds.tolist()
                temp_preds = list()
                for j, p in enumerate(preds):
                    temp_preds.append(
                        preds[j][:decode_lengths[j]])  # remove pads
                preds = temp_preds
                prediction.extend(preds)

                assert len(ground_truth) == len(prediction)

            # calc BLEU-4 and CIDEr score
            metrics = Metrics(ground_truth, prediction, self.rev_word_map)
            bleu4 = metrics.belu()[3]  # BLEU-4
            cider = metrics.cider()  # CIDEr

            print(
                '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}, CIDEr - {cider}\n'
                .format(loss=losses, top5=top5accs, bleu=bleu4, cider=cider))

        return bleu4
Example #46
0
def eval(**args):
    """
    Evaluate selected model 
    Args:
        seed       (Int):        Integer indicating set seed for random state
        save_dir   (String):     Top level directory to generate results folder
        model      (String):     Name of selected model 
        dataset    (String):     Name of selected dataset  
        exp        (String):     Name of experiment 
        load_type  (String):     Keyword indicator to evaluate the testing or validation set
        pretrained (Int/String): Int/String indicating loading of random, pretrained or saved weights
        
    Return:
        None
    """

    print("\n############################################################################\n")
    print("Experimental Setup: ", args)
    print("\n############################################################################\n")

    d          = datetime.datetime.today()
    date       = d.strftime('%Y%m%d-%H%M%S')
    result_dir = os.path.join(args['save_dir'], args['model'], '_'.join((args['dataset'],args['exp'],date)))
    log_dir    = os.path.join(result_dir, 'logs')
    save_dir   = os.path.join(result_dir, 'checkpoints')

    if not args['debug']:
        os.makedirs(result_dir, exist_ok=True)
        os.makedirs(log_dir,    exist_ok=True) 
        os.makedirs(save_dir,   exist_ok=True) 

        # Save copy of config file
        with open(os.path.join(result_dir, 'config.yaml'),'w') as outfile:
            yaml.dump(args, outfile, default_flow_style=False)

        # Tensorboard Element
        writer = SummaryWriter(log_dir)

    # Check if GPU is available (CUDA)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load Network
    model = create_model_object(**args).to(device)

    # Load Data
    loader = data_loader(**args, model_obj=model)

    if args['load_type'] == 'train_val':
        eval_loader = loader['valid']

    elif args['load_type'] == 'train':
        eval_loader = loader['train']

    elif args['load_type'] == 'test':
        eval_loader  = loader['test'] 

    else:
        sys.exit('load_type must be valid or test for eval, exiting')

    # END IF

    if isinstance(args['pretrained'], str):
        ckpt = load_checkpoint(args['pretrained'])
        model.load_state_dict(ckpt)

    # Training Setup
    params     = [p for p in model.parameters() if p.requires_grad]

    acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(eval_loader.dataset))
    acc = 0.0

    # Setup Model To Evaluate 
    model.eval()

    with torch.no_grad():
        for step, data in enumerate(eval_loader):
            x_input     = data['data']
            annotations = data['annots']

            if isinstance(x_input, torch.Tensor):
                outputs = model(x_input.to(device))
            else:
                for i, item in enumerate(x_input):
                    if isinstance(item, torch.Tensor):
                        x_input[i] = item.to(device)
                outputs = model(*x_input)

            # END IF


            acc = acc_metric.get_accuracy(outputs, annotations)

            if step % 100 == 0:
                print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc))

    print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc))

    if not args['debug']:
        writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
        # Close Tensorboard Element
        writer.close()
def image_builder(buildspec):
    FORMATTER = OutputFormatter(constants.PADDING)

    BUILDSPEC = Buildspec()
    BUILDSPEC.load(buildspec)
    IMAGES = []

    for image in BUILDSPEC["images"].items():
        ARTIFACTS = deepcopy(BUILDSPEC["context"])

        image_name = image[0]
        image_config = image[1]

        extra_build_args = {}
        labels = {}

        if image_config.get("version") is not None:
            if BUILDSPEC["version"] != image_config.get("version"):
                continue

        if image_config.get("context") is not None:
            ARTIFACTS.update(image_config["context"])

        build_context = os.getenv("BUILD_CONTEXT")
        image_tag = (tag_image_with_pr_number(image_config["tag"])
                     if build_context == "PR" else image_config["tag"])
        if not build_config.DISABLE_DATETIME_TAG or build_context != "PR":
            image_tag = tag_image_with_datetime(image_tag)
        image_repo_uri = (image_config["repository"] if build_context == "PR"
                          else modify_repository_name_for_context(
                              str(image_config["repository"]), build_context))
        base_image_uri = None
        if image_config.get("base_image_name") is not None:
            base_image_object = _find_image_object(
                IMAGES, image_config["base_image_name"])
            base_image_uri = base_image_object.ecr_url

        if image_config.get("download_artifacts") is not None:
            for artifact_name, artifact in image_config.get(
                    "download_artifacts").items():
                type = artifact["type"]
                uri = artifact["URI"]
                var = artifact["VAR_IN_DOCKERFILE"]

                try:
                    file_name = utils.download_file(uri, type).strip()
                except ValueError:
                    FORMATTER.print(
                        f"Artifact download failed: {uri} of type {type}.")

                ARTIFACTS.update({
                    f"{artifact_name}": {
                        "source":
                        f"{os.path.join(os.sep, os.path.abspath(os.getcwd()), file_name)}",
                        "target": file_name
                    }
                })

                extra_build_args[var] = file_name
                labels[var] = file_name
                labels[f"{var}_URI"] = uri

        ARTIFACTS.update({
            "dockerfile": {
                "source": image_config["docker_file"],
                "target": "Dockerfile",
            }
        })

        context = Context(ARTIFACTS, f"build/{image_name}.tar.gz",
                          image_config["root"])
        """
        Override parameters from parent in child.
        """

        info = {
            "account_id": str(BUILDSPEC["account_id"]),
            "region": str(BUILDSPEC["region"]),
            "framework": str(BUILDSPEC["framework"]),
            "version": str(BUILDSPEC["version"]),
            "root": str(image_config["root"]),
            "name": str(image_name),
            "device_type": str(image_config["device_type"]),
            "python_version": str(image_config["python_version"]),
            "image_type": str(image_config["image_type"]),
            "image_size_baseline": int(image_config["image_size_baseline"]),
            "base_image_uri": base_image_uri,
            "labels": labels,
            "extra_build_args": extra_build_args
        }

        image_object = DockerImage(
            info=info,
            dockerfile=image_config["docker_file"],
            repository=image_repo_uri,
            tag=image_tag,
            to_build=image_config["build"],
            context=context,
        )

        IMAGES.append(image_object)

    FORMATTER.banner("DLC")
    FORMATTER.title("Status")

    THREADS = {}

    # In the context of the ThreadPoolExecutor each instance of image.build submitted
    # to it is executed concurrently in a separate thread.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Standard images must be built before example images
        # Example images will use standard images as base
        standard_images = [
            image for image in IMAGES if "example" not in image.name.lower()
        ]
        example_images = [
            image for image in IMAGES if "example" in image.name.lower()
        ]

        for image in standard_images:
            THREADS[image.name] = executor.submit(image.build)

        # the FORMATTER.progress(THREADS) function call also waits until all threads have completed
        FORMATTER.progress(THREADS)

        for image in example_images:
            THREADS[image.name] = executor.submit(image.build)

        # the FORMATTER.progress(THREADS) function call also waits until all threads have completed
        FORMATTER.progress(THREADS)

        FORMATTER.title("Build Logs")

        if not os.path.isdir("logs"):
            os.makedirs("logs")

        for image in IMAGES:
            FORMATTER.title(image.name)
            FORMATTER.table(image.info.items())
            FORMATTER.separator()
            FORMATTER.print_lines(image.log)
            with open(f"logs/{image.name}", "w") as fp:
                fp.write("/n".join(image.log))
                image.summary["log"] = f"logs/{image.name}"

        FORMATTER.title("Summary")

        for image in IMAGES:
            FORMATTER.title(image.name)
            FORMATTER.table(image.summary.items())

        FORMATTER.title("Errors")
        ANY_FAIL = False
        for image in IMAGES:
            if image.build_status == constants.FAIL:
                FORMATTER.title(image.name)
                FORMATTER.print_lines(image.log[-10:])
                ANY_FAIL = True
        if ANY_FAIL:
            raise Exception("Build failed")
        else:
            FORMATTER.print("No errors")

        FORMATTER.title("Uploading Metrics")
        metrics = Metrics(
            context=constants.BUILD_CONTEXT,
            region=BUILDSPEC["region"],
            namespace=constants.METRICS_NAMESPACE,
        )
        for image in IMAGES:
            try:
                metrics.push_image_metrics(image)
            except Exception as e:
                if ANY_FAIL:
                    raise Exception(f"Build failed.{e}")
                else:
                    raise Exception(f"Build passed. {e}")

        FORMATTER.separator()

        # Set environment variables to be consumed by test jobs
        test_trigger_job = utils.get_codebuild_project_name()
        utils.set_test_env(
            IMAGES,
            BUILD_CONTEXT=os.getenv("BUILD_CONTEXT"),
            TEST_TRIGGER=test_trigger_job,
        )
Example #48
0
def train(args):
    timestamp=datetime.now().strftime('%Y%m%d%H%M')    
    # LOG #
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.DEBUG, format="%(message)s")#,format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
    tb_writer=None
    if args.visual:
        # make output directory if it doesn't already exist
        os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True)
        os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True)
        fh = logging.FileHandler(f"./output/{args.model}/{args.expname}/{timestamp}/logs.txt")
                                      # create file handler which logs even debug messages
        logger.addHandler(fh)# add the handlers to the logger
        tb_writer = SummaryWriter(f"./output/{args.model}/{args.expname}/{timestamp}/logs/")
        # save arguments
        json.dump(vars(args), open(f'./output/{args.model}/{args.expname}/{timestamp}/args.json', 'w'))

    # Device #
    if args.gpu_id<0: 
        device = torch.device("cuda")
    else:
        device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() and args.gpu_id>-1 else "cpu")
    print(device)
    n_gpu = torch.cuda.device_count() if args.gpu_id<0 else 1
    print(f"num of gpus:{n_gpu}")
    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    def save_model(model, epoch, timestamp):
        """Save model parameters to checkpoint"""
        os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True)
        ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl'
        print(f'Saving model parameters to {ckpt_path}')
        torch.save(model.state_dict(), ckpt_path)

    def load_model(model, epoch, timestamp):
        """Load parameters from checkpoint"""
        ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl'
        print(f'Loading model parameters from {ckpt_path}')
        model.load_state_dict(torch.load(checkpoint))

    config = getattr(configs, 'config_'+args.model)()

    ###############################################################################
    # Load dataset
    ###############################################################################
    train_set=APIDataset(args.data_path+'train.desc.h5', args.data_path+'train.apiseq.h5', config['max_sent_len'])
    valid_set=APIDataset(args.data_path+'test.desc.h5', args.data_path+'test.apiseq.h5', config['max_sent_len'])
    train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, num_workers=1)
    valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=True, num_workers=1)
    print("Loaded dataset!")

    ###############################################################################
    # Define the models
    ###############################################################################
    model = getattr(models, args.model)(config) 
    if args.reload_from>=0:
        load_model(model, args.reload_from)
    model=model.to(device)
    
    
    ###############################################################################
    # Prepare the Optimizer
    ###############################################################################
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]    
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['lr'], eps=config['adam_epsilon'])        
    scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=config['warmup_steps'], 
            num_training_steps=len(train_loader)*config['epochs']) # do not foget to modify the number when dataset is changed

    ###############################################################################
    # Training
    ###############################################################################
    logger.info("Training...")
    itr_global=1
    start_epoch=1 if args.reload_from==-1 else args.reload_from+1
    for epoch in range(start_epoch, config['epochs']+1):

        epoch_start_time = time.time()
        itr_start_time = time.time()

        # shuffle (re-define) dataset between epochs

        for batch in train_loader:# loop through all batches in training dataset
            model.train()
            batch_gpu = [tensor.to(device) for tensor in batch]
            loss = model(*batch_gpu)  
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config['clip'])
            optimizer.step()
            scheduler.step()
            model.zero_grad()

            if itr_global % args.log_every == 0:
                elapsed = time.time() - itr_start_time
                log = '%s-%s|@gpu%d epo:[%d/%d] iter:%d step_time:%ds loss:%f'\
                %(args.model, args.expname, args.gpu_id, epoch, config['epochs'],itr_global, elapsed, loss)
                if args.visual:
                        tb_writer.add_scalar('loss', loss, itr_global)
                logger.info(log)

                itr_start_time = time.time()   

            if itr_global % args.valid_every == 0:
             
                model.eval()
                loss_records={}

                for batch in valid_loader:
                    batch_gpu = [tensor.to(device) for tensor in batch]
                    with torch.no_grad():
                        valid_loss = model.valid(*batch_gpu)    
                    for loss_name, loss_value in valid_loss.items():
                        v=loss_records.get(loss_name, [])
                        v.append(loss_value)
                        loss_records[loss_name]=v

                log = 'Validation '
                for loss_name, loss_values in loss_records.items():
                    log = log + loss_name + ':%.4f  '%(np.mean(loss_values))
                    if args.visual:
                        tb_writer.add_scalar(loss_name, np.mean(loss_values), itr_global)                 
                logger.info(log)    

            itr_global+=1        

            if itr_global % args.eval_every == 0:  # evaluate the model in the develop set
                model.eval()      
                save_model(model, itr_global, timestamp) # save model after each epoch
                
                valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1)
                vocab_api = load_dict(args.data_path+'vocab.apiseq.json')
                vocab_desc = load_dict(args.data_path+'vocab.desc.json')
                metrics=Metrics()
                
                os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True)
                f_eval = open(f"./output/{args.model}/{args.expname}/{timestamp}/temp_results/iter{itr_global}.txt", "w")
                repeat = 1
                decode_mode = 'sample'
                recall_bleu, prec_bleu = evaluate(model, metrics, valid_loader, vocab_desc, vocab_api, repeat, decode_mode, f_eval)

                if args.visual:
                    tb_writer.add_scalar('recall_bleu', recall_bleu, itr_global)
                    tb_writer.add_scalar('prec_bleu', prec_bleu, itr_global)
                

        # end of epoch ----------------------------
        model.adjust_lr()
Example #49
0
def test(config: Dict,
         vgpmil_model: vgpmil = None,
         rf_model: RandomForestClassifier = None,
         svm_model: SVC = None):
    print('Testing..')
    test_df = pd.read_csv(config['path_test_df'])
    print('Loaded test dataframe. Number of instances: ' + str(len(test_df)))
    features, bag_labels_per_instance, bag_names_per_instance, instance_labels = load_dataframe(
        test_df, config)
    bag_features, bag_labels, bag_names = get_bag_level_information(
        features, bag_labels_per_instance, bag_names_per_instance)

    metrics_calculator = Metrics(instance_labels, bag_labels, bag_names,
                                 bag_names_per_instance)

    if vgpmil_model is not None:
        print('Test VGPMIL')
        start = timeit.timeit()
        instance_predictions, bag_predictions = vgpmil_model.predict(
            features, bag_names_per_instance, bag_names)
        end = timeit.timeit()
        print('Average runtime per bag: ',
              str((end - start) / bag_predictions.size))
        metrics_calculator.calc_metrics(instance_predictions, bag_predictions,
                                        'vgpmil')
    if rf_model is not None:
        print('Test Random Forest')
        bag_predictions = rf_model.predict(bag_features)
        metrics_calculator.calc_metrics(np.array([]), bag_predictions,
                                        'random_forest')
    if svm_model is not None:
        print('Test SVM')
        bag_predictions = svm_model.predict(bag_features)
        metrics_calculator.calc_metrics(np.array([]), bag_predictions, 'svm')
    if config['use_models']['cnn'] == True:
        cnn_predictions, bag_cnn_predictions, bag_cnn_probability = load_cnn_predictions(
            test_df, config)
        metrics_calculator.calc_metrics(cnn_predictions, bag_cnn_probability,
                                        'cnn')

    metrics_calculator.write_to_file(config)
Example #50
0
def main():
    global args
    args = parse_args()
    args.input_dim, args.mem_dim = 300, 150
    args.hidden_dim, args.num_classes = 50, 5
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        print('Sparsity and weight decay are incompatible, pick one!')
        exit()
    print(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [
            os.path.join(split, 'a.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files_b = [
            os.path.join(split, 'b.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    print('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    print('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    print('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim,
                               args.mem_dim, args.hidden_dim, args.num_classes,
                               args.sparse)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        print('==> Train    Loss: {}\tPearson: {}\tMSE: {}'.format(
            train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        print('==> Dev      Loss: {}\tPearson: {}\tMSE: {}'.format(
            dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        print('==> Test     Loss: {}\tPearson: {}\tMSE: {}'.format(
            test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'pearson': test_pearson,
                'mse': test_mse,
                'args': args,
                'epoch': epoch
            }
            print('==> New optimum found, checkpointing everything now...')
            torch.save(
                checkpoint,
                '%s.pt' % os.path.join(args.save, args.expname + '.pth'))
Example #51
0
def main():
    """
    Runs a single entity resolution on data (real or synthetic) using a match function (logistic regression, decision
    tree, or random forest)
    """
    data_type = 'real'
    decision_threshold = 0.7
    train_class_balance = 0.5
    max_block_size = 1000
    cores = 2
    if data_type == 'synthetic':
        database_train = SyntheticDatabase(100, 10, 10)
        corruption = 0.1
        corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000,
                                                       database_train.database.feature_descriptor.number])
        database_train.corrupt(corruption_array)

        database_validation = SyntheticDatabase(100, 10, 10)
        corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000,
                                                       database_validation.database.feature_descriptor.number])
        database_validation.corrupt(corruption_array)

        database_test = SyntheticDatabase(10, 10, 10)
        corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000,
                                                       database_test.database.feature_descriptor.number])
        database_test.corrupt(corruption_array)
        labels_train = database_train.labels
        labels_validation = database_validation.labels
        labels_test = database_test.labels
        database_train = database_train.database
        database_validation = database_validation.database
        database_test = database_test.database
        single_block = True
    elif data_type == 'real':
        # Uncomment to use all features (annotations and LM)
        #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv')
        #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv')
        #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv')

        # Uncomment to only use annotation features
        #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv')
        #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv')
        #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv')

        # Uncomment to only use LM features
        database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv')
        database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv')
        database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv')

        labels_train = fast_strong_cluster(database_train)
        labels_validation = fast_strong_cluster(database_validation)
        labels_test = fast_strong_cluster(database_test)
        single_block = False
    else:
        Exception('Invalid experiment type'+data_type)

    entities = deepcopy(database_test)
    blocking_scheme = BlockingScheme(entities, max_block_size, single_block=single_block)

    train_seed = generate_pair_seed(database_train, labels_train, train_class_balance, require_direct_match=True, max_minor_class=5000)
    validation_seed = generate_pair_seed(database_validation, labels_validation, 0.5, require_direct_match=True, max_minor_class=5000)
    # forest_all = ForestMatchFunction(database_all_train, labels_train, train_seed, decision_threshold)
    # forest_all.test(database_all_validation, labels_validation, validation_seed)
    # tree_all = TreeMatchFunction(database_all_train, labels_train, train_seed, decision_threshold)
    # tree_all.test(database_all_validation, labels_validation, validation_seed)
    # logistic_all = LogisticMatchFunction(database_all_train, labels_train, train_seed, decision_threshold)
    # logistic_all.test(database_all_validation, labels_validation, validation_seed)

    forest_annotations = ForestMatchFunction(database_train, labels_train, train_seed, decision_threshold)
    roc = forest_annotations.test(database_validation, labels_validation, validation_seed)
    #roc.make_plot()
    #plt.show()

    # tree_annotations = TreeMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold)
    # tree_annotations.test(database_annotations_validation, labels_validation, validation_seed)
    # logistic_annotations = LogisticMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold)
    # logistic_annotations.test(database_annotations_validation, labels_validation, validation_seed)

    # forest_LM = ForestMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold)
    # forest_LM.test(database_LM_validation, labels_validation, validation_seed)
    # tree_LM = TreeMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold)
    # tree_LM.test(database_LM_validation, labels_validation, validation_seed)
    # logistic_LM = LogisticMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold)
    # logistic_LM.test(database_LM_validation, labels_validation, validation_seed)

    # forest_all.roc.write_rates('match_forest_all.csv')
    # tree_all.roc.write_rates('match_tree_all.csv')
    # logistic_all.roc.write_rates('match_logistic_all.csv')
    #
    # forest_annotations.roc.write_rates('match_forest_annotations.csv')
    # tree_annotations.roc.write_rates('match_tree_annotations.csv')
    # logistic_annotations.roc.write_rates('match_logistic_annotations.csv')
    #
    # forest_LM.roc.write_rates('match_forest_LM.csv')
    # tree_LM.roc.write_rates('match_tree_LM.csv')
    # logistic_LM.roc.write_rates('match_logistic_LM.csv')
    # ax = forest_all.roc.make_plot()
    # _ = tree_all.roc.make_plot(ax=ax)
    # _ = logistic_all.roc.make_plot(ax=ax)
    # plt.show()
    #forest_annotations.roc.make_plot()
    #plt.show()

    #entities.merge(strong_labels)

    #er = EntityResolution()
    #weak_labels = er.run(entities, match_function, blocking_scheme, cores=cores)
    weak_labels = weak_connected_components(database_test, forest_annotations, blocking_scheme)
    entities.merge(weak_labels)
    #strong_labels = fast_strong_cluster(entities)
    #entities.merge(strong_labels)

    # out = open('ER.csv', 'w')
    # out.write('phone,cluster_id\n')
    # for cluster_counter, (entity_id, entity) in enumerate(entities.records.iteritems()):
    #     phone_index = 21
    #     for phone in entity.features[phone_index]:
    #         out.write(str(phone)+','+str(cluster_counter)+'\n')
    # out.close()

    print 'Metrics using strong features as surrogate label. Entity resolution run using weak and strong features'
    metrics = Metrics(labels_test, weak_labels)
    # estimated_test_class_balance = count_pairwise_class_balance(labels_test)
    # new_metrics = NewMetrics(database_all_test, weak_labels, forest_all, estimated_test_class_balance)
    metrics.display()