Exemple #1
0
    def __init__(self, optimizer_id):
        self.map = 'sub-maxprob-thr50-2mm'  # This is the brain atlas we use for normalization (from HarvardOxford)
        self.CV = 4  # The number of random subsets to test our data on
        self.connectivity_metric = 'tangent'  # The type of functional connectivity extraction we use
        self.times_to_run = 3000  # Number of times that we randomly generate and test params before ending
        self.verbose = True  # Whether to print out model results after each iteration
        self.csv = True  # Whether to output results to a csv file or not
        self.estimator_chance = 0.5  # The chance that an estimator will be included

        self.shuffle_models = True  # Whether to shuffle the classifier order
        self.maxes = {  # Set maximum values for the random parameter generator
            'estimators': 2,
            'mlp_layers': 3,  # Maximum MLP Layers
            'mlp_nodes': 150,  # Maximum nodes in each MLP layer
            'xgb_trees': 130,  # Maximum number of XGB trees
            'rf_tress': 130,  # Maximum number of RF tress
            'early_stopping': 3  # Maximum number of early stopping
        }

        self.models_consider = {  # Which models to consider during optimization
            'rf': True,  # Include Random Forests?
            'xgb': True,  # Include XGB?
            'mlp': True,  # Include MLP?
            'svc': True,  # Include SVC?
            'logit': True,  # Include Logit?
        }

        self.classifier_atr_choices = {  # Certain choice attributes for classifiers
            'mlp': ['sgd', 'lbfgs', 'adam'],  # MLP solver choices
            'svc': ['rbf', 'linear', 'poly']  # SVC Kernel Choices
        }

        self.csvFile = 'optimizer_' + optimizer_id + '_metrics' + '.csv'
        # Set the CSV file name to include some useful information

        pickled_features, pickled_labels = check_and_get_pickled_data()  # Check and see if biomarkers are already
        # created

        try:
            if not pickled_features or not pickled_labels:  # If we don't already have the data cached locally
                masker = get_atlas_data(self.map)  # Generate a mask using the HarvardOxford atlas
                adhd_data = generate_train_data()  # Retrieve the data from my hard drive
                masked_fmris = apply_masks(adhd_data.func, masker)
                features, adhd_labels = make_connectivity_biomarkers(self.connectivity_metric, adhd_data.labels,
                                                                     adhd_data, masked_fmris)
                # Calculate functional connectivity and combine phenotypic information as a feature. Returns a matrix
                # containing phenotypic information and computed functional connectivity -> features
            else:
                features, adhd_labels = pickled_features, pickled_labels  # If it is cached, retrieve it
        except ValueError:
            features, adhd_labels = pickled_features, pickled_labels  # If it is cached, retrieve it

        self.features = features
        self.labels = adhd_labels

        Helpers.write_attributes(optimizer_id, self.CV, self.times_to_run, self.estimator_chance, self.maxes,
                                 self.classifier_atr_choices, self.models_consider)
Exemple #2
0
              def run_hdiutil_command(self, *args, **kwargs):
                  args = ['hdiutil'] + list(args)
                  try:
                      out = Helpers.run_command(*args, **kwargs)
                  except Exception, e:
                      # Some commands require the disk-image to be mounted/unmounted. If ran when it isn't it returns
                      # saying 'Resource temporarily unavailable'
                      if 'Resource temporarily unavailable' in str(e):
                          self.detach() if self.is_mounted() else self.attach()
 
                      out = Helpers.run_command(*args, **kwargs)
Exemple #3
0
def make_connectivity_biomarkers(kind, labels, adhd200, pooled_subjects):
    """
    This function takes the masked fMRI volumes and the corresponding phenotypic information (age, gender and dexterity)
    and turns them into a 2D array for doing ML classification. If there is no phenotypic information available,
    we exlude it from the dataset.

    :param kind: (str) The type of functional connnectity we extract
    :param labels: (list) The truth values for the ADHD200 dataset
    :param adhd200: (ADHD200) The ADHD200 object
    :param pooled_subjects: (list) The masked fMRI volumes
    :return: (list) features, (list) labels
    """

    new_labels = []  # Initialize a new list for containing the new labels (only labels for fMRI volumes that
    # have corresponding phenotypic information
    temp_features = []  # Initialize a new list for containing the new labels (only labels for fMRI volumes that
    # have corresponding phenotypic information

    conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True)  # Generate the functional
    # connectivity using the biomarker specified
    connectivity = conn_measure.fit_transform(pooled_subjects)  # Apply it to all of the masked fMRI scans

    bar = ProgressBar(max_value=len(adhd200.func))  # Instantiate a new progressbar
    ops = 0  # Set the default value of the bar to 0

    for index in range(len(adhd200.func)):
        phenotypic_information = Helpers.get_params(adhd200, adhd200.func[
            index])  # Retrieve the corresponding phenotypic information for each fMRI
        ops += 1  # Increment the bar by one
        bar.update(ops)  # Update the progressbar to the value of the variable "ops"
        if phenotypic_information is not None:  # If we found phenotypic information for that fMRI
            new_labels.append(labels[index])  # Add it to the "approved" labels list
            generated_features = np.array(
                [Helpers.conform_1d(phenotypic_information, connectivity[index].shape[0]), connectivity[index]])
            # Add the phenotypic information and the functional connectivity as a matrix. We have to
            # surround the phenotypic information by 0s to make it the same shape as the connectivity (conform 1d)
            temp_features.append(generated_features)  # add it to the temp features
        else:
            continue  # Skip that fMRI scan from the dataset

    d3_dataset = np.array(temp_features)  # Convert the 3D temp_features array to a numpy array
    nsamples, nx, ny = d3_dataset.shape  # Extract the dimensionality of the data
    d2_functional_connectivity = d3_dataset.reshape((nsamples, nx * ny))  # Convert it to 2 dimensions

    with open('pickles/features.pkl',
              'wb') as features_file:  # Cache the features so that we don't have to run this
        # function again
        dump(d2_functional_connectivity, features_file)  # Dump them to the pickle file

    with open('pickles/adhd_labels.pkl', 'wb') as labels_file:  # Cache the biomarkers so that we don't have to run this
        # function again
        dump(new_labels, labels_file)  # Dump them to the pickle file

    return d2_functional_connectivity, new_labels  # Return them
Exemple #4
0
 def size(self, size):
     
     size = Helpers.get_bytes(size) if isinstance(size, str) else size
         
     # General validation testing that size is an int and that space is available on disk
     if not Helpers.is_float(size):
         raise Exception('Invalid argument. Size must be an integer')
     elif size >= Helpers.bytes_available():
         raise Exception('Invalid argument. Size is too large, not enough space.')
 
     # Different handling cases depending on if size has been assigned before
     if self._size:
         self.run_hdiutil_command('resize', self.path, size=Helpers.hr_bytes(size))
     self._size = size
Exemple #5
0
 def diskutil_info(self):
     UTILITY_NAME = 'diskutil'
     self.attach()
          
     response = Helpers.run_command(UTILITY_NAME, 'info', self.get_mounting_point())
     non_empty_lines = [line for line in response.splitlines() if line != '']
     return {line.split(':')[0].lstrip() : line.split(':')[-1].strip() for line in non_empty_lines}
Exemple #6
0
 def mp3(self, url):
     path = Helpers.check_platform('Music')
     self.create_path(path)
     ydl_opts = utils.ydl_options()
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         ydl.download([url])
     print(colored('Download Completed', 'green'))
Exemple #7
0
 def mp4(self, url):
     path = Helpers.check_platform('Videos')
     self.create_path(path)
     try:
         with youtube_dl.YoutubeDL({}) as ydl:
             ydl.download([url])
         print(colored('Download Completed', 'green'))
     except:
         print(colored('something went wrong try again', 'red'))
Exemple #8
0
 def playlist(self, url):
     playlist = pafy.get_playlist(url)
     path = Helpers.check_platform() + "/Playlists/{}".format(
         playlist['title'])
     join_ = os.path.join(path)
     file = Helpers.check_platform('Playlists')
     if os.path.isdir(file) == True:
         pass
     else:
         os.mkdir(file)
     if os.path.isdir(path) == True:
         pass
     else:
         os.mkdir(join_)
     os.chdir(join_)
     utils.playlist_info(url)
     with youtube_dl.YoutubeDL({}) as ydl:
         ydl.download([url])
Exemple #9
0
    def run(self):

        """
        Run the model self.times_to_run times using random parameters. Export the results to a CSV file
        """

        for times in range(0, self.times_to_run):  # Loop through the number of times we have to run
            output = {  # Initialize an empty dictionary containing the results from each iteration
                'accuracies': [],
                'f1s_positive': [],
                'precisions_positive': [],
                'recalls_positive': [],
                'f1s_negative': [],
                'precisions_negative': [],
                'recalls_negative': [],
                'true_negative': [],
                'false_positive': [],
                'false_negative': [],
                'true_positive': []
            }

            random_attributes = self._random_args()  # Generate random model parameters
            layer_order = self._find_order(random_attributes)

            for cv_run in range(self.CV):
                accuracy, positive_metrics, negative_metrics, confusion_metrics = run_model(
                    self.features, self.labels,
                    random_attributes, verbose=True
                )
                #  Run the model and get metrics from that run
                output['accuracies'].append(accuracy)  # Add this iteration's metrics to the CV array
                output['f1s_positive'].append(positive_metrics['f1'])
                output['f1s_negative'].append(negative_metrics['f1'])
                output['precisions_positive'].append(positive_metrics['precision'])
                output['precisions_negative'].append(negative_metrics['precision'])
                output['recalls_negative'].append(negative_metrics['recall'])
                output['recalls_positive'].append(positive_metrics['recall'])
                output['true_negative'].append(confusion_metrics['true_negative'])
                output['false_positive'].append(confusion_metrics['false_positive'])
                output['false_negative'].append(confusion_metrics['false_negative'])
                output['true_positive'].append(confusion_metrics['true_positive'])

                print 'Ran {0} times (iteration {1})'.format(times, cv_run), random_attributes

            data = Helpers.generate_csv_data(layer_order, output['accuracies'],
                                             [output['f1s_negative'], output['f1s_positive']],
                                             [output['precisions_negative'], output['precisions_positive']],
                                             [output['recalls_negative'], output['recalls_positive']],
                                             [output['true_negative'], output['false_positive'],
                                              output['false_negative'], output['true_positive']])
            # Generate the dictionary for the CSV File

            self._csv_writer(random_attributes, data)  # Write the data to a file
Exemple #10
0
                def generate_data_model(self, image_info):
                    ext = self.path.split('.')[-1]
                    options = {
                        'volname' : 'Volume Name',
                        'fs' : 'File System Personality',
                        'size' : 'Total Size'
                    }
                    options = {k: image_info[v] for k, v in options.items()}
                    str_size = ' '.join(options['size'].split(' ')[:2])

                    options['size'] = Helpers.get_bytes(str_size)
                    options['type'] = ext if ext != 'dmg' else 'UDIF'
                    
                    return options
Exemple #11
0
    def test_jss_stage(self):
        driver = self.driver
        WebDriverWait(driver, 10).until_not(lambda x: x.find_element_by_xpath(
            '//*[@style="display: block;"]').is_displayed)

        helper = Helpers()
        helper.field(self, driver.find_element(By.ID, l.search_ref),
                     time_stamp)
        driver.find_element_by_xpath(
            '//*[@id="downloadProjects-form"]//*[@data-target="list"]').click(
            )
        WebDriverWait(driver, 10).until_not(lambda el: el.find_element(
            By.XPATH, '//*[@id="tasks-table"]/tbody/tr[2]').is_displayed)
        time.sleep(2)
        project_ref_num = driver.find_element_by_xpath(
            '//*[@id="tasks-table"]/tbody/tr/td[7]/a')
        project_ref_num.click()
        time.sleep(2)
        WebDriverWait(driver, 10).until(
            lambda x: x.find_element_by_id(l.project_code).is_displayed())
        assert driver.find_element(By.ID, l.project_code).is_displayed()

        auth = Authenticate()
        auth.delete_project(time_stamp)
Exemple #12
0
    def _csv_writer(self, iteration_input, iteration_output):
        """
        Write outputs to csv file

        :param iteration_input: (dict) the parameters that were into the model
        :param iteration_output: (dict)the returned result from the CV
        :param csvFile: (string) path to a csv file for outputting
        :return: None
        """
        if self.csv:
            is_new_file = not exists(self.csvFile)  # Check if the file exists so we know whether to write the header
            with open(self.csvFile, 'a') as csv_file:  # Open the tsv file for appending
                writer = DictWriter(csv_file,
                                    fieldnames=Helpers.fieldnames,
                                    delimiter=',')  # Initialize a tsv writer (using dictionary)
                if is_new_file:
                    writer.writeheader()  # If the file is new, create a header
                writer.writerow(
                    Helpers.merge_two_dicts(iteration_input, iteration_output))  # Write a new merged dictionary
Exemple #13
0
def download_type():

    music_type = input(colored('Press A for Audio  V for Video P for Playlist : > ', 'green'))
    print(colored('processing you download request ...........', 'blue'))

    if music_type == "A" or music_type == "a":
        Helpers.save_url_(url)
        Audio.mp3(url)

    elif music_type == 'V' or music_type == 'v':
        Helpers.save_url_(url)
        Audio.mp4(url)

    elif music_type == 'P' or music_type == 'p':
        Helpers.save_url(url)
        Audio.playlist(url)
    else:
        main()
Exemple #14
0
def change_volname(old_name, new_name):
    Helpers.run_command('diskutil', 'rename', old_name, new_name)
Exemple #15
0
 def run_disk_util_command(self, *args, **kwargs):
     out = Helpers.run_command(*args, **kwargs)
Exemple #16
0
def main(save_path, params):
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    sub2vec = params['sub2vec']
    subdict = params['subdic']
    dataset = params['data']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    sub_dim = params['sub_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']

    # save settings
    shutil.copyfile('config.py', '%s/config.py' % save_path)

    use_subs = sub_dim > 0
    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset,
                         no_training_set=False,
                         use_subs=use_subs,
                         subdict=subdict)

    print "building minibatch loaders ...", datetime.now().strftime(
        '%Y-%m-%d %H:%M:%S')
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training,
                                                         BATCH_SIZE,
                                                         sample=1)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation,
                                                       BATCH_SIZE)

    print "building network ...", datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    S_init, sub_dim = Helpers.load_sub_embeddings(data.dictionary[1], sub2vec)
    m = model.Model(nlayers, data.vocab_size, data.num_chars, W_init, S_init,
                    nhidden, embed_dim, dropout, train_emb, sub_dim, use_feat,
                    gating_fn)

    print "training ...", datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    num_iter = 0
    max_acc = 0.
    deltas = []

    logger = open(save_path + '/log', 'a', 0)

    if os.path.isfile('%s/best_model.p' % save_path):
        print 'loading previously saved model', datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        m.load_model('%s/best_model.p' % save_path)
        print "model loaded"
    else:
        print 'saving init model', datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        m.save_model('%s/model_init.p' % save_path)
        print 'loading init model', datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        m.load_model('%s/model_init.p' % save_path)
    for epoch in xrange(NUM_EPOCHS):
        print "epochs training ...", datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        estart = time.time()
        new_max = False
        for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_train:
            loss, tr_acc, probs = m.train(dw, dt, qw, qt, c, a, m_dw, m_qw, tt,
                                          tm, m_c, cl)

            message = "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" % (
                epoch, loss, tr_acc, time.time() - estart)
            print message
            logger.write(message + '\n')

            num_iter += 1
            if num_iter % VALIDATION_FREQ == 0:
                total_loss, total_acc, n, n_cand = 0., 0., 0, 0.

                for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_val:
                    outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm,
                                      m_c, cl)
                    loss, acc, probs = outs[:3]

                    bsize = dw.shape[0]
                    total_loss += bsize * loss
                    total_acc += bsize * acc
                    n += bsize
                print('validate on ', str(n) + 'validation data')
                val_acc = total_acc / n
                if val_acc > max_acc:
                    max_acc = val_acc
                    m.save_model('%s/best_model.p' % save_path)
                    new_max = True
                message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f" % (
                    epoch, total_loss / n, val_acc, max_acc)
                print message
                logger.write(message + '\n')

        m.save_model('%s/model_%d.p' % (save_path, epoch))
        message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % (
            epoch, tr_acc, val_acc)
        print message
        logger.write(message + '\n')

        # learning schedule
        if epoch >= 2:
            m.anneal()
        # stopping criterion
        if not new_max:
            break

    logger.close()
Exemple #17
0
    def _random_args(self, consider_mlp=True, consider_svc=True, consider_logit=True, consider_xgb=True,
                     consider_rf=True, consider_early_stopping=True):
        """
        Generate random parameters for testing

        :param consider_mlp: (bool) whether or not to include multi layer perceptron in the optimization
        :param consider_svc: (bool) whether or not to include SVC in the optimization
        :param consider_logit: (bool) whether or not to include Logistic Regression in the optimization
        :param consider_xgb: (bool) whether or not to include gradient boosting in the optimization
        :param consider_rf: (bool) whether or not to include random forests in the optimization
        :param consider_early_stopping: (bool) whether or not to randomize the "down" iterations required to stop a model train
        :return: (dict) a dictionary contating the model parameters (MLP Solver, MLP Layers, XGB Estimators, number of
            Logistic Regressions and SVC kernel)
        """

        mlp = Helpers.decision(probability=self.estimator_chance)  # Decide whether to include MLP
        svc = Helpers.decision(probability=self.estimator_chance)  # Decide whether to include SVC
        xgb = Helpers.decision(probability=self.estimator_chance)  # Decide whether to include XGB
        rf = Helpers.decision(probability=self.estimator_chance)  # Decide whether to include RF
        logit = choice([0, 1, 2, 3])  # Decide whether to include one logit, 2 logit, or none
        active_classifiers = []  # Keep a count so we know how many active estimators we have

        if mlp and self.models_consider['svc']:
            num_mlp = randint(0, self.maxes[
                'estimators'])  # How many MLP classifiers we should use in this round of testing
            mlp_layer_schema = []
            mlp_solvers = []
            for _ in range(num_mlp):  # Loop through all of the MLP classifiers we chose to consider
                temp_schema = []
                mlp_solvers.append(choice(self.classifier_atr_choices['mlp']))  # Randomly choose an MLP algorithm
                number_of_layers = randint(1, self.maxes['mlp_layers'])
                # Randomly generate the number of layers between 1 and 3
                active_classifiers.append(
                    Helpers.reversed_initial_structure['mlp'])  # Append so we keep track of active estimators
                for layer in range(number_of_layers):  # Loop through all of the layers
                    nodes_in_layer = randint(1, self.maxes['mlp_nodes'])
                    # Generate the random number of nodes in each layer (up to max)
                    temp_schema.append(nodes_in_layer)  # Add it to the array containing the MLP layer schema
                mlp_layer_schema.append(temp_schema)

        else:
            mlp_layer_schema = None
            mlp_solvers = None

        if svc and self.models_consider['svc']:
            num_svc = randint(0, self.maxes[
                'estimators'])  # Randomly generate the number of SVC classifiers we want to use
            svc_kernels = []
            for _ in range(num_svc):  # Loop through all of the SVC estimators we chose to consider
                svc_kernels.append(choice(self.classifier_atr_choices['svc']))  # Randomly choose a kernel for SVC
                active_classifiers.append(
                    Helpers.reversed_initial_structure['svc'])  # Append so we keep track of active estimators
        else:
            svc_kernels = None

        if xgb and self.models_consider['xgb']:
            num_xgb = randint(0, self.maxes['estimators'])
            xgb_estimators = []
            for _ in range(num_xgb):
                xgb_estimators.append(
                    randint(1, self.maxes['xgb_trees']))  # Generate a random number of XGB estimators (0 < num estim < 130)
                active_classifiers.append(
                    Helpers.reversed_initial_structure['xgb'])  # Append so we keep track of active estimators
        else:
            xgb_estimators = None

        if self.models_consider['logit']:
            if logit == 1:
                number_of_logit_regressions = randint(1, self.maxes['estimators']), 0  # Generate one group of logistic
                # regressions but leave the other blank
                for _ in range(number_of_logit_regressions[0]):
                    active_classifiers.append(
                        Helpers.reversed_initial_structure['logit1'])  # Append so we keep track of active estimators

            elif logit == 2:
                number_of_logit_regressions = randint(1, self.maxes['estimators']), randint(0, self.maxes['estimators'])
                # Generate two groups of a random number of logistic regressions
                for _ in range(number_of_logit_regressions[0]):
                    active_classifiers.append(
                        Helpers.reversed_initial_structure['logit1'])  # Append so we keep track of active estimators
                for _ in range(number_of_logit_regressions[1]):
                    active_classifiers.append(
                        Helpers.reversed_initial_structure['logit2'])  # Append so we keep track of active estimators

            elif logit == 3:
                number_of_logit_regressions = 0, randint(1, self.maxes['estimators'])  # Generate one group of logistic
                for _ in range(number_of_logit_regressions[1]):
                    active_classifiers.append(
                        Helpers.reversed_initial_structure['logit2'])  # Append so we keep track of active estimators
                # regressions but leave the other blank

            else:
                number_of_logit_regressions = 0, 0

        active_classifiers.append(
            Helpers.reversed_initial_structure[
                'logit3'])  # Add the constant logistic regressions to the active classifiers list

        if rf and consider_rf:
            number_of_rfs = randint(0, self.maxes['estimators'])  # Generate the number of RFs to consider
            rf_estims = []
            for _ in range(number_of_rfs):  # For each RF classifier
                active_classifiers.append(
                    Helpers.reversed_initial_structure['rf'])  # Add it to the active classifiers list
                rf_estims.append(randint(1, self.maxes['rf_tress']))  # Add the random number of estimators
        else:
            rf_estims = None

        if consider_early_stopping:
            early_stopping_iterations = randint(1, self.maxes['early_stopping'])  # Randomly select the number of layers
            # required to stop the model iteration
        else:
            early_stopping_iterations = 2

        if self.shuffle_models:
            positions = sample(active_classifiers, len(active_classifiers))
            # Shuffle the indexes of estimators in the config
        else:
            positions = active_classifiers

        final_parameters = {
            'mlp_layers': mlp_layer_schema,
            'mlp_solver': mlp_solvers,
            'svc_kernel': svc_kernels,
            'xgb_estimators': xgb_estimators,
            'logistic_regressions': number_of_logit_regressions,
            'rf_estimators': rf_estims,
            'early_stopping_iterations': early_stopping_iterations,
            'positions': positions,
        }
        print final_parameters
        return final_parameters  # Return the final dictionary as a result
def main(load_path, params, mode='test'):

    regularizer = params['regularizer']
    rlambda = params['lambda']
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    subsample = params['subsample']
    base_model = params['model']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']

    # load settings
    shutil.copyfile('%s/config.py' % load_path, 'config.py')

    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset)
    inv_vocab = data.inv_dictionary

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE, data.dictionary)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE, data.dictionary)

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = eval(base_model).Model(nlayers, data.vocab_size, data.num_chars,
                               W_init, regularizer, rlambda, nhidden,
                               embed_dim, dropout, train_emb, subsample,
                               char_dim, use_feat, data.dictionary[4])
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    total_loss, total_acc, n = 0., 0., 0
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, match_feat, use_char, use_char_q in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl,
                          match_feat, use_char, use_char_q)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention

        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        pr[n:n + bsize, :] = probs
        fids += fnames
        n += bsize

    logger = open(load_path + '/log', 'a', 0)
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print message
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pkl.dump(attns, open('%s/%s.attns' % (load_path, mode), 'w'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()
def main(save_path, params):

    regularizer = params['regularizer']
    rlambda = params['lambda']
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    subsample = params['subsample']
    base_model = params['model']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    train_cut = params['train_cut']
    gating_fn = params['gating_fn']

    # save settings
    shutil.copyfile('config.py','%s/config.py'%save_path)

    use_chars = char_dim>0
    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, use_chars=use_chars)

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training, BATCH_SIZE, data.dictionary,
            sample=train_cut, max_qry_len=85)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation, BATCH_SIZE, data.dictionary, max_qry_len=85)
    batch_loader_test = MiniBatchLoader.MiniBatchLoader(data.test, BATCH_SIZE, data.dictionary)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(data.dictionary[0], word2vec)
    m = eval(base_model).Model(nlayers, data.vocab_size, data.num_chars, W_init, 
        regularizer, rlambda, nhidden, embed_dim, dropout, train_emb, subsample, 
            char_dim, use_feat, data.dictionary[4])

    print("training ...")
    num_iter = 0
    max_acc = 0.
    deltas = []
    test_acc = 0.

    logger = open(save_path+'/log','a',0)

    # if os.path.isfile('%s/best_model.p'%save_path):
    #     print('loading previously saved model')
    #     m.load_model('%s/best_model.p'%save_path)
    # else:
    #     print('saving init model')
    #     m.save_model('%s/model_init.p'%save_path)
    #     print('loading init model')
    #     m.load_model('%s/model_init.p'%save_path)

    for epoch in xrange(NUM_EPOCHS):
        estart = time.time()
        new_max = False

        for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, match_feat, use_char, use_char_q in batch_loader_train:
            loss, tr_acc, probs = m.train(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl, match_feat, use_char, use_char_q)

            # message = "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" % (
            #         epoch, loss, tr_acc, time.time()-estart)
            # print message
            # logger.write(message+'\n')

            if num_iter % VALIDATION_FREQ == 0:
                total_loss, total_acc, n, n_cand = 0., 0., 0, 0.

                for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, match_feat, use_char, use_char_q in batch_loader_val:
                    outs = m.validate(dw, dt, qw, qt, c, a, 
                            m_dw, m_qw, tt, tm, m_c, cl, match_feat, use_char, use_char_q)
                    loss, acc, probs = outs[:3]

                    bsize = dw.shape[0]
                    total_loss += bsize*loss
                    total_acc += bsize*acc
                    n += bsize

                val_acc = total_acc/n
                if val_acc > max_acc:
                    max_acc = val_acc
                    m.save_model('%s/best_model.p'%save_path)

                    temp_acc, temp_n = 0.0, 0

                    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, match_feat, use_char, use_char_q in batch_loader_test:
                        outs = m.validate(dw, dt, qw, qt, c, a, 
                            m_dw, m_qw, tt, tm, m_c, cl, match_feat, use_char, use_char_q)
                        _, acc, _ = outs[:3]
                        bsize = dw.shape[0]
                        temp_acc += bsize * acc
                        temp_n += bsize

                    test_acc = temp_acc / temp_n

                    new_max = True
                message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f test=%.4f" % (
                    epoch, total_loss/n, val_acc, max_acc, test_acc)
                print message
                logger.write(message+'\n')

            num_iter += 1

        m.save_model('%s/model_%d.p'%(save_path,epoch))
        message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % (epoch, tr_acc, val_acc)
        print message
        logger.write(message+'\n')
        
        # learning schedule
        if epoch >=2:
            m.anneal()
        # stopping criterion
        if not new_max:
            break

    logger.close()
Exemple #20
0
import argparse
import os
import sys
from pathlib import Path

import pdfplumber
from tld import is_tld
from tld.utils import update_tld_names

from utils import Helpers, Termcolors

__author__ = "DFIRSec (@pulsecode)"
__version__ = "v0.0.8"
__description__ = "Extract Indicators of Compromise (IOCs) from PDF documents."

helper = Helpers()
tc = Termcolors()

# update/sync tld names
update_tld_names()

# Base directory
parent = Path(__file__).resolve().parent


def extractor(pdf):
    size = os.path.getsize(pdf)
    large = round(size / (1024 * 1024))
    if size > 10240000:
        sys.exit(
            f"{tc.RED}[ERROR]{tc.RESET} Limit file size to 10 MB or less. Your file is {large:,} MB."
Exemple #21
0
def main(load_path, params, mode='test'):
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    ent_setup = params['ent_setup']
    data_path = params['data_path']
    # save settings
    shutil.copyfile('config.py', '%s/config_test.py' % load_path)
    use_chars = char_dim > 0

    if dataset == "clicr":
        dp = DataPreprocessor.DataPreprocessorClicr()
        #dataset_path = "/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/"
        #dataset_path = "data/"
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=True)
    elif dataset == "clicr_novice":
        dp = DataPreprocessor.DataPreprocessorNovice()
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=True)
    else:
        dp = DataPreprocessor.DataPreprocessor()
        data = dp.preprocess(data_path, no_training_set=True)
    inv_vocab = data.inv_dictionary

    assert os.path.exists(params["test_file"] if mode ==
                          "test" else params["validation_file"])

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE)
    f_to_cand = {i[-1]: i[3] for i in batch_loader_test.questions}

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GAReader.Model(nlayers,
                       data.vocab_size,
                       data.num_chars,
                       W_init,
                       nhidden,
                       embed_dim,
                       dropout,
                       train_emb,
                       char_dim,
                       use_feat,
                       gating_fn,
                       save_attn=False)
    print("model load path")
    print('%s/best_model.p' % load_path)
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    pred_ans = {}
    total_loss, total_acc, n = 0., 0., 0
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention

        for f in range(len(fnames)):
            pred_cand = probs[f].argmax()
            pred_a_ids = f_to_cand[fnames[f]][pred_cand]
            pred_a = " ".join([inv_vocab[i] for i in pred_a_ids])
            if ent_setup == "ent-anonym" and (dataset == "clicr"
                                              or dataset == "clicr_novice"):
                relabeling_dicts = data.test_relabeling_dicts if mode == 'test' else data.val_relabeling_dicts
                pred_a = relabeling_dicts[fnames[f]][pred_a]
            pred_ans[fnames[f]] = pred_a

        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        pr[n:n + bsize, :] = probs
        fids += fnames
        n += bsize

    if (params["dataset"] == "clicr" or params["dataset"] == "clicr_plain" or params["dataset"] == "clicr_novice") \
            and (mode == 'test' or mode == 'validation'):
        print("writing predictions")
        preds_data = utils.to_output_preds(pred_ans)
        preds_filepath = load_path + '/{}.preds'.format(mode)
        utils.write_preds(preds_data, file_name=preds_filepath)
        utils.external_eval(preds_filepath,
                            preds_filepath + ".scores",
                            params["test_file"]
                            if mode == "test" else params["validation_file"],
                            extended=True)
    logger = open(load_path + '/log.test', 'a')
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print(message)
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pickle.dump(attns, open('%s/%s.attns' % (load_path, mode), 'wb'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()
Exemple #22
0
def main(save_path, params):

    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    out = 'out'

    # save settings
    shutil.copyfile('config.py', '%s/config.py' % save_path)

    use_chars = char_dim > 0
    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, no_training_set=False, use_chars=use_chars)
    word_dictionary = data.dictionary[0]
    the_index = word_dictionary['the']
    #print('the index : {}'.format(word_dictionary['the']))

    idx_to_word = dict([(v, k) for (k, v) in word_dictionary.iteritems()])
    words = [idx_to_word[i] for i in sorted(idx_to_word.keys())]

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training,
                                                         BATCH_SIZE,
                                                         sample=1.0)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation,
                                                       BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    #print('the embedding : {}'.format(W_init[the_index]))
    #print(W_init[0:5])

    print("running GAReader ...")

    m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init,
                       nhidden, embed_dim, dropout, train_emb, char_dim,
                       use_feat, gating_fn, words).build_network()
    m.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE,
                                                 clipnorm=GRAD_CLIP),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.categorical_accuracy])
    #tf.enable_eager_execution(config=tf.ConfigProto(allow_soft_placement = True))
    with tf.Graph().as_default():
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            K.set_session(sess)
            #with tf.device('/gpu:0:'):
            tensorboard = TensorBoardCustom(log_dir="logs", words=words)
            modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
                'output/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
            writer = tf.summary.FileWriter("logs")

            def schedule(epoch, lr):

                if epoch >= 3:
                    return lr * 0.5
                else:
                    return lr

            lrate = LearningRateScheduler(schedule, verbose=1)

            for epoch in xrange(NUM_EPOCHS):
                for (inputs, a) in batch_loader_train:
                    [dw, qw, m_dw, m_qw, c, m_c, cl] = inputs
                    m = GAReader.Model(nlayers, data.vocab_size,
                                       data.num_chars, W_init, nhidden,
                                       embed_dim, dropout, train_emb, char_dim,
                                       use_feat, gating_fn,
                                       words).build_network()
                    m.compile(optimizer=tf.keras.optimizers.Adam(
                        lr=LEARNING_RATE, clipnorm=GRAD_CLIP),
                              loss=tf.keras.losses.categorical_crossentropy,
                              metrics=[tf.keras.metrics.categorical_accuracy])
                    #print(dw.shape)
                    #print('dw : {}'.format(dw))
                    #print('qw : {}'.format(qw))
                    #print('m_dw : {}'.format(m_dw))
                    #print('m_qw : {}'.format(m_qw))
                    #print('c : {}'.format(c))
                    #print([idx_to_word[i] for i in dw[0, :, 0].tolist()])
                    train_summary = m.train_on_batch(
                        inputs,
                        to_categorical(a, batch_loader_train.max_num_cand))
                    print(m.get_weights()[0])
                    print('epoch: {}, train loss: {}, train acc: {}'.format(
                        epoch, train_summary[0], train_summary[1]))
                    lr = tf.summary.scalar('learning_rate', LEARNING_RATE)
                    summary = tf.summary.merge_all()
                    s = sess.run(summary)
                    writer.add_summary(s)
                writer.close()
Exemple #23
0
import youtube_dl
import os
import getpass
import pafy
from termcolor import colored
from utils import Helpers

user = getpass.getuser()
utils = Helpers()


class Music:
    @staticmethod
    def create_path(path):
        if not os.path.exists(path):
            os.mkdir(path)
            os.chdir(path)
        os.chdir(path)

    def mp3(self, url):
        path = Helpers.check_platform('Music')
        self.create_path(path)
        ydl_opts = utils.ydl_options()
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print(colored('Download Completed', 'green'))

    def mp4(self, url):
        path = Helpers.check_platform('Videos')
        self.create_path(path)
        try:
Exemple #24
0
def main(load_path, params, mode='test'):

    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']

    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, no_training_set=True)
    inv_vocab = data.inv_dictionary

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = Reader.Model(nlayers,
                     data.vocab_size,
                     data.num_chars,
                     W_init,
                     nhidden,
                     embed_dim,
                     dropout,
                     train_emb,
                     char_dim,
                     use_feat,
                     gating_fn,
                     save_attn=True)
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    total_loss, total_acc, n = 0., 0., 0
    result = {}
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention

        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        pr[n:n + bsize, :] = probs
        fids += fnames
        n += bsize

        answer = probs.argmax(1)
        for it in range(len(fnames)):
            tid = fnames[it].split('/')[-1].split('.')[0].strip()
            result[eval(tid)] = answer[it]
            print tid, answer[it]
        print('probs----', probs)
        #print('a----', a)
        print('fnames----', fnames)

    print len(result)
    with open('raw.txt', 'w') as ff:
        for i in range(1, 2501):
            ff.write(str(result[i]) + '\n')

    logger = open(load_path + '/log', 'a', 0)
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print message
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pkl.dump(attns, open('%s/%s.attns' % (load_path, mode), 'w'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()
Exemple #25
0
def main(load_path, params, mode='test'):

    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['data']
    nlayers = params['nlayers']
    sub2vec = params['sub2vec']
    train_emb = params['train_emb']
    sub_dim = params['sub_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    use_subs = sub_dim > 0
    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, no_training_set=True, use_subs=use_subs)
    inv_vocab = data.inv_dictionary

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    S_init, sub_dim = Helpers.load_sub_embeddings(data.dictionary[1], sub2vec)
    m = model.Model(nlayers,
                    data.vocab_size,
                    data.num_chars,
                    W_init,
                    S_init,
                    nhidden,
                    embed_dim,
                    dropout,
                    train_emb,
                    sub_dim,
                    use_feat,
                    gating_fn,
                    save_attn=True)
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    total_loss, total_acc, n = 0., 0., 0
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention
        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc
        fids += fnames
        n += bsize
        print("step" + str(n) + ",acc" + str(acc))

    logger = open(load_path + '/log', 'a', 0)
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print message
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pkl.dump(attns, open('%s/%s.attns' % (load_path, mode), 'w'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()
Exemple #26
0
def main(save_path, params, mode='train'):

    word2vec = params['word2vec']
    dataset = params['dataset']

    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess_rc(params, dataset)

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoaderMention(
        params, data.training, params['batch_size'])
    batch_loader_val = MiniBatchLoader.MiniBatchLoaderMention(
        params,
        data.validation,
        params['batch_size'],
        shuffle=False,
        ensure_answer=False)
    batch_loader_test = MiniBatchLoader.MiniBatchLoaderMention(
        params,
        data.test,
        params['batch_size'],
        shuffle=False,
        ensure_answer=False)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GA.Model(params, W_init, embed_dim)

    print("training ...")
    num_iter = 0
    max_acc = 0.0
    min_loss = 1e5

    logger = open(save_path + '/log', 'a', 0)
    train_writer = tf.summary.FileWriter(os.path.join(save_path, 'train'))
    val_writer = tf.summary.FileWriter(os.path.join(save_path, 'val'))

    if params['reload_']:
        print('loading previously saved model')
        saves = pkl.load(open('%s/checkpoints.p' % save_path))
        m.load_model('%s/best_model.p' % save_path, saves[-1])

    # train
    if mode == 'train':
        saves = []
        for epoch in xrange(params['num_epochs']):
            estart = time.time()
            stop_flag = False

            for example in batch_loader_train:
                loss, tr_acc, probs, summary = m.train(*example[:-2])

                if num_iter % params['logging_frequency'] == 0:
                    message = (
                        "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" %
                        (epoch, loss, tr_acc, time.time() - estart))
                    print(message)
                    logger.write(message + '\n')
                    train_writer.add_summary(summary, num_iter)

                num_iter += 1
                if num_iter % params['validation_frequency'] == 0:
                    total_loss, total_acc, n = 0., 0., 0.

                    for example in batch_loader_val:
                        outs = m.validate(*example[:-2])
                        loss, acc, probs = outs[:3]

                        bsize = example[0].shape[0]
                        total_loss += bsize * loss
                        total_acc += bsize * acc
                        n += bsize

                    val_acc = total_acc / n
                    print("11111111111   ", val_acc)
                    if val_acc > max_acc:
                        max_acc = val_acc
                        save_id = num_iter
                        print("111111111111111111111111111111")
                        sv = m.save_model('%s/best_model.p' % save_path,
                                          save_id)
                        saves.append(save_id)
                        new_max = True

                    val_loss = total_loss / n
                    message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f" % (
                        epoch, val_loss, val_acc, max_acc)
                    print(message)
                    logger.write(message + '\n')

                    _add_summary(val_writer, val_loss, "loss", num_iter)
                    _add_summary(val_writer, val_acc, "accuracy", num_iter)

                    # stopping
                    if val_loss < min_loss: min_loss = val_loss
                    if params['stopping_criterion'] and (
                            val_loss - min_loss) / min_loss > 0.3:
                        stop_flag = True
                        break

                if num_iter % params["anneal_frequency"] == 0:
                    m.anneal()

            #m.save_model('%s/model_%d.p'%(save_path,epoch))
            message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % (
                epoch, tr_acc, max_acc)
            print(message)
            logger.write(message + '\n')

            if stop_flag: break
        # record all saved models
        pkl.dump(saves, open('%s/checkpoints.p' % save_path, 'w'))

    # test
    mode = 'test' if mode in ['train', 'test'] else 'val'
    print("testing ...")
    try:
        saves = pkl.load(open('%s/checkpoints.p' % save_path))
        print('%s/checkpoints.p' % save_path)
    except IOError:

        def _to_num(foo):
            try:
                num = int(foo)
            except ValueError:
                return None
            return num

        saves = []
        for directory in os.listdir(save_path):
            if not os.path.isdir(os.path.join(save_path, directory)): continue
            num = _to_num(directory)
            if num is None: continue
            saves.append(num)

        saves = sorted(saves)
    print("saves111111", saves)
    if not saves:
        print("No models saved during training!")
        return
    print('loading model')
    m.load_model('%s/best_model.p' % save_path, saves[-1])

    total_loss, total_acc, n = 0., 0., 0
    answer_structure = {}
    idict = data.inv_dictionary
    for example in batch_loader_val:
        outs = m.validate(*example[:-2])
        loss, acc, probs = outs[:3]

        pred_indices = np.argmax(probs, axis=1)
        for i in range(len(example[-1])):
            cname = str(example[-1][i]).strip()
            gt_answer = example[10][i]
            answer_structure[cname] = (pred_indices[i], gt_answer, probs[i, :])

        bsize = example[0].shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        n += bsize
    test_acc = total_acc / n
    test_loss = total_loss / n
    message = "TEST loss=%.4e acc=%.4f" % (test_loss, test_acc)
    print(message)
    logger.write(message + '\n')
    pkl.dump(answer_structure,
             open(os.path.join(save_path, "test_answer_structure.p"), "w"))

    logger.close()

    # clean up
    print("Cleaning up saved models ...")
Exemple #27
0
def main(save_path, params):
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    ent_setup = params['ent_setup']  # ent, ent-anonym, no-ent
    data_path = params['data_path']
    # save settings
    shutil.copyfile('config.py', '%s/config.py' % save_path)

    use_chars = char_dim > 0

    if dataset == "clicr":
        dp = DataPreprocessor.DataPreprocessorClicr()
        data = dp.preprocess(
            #"/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/",
            data_path,
            ent_setup=ent_setup,
            no_training_set=False,
            use_chars=use_chars)
    elif dataset == "clicr_novice":
        dp = DataPreprocessor.DataPreprocessorNovice()
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=False,
                             use_chars=use_chars)
    else:
        dp = DataPreprocessor.DataPreprocessor()
        data = dp.preprocess(data_path,
                             no_training_set=False,
                             use_chars=use_chars)

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training,
                                                         BATCH_SIZE,
                                                         sample=1.0)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation,
                                                       BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init,
                       nhidden, embed_dim, dropout, train_emb, char_dim,
                       use_feat, gating_fn)

    print("training ...")
    num_iter = 0
    max_acc = 0.
    deltas = []

    logger = open(save_path + '/log', 'a')

    if os.path.isfile('%s/best_model.p' % save_path):
        print('loading previously saved model')
        m.load_model('%s/best_model.p' % save_path)
    else:
        print('saving init model')
        m.save_model('%s/model_init.p' % save_path)
        print('loading init model')
        m.load_model('%s/model_init.p' % save_path)

    for epoch in range(NUM_EPOCHS):
        estart = time.time()
        new_max = False

        for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_train:
            loss, tr_acc, probs = m.train(dw, dt, qw, qt, c, a, m_dw, m_qw, tt,
                                          tm, m_c, cl)

            message = "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" % (
                epoch, loss, tr_acc, time.time() - estart)
            print(message)
            logger.write(message + '\n')

            num_iter += 1
            if num_iter % VALIDATION_FREQ == 0:
                total_loss, total_acc, n, n_cand = 0., 0., 0, 0.

                for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_val:
                    outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm,
                                      m_c, cl)
                    loss, acc, probs = outs[:3]

                    bsize = dw.shape[0]
                    total_loss += bsize * loss
                    total_acc += bsize * acc
                    n += bsize
                val_acc = total_acc / n
                if val_acc > max_acc:
                    max_acc = val_acc
                    m.save_model('%s/best_model.p' % save_path)
                    new_max = True
                message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f" % (
                    epoch, total_loss / n, val_acc, max_acc)
                print(message)
                logger.write(message + '\n')

        # m.save_model('%s/model_%d.p'%(save_path,epoch))
        message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % (
            epoch, tr_acc, val_acc)
        print(message)
        logger.write(message + '\n')

        # learning schedule
        if epoch >= 2:
            m.anneal()
        # stopping criterion
        if not new_max:
            break

    logger.close()
Exemple #28
0
    def test_create_project(self):
        driver = self.driver
        WebDriverWait(driver, 10).until_not(lambda x: x.find_element_by_xpath(
            '//*[@style="display: block;"]').is_displayed())

        # project menu: get and click
        self.project_menu = driver.find_element_by_id(l.projectTool)
        self.assertTrue(self.project_menu.is_displayed()
                        and self.project_menu.is_enabled())
        self.project_menu.click()

        # create project click
        WebDriverWait(
            driver,
            10).until(lambda el: el.find_element_by_id(l.CREATE_PROJECT))
        driver.find_element(By.ID, l.CREATE_PROJECT).click()

        # fill mandatory fields
        helper = Helpers()
        helper.field(self, driver.find_element(By.ID, l.project_code),
                     time_stamp)
        helper.field(self, driver.find_element(By.ID, l.project_note),
                     time_stamp)
        helper.field(self, driver.find_element(By.ID, l.project_address),
                     time_stamp)
        helper.field(self, driver.find_element(By.ID, l.project_client),
                     time_stamp)
        helper.field(self, driver.find_element(By.ID, l.project_contact),
                     time_stamp)
        print(time_stamp)

        driver.find_element_by_id(l.project_add_button).click()