Beispiel #1
0
def save_image(fig, name, in_tmp, step):
    if in_tmp:  # step should be an int if in_tmp is True
        path = config.tmp_loc+'step/step_'+str(step)+'/'
        func.create_directory(path)
        plt.savefig(path + name)
    else:
        plt.savefig(config.parent_dir + 'data/images/' + name)
        with open(config.parent_dir + 'data/saved/' + name + '.pkl', 'wb') as f:
            pickle.dump(fig, f)
Beispiel #2
0
 def __init__(self):
     #Assign the application directory
     self.application_directory = data.application_directory
     self.resources_directory = data.resources_directory
     # Create the settings file path
     functions.create_directory(data.settings_directory)
     self.settings_filename_with_path = functions.unixify_path_join(
         data.settings_directory,
         self.settings_filename
     )
     #Check if the settings file exists
     if self.check_settings_file() == None:
         #Create the settings file
         self.create_settings_file(self.empty_settings_list)
     #Load the settings from the settings file
     self.load_settings()
Beispiel #3
0
def createCacheData(outputFile="cachedData.npz"):
	if outputFile.find('.') == -1:
		outputFile += ".npz"
	outputFile = os.path.join(DATA_FOLDER, outputFile)

	#Prevent overwriting
	assert(not os.path.isfile(outputFile))

	create_directory(DATA_FOLDER)
	noteStateSeq = musicFolderToNoteStateSeq(TRAIN_MUSIC_FOLDER)
	wordIdxToNoteState, wordIdxToCount = loadVocabularyData(noteStateSeq)

	print("Creating data cache")
	print("-------------------")
	np.savez(outputFile,
			 noteStateSeq=noteStateSeq,
			 wordIdxToNoteState=wordIdxToNoteState,
			 wordIdxToCount=wordIdxToCount)
	print("Data cache created: {}".format(outputFile))
def main(run_again, with_dnn, with_prompt, override, with_collect=True):
    # with_collect = True  # whether or not to run collect after run is finished

    if with_prompt:
        print("Are you sure you have checked the following variables?")
        print(" - with_collect (run.py)")
        print(" - with_dnn (run.py")
        print(" - already_trained (run.py")
        print(" - has_past (pack_data.py)")
        check = input('(y/n) ')
        if not check in ['y', 'Y']:
            raise Exception("please check your variables!")
        print()

    # start runtime
    start_prog = timeit.default_timer()

    # ensure current working directory is in src folder
    if os.getcwd()[-3:] != 'src':
        # assuming we are somewhere inside the git directory
        path = s.Popen('git rev-parse --show-toplevel',
                       shell=True,
                       stdout=s.PIPE).communicate()[0].decode("utf-8")[:-1]
        print('changing working directory from', os.getcwd(), 'to', path)
        os.chdir(path + '/src')

    print('Run Args:', sys.argv[:])
    # if user wants to pass in arguments
    if len(sys.argv) > 1 and sys.argv[1] == 'master':
        init.use_fund = sys.argv[2] == 'True'
        if len(sys.argv) == 5:
            init.switch = int(sys.argv[3])
            init.has_past = sys.argv[4] == 'True'
    else:
        if len(sys.argv) >= 5:  # config 1
            init.time_periods = int(sys.argv[1])
            init.ideas_per_time = int(sys.argv[2])
            init.N = int(sys.argv[3])
            init.time_periods_alive = int(sys.argv[4])
        if len(sys.argv) >= 8:  # config 2
            init.prop_sds = float(sys.argv[5])
            init.prop_means = float(sys.argv[6])
            init.prop_start = float(sys.argv[7])
        if len(sys.argv) == 15:  # server config
            init.true_means_lam = float(sys.argv[5])
            init.prop_sds = float(sys.argv[6])
            init.prop_means = float(sys.argv[7])
            init.prop_start = float(sys.argv[8])
            init.switch = float(sys.argv[9])
            # sys.argv[10] is empty for now
            init.all_scientists = sys.argv[11] == 'True'
            init.use_equal = sys.argv[12] == 'True'
            init.use_idea_shift = sys.argv[13] == 'True'
            init.show_step = sys.argv[14] == 'True'
        if not override:
            if run_again:
                init.switch = 2  # need bayesian stats to train neural net the first time
            else:
                init.switch = 4  # prefer neural net over bayesian stats if already trained

    # check if we are using batch runs
    if os.path.isdir('tmp_batch'):
        init.tmp_loc = 'tmp_batch/tmp_' + '_'.join(
            [str(v) for v in sys.argv[1:]]) + '/'

    # so that config file loads after init.py is set
    import config, collect
    import model as m
    import functions as func

    func.create_directory(config.parent_dir + 'data/')
    func.create_directory(config.tmp_loc)
    with open(config.tmp_loc + 'start_prog.txt', 'w') as f:
        f.write('%d' % time.time())

    config.start = timeit.default_timer()

    # default parameters for model as a dictionary
    all_params = {
        "seed": config.seed,
        "use_multiprocessing": config.use_multiprocessing,
        "use_fund": config.use_fund,
        "optimization": config.switch,
        "time_periods": config.time_periods,
        "ideas_per_time": config.ideas_per_time,
        "N": config.N,
        "use_store_model": config.use_store_model,
        "time_periods_alive": config.time_periods_alive,
        "true_means_lam": config.true_means_lam,
        "true_sds_lam": config.true_sds_lam,
        "start_effort_lam": config.start_effort_lam,
        "k_lam": config.k_lam,
        "use_multithreading": config.use_multithreading,
        "use_equal": config.use_equal,
        "use_idea_shift": config.use_idea_shift
    }

    # printing parameters into console screen
    func.f_print("\nVariables:\n", all_params)

    # write parameters to text file
    f = open('../data/parameters.txt', 'w')
    f.write(str(all_params))
    f.close()

    # initialize model object
    model = m.ScientistModel(config.seed)

    func.stop_run("time to create model object... now entering main function")
    func.gc_collect()

    for i in range(config.time_periods + 2):
        model.step()
        func.stop_run("step: " + str(i))

    func.f_print("\nTOTAL TIME TO FINISH RUNNING SIMULATION:",
                 timeit.default_timer() - start_prog, "seconds")

    if with_collect:
        s.call('python3 collect.py', shell=True)
        path = s.Popen('git rev-parse --show-toplevel',
                       shell=True,
                       stdout=s.PIPE).communicate()[0].decode("utf-8")[:-1]
        # s.call('open ../data/pages/page_agent_vars.html', shell=True)
        # s.call("/usr/bin/open -a '/Applications/Google Chrome.app' 'file://"+path+"/data/images/scatterplot_resid.png'", shell=True)  # open image with Chrome
        # s.call("/usr/bin/open -a '/Applications/Google Chrome.app' 'file://"+path+"/data/images/1-var_bar_graph_prop_idea_phase.png'", shell=True)  # open image with Chrome
        # collect.init()
    if not override:
        if with_dnn:
            s.call('python3 ../ai/neural_net.py', shell=True)
        if run_again:
            s.call('python3 run.py False False False', shell=True)
Beispiel #5
0
def main():
    #define supported models
    allowed_models = [
        'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
    ]

    #Set up argument parser for console input
    parser = argparse.ArgumentParser(description='Train NN')
    parser.add_argument('data_dir',
                        help='directory containing sub-folders with data')
    parser.add_argument('--save_dir',
                        help='directory for saving checkpoint',
                        default='checkpoints')
    parser.add_argument('--arch',
                        help='pre-trained model architecture',
                        default='resnet18',
                        choices=allowed_models)
    parser.add_argument('--learning_rate',
                        help='learning rate during learning',
                        type=float,
                        default=0.01)
    parser.add_argument('--dropout',
                        help='dropout during learning',
                        type=float,
                        default=0.05)
    parser.add_argument('--hidden_units',
                        help='List of number of nodes in hidden layers',
                        nargs='+',
                        type=int,
                        default=[256, 128])
    parser.add_argument('--epochs',
                        help='Number of epochs for training',
                        default=3,
                        type=int)
    parser.add_argument('--gpu', help='Enable GPU', action='store_true')

    args = parser.parse_args()

    # Describe directories relative to working directory
    data_dir = args.data_dir
    train_dir = data_dir + '/train'
    valid_dir = data_dir + '/valid'
    test_dir = data_dir + '/test'
    save_dir = args.save_dir

    # Set variables for console input arguments
    model_arch = args.arch
    model_hidden_units = args.hidden_units
    learning_rate = args.learning_rate
    drop = args.dropout

    #Testing area
    print('Data directory: ' + data_dir)
    print('hidden units: ' + str(args.hidden_units))
    print('Save directory: ' + save_dir)
    print('Architecture: ' + args.arch)

    #create save directory if not existing
    fu.create_directory(save_dir)

    # Loading Pre-Trained model dependent on console input arch
    model = models.__getattribute__(model_arch)(pretrained=True)

    # Freeze parameters so we don't backprop through them
    for param in model.parameters():
        param.requires_grad = False

    # Create the network, define the criterion and optimizer
    model.fc = fu.Network(model.fc.in_features, 102, model_hidden_units, drop)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)

    device = torch.device(
        'cuda' if torch.cuda.is_available() and args.gpu == True else 'cpu')
    print('Device is: ', device)

    epochs = args.epochs
    print_every = 50
    running_loss = 0
    steps = 0

    train_loader, test_loader, valid_loader, train_data, test_data, valid_data = load_transform.load_transform(
        data_dir, train_dir, valid_dir, test_dir)

    fu.train(device, model, epochs, criterion, optimizer, print_every,
             train_loader, test_loader, valid_loader)
    fu.save_checkpoint(model, model_arch, epochs, criterion, optimizer,
                       train_data, save_dir)

    return model, test_loader, criterion
Beispiel #6
0
def run_bigscape_hmmscan(input_dir,
                         output_folder,
                         pfam_dir,
                         bigscape_path,
                         biopython_path,
                         parallel=False):

    sys.path.append(bigscape_path)
    sys.path.append(biopython_path)

    import bigscape as bs
    import functions as f

    class bgc_data:
        def __init__(self, accession_id, description, product, records,
                     max_width, organism, taxonomy, biosynthetic_genes,
                     contig_edge):
            # These two properties come from the genbank file:
            self.accession_id = accession_id
            self.description = description
            # AntiSMASH predicted class of compound:
            self.product = product
            # number of records in the genbank file (think of multi-locus BGCs):
            self.records = records
            # length of largest record (it will be used for ArrowerSVG):
            self.max_width = int(max_width)
            # organism
            self.organism = organism
            # taxonomy as a string (of comma-separated values)
            self.taxonomy = taxonomy
            # Internal set of tags corresponding to genes that AntiSMASH marked
            # as "Kind: Biosynthetic". It is formed as
            # clusterName + "_ORF" + cds_number + ":gid:" + gene_id + ":pid:" + protein_id + ":loc:" + gene_start + ":" + gene_end + ":strand:" + {+,-}
            self.biosynthetic_genes = biosynthetic_genes
            # AntiSMASH 4+ marks BGCs that sit on the edge of a contig
            self.contig_edge = contig_edge

    f.create_directory(output_folder, "Output", False)
    bgc_fasta_folder = os.path.join(output_folder, "fasta")
    f.create_directory(bgc_fasta_folder, "BGC fastas", False)

    bs.bgc_data = bgc_data
    bs.mode = 'global'

    bgc_info = {}  # Stores, per BGC: predicted type, gbk Description,
    # number of records, width of longest record,
    # GenBank's accession, Biosynthetic Genes' ids

    min_bgc_size = 0  # Provide the minimum size of a BGC to be included in the analysis. Default is 0 base pairs
    exclude_gbk_str = ''  # If this string occurs in the gbk filename, this file will not be used for the analysis

    # genbankDict: {cluster_name:[genbank_path_to_1st_instance,[sample_1,sample_2,...]]}
    genbankDict = bs.get_gbk_files(input_dir, output_folder, bgc_fasta_folder,
                                   min_bgc_size, exclude_gbk_str, bgc_info)

    # clusters and sampleDict contain the necessary structure for all-vs-all and sample analysis
    clusters = genbankDict.keys()
    clusterNames = tuple(sorted(clusters))

    sampleDict = {}  # {sampleName:set(bgc1,bgc2,...)}
    gbk_files = []  # raw list of gbk file locations
    for (cluster, (path, clusterSample)) in genbankDict.items():
        gbk_files.append(path)
        for sample in clusterSample:
            clustersInSample = sampleDict.get(sample, set())
            clustersInSample.add(cluster)
            sampleDict[sample] = clustersInSample

    baseNames = set(clusters)

    allFastaFiles = set(glob(os.path.join(bgc_fasta_folder, "*.fasta")))
    fastaFiles = set()
    for name in baseNames:
        fastaFiles.add(os.path.join(bgc_fasta_folder, name + ".fasta"))
    fastaBases = allFastaFiles.intersection(fastaFiles)
    task_set = fastaFiles
    verbose = False

    domtable_folder = os.path.join(output_folder, "domtable")
    f.create_directory(domtable_folder, "Domtable", False)

    if parallel:
        cores = cpu_count()
        pool = Pool(cores, maxtasksperchild=1)
        for fasta_file in task_set:
            pool.apply_async(bs.runHmmScan,
                             args=(fasta_file, pfam_dir, domtable_folder,
                                   verbose))
        pool.close()
        pool.join()
    else:
        i = 1
        for fasta_file in task_set:
            print 'Processing %d/%d' % (i, len(task_set))
            bs.runHmmScan(fasta_file, pfam_dir, domtable_folder, verbose)
            i += 1

    print("Processing domtable files")

    pfs_folder = os.path.join(output_folder, "pfs")
    pfd_folder = os.path.join(output_folder, "pfd")
    f.create_directory(pfs_folder, "pfs", False)
    f.create_directory(pfd_folder, "pfd", False)

    allDomtableFiles = set(glob(os.path.join(domtable_folder, "*.domtable")))
    domtableFiles = set()
    for name in baseNames:
        domtableFiles.add(os.path.join(domtable_folder, name + ".domtable"))
    domtableBases = allDomtableFiles.intersection(domtableFiles)
    alreadyDone = set()

    bs.gbk_files = gbk_files
    bs.genbankDict = genbankDict
    bs.clusters = clusters
    bs.baseNames = baseNames
    bs.sampleDict = sampleDict

    # Specify at which overlap percentage domains are considered to overlap.
    # Domain with the best score is kept (default=0.1).
    domain_overlap_cutoff = 0.1
    for domtableFile in domtableFiles - alreadyDone:
        try:
            bs.parseHmmScan(domtableFile, pfd_folder, pfs_folder,
                            domain_overlap_cutoff)
        except IndexError:
            continue
        except ValueError:
            continue

    return baseNames