def main(): """ generate dataset """ data_path = "../../../data/" # set up container for data tax = taxonomy.create_tax_four() #tax = taxonomy.create_tax_many() dh = DataHandler(tax) for org in tax.get_data_keys(): org_id = dh.task_to_id[org] print "processing organism %s (id=%i)" % (org, org_id) org_path = data_path + org + "/" save_path = org_path ptt_files = [ptt for ptt in os.listdir(org_path) if ptt.endswith(".ptt")] print("Gff files: %s",ptt_files) ptt_file_sizes = [os.stat(org_path + ptt).st_size for ptt in ptt_files] print ptt_file_sizes # pick largest one (in case we have several contigs) largest_idx = 0; if (len(ptt_file_sizes)>1): print 'There are multiple .ptt files available for this organism.' print 'I only use the biggest one.' largest_idx = numpy.argmax(ptt_file_sizes) print 'Largest file is: ', largest_idx # determine file names selected_ptt = org_path + ptt_files[largest_idx] selected_fna = selected_ptt.replace(".ptt", ".fna") # invoke generation procedure dh.add_organism(org_id, selected_fna, selected_ptt) import ipdb ipdb.set_trace() # store final result save_path = "/tmp/" dh.save_to_file(save_path)
def main(): tax = taxonomy.create_tax_four() train(tax)