Ejemplo n.º 1
0
def main():
    """
    generate dataset
    """

    data_path = "../../../data/"

    # set up container for data
    tax = taxonomy.create_tax_four()
    #tax = taxonomy.create_tax_many()
    dh = DataHandler(tax)


    for org in tax.get_data_keys():

        org_id = dh.task_to_id[org] 
        print "processing organism %s (id=%i)" % (org, org_id)
        
        org_path = data_path + org + "/"
        save_path = org_path
        
        ptt_files = [ptt for ptt in os.listdir(org_path) if ptt.endswith(".ptt")]
        print("Gff files: %s",ptt_files)
        ptt_file_sizes = [os.stat(org_path + ptt).st_size  for ptt in ptt_files] 
    
        print ptt_file_sizes
    
        # pick largest one (in case we have several contigs)
        largest_idx = 0;
        if (len(ptt_file_sizes)>1):
            print 'There are multiple .ptt files available for this organism.'
            print 'I only use the biggest one.'
            largest_idx = numpy.argmax(ptt_file_sizes)
            print 'Largest file is: ', largest_idx
        
        # determine file names
        selected_ptt = org_path + ptt_files[largest_idx]
        selected_fna = selected_ptt.replace(".ptt", ".fna")
       
        # invoke generation procedure
        dh.add_organism(org_id, selected_fna, selected_ptt)
    
        import ipdb
        ipdb.set_trace()
  
    # store final result
    save_path = "/tmp/"
    dh.save_to_file(save_path)
Ejemplo n.º 2
0
def main():
    
    tax = taxonomy.create_tax_four()
    
    train(tax)