Example #1
0
 def output_pickle_file(self, results_output_dir, sample_label):
     print("Serializing a total of %d events by Pickle." %(len(self.events)))
     pickle_output_dir = os.path.join(results_output_dir, 'pickle')
     if not os.path.isdir(pickle_output_dir):
         os.mkdir(pickle_output_dir)
     pickle_events_filename = os.path.join(pickle_output_dir, sample_label + '.pickle')
     pickle_utils.write_pickled_file(self.events, pickle_events_filename)
     return pickle_events_filename
Example #2
0
    def output_pickle_file(self, results_output_dir, sample_label):
	print "Serializing a total of %d events by Pickle." %(len(self.events))
	pickle_output_dir = os.path.join(results_output_dir, 'pickle')
	if not os.path.isdir(pickle_output_dir):
	    os.mkdir(pickle_output_dir)
	pickle_events_filename = os.path.join(pickle_output_dir, sample_label + '.pickle')
	pickle_utils.write_pickled_file(self.events, pickle_events_filename)
	return pickle_events_filename
Example #3
0
def serialize_genes(gff_genes,
                    gff_filename,
                    output_dir,
                    compress_id=False):
    """
    Output genes into pickle files by chromosome, by gene.

    If asked, use compressed IDs (hashes) of the 'ID=' field in the GFF.
    """
    genes_by_chrom = defaultdict(dict)

    # Split up genes by chromosome 
    for gene_id, gene_info in gff_genes.iteritems():
        gene_obj = gene_info["gene_object"]
        gene_hierarchy = gene_info["hierarchy"]
        genes_by_chrom[gene_obj.chrom][gene_id] = \
            {'gene_object': gene_obj,
             'hierarchy': gene_hierarchy}
        if compress_id:
            gene_compressed_id = compress_event_name(gene_id)
            # Store compressed ID
            genes_by_chrom[gene_obj.chrom][gene_id]['compressed_id'] \
                = gene_compressed_id

    # Mapping from gene IDs to pickled filename
    gene_id_to_filename = {}
    # Mapping from compressed IDs (hashes) to gene IDs
    compressed_id_to_gene_id = {}
                                                   
    # Serialize all the genes in each chromosome into their
    # own directory
    for chrom, chrom_genes in genes_by_chrom.iteritems():
        if chrom.startswith("chr"):
            chrom_dir_name = chrom
        else:
            # Add chr-prefix for ease of finding directory
            # in downstream steps.
            chrom_dir_name = "chr%s" %(str(chrom))

        # Make directory for chromosome if it doesn't already exist
        chrom_dir = os.path.join(output_dir, chrom_dir_name)
        if not os.path.isdir(chrom_dir):
            #print "Making directory: %s" %(chrom_dir)
            os.makedirs(chrom_dir)

        t1 = time.time()
        # Serialize each gene into a separate file
        num_genes = len(genes_by_chrom[chrom])
        
        for gene_id, gene_info in genes_by_chrom[chrom].iteritems():
            gene_compressed_id = None
            if compress_id:
                gene_compressed_id = \
                    genes_by_chrom[chrom][gene_id]['compressed_id']
                gene_filename = \
                    os.path.abspath(os.path.join(chrom_dir,
                                                 "%s.pickle" \
                                                 %(gene_compressed_id)))
            else:
                gene_filename = \
                    os.path.abspath(os.path.join(chrom_dir,
                                                 "%s.pickle" %(gene_id)))
            # Write each gene/event's pickle file
            pickle_utils.write_pickled_file({gene_id:
                                             genes_by_chrom[chrom][gene_id]},
                                            gene_filename)
            # Record what filename was associated with this gene ID
            gene_id_to_filename[gene_id] = gene_filename
            # Record compressed ID (hash) to gene ID
            if gene_compressed_id is not None:
                compressed_id_to_gene_id[gene_compressed_id] = gene_id
 
        t2 = time.time()
        #print "  - Chromosome serialization took %.2f seconds" %(t2 - t1)
        print '.',

    # Shelve the mapping from gene ids to filenames
    shelved_filename = os.path.join(output_dir,
                                    "genes_to_filenames.shelve")
    shelved_data = shelve.open(shelved_filename)
    for k, v in gene_id_to_filename.iteritems():
        shelved_data[k] = v
    shelved_data.close()

    # Shelve the mapping from compressed gene ids to gene ids
    shelved_filename = os.path.join(output_dir,
                                    "compressed_ids_to_genes.shelve")
    shelved_data = shelve.open(shelved_filename)
    for k, v in compressed_id_to_gene_id.iteritems():
        shelved_data[k] = v
    shelved_data.close()

    # Output a list of genes in ordinary GFF format
    genes_filename = os.path.join(output_dir, "genes.gff")
    #print "Outputting gene records in GFF format..."
    #print "  - Output file: %s" %(genes_filename)
    with open(gff_filename) as gff_in:
        with open(genes_filename, "w") as gff_out:
            for line in gff_in:
                if line.startswith("#"): continue
                record_type = line.strip().split("\t")[2]
                if record_type == "gene":
                    gff_out.write(line)