Exemple #1
0
 def t_key_whitespace(self):
     """Fix keys with problematic whitespace.
     """
     tfile = os.path.join(self._test_dir, "spaces.gff3")
     for i, line_info in enumerate(GFF.parse_simple(tfile)):
         if i > 2:
             assert line_info["quals"]["foo"] == ["bar"]
 def t_simple_parsing_nesting(self):
     """Simple parsing for lines with nesting, using the simplified API.
     """
     test_gff = os.path.join(self._test_dir, "transcripts.gff3")
     num_lines = 0
     for line_info in GFF.parse_simple(test_gff):
         num_lines += 1
     assert num_lines == 16, num_lines
    def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info("Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i,k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename)
        shove.close()
Exemple #4
0
def parse_tx_in_region(tx_file, region):
    want = ["gene_id", "transcript_id", "FPKM"]
    se_range = set(range(region["start"], region["end"]))
    limit_info = {"gff_id": [region["space"]], "gff_type": ["transcript"]}
    for rec in GFF.parse_simple(tx_file, limit_info=limit_info):
        s, e = rec["location"]
        if s in se_range or e in se_range:
            out = {"chr": rec["rec_id"], "start": s, "end": e}
            for n in want:
                out[n] = rec["quals"][n][0]
            yield out
def parse_tx_in_region(tx_file, region):
    want = ["gene_id", "transcript_id", "FPKM"]
    se_range = set(range(region["start"], region["end"]))
    limit_info = {"gff_id": [region["space"]],
                  "gff_type": ["transcript"]}
    for rec in GFF.parse_simple(tx_file, limit_info=limit_info):
        s, e = rec["location"]
        if s in se_range or e in se_range:
            out = {"chr": rec["rec_id"], "start": s, "end": e}
            for n in want:
                out[n] = rec["quals"][n][0]
            yield out
    def build_ensembl_transcript_index(self,
                                       ensembl_input_gtfs,
                                       ensembl_input_fastas,
                                       output_filename,
                                       protocol="file",
                                       protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " +
                                         protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(
            protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " +
                                             in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(
                    in_file):  #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn(
                        "ensembl records had more than one transcript id: " +
                        str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict,
                                                   seq_dict_keys,
                                                   tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info(
                        "Added " + str(ctr) + " lines of gtf " +
                        str(file_ctr + 1) + " of " +
                        str(len(ensembl_input_gtfs)) + " (" + in_file +
                        ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) +
                                             " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info(
            "Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i, k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(
                self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info(
                    "Saved %0.1f%% of transcript index to disk with protein sequence."
                    % (float(i * 100) / float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " +
                                         str(len(shove.keys())) +
                                         " transcripts: " + protocol + "://" +
                                         output_filename)
        shove.close()