Esempio n. 1
0
def main(gff_file):
    gff_index = gff_file + ".index"
    if not os.path.exists(gff_index):
        print "Indexing GFF file"
        index(gff_file)
    index = GFFIndexedAccess(gff_file, keep_open=True)
    print index.seqids
    print
    for feature in index.get_features_in_region("Chr2", 17500, 20000):
        print feature
    for feature in index.get_features_in_region("Chr5", 500000, 502500):
        print feature

    exam = GFF.GFFExaminer()
    #print exam.available_limits(gff_file)
    #print exam.parent_child_map(gff_file)

    found = 0
    limit_info = dict(gff_type=[
        "protein", "gene", "mRNA", "exon", "CDS", "five_prime_UTR",
        "three_prime_UTR"
    ])
    for feature in index.get_features_in_region("Chr1", 0, 50000, limit_info):
        found += 1
    print found
def annotate_gff_from_bw(bw, gff_path, gff_source_type=[('ensembl_havana', 'gene')]):
    """
    Segment bigwig file according to gff annotation file
    :param bw: Bigwig file
    :type bw: bigWigFile
    :param gff_path: path to gff file
    :type gff_path: str
    :param gff_source_type: parameters for which the gff file is filtered
    :type gff_source_type: list(tuple(str))
    :return: Segmented data array
    """
    gff = load_gff(gff_path, rel_path='', is_abs_path=True)
    examiner = GFF.GFFExaminer()
    chrom_list = list(examiner.available_limits(gff)['gff_id'].keys())
    gff.close()

    gen_mapping = []
    gff = load_gff(gff_path, rel_path='', is_abs_path=True)
    for chrom in chrom_list:
        limit_info = dict(gff_id=chrom, gff_source_type=gff_source_type)
        for rec in GFF.parse(gff, limit_info=limit_info):
            for num, r in enumerate(rec.features):
                anno = bw.values('chr%s' % chrom[0], int(r.location.start), int(r.location.end))
                if int(r.location.strand) == -1:
                    anno = np.flip(anno)
                anno = np.nan_to_num(anno, copy=False, nan=0.)
                gen_mapping.append(anno)

    gff.close()
    return gen_mapping
Esempio n. 3
0
def check_gff_suitability(gff_file: str, sequences: List[SeqRecord]) -> None:
    """
        Checks that the provided GFF3 file is acceptable

        If only a single record is contained in both sequences and GFF, they
        are assumed to be the same.

        Arguments:
            gff_file: the path of the GFF file to check
            sequences: a list of SeqRecords

        Returns:
            None
    """
    try:
        examiner = GFF.GFFExaminer()
        # file handle is automatically closed by GFF lib
        gff_data = examiner.available_limits(open(gff_file))
        # Check if at least one GFF locus appears in sequence
        gff_ids = set([n[0] for n in gff_data['gff_id']])

        if len(gff_ids) == 1 and len(sequences) == 1:
            # If both inputs only have one record, assume is the same,
            # but first check coordinate compatibility
            logging.info("GFF3 and sequence have only one record. Assuming is "
                         "the same as long as coordinates are compatible.")
            limit_info = dict(gff_type=['CDS'])

            record_iter = GFF.parse(open(gff_file), limit_info=limit_info)
            try:
                record = next(record_iter)
            except StopIteration:
                raise AntismashInputError("could not parse records from GFF3 file")

            if not record.features:
                raise AntismashInputError('GFF3 record %s contains no features' % record.id)

            coord_max = max([n.location.end.real for n in record.features])
            if coord_max > len(sequences[0]):
                logging.error('GFF3 record and sequence coordinates are not compatible.')
                raise AntismashInputError('incompatible GFF record and sequence coordinates')

        elif not gff_ids.intersection({seq.id for seq in sequences}):
            logging.error('No GFF3 record IDs match any sequence record IDs.')
            raise AntismashInputError("GFF3 record IDs don't match sequence file record IDs.")

        # Check GFF contains CDSs
        if not ('CDS',) in gff_data['gff_type']:
            logging.error('GFF3 does not contain any CDS.')
            raise AntismashInputError("no CDS features in GFF3 file.")

        # Check CDS are childless but not parentless
        if 'CDS' in set([n for key in examiner.parent_child_map(open(gff_file)) for n in key]):
            logging.error('GFF3 structure is not suitable. CDS features must be childless but not parentless.')
            raise AntismashInputError('GFF3 structure is not suitable.')

    except AssertionError as err:
        logging.error('Parsing %r failed: %s', gff_file, err)
        raise AntismashInputError(str(err)) from err
Esempio n. 4
0
def check_gff_suitability(options, sequences):
    if not options.gff3:
        return

    try:
        examiner = GFF.GFFExaminer()
        gff_data = examiner.available_limits(open(options.gff3))

        # Check if at least one GFF locus appears in sequence
        gff_ids = set([n[0] for n in gff_data['gff_id']])

        if len(gff_ids) == 1 and len(options.all_record_ids) == 1:
            # If both inputs only have one record, assume is the same, but first check coordinate compatibility
            logging.info("GFF3 and sequence have only one record. Assuming is "
                         "the same as long as coordinates are compatible.")
            limit_info = dict(gff_type=['CDS'])

            record_iter = GFF.parse(open(options.gff3), limit_info=limit_info)
            record = next(record_iter)

            coord_max = max([n.location.end.real for n in record.features])
            if coord_max > len(sequences[0]):
                logging.error(
                    'GFF3 record and sequence coordinates are not compatible.')
                raise ValueError(
                    'Incompatible GFF record and sequence coordinates')
            else:
                options.single_entries = True

        elif len(gff_ids.intersection(options.all_record_ids)) == 0:
            logging.error('No GFF3 record IDs match any sequence record IDs.')
            raise ValueError(
                "GFF3 record IDs don't match sequence file record IDs.")

        else:
            options.single_entries = False

        # Check GFF contains CDSs
        if not ('CDS', ) in gff_data['gff_type']:
            logging.error('GFF3 does not contain any CDS.')
            raise ValueError("No CDS features in GFF3 file.")

        # Check CDS are childless but not parentless
        if 'CDS' in set([
                n for key in examiner.parent_child_map(open(options.gff3))
                for n in key
        ]):
            logging.error(
                'GFF3 structure is not suitable. CDS features must be childless but not parentless.'
            )
            raise ValueError('GFF3 structure is not suitable.')

    except AssertionError as e:
        logging.error('Parsing %r failed: %s', options.gff3, e)
        raise
Esempio n. 5
0
    def read_gff(self):
        inf = open(self.gff_path, "r")
        e = GFF.GFFExaminer()
        # tmp = e.available_limits(inf)
        # pprint.pprint(tmp)

        for r in GFF.parse(inf):
            for record in r.features:
                if len(record.sub_features) >= 1:
                    self.gene_locations[record.id] = (
                        record.location.nofuzzy_start,
                        record.location.nofuzzy_end)
        inf.close()
Esempio n. 6
0
    def do_import(self, dirn='.'):
        in_file = self.__gff_fasta_fn
        in_handle = open(in_file)

        # In DEBUG=True mode, Django keeps list of queries and blows up memory
        # usage when doing a big import. The following line disables this
        # logging.
        connection.use_debug_cursor = False

        # First, retrieve rec names
        rec_ids = [i[0] for i in GFF.GFFExaminer().available_limits(in_handle).get('gff_id')]
        in_handle.close()

        # Then parse GFF by rec
        t0 = time.time()
        for rec_id in rec_ids:
            in_handle = open(in_file)
            limit_info = dict(gff_id=[rec_id])
            recs = [rec for rec in GFF.parse(in_handle, limit_info=limit_info)
                    if rec.id == rec_id]

            for rec in recs:
                if self.__genome.fragments.filter(name=rec.id).count() > 0:
                    print("skipping %s, already imported" % rec.id)
                    break
                else:
                    try:
                        GFFFragmentImporter(rec, dirn=dirn).parse_gff()
                    except Exception as e:
                        print(str(e))
                        raise Exception(f"{rec} failed import validation")
            in_handle.close()
        print("%s seconds to parse and validate all contigs from GFF" % (time.time() - t0))

        # Then, build and annotate fragments
        for rec_id in rec_ids:
            in_handle = open(in_file)
            limit_info = dict(gff_id=[rec_id])
            recs = [rec for rec in GFF.parse(in_handle, limit_info=limit_info)
                    if rec.id == rec_id]

            for rec in recs:
                importer = GFFFragmentImporter(rec, dirn=dirn)
                fragment = importer.do_import()
                if fragment is None:
                    raise Exception(f"{rec} failed fragment generation")
                self.__genome.genome_fragment_set.create(fragment=fragment, inherited=False)
            in_handle.close()

        # Be nice and turn debug cursor back on
        connection.use_debug_cursor = True
Esempio n. 7
0
        get_feature_start_end(feature) for feature in features
        if feature.type == 'CDS'
    ]
    CDS_starts, CDS_ends = zip(*CDS_positions)
    return min(CDS_starts), max(CDS_ends)


### Main function

try:
    infile, outfile = sys.argv[1:]
except ValueError:
    print __doc__
    sys.exit("Error: Needs exactly one input and one output file!")

examiner = GFF.GFFExaminer()

# parsing the whole GFF file at once takes a ton of memory, so split it into sets
with open(infile) as INFILE:
    GFF_limit_data = examiner.available_limits(INFILE)
    chromosomes_and_counts = dict([
        (c, n) for ((c, ), n) in GFF_limit_data['gff_id'].items()
    ])

chromosome_sets = split_into_N_sets_by_counts(chromosomes_and_counts,
                                              N_chromosome_groups)

with open(outfile, 'w') as OUTFILE:
    for chromosome_set in chromosome_sets:
        genefile_parsing_limits = {'gff_id': list(chromosome_set)}
        with open(infile) as INFILE:
Esempio n. 8
0
    def load_gff(self, gff, analysis_id, organism_id, landmark_type=None, re_protein=None, re_protein_capture="^(.*?)$", fasta=None, no_seq_compute=False, quiet=False, add_only=False, protein_id_attr=None):
        """
        Load features from a gff file

        :type gff: str
        :param gff: Path to the Fasta file to load

        :type analysis_id: int
        :param analysis_id: Analysis ID

        :type organism_id: int
        :param organism_id: Organism ID

        :type landmark_type: str
        :param landmark_type: Type of the landmarks (will speed up loading if provided, e.g. contig, should be a term of the Sequence ontology)

        :type re_protein: str
        :param re_protein: Replacement string for the protein name using capturing groups defined by --re_protein_capture

        :type re_protein_capture: str
        :param re_protein_capture: Regular expression to capture groups in mRNA name to use in --re_protein (e.g. "^(.*?)-R([A-Z]+)$", default="^(.*?)$")

        :type protein_id_attr: str
        :param protein_id_attr: Attribute containing the protein uniquename. It is searched at the mRNA level, and if not found at CDS level.

        :type fasta: str
        :param fasta: Path to a Fasta containing sequences for some features. When creating a feature, if its sequence is in this fasta file it will be loaded. Otherwise for mRNA and polypeptides it will be computed from the genome sequence (if available), otherwise it will be left empty.

        :type no_seq_compute: bool
        :param no_seq_compute: Disable the computation of mRNA and polypeptides sequences based on genome sequence and positions.

        :type quiet: bool
        :param quiet: Hide progress information

        :type add_only: bool
        :param add_only: Use this flag if you're not updating existing features, but just adding new features to the selected analysis and organism. It will speedup loading, and reduce memory usage, but might produce errors in case of already existing feature.

        :rtype: None
        :return: None
        """

        if len(self.ci.analysis.get_analyses(analysis_id=analysis_id)) != 1:
            raise Exception("Could not find analysis with id '{}'".format(analysis_id))

        if len(self.ci.organism.get_organisms(organism_id=organism_id)) != 1:
            raise Exception("Could not find organism with id '{}'".format(organism_id))

        if protein_id_attr and re_protein:
            raise Exception("--protein_id_attr and --re_protein cannot be used at the same time.")

        self.cache_existing = not add_only

        # Get possible landmarks
        landmarks = self.session.query(self.model.feature.name, self.model.feature.uniquename, self.model.feature.feature_id, self.model.feature.type_id, self.model.feature.organism_id) \
            .filter_by(organism_id=organism_id)
        if landmark_type:
            # Filter by landmark type if provided (else we look for all features)
            landmark_type_id = self.ci.get_cvterm_id(landmark_type, 'sequence')
            landmarks = landmarks.filter(self.model.feature.type_id == landmark_type_id)

        self._landmark_cache = {}
        for lm in landmarks:
            if lm.name not in self._landmark_cache:
                self._landmark_cache[lm.name] = []
            if lm.feature_id not in self._landmark_cache[lm.name]:
                self._landmark_cache[lm.name].append(lm.feature_id)  # There may be multiple landmarks with the same name

            # Also look for uniquename
            if lm.uniquename not in self._landmark_cache:
                self._landmark_cache[lm.uniquename] = []
            if lm.feature_id not in self._landmark_cache[lm.uniquename]:
                self._landmark_cache[lm.uniquename].append(lm.feature_id)

        examiner = GFF.GFFExaminer()
        gff_handle = open(gff)
        gff_limits = examiner.available_limits(gff_handle)
        gff_handle.close()

        # Check that we have all the cvterms in the db
        self._blacklisted_cvterms = []
        for feat_type in gff_limits['gff_type']:
            type_to_check = feat_type[0]
            # Be tolerant for proteins (shameless hard coding)
            if type_to_check == 'protein':
                type_to_check = 'polypeptide'

            # Will raise an exception if not present + keep value in cache
            try:
                self.ci.get_cvterm_id(type_to_check, 'sequence', True)
            except chado.RecordNotFoundError:
                if type_to_check not in self._blacklisted_cvterms:
                    warn("WARNING: will skip features of unknown type: %s", type_to_check)
                    self._blacklisted_cvterms.append(type_to_check)

        # Read optional fasta file
        self._fasta_sequence_cache = {}
        if fasta:
            for record in SeqIO.parse(fasta, "fasta"):
                self._fasta_sequence_cache[record.id] = str(record.seq)

        # Check that all landmarks are there
        for seq_id in gff_limits['gff_id']:
            seq_id = seq_id[0]
            if seq_id not in self._landmark_cache:
                if landmark_type:
                    # Landmark does not exist yet, but we know how to create it
                    lm = SeqFeature(FeatureLocation(0, 1), type=landmark_type, qualifiers={'ID': [seq_id], 'Name': [seq_id]})
                    if seq_id in self._fasta_sequence_cache:
                        added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False, residues=self._fasta_sequence_cache[seq_id])
                    else:
                        added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False)
                    self._landmark_cache[seq_id] = [added_feat['feature_id']]
                else:
                    raise Exception("Could not find landmark named '{}', add --landmark_type to create it".format(seq_id))
            elif len(self._landmark_cache[seq_id]) > 1:
                raise Exception("Found {} landmarks with same name '{}'".format(len(self._landmark_cache[seq_id]), seq_id))

        count_ins = 0

        for rec in GFF.parse(gff):

            # Preload landmark seq to compute some seqs on it
            # We compare to ????... as the gff parser will populate rec.seq with a fake sequence based on the size from "sequence-region" header
            if not no_seq_compute:
                if rec.id in self._fasta_sequence_cache:
                    rec.seq = Seq.Seq(self._fasta_sequence_cache[rec.id])
                    del self._fasta_sequence_cache[rec.id]  # Save a little memory
                elif len(rec.seq) == 0 or str(rec.seq)[0:10] == "??????????":
                    seq_res = self.session.query(self.model.feature.residues) \
                        .filter(self.model.feature.uniquename == rec.id)

                    if landmark_type:
                        seq_res = seq_res.filter(self.model.feature.type_id == landmark_type_id)

                    seq_res = seq_res.all()

                    if len(seq_res) == 1 and seq_res[0].residues:
                        rec.seq = Seq.Seq(seq_res[0].residues)

            # Set a custom attr to store the chado feature_id
            rec._chado_feature_id = self._landmark_cache[rec.id][0]
            if not quiet:
                print("Loading features on {}".format(rec.id))

            for f in rec.features:

                self._load_gff_feature_with_children(rec, f, analysis_id, organism_id, re_protein_capture, re_protein, protein_id_attr, no_seq_compute=no_seq_compute)
                count_ins += 1

                if not quiet:
                    print("Inserted feature #{}".format(count_ins))

        self._update_rel_ranks()

        self.session.commit()

        self._reset_cache()

        return {'inserted': count_ins}
Esempio n. 9
0
def check_gff_suitability(options, sequences) -> bool:
    """
        Checks that the provided GFF3 file is acceptable

        If only a single record is contained in both sequences and GFF, they
        are assumed to be the same.

        Returns:
            True if only a single entry is contained by both inputs and
                    their sequence coordinates match
    """
    try:
        examiner = GFF.GFFExaminer()
        # file handle is automatically closed by GFF lib
        gff_data = examiner.available_limits(open(options.genefinding_gff3))
        # Check if at least one GFF locus appears in sequence
        gff_ids = set([n[0] for n in gff_data['gff_id']])

        single_entries = False

        if len(gff_ids) == 1 and len(sequences) == 1:
            # If both inputs only have one record, assume is the same,
            # but first check coordinate compatibility
            logging.info("GFF3 and sequence have only one record. Assuming is "
                         "the same as long as coordinates are compatible.")
            limit_info = dict(gff_type=['CDS'])

            record_iter = GFF.parse(open(options.genefinding_gff3),
                                    limit_info=limit_info)
            try:
                record = next(record_iter)
            except StopIteration:
                raise ValueError("Could not parse records from GFF3 file")

            if not record.features:
                raise ValueError('GFF3 record %s contains no features' %
                                 record.id)

            coord_max = max([n.location.end.real for n in record.features])
            if coord_max > len(sequences[0]):
                logging.error(
                    'GFF3 record and sequence coordinates are not compatible.')
                raise ValueError(
                    'Incompatible GFF record and sequence coordinates')

            single_entries = True

        elif not gff_ids.intersection({seq.id for seq in sequences}):
            logging.error('No GFF3 record IDs match any sequence record IDs.')
            raise ValueError(
                "GFF3 record IDs don't match sequence file record IDs.")

        # Check GFF contains CDSs
        if not ('CDS', ) in gff_data['gff_type']:
            logging.error('GFF3 does not contain any CDS.')
            raise ValueError("No CDS features in GFF3 file.")

        # Check CDS are childless but not parentless
        if 'CDS' in set([
                n for key in examiner.parent_child_map(
                    open(options.genefinding_gff3)) for n in key
        ]):
            logging.error(
                'GFF3 structure is not suitable. CDS features must be childless but not parentless.'
            )
            raise ValueError('GFF3 structure is not suitable.')

    except AssertionError as err:
        logging.error('Parsing %r failed: %s', options.genefinding_gff3, err)
        raise
    return single_entries
Esempio n. 10
0
def check_gff_suitability(options, sequences):
    if options.gff3:
        options.gff_ids = []
        # Some GFFs have a header, but some GFF parser functions break with it, so check for header and error out if
        # if exists.
        try:
            with open(options.gff3) as f:
                for line in f:
                    if line.startswith('#'):
                        continue
                    else:
                        int(line.split('\t')
                            [3])  # 4th column has to be a number (start)
                        int(line.split('\t')
                            [4])  # 5th column has to be a number (end)
        except ValueError as e:
            logging.error('Parsing %r failed: %s', options.gff3, e)
            logging.error(
                'It appears %r has a header. It should be removed or commented out for proper parsing.',
                options.gff3)
            sys.exit(1)
        try:
            examiner = GFF.GFFExaminer()
            gff_data = examiner.available_limits(open(options.gff3))
            # Check if at least one GFF locus appears in sequence
            gff_ids = set([n[0] for n in gff_data['gff_id']])
            options.gff_ids = list(gff_ids)
            if len(gff_ids) == 1 and len(options.all_record_ids) == 1:
                # If both inputs only have one record, assume is the same, but first check coordinate compatibility
                logging.info(
                    "GFF3 and sequence have only one record. Assuming is the same as long as coordinates are"
                    "compatible.")
                limit_info = dict(gff_type=['CDS'])
                for record in GFF.parse(open(options.gff3),
                                        limit_info=limit_info):
                    break
                coord_max = max([n.location.end.real for n in record.features])
                if coord_max > len(sequences[0]):
                    logging.error(
                        'GFF3 record and sequence coordinates are not compatible.'
                    )
                    sys.exit(1)
                else:
                    options.single_entries = True
            elif len(gff_ids.intersection(set(options.all_record_ids))) == 0:
                logging.error(
                    'No GFF3 record IDs match any sequence record IDs.')
                sys.exit(1)
            else:
                options.single_entries = False
            # Check GFF contains CDSs
            if not ('CDS', ) in gff_data['gff_type']:
                logging.error('GFF3 does not contain any CDS.')
                sys.exit(1)
            # Check CDS are childless but not parentless
            if 'CDS' in set([
                    n for key in examiner.parent_child_map(open(options.gff3))
                    for n in key
            ]):
                logging.error(
                    'GFF3 structure is not suitable. CDS features must be childless but not parentless.'
                )
                sys.exit(1)
        except AssertionError as e:
            logging.error('Parsing %r failed: %s', options.gff3, e)
            sys.exit(1)