def main(gff_file): gff_index = gff_file + ".index" if not os.path.exists(gff_index): print "Indexing GFF file" index(gff_file) index = GFFIndexedAccess(gff_file, keep_open=True) print index.seqids print for feature in index.get_features_in_region("Chr2", 17500, 20000): print feature for feature in index.get_features_in_region("Chr5", 500000, 502500): print feature exam = GFF.GFFExaminer() #print exam.available_limits(gff_file) #print exam.parent_child_map(gff_file) found = 0 limit_info = dict(gff_type=[ "protein", "gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR" ]) for feature in index.get_features_in_region("Chr1", 0, 50000, limit_info): found += 1 print found
def annotate_gff_from_bw(bw, gff_path, gff_source_type=[('ensembl_havana', 'gene')]): """ Segment bigwig file according to gff annotation file :param bw: Bigwig file :type bw: bigWigFile :param gff_path: path to gff file :type gff_path: str :param gff_source_type: parameters for which the gff file is filtered :type gff_source_type: list(tuple(str)) :return: Segmented data array """ gff = load_gff(gff_path, rel_path='', is_abs_path=True) examiner = GFF.GFFExaminer() chrom_list = list(examiner.available_limits(gff)['gff_id'].keys()) gff.close() gen_mapping = [] gff = load_gff(gff_path, rel_path='', is_abs_path=True) for chrom in chrom_list: limit_info = dict(gff_id=chrom, gff_source_type=gff_source_type) for rec in GFF.parse(gff, limit_info=limit_info): for num, r in enumerate(rec.features): anno = bw.values('chr%s' % chrom[0], int(r.location.start), int(r.location.end)) if int(r.location.strand) == -1: anno = np.flip(anno) anno = np.nan_to_num(anno, copy=False, nan=0.) gen_mapping.append(anno) gff.close() return gen_mapping
def check_gff_suitability(gff_file: str, sequences: List[SeqRecord]) -> None: """ Checks that the provided GFF3 file is acceptable If only a single record is contained in both sequences and GFF, they are assumed to be the same. Arguments: gff_file: the path of the GFF file to check sequences: a list of SeqRecords Returns: None """ try: examiner = GFF.GFFExaminer() # file handle is automatically closed by GFF lib gff_data = examiner.available_limits(open(gff_file)) # Check if at least one GFF locus appears in sequence gff_ids = set([n[0] for n in gff_data['gff_id']]) if len(gff_ids) == 1 and len(sequences) == 1: # If both inputs only have one record, assume is the same, # but first check coordinate compatibility logging.info("GFF3 and sequence have only one record. Assuming is " "the same as long as coordinates are compatible.") limit_info = dict(gff_type=['CDS']) record_iter = GFF.parse(open(gff_file), limit_info=limit_info) try: record = next(record_iter) except StopIteration: raise AntismashInputError("could not parse records from GFF3 file") if not record.features: raise AntismashInputError('GFF3 record %s contains no features' % record.id) coord_max = max([n.location.end.real for n in record.features]) if coord_max > len(sequences[0]): logging.error('GFF3 record and sequence coordinates are not compatible.') raise AntismashInputError('incompatible GFF record and sequence coordinates') elif not gff_ids.intersection({seq.id for seq in sequences}): logging.error('No GFF3 record IDs match any sequence record IDs.') raise AntismashInputError("GFF3 record IDs don't match sequence file record IDs.") # Check GFF contains CDSs if not ('CDS',) in gff_data['gff_type']: logging.error('GFF3 does not contain any CDS.') raise AntismashInputError("no CDS features in GFF3 file.") # Check CDS are childless but not parentless if 'CDS' in set([n for key in examiner.parent_child_map(open(gff_file)) for n in key]): logging.error('GFF3 structure is not suitable. CDS features must be childless but not parentless.') raise AntismashInputError('GFF3 structure is not suitable.') except AssertionError as err: logging.error('Parsing %r failed: %s', gff_file, err) raise AntismashInputError(str(err)) from err
def check_gff_suitability(options, sequences): if not options.gff3: return try: examiner = GFF.GFFExaminer() gff_data = examiner.available_limits(open(options.gff3)) # Check if at least one GFF locus appears in sequence gff_ids = set([n[0] for n in gff_data['gff_id']]) if len(gff_ids) == 1 and len(options.all_record_ids) == 1: # If both inputs only have one record, assume is the same, but first check coordinate compatibility logging.info("GFF3 and sequence have only one record. Assuming is " "the same as long as coordinates are compatible.") limit_info = dict(gff_type=['CDS']) record_iter = GFF.parse(open(options.gff3), limit_info=limit_info) record = next(record_iter) coord_max = max([n.location.end.real for n in record.features]) if coord_max > len(sequences[0]): logging.error( 'GFF3 record and sequence coordinates are not compatible.') raise ValueError( 'Incompatible GFF record and sequence coordinates') else: options.single_entries = True elif len(gff_ids.intersection(options.all_record_ids)) == 0: logging.error('No GFF3 record IDs match any sequence record IDs.') raise ValueError( "GFF3 record IDs don't match sequence file record IDs.") else: options.single_entries = False # Check GFF contains CDSs if not ('CDS', ) in gff_data['gff_type']: logging.error('GFF3 does not contain any CDS.') raise ValueError("No CDS features in GFF3 file.") # Check CDS are childless but not parentless if 'CDS' in set([ n for key in examiner.parent_child_map(open(options.gff3)) for n in key ]): logging.error( 'GFF3 structure is not suitable. CDS features must be childless but not parentless.' ) raise ValueError('GFF3 structure is not suitable.') except AssertionError as e: logging.error('Parsing %r failed: %s', options.gff3, e) raise
def read_gff(self): inf = open(self.gff_path, "r") e = GFF.GFFExaminer() # tmp = e.available_limits(inf) # pprint.pprint(tmp) for r in GFF.parse(inf): for record in r.features: if len(record.sub_features) >= 1: self.gene_locations[record.id] = ( record.location.nofuzzy_start, record.location.nofuzzy_end) inf.close()
def do_import(self, dirn='.'): in_file = self.__gff_fasta_fn in_handle = open(in_file) # In DEBUG=True mode, Django keeps list of queries and blows up memory # usage when doing a big import. The following line disables this # logging. connection.use_debug_cursor = False # First, retrieve rec names rec_ids = [i[0] for i in GFF.GFFExaminer().available_limits(in_handle).get('gff_id')] in_handle.close() # Then parse GFF by rec t0 = time.time() for rec_id in rec_ids: in_handle = open(in_file) limit_info = dict(gff_id=[rec_id]) recs = [rec for rec in GFF.parse(in_handle, limit_info=limit_info) if rec.id == rec_id] for rec in recs: if self.__genome.fragments.filter(name=rec.id).count() > 0: print("skipping %s, already imported" % rec.id) break else: try: GFFFragmentImporter(rec, dirn=dirn).parse_gff() except Exception as e: print(str(e)) raise Exception(f"{rec} failed import validation") in_handle.close() print("%s seconds to parse and validate all contigs from GFF" % (time.time() - t0)) # Then, build and annotate fragments for rec_id in rec_ids: in_handle = open(in_file) limit_info = dict(gff_id=[rec_id]) recs = [rec for rec in GFF.parse(in_handle, limit_info=limit_info) if rec.id == rec_id] for rec in recs: importer = GFFFragmentImporter(rec, dirn=dirn) fragment = importer.do_import() if fragment is None: raise Exception(f"{rec} failed fragment generation") self.__genome.genome_fragment_set.create(fragment=fragment, inherited=False) in_handle.close() # Be nice and turn debug cursor back on connection.use_debug_cursor = True
get_feature_start_end(feature) for feature in features if feature.type == 'CDS' ] CDS_starts, CDS_ends = zip(*CDS_positions) return min(CDS_starts), max(CDS_ends) ### Main function try: infile, outfile = sys.argv[1:] except ValueError: print __doc__ sys.exit("Error: Needs exactly one input and one output file!") examiner = GFF.GFFExaminer() # parsing the whole GFF file at once takes a ton of memory, so split it into sets with open(infile) as INFILE: GFF_limit_data = examiner.available_limits(INFILE) chromosomes_and_counts = dict([ (c, n) for ((c, ), n) in GFF_limit_data['gff_id'].items() ]) chromosome_sets = split_into_N_sets_by_counts(chromosomes_and_counts, N_chromosome_groups) with open(outfile, 'w') as OUTFILE: for chromosome_set in chromosome_sets: genefile_parsing_limits = {'gff_id': list(chromosome_set)} with open(infile) as INFILE:
def load_gff(self, gff, analysis_id, organism_id, landmark_type=None, re_protein=None, re_protein_capture="^(.*?)$", fasta=None, no_seq_compute=False, quiet=False, add_only=False, protein_id_attr=None): """ Load features from a gff file :type gff: str :param gff: Path to the Fasta file to load :type analysis_id: int :param analysis_id: Analysis ID :type organism_id: int :param organism_id: Organism ID :type landmark_type: str :param landmark_type: Type of the landmarks (will speed up loading if provided, e.g. contig, should be a term of the Sequence ontology) :type re_protein: str :param re_protein: Replacement string for the protein name using capturing groups defined by --re_protein_capture :type re_protein_capture: str :param re_protein_capture: Regular expression to capture groups in mRNA name to use in --re_protein (e.g. "^(.*?)-R([A-Z]+)$", default="^(.*?)$") :type protein_id_attr: str :param protein_id_attr: Attribute containing the protein uniquename. It is searched at the mRNA level, and if not found at CDS level. :type fasta: str :param fasta: Path to a Fasta containing sequences for some features. When creating a feature, if its sequence is in this fasta file it will be loaded. Otherwise for mRNA and polypeptides it will be computed from the genome sequence (if available), otherwise it will be left empty. :type no_seq_compute: bool :param no_seq_compute: Disable the computation of mRNA and polypeptides sequences based on genome sequence and positions. :type quiet: bool :param quiet: Hide progress information :type add_only: bool :param add_only: Use this flag if you're not updating existing features, but just adding new features to the selected analysis and organism. It will speedup loading, and reduce memory usage, but might produce errors in case of already existing feature. :rtype: None :return: None """ if len(self.ci.analysis.get_analyses(analysis_id=analysis_id)) != 1: raise Exception("Could not find analysis with id '{}'".format(analysis_id)) if len(self.ci.organism.get_organisms(organism_id=organism_id)) != 1: raise Exception("Could not find organism with id '{}'".format(organism_id)) if protein_id_attr and re_protein: raise Exception("--protein_id_attr and --re_protein cannot be used at the same time.") self.cache_existing = not add_only # Get possible landmarks landmarks = self.session.query(self.model.feature.name, self.model.feature.uniquename, self.model.feature.feature_id, self.model.feature.type_id, self.model.feature.organism_id) \ .filter_by(organism_id=organism_id) if landmark_type: # Filter by landmark type if provided (else we look for all features) landmark_type_id = self.ci.get_cvterm_id(landmark_type, 'sequence') landmarks = landmarks.filter(self.model.feature.type_id == landmark_type_id) self._landmark_cache = {} for lm in landmarks: if lm.name not in self._landmark_cache: self._landmark_cache[lm.name] = [] if lm.feature_id not in self._landmark_cache[lm.name]: self._landmark_cache[lm.name].append(lm.feature_id) # There may be multiple landmarks with the same name # Also look for uniquename if lm.uniquename not in self._landmark_cache: self._landmark_cache[lm.uniquename] = [] if lm.feature_id not in self._landmark_cache[lm.uniquename]: self._landmark_cache[lm.uniquename].append(lm.feature_id) examiner = GFF.GFFExaminer() gff_handle = open(gff) gff_limits = examiner.available_limits(gff_handle) gff_handle.close() # Check that we have all the cvterms in the db self._blacklisted_cvterms = [] for feat_type in gff_limits['gff_type']: type_to_check = feat_type[0] # Be tolerant for proteins (shameless hard coding) if type_to_check == 'protein': type_to_check = 'polypeptide' # Will raise an exception if not present + keep value in cache try: self.ci.get_cvterm_id(type_to_check, 'sequence', True) except chado.RecordNotFoundError: if type_to_check not in self._blacklisted_cvterms: warn("WARNING: will skip features of unknown type: %s", type_to_check) self._blacklisted_cvterms.append(type_to_check) # Read optional fasta file self._fasta_sequence_cache = {} if fasta: for record in SeqIO.parse(fasta, "fasta"): self._fasta_sequence_cache[record.id] = str(record.seq) # Check that all landmarks are there for seq_id in gff_limits['gff_id']: seq_id = seq_id[0] if seq_id not in self._landmark_cache: if landmark_type: # Landmark does not exist yet, but we know how to create it lm = SeqFeature(FeatureLocation(0, 1), type=landmark_type, qualifiers={'ID': [seq_id], 'Name': [seq_id]}) if seq_id in self._fasta_sequence_cache: added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False, residues=self._fasta_sequence_cache[seq_id]) else: added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False) self._landmark_cache[seq_id] = [added_feat['feature_id']] else: raise Exception("Could not find landmark named '{}', add --landmark_type to create it".format(seq_id)) elif len(self._landmark_cache[seq_id]) > 1: raise Exception("Found {} landmarks with same name '{}'".format(len(self._landmark_cache[seq_id]), seq_id)) count_ins = 0 for rec in GFF.parse(gff): # Preload landmark seq to compute some seqs on it # We compare to ????... as the gff parser will populate rec.seq with a fake sequence based on the size from "sequence-region" header if not no_seq_compute: if rec.id in self._fasta_sequence_cache: rec.seq = Seq.Seq(self._fasta_sequence_cache[rec.id]) del self._fasta_sequence_cache[rec.id] # Save a little memory elif len(rec.seq) == 0 or str(rec.seq)[0:10] == "??????????": seq_res = self.session.query(self.model.feature.residues) \ .filter(self.model.feature.uniquename == rec.id) if landmark_type: seq_res = seq_res.filter(self.model.feature.type_id == landmark_type_id) seq_res = seq_res.all() if len(seq_res) == 1 and seq_res[0].residues: rec.seq = Seq.Seq(seq_res[0].residues) # Set a custom attr to store the chado feature_id rec._chado_feature_id = self._landmark_cache[rec.id][0] if not quiet: print("Loading features on {}".format(rec.id)) for f in rec.features: self._load_gff_feature_with_children(rec, f, analysis_id, organism_id, re_protein_capture, re_protein, protein_id_attr, no_seq_compute=no_seq_compute) count_ins += 1 if not quiet: print("Inserted feature #{}".format(count_ins)) self._update_rel_ranks() self.session.commit() self._reset_cache() return {'inserted': count_ins}
def check_gff_suitability(options, sequences) -> bool: """ Checks that the provided GFF3 file is acceptable If only a single record is contained in both sequences and GFF, they are assumed to be the same. Returns: True if only a single entry is contained by both inputs and their sequence coordinates match """ try: examiner = GFF.GFFExaminer() # file handle is automatically closed by GFF lib gff_data = examiner.available_limits(open(options.genefinding_gff3)) # Check if at least one GFF locus appears in sequence gff_ids = set([n[0] for n in gff_data['gff_id']]) single_entries = False if len(gff_ids) == 1 and len(sequences) == 1: # If both inputs only have one record, assume is the same, # but first check coordinate compatibility logging.info("GFF3 and sequence have only one record. Assuming is " "the same as long as coordinates are compatible.") limit_info = dict(gff_type=['CDS']) record_iter = GFF.parse(open(options.genefinding_gff3), limit_info=limit_info) try: record = next(record_iter) except StopIteration: raise ValueError("Could not parse records from GFF3 file") if not record.features: raise ValueError('GFF3 record %s contains no features' % record.id) coord_max = max([n.location.end.real for n in record.features]) if coord_max > len(sequences[0]): logging.error( 'GFF3 record and sequence coordinates are not compatible.') raise ValueError( 'Incompatible GFF record and sequence coordinates') single_entries = True elif not gff_ids.intersection({seq.id for seq in sequences}): logging.error('No GFF3 record IDs match any sequence record IDs.') raise ValueError( "GFF3 record IDs don't match sequence file record IDs.") # Check GFF contains CDSs if not ('CDS', ) in gff_data['gff_type']: logging.error('GFF3 does not contain any CDS.') raise ValueError("No CDS features in GFF3 file.") # Check CDS are childless but not parentless if 'CDS' in set([ n for key in examiner.parent_child_map( open(options.genefinding_gff3)) for n in key ]): logging.error( 'GFF3 structure is not suitable. CDS features must be childless but not parentless.' ) raise ValueError('GFF3 structure is not suitable.') except AssertionError as err: logging.error('Parsing %r failed: %s', options.genefinding_gff3, err) raise return single_entries
def check_gff_suitability(options, sequences): if options.gff3: options.gff_ids = [] # Some GFFs have a header, but some GFF parser functions break with it, so check for header and error out if # if exists. try: with open(options.gff3) as f: for line in f: if line.startswith('#'): continue else: int(line.split('\t') [3]) # 4th column has to be a number (start) int(line.split('\t') [4]) # 5th column has to be a number (end) except ValueError as e: logging.error('Parsing %r failed: %s', options.gff3, e) logging.error( 'It appears %r has a header. It should be removed or commented out for proper parsing.', options.gff3) sys.exit(1) try: examiner = GFF.GFFExaminer() gff_data = examiner.available_limits(open(options.gff3)) # Check if at least one GFF locus appears in sequence gff_ids = set([n[0] for n in gff_data['gff_id']]) options.gff_ids = list(gff_ids) if len(gff_ids) == 1 and len(options.all_record_ids) == 1: # If both inputs only have one record, assume is the same, but first check coordinate compatibility logging.info( "GFF3 and sequence have only one record. Assuming is the same as long as coordinates are" "compatible.") limit_info = dict(gff_type=['CDS']) for record in GFF.parse(open(options.gff3), limit_info=limit_info): break coord_max = max([n.location.end.real for n in record.features]) if coord_max > len(sequences[0]): logging.error( 'GFF3 record and sequence coordinates are not compatible.' ) sys.exit(1) else: options.single_entries = True elif len(gff_ids.intersection(set(options.all_record_ids))) == 0: logging.error( 'No GFF3 record IDs match any sequence record IDs.') sys.exit(1) else: options.single_entries = False # Check GFF contains CDSs if not ('CDS', ) in gff_data['gff_type']: logging.error('GFF3 does not contain any CDS.') sys.exit(1) # Check CDS are childless but not parentless if 'CDS' in set([ n for key in examiner.parent_child_map(open(options.gff3)) for n in key ]): logging.error( 'GFF3 structure is not suitable. CDS features must be childless but not parentless.' ) sys.exit(1) except AssertionError as e: logging.error('Parsing %r failed: %s', options.gff3, e) sys.exit(1)