def clear_all_bioinformatics_data(): enzyme_types = EnzymeType.objects() seqs = Sequence.objects() for enz in enzyme_types: enz.bioinformatics_status = 'Idle' enz.save() for seq in seqs: seq.blast = None seq.alignments_made = None seq.save() UniRef50.drop_collection() SSN_record.drop_collection() UniRef90.drop_collection() Alignment.drop_collection() SeqSimNet.drop_collection() analysis_data_ssn = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/ssn' analysis_data_aba = str(Path(__file__).parents[3]) + f'/analysis/analysis_data/all_by_all_blast' shutil.rmtree(analysis_data_ssn) shutil.rmtree(analysis_data_aba) os.mkdir(analysis_data_ssn) os.mkdir(analysis_data_aba) print('ALL BIOINFORMATICS DATA DELETED') result = {'status': 'success', 'msg': f"Done", 'issues': []} return jsonify(result=result)
def task_check_ssn_status(): for enzyme_type in EnzymeType.objects(): ssn_query = list(SSN_record.objects(enzyme_type=enzyme_type)) if len(ssn_query) > 1: print( f'Warning - multiple ssn records for {enzyme_type} - deleting extras' ) for i in range(1, len(ssn_query)): ssn_query[i].delete() if len(current_app.blast_queue.jobs) + len( current_app.process_blasts_queue.jobs) + len( current_app.alignment_queue.jobs) == 0: print('Checking ssn status') ssn_records = SSN_record.objects().select_related() for ssn_r in ssn_records: if ssn_r.status != 'Complete' and ssn_r.enzyme_type.bioinformatics_status == 'Complete': if len(UniRef50.objects(enzyme_type=ssn_r.enzyme_type)) != 0: enzyme_type = ssn_r.enzyme_type.enzyme_type job_name = f"{enzyme_type}_expand_ssn" current_app.alignment_queue.enqueue( ssn_tasks.task_expand_ssn, enzyme_type, job_id=job_name) print(f'Queued SSN job for {enzyme_type}') for enz_type_obj in EnzymeType.objects(): if enz_type_obj.bioinformatics_status == 'Complete': if enz_type_obj not in SSN_record.objects().distinct( 'enzyme_type'): unirefs = UniRef50.objects(enzyme_type=enz_type_obj) biocatdb_seqs = list( Sequence.objects( db.Q(enzyme_type=enz_type_obj.enzyme_type) & db.Q(bioinformatics_ignore__ne=True))) biocatdb_seqs = [ seq for seq in biocatdb_seqs if seq.sequence != '' and seq.sequence is not None ] if len(unirefs) + len(biocatdb_seqs) != 0: print( f"No SSN for {enz_type_obj.enzyme_type}, but blasts are complete and sequences present.. creating SSN." ) job_name = f"{enz_type_obj.enzyme_type}_expand_ssn" current_app.alignment_queue.enqueue( ssn_tasks.task_expand_ssn, enz_type_obj.enzyme_type, job_id=job_name) else: print(f"Length blast queue = {len(current_app.blast_queue.jobs)}") print( f"Length process blast queue = {len(current_app.process_blasts_queue.jobs)}" ) print( f"Length alignment queue = {len(current_app.alignment_queue.jobs)}" )
def nodes_not_present(self, only_biocatdb=False, max_num=None): """ Return a list of enzymes which are not in the ssn """ # Get a list of all sequence objects of enzyme type t0 = time.time() sequences = Sequence.objects( db.Q(enzyme_type=self.enzyme_type) & db.Q(sequence__ne="") & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True)) if only_biocatdb is True: seq_objects = list(sequences) else: unirefs = UniRef50.objects(enzyme_type=self.enzyme_type_obj) seq_objects = list(sequences) + list(unirefs) # Get sequences not in nodes not_in_nodes = [] for seq_obj in seq_objects: if seq_obj.enzyme_name not in list(self.graph.nodes): if seq_obj.sequence != None: if len(seq_obj.sequence) > 12: not_in_nodes.append(seq_obj) # Return only up to the maximum number of sequences if max_num != None: if len(not_in_nodes) > max_num: not_in_nodes = not_in_nodes[0:max_num] t1 = time.time() self.log( f"Identified {len(not_in_nodes)} {self.enzyme_type} proteins which need adding, in {round(t1 - t0, 1)} seconds" ) return not_in_nodes
def full_uniref_check(enzyme_type_obj): unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj).select_related() if len(unirefs) != 0: for ur in unirefs: print(f'Checking {ur.enzyme_name}..') ref_parser = UniRef_Parser() ref_parser.load_xml(ur.enzyme_name) time.sleep(0.2) if ref_parser.check_id_match(ur.enzyme_name) == False: print( f"{ur.enzyme_name} doesnt match cluster id online, deleting.." ) for seq in ur.result_of_blasts_for: seq.blast = None seq.save() ur.delete() ssn_query = SSN_record.objects(enzyme_type=enzyme_type_obj) if len(ssn_query) != 0: ssn_record = SSN_record.objects(enzyme_type=enzyme_type_obj)[0] ssn_record.status = 'Queued for update' ssn_record.save() enzyme_type_obj.bioinformatics_status = 'Queued for update' enzyme_type_obj.save() print(f"Full UniRef50 update complete for {enzyme_type_obj.enzyme_type}")
def _find_uniref_metadata(self): node_metadata = {} unirefs = UniRef50.objects(enzyme_type=self.enzyme_type_obj).exclude( 'id', 'enzyme_type', 'sequence', "result_of_blasts_for") for seq_obj in unirefs: node_metadata[seq_obj.enzyme_name] = json.loads(seq_obj.to_json()) return node_metadata
def _make_db_fasta(self): """ Create a fasta file containing all the sequences of an enzyme type """ seqs = Sequence.objects( db.Q(enzyme_type=self.enzyme_type) & db.Q(sequence__ne="") & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True)) bioinf_seqs = UniRef50.objects(db.Q(enzyme_type=self.enzyme_type_obj)) with open(f"{self.directory}/{self.enz_type_dir_name}.fasta", 'w') as file: for seq in list(seqs) + list(bioinf_seqs): name = seq.enzyme_name seq = seq.sequence.replace('\n', '') file.write(f'>{name}\n') file.write(f"{seq}\n")
def _add_uniref(self, alignment, identifier, sequence, enzyme_type_obj, seq_seed): try: uniref_seq = UniRef50( enzyme_name=identifier, protein_name=self._get_name_from_header(alignment.title), tax=self._get_tax_from_header(alignment.title), tax_id=self._get_tax_id_from_header(alignment.title), sequence=sequence, enzyme_type=enzyme_type_obj, result_of_blasts_for=[seq_seed], blast_round=self.blast_round) uniref_seq.save() except Exception as e: self.log(e)
def clear_empty_ssns(): ssn_records = SSN_record.objects().select_related() for ssn_r in ssn_records: enzyme_type_obj = ssn_r.enzyme_type unirefs = UniRef50.objects(enzyme_type=enzyme_type_obj) biocat_seqs = Sequence.objects( db.Q(enzyme_type=enzyme_type_obj.enzyme_type) & db.Q(sequence__ne="") & db.Q(sequence__ne=None) & db.Q(sequence_unavailable__ne=True)) if len(unirefs) + len(biocat_seqs) == 0: ssn_r.delete() result = {'status': 'success', 'msg': f'Empty SSNs removed', 'issues': []} return jsonify(result=result)
def remove_nonexisting_seqs(self): t0 = time.time() sequences = Sequence.objects( enzyme_type=self.enzyme_type).distinct('enzyme_name') unirefs = UniRef50.objects( enzyme_type=self.enzyme_type_obj).distinct('enzyme_name') protein_names = list(sequences) + list(unirefs) count = 0 for node in list(self.graph.nodes): if node not in protein_names: self.log(f"Node: {node} not in the database - removing") self.graph.remove_node(node) count += 1 t1 = time.time() self.log( f"Identified {count} sequences which were in SSN but not in database, in {round(t1 - t0, 1)} seconds" )
def load_uniref_data(): name = request.form['name'] enzyme_type = request.form['enzyme_type'] enzyme_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0] et = db.Q(enzyme_type=enzyme_type_obj) nq = db.Q(enzyme_name=name) query = UniRef50.objects(et & nq) seq = query[0] protein_name = seq.protein_name organism = seq.tax uniprot_id = retrieve_uniref_info.strip_uniref_name(name) if uniprot_id[0:2] == 'UP': uniprot_id = "" ref_parser = retrieve_uniref_info.UniRef_Parser() ref_parser.load_xml(name) uni90, uni100, uniprot = ref_parser.get_uniref_members() cluster_id = ref_parser.get_cluster_name() num_uni90 = len(uni90) num_uni100 = len(uni100) num_uniprot = len(list(uniprot.keys())) if uniprot_id != "": prot_parser = retrieve_uniref_info.UniProt_Parser() prot_parser.load_xml(uniprot_id) pfams = prot_parser.get_pfams() else: pfams = [] result = { 'rep_seq_name': protein_name, 'rep_seq_organism': organism, 'rep_seq_uniprot_id': uniprot_id, 'cluster_id': cluster_id, 'num_uni90': num_uni90, 'num_uni100': num_uni100, 'num_uniprot': num_uniprot, 'pfam_object': pfams } return jsonify(result=result)
def bioinformatics_admin_page(): enzyme_types = EnzymeType.objects().order_by('enzyme_type') biostat = {} ssn = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type biostat[enz_type] = enz_type_obj.bioinformatics_status q = SSN_record.objects(enzyme_type=enz_type_obj) if len(q) != 0: ssn[enz_type] = q[0].status else: ssn[enz_type] = 'None' enzyme_numbers = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type enzyme_numbers[enz_type] = {} enzyme_numbers[enz_type]['biocatdb'] = len(Sequence.objects(enzyme_type=enz_type)) enzyme_numbers[enz_type]['uniref'] = len(UniRef50.objects(enzyme_type=enz_type_obj)) enz_type_dict = {} for enz_type_obj in enzyme_types: enz_type = enz_type_obj.enzyme_type enz_type_dict[enz_type] = 0 seqs = Sequence.objects(enzyme_type=enz_type) if len(seqs) != 0: for seq in seqs: if seq.blast is not None: enz_type_dict[enz_type] += 1 if enz_type_dict[enz_type] != 0: enz_type_dict[enz_type] = round((enz_type_dict[enz_type]/len(seqs))*100, 0) registry = StartedJobRegistry(queue=current_app.blast_queue) num_jobs = registry.count return render_template('bioinformatics/bioinformatics_admin.html', blasted_enz_types=enz_type_dict, biostat=biostat, ssn=ssn, num_jobs=num_jobs, enzyme_numbers=enzyme_numbers)
def ssn_object(): enzyme_type = request.form['enzyme_type'] enzyme_type_obj = EnzymeType.objects(enzyme_type=enzyme_type)[0] ssn_obj = SSN_record.objects(enzyme_type=enzyme_type_obj)[0] num_biocatdb = Sequence.objects(enzyme_type=enzyme_type).count() num_uniref = UniRef50.objects(enzyme_type=enzyme_type_obj).count() precalc_choices = {} for score in ssn_obj.num_at_alignment_score: clusters = ssn_obj.num_at_alignment_score[score] idt = ssn_obj.identity_at_alignment_score[score] choice_text = f"{score}, {clusters} clusters, avg identity {idt[0]} ± {idt[1]}" precalc_choices[score] = choice_text result = {'status': ssn_obj.status, 'num_biocatdb': num_biocatdb, 'num_uniref': num_uniref, 'precalculated': precalc_choices} return jsonify(result=result)
def parse(self, output, seq_obj): blast_record = output query_length = len(seq_obj.sequence) enzyme_type_obj = EnzymeType.objects( enzyme_type=seq_obj.enzyme_type)[0] for alignment in blast_record.alignments: identifier = alignment.hit_id.replace(self.identifier_head, '') if self._alignment_filters(alignment, query_length): db_query = UniRef50.objects( db.Q(enzyme_name=identifier) & db.Q(enzyme_type=enzyme_type_obj)).select_related() if len(db_query) == 0: protein_sequence = self._get_sequence(identifier) if self._sequence_filters(protein_sequence, query_length): self.log(f"Adding sequence for {identifier}") self._add_uniref(alignment, identifier, protein_sequence, enzyme_type_obj, seq_obj) else: uniref_obj = db_query[0] self._add_result_of_blasts_for(seq_obj, uniref_obj)
def check_random_uniref(num_to_check=25): for enzyme_type in EnzymeType.objects(): unirefs = UniRef50.objects(enzyme_type=enzyme_type) all_match = True if len(unirefs) != 0: for i in range(num_to_check): rand_uniref = random.choice(unirefs) name = rand_uniref.enzyme_name ref_parser = UniRef_Parser() ref_parser.load_xml(name) time.sleep(0.2) if ref_parser.check_id_match(name) == False: all_match = False if all_match != True: print( f'Identified mismatches with online uniref entries.. full uniref check for {enzyme_type.enzyme_type}' ) full_uniref_check(enzyme_type) print(f'Uniref checks complete ')
def _get_sequence_object(enzyme_name): if 'UniRef50' in enzyme_name: return UniRef50.objects(enzyme_name=enzyme_name)[0] else: return Sequence.objects(enzyme_name=enzyme_name)[0]
def task_check_uniref_has_blast_source(): print('Checking for unirefs with no blast source..') uniref_query = UniRef50.objects(result_of_blasts_for__size=0) for uniref in uniref_query: print(f"Deleting {uniref.enzyme_name}") uniref.delete()