def update_sequence_info(record, seed_id, seqid_of_description, aligned_sequence_of_seqid, num_aligned_columns, sequence_of_seqid, sequence_header_of_seqid, assume_seed_first, seed_sequence_header): """ record=biopython Bio.SeqRecord.SeqRecord. record.description is string (e.g., O28424_ARCFU/3-199 1,197) seqid_of_description - dictionary (reverse of seqs) """ # Find the SEQ number associated with the record description = record.description seqid = seqid_of_description[description] # Find or create the aligned sequence record aligned_seq = record.seq.tostring() aligned_seguid = CheckSum.seguid(record.seq) aligned_sequence_of_seqid[seqid] = _sequence(AlignedSequence, aligned_seq, aligned_seguid) # Compute the number of aligned columns, if not already known if num_aligned_columns == 0: num_aligned_columns = len( aligned_seq.translate(trivial_translation, dotlowercase)) # Find or create the unaligned sequence record unaligned_seq = aligned_seq.translate(uppercase_translation, dotdash) unaligned_seguid = CheckSum.seguid(unaligned_seq) sequence_of_seqid[seqid] = _sequence(Sequence, unaligned_seq, unaligned_seguid) # Find or create the sequence_header record sequence_header_objects = \ SequenceHeader.objects.filter(header__exact = description, sequence__exact = sequence_of_seqid[seqid]) if sequence_header_objects: # Since the combination of header and sequence_id is constrained to be # unique, there can only be one sequence_header_of_seqid[seqid] = sequence_header_objects[0] else: # Create a new sequence_header record sequence_header_of_seqid[seqid] =\ create_sequence_header(record.id, description, sequence_of_seqid[seqid]) if seed_sequence_header is None: if assume_seed_first: seed_sequence_header = sequence_header_of_seqid[seqid] elif seed_id is not None and seed_id == record.id: seed_sequence_header = sequence_header_of_seqid[seqid] return seed_sequence_header
def main(): if len(sys.argv) < 2: print "Usage: %s <uniprot_accession>" % sys.argv[0] sys.exit(0) uniprot_accession = sys.argv[1] if not uniprot_accession_re1.match(uniprot_accession) and not \ uniprot_accession_re2.match(uniprot_accession): print "The argument must be a valid UniProt accession" sys.exit(1) try: response = urllib2.urlopen('http://www.uniprot.org/uniprot/%s.fasta' % uniprot_accession) except urllib2.HTTPError: print "Unable to download sequence from UniProt" sys.exit(1) record = SeqIO.parse(response, 'fasta').next() seguid = CheckSum.seguid(record.seq) sequence_objects = Sequence.objects.filter(seguid__exact=seguid) if sequence_objects: tree_node_alignment_objects = TreeNodeAlignment.objects.filter( sequence_header__sequence__in=sequence_objects) if tree_node_alignment_objects: families = set([ obj.tree_node.tree.family for obj in tree_node_alignment_objects ]) for family in families: print family.get_accession() else: print "There are no families containing this sequence." else: print "This sequence is not in the PhyloFacts 3 database"
def parse_smo(work_path): f = open(os.path.join(work_path, "satchmo.smo")) alignment_offset_dict = {} records = set() current_header = "" current_sequence = "" alignmentOffset = 0 alignmentNumBytes = 0 withinAlignment = False start_of_line = f.tell() line = f.readline() while line: if line.rstrip() == 'alignment': # skip past the line with the curly brace f.readline() # the next line is the start of the alignment alignmentOffset = f.tell() withinAlignment = True elif line.rstrip() == '//': if current_sequence != '': seguid = CheckSum.seguid(current_sequence) records.add(seguid) alignmentNumBytes = start_of_line - alignmentOffset for seguid in records: if seguid not in alignment_offset_dict: alignment_offset_dict[seguid] = {} alignment_offset_dict[seguid][len(records)] \ = (alignmentOffset, alignmentNumBytes) records = set() current_header = "" current_sequence = "" withinAlignment = False elif withinAlignment: if len(line) > 0 and line[0] == '>': if current_sequence != '': seguid = CheckSum.seguid(current_sequence) records.add(seguid) current_header = line[1:].rstrip() current_sequence = "" else: current_sequence = current_sequence + \ line.strip().translate(uppercase_translation, dotdash) start_of_line = f.tell() line = f.readline() f.close() return alignment_offset_dict
def main(): # This script takes UniProt Accession as input and finds the best GHG books # that contains the Accession. Best book means having the most diverse taxa. # Note: In PhyloFacts3 there are families without trees (normally because all # the sequences in the family are identical, which makes the tree has no branch # length), this script cannot find these families. if len(sys.argv) < 2: print "usage: %prog UniProt_accession outputfile " sys.exit(1) accession = sys.argv[1] # find the UniProt index for the accession uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot_accession = accession) if uniprot_dat_indices: uniprot_object = uniprot_dat_indices[0].uniprot TreeNodeAlignments=TreeNodeAlignment.objects.filter(sequence_header__uniprot=uniprot_object,tree_node__tree__family__active=True,tree_node__tree__family__family_type='G').exclude(tree_node__tree__family__status__exact ='bad') if not len(TreeNodeAlignments)==0: families=set([tree_node_alignment.tree_node.tree.family \ for tree_node_alignment in TreeNodeAlignments]) (max_family_id, max_taxa_num)=get_all_taxon_ids(families) print ("%s,%s,%s" %(accession,max_family_id,max_taxa_num)) else: print ("%s is not covered in the database\n" % accession) else: # find the seguid from UniProt print ("%s is not in uniprot_dat_index, try the seguid\n" % accession) uniprot_accession = accession if uniprot_accession_re1.match(uniprot_accession) or \ uniprot_accession_re2.match(uniprot_accession): try: response = urllib2.urlopen('http://www.uniprot.org/uniprot/%s.fasta' % uniprot_accession) except urllib2.HTTPError: print ("Unable to download sequence from UniProt\n") record = SeqIO.parse(response, 'fasta').next() seguid = CheckSum.seguid(record.seq) sequence_objects = Sequence.objects.filter(seguid__exact = seguid) if sequence_objects: TreeNodeAlignments=TreeNodeAlignment.objects.filter \ (sequence_header__sequence__in = sequence_objects, \ tree_node__tree__family__active=True, \ tree_node__tree__family__family_type='G') \ .exclude(tree_node__tree__family__status__exact='bad') if not len(TreeNodeAlignments)==0: families=set([tree_node_alignment.tree_node.tree.family \ for tree_node_alignment in TreeNodeAlignments]) (max_family_id,max_taxa_num)=get_all_taxon_ids(families) print ("%s,%s,%s" % (accession,max_family_id,max_taxa_num)) else: print ("There are no families containing %s.\n" % accession) else: print ("%s is not in the PhyloFacts 3 database.\n" % accession) else: print ("The argument must be a valid UniProt accession\n")
def insertPDBPredictionsIntoDB(hmm, tree_node, basename): hmmsearch_filename = basename + "_vs_PDB.hmmsearch.out" hmmsearch_results = parse_results_of_hmmsearch_or_hmmscan.parse( hmmsearch_filename, 0.001, 1, 1) # There should only be one query here - the family HMM for query in hmmsearch_results.hit_result_of_name_of_query: for pdb_chain_id in hmmsearch_results.hit_result_of_name_of_query[ query]: pdb_id, chain_id = pdb_chain_id.split('_') pdb_chain_objects = PDB_Chain.objects.filter( pdb__id__exact=pdb_id, chain_id__exact=chain_id) if pdb_chain_objects: pdb_chain = pdb_chain_objects[0] else: print "Unrecognized PDB chain %s in hmmsearch results." \ % pdb_chain_id, print "The PDB_Chain table may be out of date." continue hit_result \ = hmmsearch_results.hit_result_of_name_of_query[query][pdb_chain_id] for match_number in hit_result.matches: match_result = hit_result.matches[match_number] aligned_seguid = CheckSum.seguid(match_result.aligned_hit) aligned_sequence_objects = AlignedSequence.objects.filter( seguid__exact=aligned_seguid) if aligned_sequence_objects: aligned_sequence = aligned_sequence_objects[0] else: # Because multiple versions of this fun run simultaneously, it is # possible this was just created moments ago. So, get_or_create # just in case. aligned_sequence, is_created \ = AlignedSequence.objects.get_or_create( chars = match_result.aligned_hit, seguid = aligned_seguid) sequence_hmm = SequenceHMM.objects.create( hmm=hmm, sequence=pdb_chain.full_sequence, aligned_sequence=aligned_sequence, bit_score=match_result.bit_score, e_value=match_result.i_evalue, sequence_type='query', hmm_start=match_result.hmm_from, hmm_end=match_result.hmm_to, sequence_start=match_result.seq_from, sequence_end=match_result.seq_to, match_type=match_result.match_type, n_aligned_chars=match_result.num_aligned_chars) TreeNodePDB.objects.create(sequence_hmm=sequence_hmm, tree_node=tree_node, pdb_chain=pdb_chain)
def parse_tree(work_path): seguids = {} f = open(os.path.join(work_path, "input_unaligned.fasta")) for record in SeqIO.parse(f, "fasta"): id = record.id.replace(':', '_') seguids[id] = CheckSum.seguid(record.seq) f.close() f = open(os.path.join(work_path, "satchmo_tree.newick")) tree_string = f.read() f.close() tree_string = tree_string.translate(trivial_translation, string.whitespace) root = node() root.readFromTreeString(tree_string, seguids, 0) root.updateLeftId(1) return root
def main(): if len(sys.argv) < 3: print "Usage: %s <seedX_id> <seedY_id>" % sys.argv[0] sys.exit(0) seedX_id = sys.argv[1] seedY_id = sys.argv[2] all_alignment_filenames = \ matchmaker_seed_alignment_filenames(seedX_id, seedY_id) output_handle = open(alignment_nr_csv_file(seedX_id, seedY_id), "w") output_handle.write("Alignment\n") nr_alignments = {} dups_of_alignment = {} for alignment_filename in all_alignment_filenames: """ #7/15: replaced below with functions that can read compressed alignment files f = open(alignment_filename) lines = f.readlines() if len(lines) < 4: continue ali = '*'.join([lines[1],lines[3]]) """ (seed1, seq1), (seed2, seq2) = read_alignment_file(alignment_filename) ali = '*'.join([seq1, seq2]) the_seguid = CheckSum.seguid(ali) if the_seguid in nr_alignments and nr_alignments[the_seguid] != None: if ali in nr_alignments[the_seguid] \ and nr_alignments[the_seguid][ali] != None: nr_alignments[the_seguid][ali].add(alignment_filename) dups_of_alignment[alignment_filename] = nr_alignments[ the_seguid][ali] else: nr_alignments[the_seguid][ali] = set([alignment_filename]) output_handle.write("%s\n" % alignment_filename) dups_of_alignment[alignment_filename] = nr_alignments[ the_seguid][ali] else: nr_alignments[the_seguid] = {ali: set([alignment_filename])} output_handle.write("%s\n" % alignment_filename) dups_of_alignment[alignment_filename] = nr_alignments[the_seguid][ ali] pklfp = open( os.path.join(align_dir(seedX_id, seedY_id), "%s_%s_alignment_dict.pkl" % (seedX_id, seedY_id)), "w") cPickle.dump((dups_of_alignment, nr_alignments), pklfp) pklfp.close()
def main(): parser = OptionParser(usage='%prog') (options, args) = parser.parse_args() f = open("/clusterfs/ohana/external/pdb_rcsb_full", "rU") for record in SeqIO.parse(f, "fasta"): fields = record.description.split() if len(fields) < 2 or fields[1] != 'mol:protein': continue pdb_id, chain_id = fields[0].split('_') pdb_objects = PDB.objects.filter(id__exact=pdb_id) if pdb_objects: pdb = pdb_objects[0] else: pdb = PDB.objects.create(id=pdb_id) seguid = CheckSum.seguid(record.seq) sequence_objects = Sequence.objects.filter(seguid__exact=seguid) if sequence_objects: sequence = sequence_objects[0] else: sequence = Sequence.objects.create(chars=record.seq.tostring(), seguid=seguid) pdb_chain_objects = PDB_Chain.objects.filter(pdb__exact=pdb, chain_id__exact=chain_id) if pdb_chain_objects: pdb_chain = pdb_chain_objects[0] # Update the sequence in case it has changed if pdb_chain.full_sequence != sequence: pdb_chain.full_sequence = sequence pdb_chain.all_residues_have_atom_records_f = None if len(fields) >= 4: pdb_chain.description = ' '.join(fields[3:]) pdb_chain.save() else: if len(fields) >= 4: description = ' '.join(fields[3:]) pdb_chain = PDB_Chain.objects.create(pdb=pdb, chain_id=chain_id, full_sequence=sequence, description=description) else: pdb_chain = PDB_Chain.objects.create(pdb=pdb, chain_id=chain_id, full_sequence=sequence)
def get_seguids_of_ids(work_path): seguids = {} ids_of_seguid = {} f = open(os.path.join(work_path, "input_unaligned.fasta")) for record in SeqIO.parse(f, "fasta"): id = record.id.replace(':', '_') additional_id = id.translate(special_tree_char_translation, '') seguid = CheckSum.seguid(record.seq) seguids[id] = seguid seguids[additional_id] = seguid # Don't put the additional_id in ids_of_seguid, as views.py expects there # to be one id per leaf. if seguid not in ids_of_seguid: ids_of_seguid[seguid] = set() ids_of_seguid[seguid].add(id) f.close() for seguid in ids_of_seguid: id_list = list(ids_of_seguid[seguid]) id_list.sort() ids_of_seguid[seguid] = id_list f = open(os.path.join(work_path, "ids_of_seguid.pkl"), "w") cPickle.dump(ids_of_seguid, f) f.close() return (seguids, ids_of_seguid)
def main(): if len(sys.argv) < 2: usage() sys.exit(0) family_accession = sys.argv[1] try: family_id = int(family_accession[3:]) except ValueError: usage() sys.exit(1) try: family = Family.objects.get(id=family_id) if family.status == "bad": raise Family.DoesNotExist except Family.DoesNotExist: print "No family found with accession %s" % family_accession sys.exit(1) family_dir = get_dir_of_family_accession(family_accession) seed_path = os.path.join(family_dir, "seed.fa") if not os.path.exists(seed_path): if os.path.realpath(family_dir).find('TreeFam') >= 0: os.chdir(family_dir) possible_seed_files = glob.glob("*_HUMAN*.fa") candidates = set() swissprot_desc_re = re.compile('^%s$' % swissprot_desc_pat) for file in possible_seed_files: basename = os.path.splitext(file)[0] components = basename.split('_') if len(components) < 2 or components[1] != 'HUMAN': continue if swissprot_desc_re.match(components[0]) is None and \ uniprot_accession_re1.match(components[0]) is None and \ uniprot_accession_re2.match(components[0]) is None: continue if len(components) > 2: if len(components) != 4: continue try: start = int(components[2]) except ValueError: continue try: end = int(components[3]) except ValueError: continue candidates.add(file) if len(candidates) != 1: print "Seed file for family %s missing" % family_accession sys.exit(1) seed_path = os.path.join(family_dir, list(candidates)[0]) else: print "Seed file for family %s missing" % family_accession sys.exit(1) f = open(seed_path) seed_record = SeqIO.parse(f, "fasta").next() f.close() seed_seguid = CheckSum.seguid(seed_record.seq) seed_id = seed_record.id.strip('lcl|') print "%s: FlowerPower seed id %s" % (family_accession, seed_id) seed_accession = None recognizing_regexp = None # uniprot_accession_re1 recognizes a UniProt accession only if it is the # whole string, not if it is a substring for regexp in [ re.compile(uniprot_accession_pat1), re.compile(uniprot_accession_pat2), gi_re ]: m = regexp.search(seed_id) if m: seed_accession = m.group() recognizing_regexp = regexp break if seed_accession is None: print "Could not parse accession from seed id" sys.exit(1) sequences = Sequence.objects.filter(seguid=seed_seguid) sequence_headers = SequenceHeader.objects.filter(sequence__in=sequences) possible_sequence_headers = set() for sequence_header in sequence_headers: m = recognizing_regexp.search(sequence_header.header) if m: accession = m.group() if accession == seed_accession: if len(sequence_header.header) >= 4 and \ sequence_header.header[0:4] == 'lcl|': possible_sequence_headers = set([sequence_header]) break if sequence_header.header.find('|') < 0: possible_sequence_headers = set([sequence_header]) break possible_sequence_headers.add(sequence_header) if len(possible_sequence_headers) > 1: alns = TreeNodeAlignment.objects.filter( tree_node=family.canonical_root_node(), sequence_header__in=possible_sequence_headers) possible_sequence_headers = set([aln.sequence_header for aln in alns]) print "%s: Found %d possible sequence headers" % ( family_accession, len(possible_sequence_headers)) for seqhdr in possible_sequence_headers: print "%s: possible sequence header %s" % (family_accession, seqhdr.header) if len(possible_sequence_headers) == 1: seed_sequence_header = list(possible_sequence_headers)[0] print "Assigning seed sequence header id %d to family %s" \ % (seed_sequence_header.id, family_accession) family.seed_sequence_header = seed_sequence_header family.save()
def main(): parser = OptionParser(usage='%prog [StartPos_EndPos]') parser.add_option( '--update_features', action='store_true', dest='update_features', default=True, help="Insert new entries into uniprot_feature and delete old ones.") parser.add_option( '--no_update_features', action='store_false', dest='update_features', default=True, help="Only insert entries into uniprot_feature for new uniprot records." ) (options, args) = parser.parse_args() sharding = False if len(args) >= 1: sharding = True shard_spec = args[0] positions = shard_spec.split('_') if positions < 2: parser.error( "Must specify shard as starting and ending file positions " \ + "separated by an underscore") try: start_pos = int(positions[0]) except ValueError: parser.error( "Must specify shard as starting and ending file positions " \ + "separated by an underscore") try: end_pos = int(positions[1]) except ValueError: parser.error( "Must specify shard as starting and ending file positions " \ + "separated by an underscore") num_records = 0 description_re = re.compile( '(RecName: |AltName: |SubName: ' + '|Full=|Short=|EC=|Allergen=|Biotech=|CD_antigen=|INN=|;)' + '|Includes: |Contains: |Flags: ') # Prepare regular expressions and object maps for parsing feature tables ptm_types = PostTranslationalModificationType.objects.all() ptm_re = re.compile( '(%s)' % '|'.join([ptm_type.modification for ptm_type in ptm_types])) ptm_type_object_of_modification = {} for ptm_type in ptm_types: ptm_type_object_of_modification[ptm_type.modification] = ptm_type # Example: # RP PHOSPHORYLATION [LARGE SCALE ANALYSIS] AT SER-267 # Here PHOSPHORYLATION is a post-translational modification type, which # occurs at position 267. Later there is a corresponding line in the # feature table: # FT MOD_RES 267 267 Phosphoserine. ptm_pos_re = re.compile(' AT [A-Z][A-Z][A-Z]-([0-9]*)') feature_keys = FeatureKey.objects.all() feature_key_object_of_key_name = {} for feature_key in feature_keys: feature_key_object_of_key_name[feature_key.key_name] = feature_key nonexperimental_qualifiers = NonExperimentalQualifier.objects.all() nonexperimental_re = re.compile('(%s)' % '|'.join( [qualifier.description for qualifier in nonexperimental_qualifiers])) nonexperimental_qualifier_object_of_description = {} for qualifier in nonexperimental_qualifiers: nonexperimental_qualifier_object_of_description[qualifier.description] \ = qualifier dbSNPrs_re = re.compile('dbSNP:rs([0-9]*)') large_scale_re = re.compile('LARGE SCALE') go_evidence_objects = GO_EvidencePriority.objects.all() go_evidence_object_of_go_evidence_code = {} for go_evidence_object in go_evidence_objects: go_evidence_object_of_go_evidence_code[go_evidence_object.evidence]\ = go_evidence_object f = open("/clusterfs/ohana/external/UniProt/to_import/uniprot.dat") pos_f = open("/clusterfs/ohana/external/UniProt/to_import/uniprot.dat") if sharding: f.seek(start_pos) current_pos = f.tell() pos_f.seek(current_pos) for record in SwissProt.parse(f): try: taxon = UniProtTaxonomy.objects.get( id__exact=record.taxonomy_id[0]) except UniProtTaxonomy.DoesNotExist: taxon = handle_missing_taxonomy(record) seguid = CheckSum.seguid(record.sequence) # Parse the description description_tokens = description_re.split(record.description) full_recommended_name = '' # Look for the first recommended name category # (before any Includes or Contains sections) for i in xrange(len(description_tokens)): if description_tokens[i] == 'RecName: ': break # Now look for the full name for j in xrange(i, len(description_tokens)): if description_tokens[j] == 'Full=': break # The full recommended name is the next token if j < len(description_tokens) - 1: full_recommended_name = description_tokens[j + 1] else: # Try looking for SubName instead, maybe this is a fragment # Look for the first subname category # (before any Includes or Contains sections) for i in xrange(len(description_tokens)): if description_tokens[i] == 'SubName: ': break # Now look for the full name for j in xrange(i, len(description_tokens)): if description_tokens[j] == 'Full=': break # The full subname is the next token if j < len(description_tokens) - 1: full_recommended_name = description_tokens[j + 1] else: print "Full recommended name not found for %s" % record.entry_name print record.description # Look for all the EC numbers ecs = set() for i in xrange(len(description_tokens)): if description_tokens[i] == 'EC=': ecs.add(description_tokens[i + 1]) # Look for precursor or fragment flags is_fragment = False is_precursor = False for i in xrange(len(description_tokens)): if description_tokens[i] == 'Flag: ': if description_tokens[i + 1][0:8] == 'Fragment': is_fragment = True elif description_tokens[i + 1] == 'Precursor': is_precursor = True # Every UniProt accession is present in the uniprot_dat_index table. # Each of them points to a record in the uniprot table. # On the other hand, a record in the uniprot table has only one # accession, the one that was the primary accession the last time we did # this update. # The primary accession may have changed since we last updated (it may now # be a secondary accession). # The identifier may also have changed. E.g., if the record was previously # in TrEMBL and is now in SwissProt, then its identifier may have changed # from one like Q197F8_IIV3 to one like 002R_IIV3 (i.e., the first part is # no longer the accession, but a gene name or something more informative). # So, we can't necessarily tell which was the existing record in the uniprot # table corresponding to the record we are now parsing by looking at either # its identifier or its accession. # Instead, we find the entries in the uniprot_dat_index table for each of # the accessions. # If one corresponding to the primary accession is already present, we take # the corresponding record in the UniProt table to be the canonical entry # corresponding to this UniProt record. # If none corresponding to the primary accession is present but entries in # the uniprot_dat_index for other accessions are present, we pick one of # these and make the corresponding record in the uniprot table the canonical # entry. # If no entries in the uniprot_dat_index table corresponding to any of these # accessions are present, we create a new record in the uniprot table and # make it the canonical entry. # If the entries in the uniprot_dat_index table corresponding to these # accessions point to multiple different records in the uniprot table, we # will delete the other ones at the end of this loop iteration. But first # we will update the sequence_header records that point to those entries to # point instead to the canonical entry. new_uniprot = False uniprot_ids_to_delete = set() uniprot_dat_index_of_accession = {} uniprot_dat_indices = UniProtDatIndex.objects.filter( uniprot_accession__in=record.accessions) uniprot_of_uniprot_id = {} for uniprot_dat_index in uniprot_dat_indices: uniprot_dat_index_of_accession[uniprot_dat_index.uniprot_accession] \ = uniprot_dat_index uniprot_of_uniprot_id[uniprot_dat_index.uniprot.id] \ = uniprot_dat_index.uniprot if len(uniprot_dat_index_of_accession.keys()) > 0: if record.accessions[0] in uniprot_dat_index_of_accession: uniprot = uniprot_dat_index_of_accession[ record.accessions[0]].uniprot else: an_accession = uniprot_dat_index_of_accession.keys()[0] uniprot = uniprot_dat_index_of_accession[an_accession].uniprot for accession in uniprot_dat_index_of_accession: uniprot_dat_index_of_accession[accession].uniprot = uniprot uniprot_dat_index_of_accession[ accession].file_char = current_pos uniprot_dat_index_of_accession[accession].save() missing_accessions \ = set(record.accessions) - set(uniprot_dat_index_of_accession.keys()) for accession in missing_accessions: uniprot_dat_index_of_accession[accession] \ = UniProtDatIndex.objects.create(file_char = current_pos, uniprot_accession = accession, uniprot = uniprot) uniprot_ids_to_delete = set(uniprot_of_uniprot_id.keys()) uniprot_ids_to_delete.remove(uniprot.id) # Find sequence_headers pointing to the obsolete uniprot records, and # point them at the canonical one instead sequence_headers = SequenceHeader.objects.filter( uniprot__id__in=uniprot_ids_to_delete) for sequence_header in sequence_headers: sequence_header.uniprot = uniprot sequence_header.save() else: new_uniprot = True uniprot = UniProt.objects.create( uniprot_identifier=record.entry_name, accession=record.accessions[0], taxon=taxon, de=full_recommended_name, seguid=seguid, in_swissprot_f=(record.data_class == 'Reviewed')) for accession in record.accessions: uniprot_dat_index_of_accession[accession] \ = UniProtDatIndex.objects.create(file_char = current_pos, uniprot_accession = accession, uniprot = uniprot) # Look for orphaned sequence headers that can be assigned to this # uniprot sequences = Sequence.objects.filter(seguid__exact=seguid) sequence_ids = [sequence.id for sequence in sequences] if sequences: sequence_headers = SequenceHeader.objects.filter( sequence__id__in=sequence_ids, uniprot__isnull=True, taxon__id__exact=taxon.id) for sequence_header in sequence_headers: sequence_header.uniprot = uniprot uniprot.uniprot_identifier = record.entry_name uniprot.accession = record.accessions[0] uniprot.uniprot_taxon = taxon uniprot.de = full_recommended_name uniprot.seguid = seguid uniprot.in_swissprot_f = (record.data_class == 'Reviewed') uniprot.description = record.description if is_fragment: uniprot.is_fragment = True if is_precursor: uniprot.is_precursor = True uniprot.save() # Update the EC associations uniprot_ec_objects = UniProtEC.objects.filter(uniprot__exact=uniprot) uniprot_ec_object_of_ec_id = {} for uniprot_ec_object in uniprot_ec_objects: uniprot_ec_object_of_ec_id[ uniprot_ec_object.ec.id] = uniprot_ec_object db_ec_ids = set(uniprot_ec_object_of_ec_id.keys()) ec_object_of_ec_id = {} for ec in ecs: class_number_str, subclass_number_str, subsubclass_number_str, \ enzyme_number_str = ec.split('.') is_preliminary = False # If EC number is similar to '-.-.-.-', then we should report # an error and continue. This is obviously dirty data and should # be reported to UniProt try: class_number = int(class_number_str) except ValueError: print "Warning: %s has invalid EC Number: '%s'" % ( uniprot.uniprot_identifier, ec) continue if subclass_number_str == '-': ec_objects = EC.objects.filter( class_number__exact=class_number, subclass_number__isnull=True, subsubclass_number__isnull=True, enzyme_number__isnull=True) else: subclass_number = int(subclass_number_str) if subsubclass_number_str == '-': ec_objects = EC.objects.filter( class_number__exact=class_number, subclass_number__exact=subclass_number, subsubclass_number__isnull=True, enzyme_number__isnull=True) else: subsubclass_number = int(subsubclass_number_str) enzyme_number_str = enzyme_number_str.strip().rstrip(';') if enzyme_number_str == '-': ec_objects = EC.objects.filter( class_number__exact=class_number, subclass_number__exact=subclass_number, subsubclass_number__exact=subsubclass_number, enzyme_number__isnull=True) else: try: enzyme_number = int(enzyme_number_str) except ValueError: print "Preliminary EC %s in %s" % ( ec, record.entry_name) print record.description is_preliminary = True enzyme_number = int(enzyme_number_str[1:]) ec_objects = EC.objects.filter( class_number__exact=class_number, subclass_number__exact=subclass_number, subsubclass_number__exact=subsubclass_number, enzyme_number__exact=enzyme_number, is_preliminary_f=is_preliminary) if ec_objects: ec_object = ec_objects[0] else: ec_object = EC.objects.create( class_number=class_number, subclass_number=subclass_number, subsubclass_number=subsubclass_number, enzyme_number=enzyme_number, is_preliminary_f=is_preliminary) ec_object_of_ec_id[ec_object.id] = ec_object uniprot_dat_ec_ids = set(ec_object_of_ec_id.keys()) for ec_id in db_ec_ids - uniprot_dat_ec_ids: uniprot_ec_object_of_ec_id[ec_id].delete() for ec_id in uniprot_dat_ec_ids - db_ec_ids: UniProtEC.objects.create(uniprot=uniprot, ec=ec_object_of_ec_id[ec_id]) # Update the keyword associations uniprot_keyword_objects = UniProtKeyword.objects.filter( uniprot__exact=uniprot) uniprot_keyword_object_of_keyword_accession = {} for uniprot_keyword_object in uniprot_keyword_objects: uniprot_keyword_object_of_keyword_accession[ \ uniprot_keyword_object.keyword.accession] = uniprot_keyword_object db_keyword_accessions \ = set(uniprot_keyword_object_of_keyword_accession.keys()) keyword_object_of_keyword_accession = {} for keyword in record.keywords: keyword_objects = Keyword.objects.filter(identifier__exact=keyword) if keyword_objects: keyword_object = keyword_objects[0] keyword_object_of_keyword_accession[keyword_object.accession] \ = keyword_object else: print "Unrecognized keyword %s while parsing %s." % ( keyword, record.entry_name), print "The keyword table may be out of date." uniprot_dat_keyword_accessions \ = set(keyword_object_of_keyword_accession.keys()) for accession in db_keyword_accessions - uniprot_dat_keyword_accessions: uniprot_keyword_object_of_keyword_accession[accession].delete() for accession in uniprot_dat_keyword_accessions - db_keyword_accessions: UniProtKeyword.objects.create( uniprot=uniprot, keyword=keyword_object_of_keyword_accession[accession]) # Update the organelle associations uniprot_organelle_objects = UniProtOrganelle.objects.filter( uniprot__exact=uniprot) uniprot_organelle_object_of_organelle_id = {} for uniprot_organelle_object in uniprot_organelle_objects: uniprot_organelle_object_of_organelle_id[ \ uniprot_organelle_object.organelle.id] = uniprot_organelle_object db_organelle_ids = set(uniprot_organelle_object_of_organelle_id.keys()) organelle_object_of_organelle_id = {} for organelle in record.organelle.rstrip('.').split(','): if len(organelle) == 0: continue fields = organelle.split('; ') if len(fields) > 1: # This had better be a plastid if fields[0] == 'Plastid': organelle_objects = Organelle.objects.filter( description__exact=fields[0], plastid_type__exact=fields[1]) else: print "Unrecognized organelle %s in %s" % ( organelle, record.entry_name) else: organelle_objects = Organelle.objects.filter( description__exact=fields[0]) if organelle_objects: organelle_object = organelle_objects[0] else: field = fields[0].strip() # This had better be a plasmid if len(field) >= 9 and field[0:7] == 'Plasmid': organelle_object = Organelle.objects.create( description=field, plasmid_name=field[8:]) elif len(field) >= 13 and field[4:11] == 'Plasmid': organelle_object = Organelle.objects.create( description=field[4:], plasmid_name=field[12:]) else: print "Unrecognized organelle %s in %s" % ( organelle, record.entry_name) continue organelle_object_of_organelle_id[ organelle_object.id] = organelle_object uniprot_dat_organelle_ids = set( organelle_object_of_organelle_id.keys()) for organelle_id in db_organelle_ids - uniprot_dat_organelle_ids: uniprot_organelle_object_of_organelle_id[organelle_id].delete() for organelle_id in uniprot_dat_organelle_ids - db_organelle_ids: UniProtOrganelle.objects.create( uniprot=uniprot, organelle=organelle_object_of_organelle_id[organelle_id]) # Update the host organism associations uniprot_host_objects = UniProtHostOrganism.objects.filter( uniprot__exact=uniprot) uniprot_host_object_of_host_id = {} for uniprot_host_object in uniprot_host_objects: uniprot_host_object_of_host_id[uniprot_host_object.host_organism.id] \ = uniprot_host_object db_host_ids = set(uniprot_host_object_of_host_id.keys()) host_object_of_host_id = {} host_ids = [ int(host_spec.split(';')[0]) for host_spec in record.host_organism ] host_objects = UniProtTaxonomy.objects.filter(id__in=host_ids) for host_object in host_objects: host_object_of_host_id[host_object.id] = host_object uniprot_dat_host_ids = set(host_object_of_host_id.keys()) for host_id in set(host_ids) - uniprot_dat_host_ids: print "Unknown host taxonomy id %d when parsing %s;" \ % (host_id, record.entry_name), print "UniProtTaxonomy table may be out of date." for host_id in db_host_ids - uniprot_dat_host_ids: uniprot_host_object_of_host_id[host_id].delete() for host_id in uniprot_dat_host_ids - db_host_ids: UniProtHostOrganism.objects.create( uniprot=uniprot, host_organism=host_object_of_host_id[host_id]) # Update the literature references uniprot_literature_objects = UniProtLiterature.objects.filter( uniprot__exact=uniprot) uniprot_literature_object_of_title = {} for uniprot_literature_object in uniprot_literature_objects: uniprot_literature_object_of_title[uniprot_literature_object.title] \ = uniprot_literature_object db_titles = set(uniprot_literature_object_of_title.keys()) uniprot_dat_ref_of_title = {} for ref in record.references: titles = ref.title.split(';') for title in titles: uniprot_dat_ref_of_title[title.strip('"')] = ref uniprot_dat_titles = set(uniprot_dat_ref_of_title.keys()) for title in db_titles - uniprot_dat_titles: uniprot_literature_object_of_title[title].delete() for title in uniprot_dat_titles: if title in db_titles: uniprot_literature_object = uniprot_literature_object_of_title[ title] else: uniprot_literature_object \ = UniProtLiterature.objects.create(uniprot = uniprot, title = title) ref = uniprot_dat_ref_of_title[title] uniprot_literature_object.authors = ref.authors positional_info = ' '.join(ref.positions) m = large_scale_re.search(positional_info) if m: uniprot_literature_object.is_large_scale_f = True for db_name, db_reference in ref.references: if db_name == 'MEDLINE': uniprot_literature_object.medline_ui = db_reference elif db_name == 'PubMed': uniprot_literature_object.pmid = db_reference elif db_name == 'DOI': uniprot_literature_object.doi = db_reference elif db_name == 'AGRICOLA': uniprot_literature_object.agricola = db_reference uniprot_literature_object.save() # Find cross references to other databases geneids = set() go_evidence_of_go_accession = {} pfam_accessions = set() extent_of_pdb_chain_ids = {} for reference in record.cross_references: if reference[0] == 'GeneID': geneids.add(reference[1]) elif reference[0] == 'GO': if len(reference) >= 4: go_evidence_of_go_accession[reference[1]] \ = reference[3].split(':')[0] elif reference[0] == 'Pfam': pfam_accessions.add(reference[1]) elif reference[0] == 'PDB': pdb_id = reference[1].lower() chain_ids = reference[4].split('=')[0].split('/') pdb_from_residue = None pdb_to_residue = None fields = reference[4].split('=') if len(fields) > 1: try: pdb_from_residue, pdb_to_residue \ = [int(x) for x in fields[1].split('-')] except IndexError: pdb_from_residue = None pdb_to_residue = None except ValueError: pdb_from_residue = None pdb_to_residue = None for chain_id in chain_ids: pdb_chain_id = pdb_id + chain_id extent_of_pdb_chain_ids[pdb_chain_id] \ = (pdb_from_residue, pdb_to_residue) # Update the GeneID associations uniprot_geneid_objects = UniProtGeneID.objects.filter( uniprot__exact=uniprot) uniprot_geneid_object_of_geneid = {} for uniprot_geneid_object in uniprot_geneid_objects: uniprot_geneid_object_of_geneid[uniprot_geneid_object.geneid] \ = uniprot_geneid_object db_geneids = set(uniprot_geneid_object_of_geneid.keys()) for geneid in db_geneids - geneids: uniprot_geneid_object_of_geneid[geneid].delete() for geneid in geneids - db_geneids: UniProtGeneID.objects.create(uniprot=uniprot, geneid=geneid) # Update the GO associations uniprot_go_objects = UniProtGO.objects.filter(uniprot__exact=uniprot) uniprot_go_object_of_go_term_accession = {} for uniprot_go_object in uniprot_go_objects: uniprot_go_object_of_go_term_accession[uniprot_go_object.go_term.acc] \ = uniprot_go_object db_go_term_accessions = set( uniprot_go_object_of_go_term_accession.keys()) go_term_objects = GO_Term.objects.filter( acc__in=go_evidence_of_go_accession.keys()) go_term_object_of_go_term_accession = {} for go_term_object in go_term_objects: go_term_object_of_go_term_accession[go_term_object.acc] \ = go_term_object uniprot_dat_go_term_accessions \ = set(go_term_object_of_go_term_accession.keys()) for go_accession in set(go_evidence_of_go_accession.keys()) - \ uniprot_dat_go_term_accessions: print "Unrecognized GO accession %s while parsing %s" \ % (go_accession, record.entry_name), print "GO term table may be out of date." for go_evidence_code in set(go_evidence_of_go_accession.values()) - \ set(go_evidence_object_of_go_evidence_code.keys()): print "Unrecognized GO evidence code %s while parsing %s" \ % (go_evidence_code, record.entry_name), print "GO evidence_priority table may be out of date." for go_term_accession in db_go_term_accessions \ - uniprot_dat_go_term_accessions: uniprot_go_object_of_go_term_accession[go_term_accession].delete() for go_term_accession in uniprot_dat_go_term_accessions: go_evidence_code = go_evidence_of_go_accession[go_term_accession] if go_evidence_code in go_evidence_object_of_go_evidence_code: go_evidence_object = \ go_evidence_object_of_go_evidence_code[go_evidence_code] if go_term_accession in db_go_term_accessions: uniprot_go \ = uniprot_go_object_of_go_term_accession[go_term_accession] if uniprot_go.go_evidence.evidence != go_evidence_code: uniprot_go.go_evidence = go_evidence_object uniprot_go.save() else: UniProtGO.objects.create( go_term=go_term_object_of_go_term_accession[ go_term_accession], go_evidence=go_evidence_object, uniprot=uniprot) # Update the Pfam associations uniprot_pfam_objects = UniProtPfam.objects.filter( uniprot__exact=uniprot) uniprot_pfam_object_of_pfam_accession = {} for uniprot_pfam_object in uniprot_pfam_objects: uniprot_pfam_object_of_pfam_accession[ uniprot_pfam_object.pfam.accession] = uniprot_pfam_object db_pfam_accessions = set(uniprot_pfam_object_of_pfam_accession.keys()) pfam_object_of_pfam_accession = {} for pfam_accession in pfam_accessions: pfam_objects = Pfam.objects.filter( accession__exact=pfam_accession).order_by( 'overall_pfam_version').reverse() if pfam_objects: pfam_object = pfam_objects[0] pfam_object_of_pfam_accession[ pfam_object.accession] = pfam_object else: print "Unknown Pfam accession %s encountered when parsing %s" \ % (pfam_accession, record.entry_name), print "Pfam table may be out of date" uniprot_dat_pfam_accessions = set(pfam_object_of_pfam_accession.keys()) for pfam_accession in db_pfam_accessions - uniprot_dat_pfam_accessions: uniprot_pfam_object_of_pfam_accession[pfam_accession].delete() for pfam_accession in uniprot_dat_pfam_accessions - db_pfam_accessions: UniProtPfam.objects.create( uniprot=uniprot, pfam=pfam_object_of_pfam_accession[pfam_accession]) # Update the PDB associations uniprot_pdb_chain_objects = UniProtPDB_Chain.objects.filter( uniprot__exact=uniprot) uniprot_pdb_chain_object_of_pdb_chain_id = {} for uniprot_pdb_chain_object in uniprot_pdb_chain_objects: pdb_chain_id = uniprot_pdb_chain_object.pdb_chain.pdb.id + \ uniprot_pdb_chain_object.pdb_chain.chain_id uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id] \ = uniprot_pdb_chain_object db_pdb_chain_ids = set(uniprot_pdb_chain_object_of_pdb_chain_id.keys()) pdb_chain_object_of_pdb_chain_id = {} for pdb_chain_id in extent_of_pdb_chain_ids.keys(): pdb_chain_objects = PDB_Chain.objects.filter( pdb__id__exact=pdb_chain_id[0:4], chain_id__exact=pdb_chain_id[4:]) if pdb_chain_objects: pdb_chain_object = pdb_chain_objects[0] pdb_chain_object_of_pdb_chain_id[ pdb_chain_id] = pdb_chain_object else: print "Unknown PDB chain %s encountered when parsing %s" \ % (pdb_chain_id, record.entry_name), print "The PDB_Chain table may be out of date." uniprot_dat_pdb_chain_ids = set( pdb_chain_object_of_pdb_chain_id.keys()) for pdb_chain_id in db_pdb_chain_ids - uniprot_dat_pdb_chain_ids: uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id].delete() for pdb_chain_id in uniprot_dat_pdb_chain_ids: pdb_from_residue, pdb_to_residue = extent_of_pdb_chain_ids[ pdb_chain_id] if pdb_from_residue: if pdb_chain_id in db_pdb_chain_ids: uniprot_pdb_chain \ = uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id] uniprot_pdb_chain.from_residue = pdb_from_residue uniprot_pdb_chain.to_residue = pdb_to_residue uniprot_pdb_chain.save() else: UniProtPDB_Chain.objects.create( uniprot=uniprot, pdb_chain=pdb_chain_object_of_pdb_chain_id[ pdb_chain_id], from_residue=pdb_from_residue, to_residue=pdb_to_residue) else: UniProtPDB_Chain.objects.create( uniprot=uniprot, pdb_chain=pdb_chain_object_of_pdb_chain_id[pdb_chain_id]) # Update the feature table (position-specific information) if new_uniprot or options.update_features: # Unfortunately, there is no part of a feature table entry that is # guaranteed to persist from one release of UniProt of the next (except # the FTid, but that's not always present). Therefore there's no way to # easily determine that a feature in the uniprot.dat line is the same or # nearly the same as one that's already in the database. So, we update # the features by simply inserting all the entries anew and deleting the # old ones (without trying to check if they were the same or modify the # old ones). So, we get the old feature entries first so we can delete # them later. uniprot_feature_objects = UniProtFeature.objects.filter( uniprot__exact=uniprot) # Instantiate the queryset by turning it into a list. Otherwise, it # won't be instantiated until we get to the bottom and loop over these to # delete them, at which point it would delete *all* of them (including # the ones we just created). uniprot_feature_object_list = list(uniprot_feature_objects) # The fields from_residue_is_uncertain, to_residue_is_uncertain, # extends_n_terminally, and extends_c_terminally derive from the FT line # in the uniprot.dat line, according to the UniProt KnowledgeBase user # manual: # When a feature is known to extend beyond the position that is given in # the feature table, the endpoint specification will be preceded by '<' # for features which continue to the left end (N-terminal direction) or # by '>' for features which continue to the right end (C- terminal # direction); Unknown endpoints are denoted by '?'. Uncertain endpoints # are denoted by a '?' before the position, e.g. '?42'. for feature in record.features: key_name, from_residue_spec, to_residue_spec, description, \ ftid = feature if key_name in feature_key_object_of_key_name: feature_key = feature_key_object_of_key_name[key_name] # Check for nonexperimental qualifier nonexperimental_qualifier = None m = nonexperimental_re.search(description) if m: nonexperimental_qualifier \ = nonexperimental_qualifier_object_of_description[m.group(0)] created_object = False if key_name == 'VARIANT': # Check for dbSNP:rsaccession_number dbSNP_rs_accession = None m = dbSNPrs_re.search(description) if m: dbSNP_rs_accession = int(m.group(1)) if nonexperimental_qualifier: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, dbsnp_rs_number=dbSNP_rs_accession, nonexperimental_qualifier= nonexperimental_qualifier) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid, dbsnp_rs_number=dbSNP_rs_accession, nonexperimental_qualifier= nonexperimental_qualifier) else: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, dbsnp_rs_number=dbSNP_rs_accession) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid, dbsnp_rs_number=dbSNP_rs_accession) created_object = True elif key_name == 'MOD_RES' and from_residue_spec == to_residue_spec: # Look for the post-translational modification type found_ptm = False for reference in record.references: if found_ptm: break positional_info = ' '.join(reference.positions) ptm_tokens = ptm_re.split(positional_info) if len(ptm_tokens) > 1: for i in range((len(ptm_tokens) - 1) / 2): modification = ptm_tokens[2 * i + 1] string_with_position = ptm_tokens[2 * i + 2] match_position = ptm_pos_re.search( string_with_position) if match_position: position = int(match_position.group(1)) if position == from_residue: # Success! found_ptm = True ptm_type = ptm_type_object_of_modification[ modification] break if found_ptm: if nonexperimental_qualifier: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, posttranslational_modification_type= ptm_type, nonexperimental_qualifier= nonexperimental_qualifier) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid, posttranslational_modification_type= ptm_type, nonexperimental_qualifier= nonexperimental_qualifier) else: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, posttranslational_modification_type= ptm_type) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid, posttranslational_modification_type= ptm_type) created_object = True if not created_object: if nonexperimental_qualifier: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, nonexperimental_qualifier= nonexperimental_qualifier) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid, nonexperimental_qualifier= nonexperimental_qualifier) else: if ftid == '': feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description) else: feature_obj = UniProtFeature.objects.create( uniprot=uniprot, feature_key=feature_key, description=description, feature_identifier=ftid) else: print "Unrecognized feature key %s while parsing %s" \ % (key_name, record.entry_name) # Parse the from_residue and to_residue and update the appropriate # fields in the object. try: from_residue = int(from_residue_spec) feature_obj.from_residue = from_residue feature_obj.save() except ValueError: if from_residue_spec != '?': for i in range(len(from_residue_spec)): if from_residue_spec[i] == '<': feature_obj.extends_n_terminally = True elif from_residue_spec[i] == '>': # We don't expect this to happen, but anyway... feature_obj.extends_c_terminally = True elif from_residue_spec[i] == '?': feature_obj.from_residue_is_uncertain = True else: break feature_obj.from_residue = int(from_residue_spec[i:]) feature_obj.save() try: to_residue = int(to_residue_spec) feature_obj.to_residue = to_residue feature_obj.save() except ValueError: if to_residue_spec != '?': for i in range(len(to_residue_spec)): if to_residue_spec[i] == '<': # We don't expect this to happen, but anyway... feature_obj.extends_n_terminally = True elif to_residue_spec[i] == '>': feature_obj.extends_c_terminally = True elif to_residue_spec[i] == '?': feature_obj.to_residue_is_uncertain = True else: break feature_obj.to_residue = int(to_residue_spec[i:]) feature_obj.save() # Delete the old feature entries. for uniprot_feature_object in uniprot_feature_object_list: uniprot_feature_object.delete() for id in uniprot_ids_to_delete: uniprot_of_uniprot_id[id].delete() # The SwissProt parser may have eaten many characters from the next record # by this point, buffering them away until we ask for the next record. So # if we set current_pos from f.tell() now, we will get the wrong answer. # Instead, we will update current_pos by reading lines from pos_f, without # parsing them--we're just looking for the record separator. line = pos_f.readline() while len(line) >= 2 and line[0:2] != '//': line = pos_f.readline() current_pos = pos_f.tell() num_records += 1 if sharding: if current_pos > end_pos: break
go = Ontology(snakemake.input["go_obo"]) go_annotations = open(snakemake.output["go_annotations"], 'w') uniparcdb = snakemake.input["uniparcdb"] conn = sqlite3.connect(goa_path) cursor = conn.cursor() sqlatt = f'attach database "{uniparcdb}" as uniparc;' cursor.execute(sqlatt,) # 1. retrieve uniprot accession from exact match (hash) # 2. retrieve GO annotations for record in SeqIO.parse(faa_path, "fasta"): checksum = CheckSum.seguid(record.seq) sqlq = 'select * from uniparc.uniparc_accession where sequence_hash="%s"' % checksum uniparc_id = cursor.execute(sqlq,).fetchall()[0][0] print("uid", uniparc_id) sqlq2 = 'select distinct accession from uniparc_cross_references t1 ' \ ' inner join crossref_databases t2 on t1.db_id=t2.db_id ' \ ' where t1.uniparc_id=%s and db_name in ("UniProtKB/Swiss-Prot", "UniProtKB/TrEMBL");' % uniparc_id print(sqlq2) uniprotkb_acc_list = [i[0].split(".")[0] for i in cursor.execute(sqlq2,).fetchall()] print("hits:", uniprotkb_acc_list)
def main(): basepath = '/clusterfs/ohana/bpg/Hpylori26695/GHGs' os.chdir(basepath) f = open('Helicobacter_pylori_26695') input_records = SeqIO.to_dict(SeqIO.parse(f, "fasta")) f.close() set_cover_dict = {} seed_dict = {} f = open("clusters") lines = f.readlines() f.close() for line in lines: if line.split()[0] == 'Cluster': starting_cluster = True else: if starting_cluster: seed_id = line.split()[0] set_cover_dict[seed_id] = seed_id seed_dict[seed_id] = set([seed_id]) starting_cluster = False else: seq_id = line.split()[0] set_cover_dict[seq_id] = seed_id seed_dict[seed_id].add(seq_id) uppercase_translation = string.maketrans(string.lowercase, string.uppercase) dotdash='.-' seed_paths = glob.glob('seeds*/*') for seed_path in seed_paths: seed_dir, seed = os.path.split(seed_path) sought_seguids = [(id, CheckSum.seguid(input_records[id].seq)) for id in seed_dict[('lcl|%s' % seed)]] alignment_paths = glob.glob('%s/bpg*.a2m' % seed_path) if alignment_paths: family_id = int(os.path.splitext(os.path.split(alignment_paths[0])[1])[0][3:]) family = Family.objects.get(id = family_id) root = family.canonical_root_node() tree_node_objects = TreeNode.objects.filter(tree = family.canonical_tree, sequence_header__isnull = False).order_by('left_id') sequence_headers = [node.sequence_header for node in tree_node_objects] tree_node_alignment_objs = \ TreeNodeAlignment.objects.filter(tree_node = root) alignment_of_sequence_header = {} for obj in tree_node_alignment_objs: alignment_of_sequence_header[obj.sequence_header] = obj sequence_headers_of_seguid = {} for obj in tree_node_alignment_objs: seguid = obj.sequence_header.sequence.seguid if seguid not in sequence_headers_of_seguid: sequence_headers_of_seguid[seguid] = set() sequence_headers_of_seguid[seguid].add(obj.sequence_header) for id, seguid in sought_seguids: if seguid in sequence_headers_of_seguid: uniprot_identifiers = list(set( [sequence_header.uniprot.uniprot_identifier for sequence_header in sequence_headers_of_seguid[seguid] if sequence_header.uniprot])) if uniprot_identifiers: print '%s,"%s"' % (id, ','.join(uniprot_identifiers)) else: the_sequence_header = list(sequence_headers_of_seguid[seguid])[0] seq0 = alignment_of_sequence_header[ the_sequence_header].aligned_sequence.chars max_percent_id = 0.0 closest_uniprot = None for obj in tree_node_alignment_objs: if obj.sequence_header.uniprot: seq1 = obj.aligned_sequence.chars pwid = BPGPWID.pairwise_identity_KS_1(seq0, seq1) if pwid > max_percent_id: max_percent_id = pwid closest_uniprot = obj.sequence_header.uniprot print '%s,"%s(%0.3f)"' % (id, closest_uniprot.uniprot_identifier, max_percent_id) else: print "No exact match for %s" % id else: alignment_path = '%s/final.a2m' % seed_path if os.path.exists(alignment_path): uniprot_identifiers_of_seguid = {} f = open(alignment_path) flowerpower_alignment = SeqIO.to_dict(SeqIO.parse(f, "fasta")) f.close() for id in flowerpower_alignment: if id[0:3] == 'tr|' or id[0:3] == 'sp|': unaligned_seq = flowerpower_alignment[ id].seq.tostring().translate(uppercase_translation, dotdash) seguid = CheckSum.seguid(unaligned_seq) if seguid not in uniprot_identifiers_of_seguid: uniprot_identifiers_of_seguid[seguid] = set() uniprot_identifiers_of_seguid[seguid].add(id.split('|')[2]) for id, seguid in sought_seguids: if seguid in uniprot_identifiers_of_seguid: print '%s,"%s"' % (id, ','.join(list(uniprot_identifiers_of_seguid[seguid]))) else: print "No exact match for %s" % id else: print "Seed %s has not been FlowerPowered" % seed
def buildFamily(alignmentfilename, njtree, njbootstrap, mltree, sciphy, astats, fasttree): """main routine for buildFamily, runs the pipeline...""" starttime = time.time() print 'reading input alignment %s...' % alignmentfilename, sys.stdout.flush() # get base name based on name of alignment file # this will be used to create names for tree files, etc basename = os.path.splitext(alignmentfilename)[0] # read input alignment handle = open(alignmentfilename, 'r') alignmentrecords = list(SeqIO.parse(handle, 'fasta')) handle.close() # check alignment for duplicate entries num_duplicates = 0 sequences_of_seguid_of_id = {} for record in alignmentrecords: id = record.id description = record.description seq = record.seq.tostring() seguid = CheckSum.seguid(seq) if id not in sequences_of_seguid_of_id: sequences_of_seguid_of_id[id] = {} if seguid not in sequences_of_seguid_of_id[id]: sequences_of_seguid_of_id[id][seguid] = {} if seq in sequences_of_seguid_of_id[id][seguid]: num_duplicates += 1 else: sequences_of_seguid_of_id[id][seguid][seq] = description # de-dup the input alignment if num_duplicates > 0: print "Found %d duplicates in %s" % (num_duplicates, alignmentfilename) oldalignmentfilename = basename + '_with_dups.afa' print "Renaming %s to %s" % (alignmentfilename, oldalignmentfilename) os.system("mv %s %s" % (alignmentfilename, oldalignmentfilename)) print "Writing de-dupped alignment to %s" % alignmentfilename f = open(alignmentfilename, "w") for id in sequences_of_seguid_of_id: for seguid in sequences_of_seguid_of_id[id]: for seq in sequences_of_seguid_of_id[id][seguid]: f.write(">%s\n" % sequences_of_seguid_of_id[id][seguid][seq]) f.write("%s\n" % seq) f.close() # create alignment for treebuilding # also create an ID mapping from internal to fasta identifiers idmap = {} treealignment = {} alignmentdict = {} id = 1 for record in alignmentrecords: myid = 'SEQ%d' % id id += 1 alignmentdict[myid] = record.seq.tostring() idmap[myid] = record.description treealignment[myid] = '' for c in record.seq.tostring(): if c == '-' or c.isupper(): treealignment[myid] += c # print ID map file (pickle) idmapfname = basename + '.idmap' handle = open(idmapfname, 'w') cPickle.dump(idmap, handle) handle.close() endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # print alignment file for NJ and build NJ tree if njtree: ntaxa = len(alignmentrecords) if ntaxa >= 4: starttime = time.time() print 'inferring tree by neighbor joining...', sys.stdout.flush() inferNJTree(basename, treealignment, njbootstrap) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) else: print 'too few sequences (%d) to build neighbor joining tree, skipping.' % (ntaxa) # print alignment for ML tree and build ML tree if mltree: ntaxa = len(alignmentrecords) if ntaxa >= 4: starttime = time.time() print 'inferring tree by maximum likelihood...', sys.stdout.flush() inferMLTree(basename, treealignment, fasttree) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) else: print 'too few sequences (%d) to build maximum likelihood tree, skipping.' % (ntaxa) # create family-level hmm starttime = time.time() print 'creating general hidden Markov model...', sys.stdout.flush() createHMM(basename, alignmentfilename) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # get PFam domains starttime = time.time() print 'inferring PFam domains...', sys.stdout.flush() inferPFam(basename) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # get transmembrane and signal peptide predictions starttime = time.time() print 'inferring transmembrane domains and signal peptides...', sys.stdout.flush() inferTransmembrane(basename) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # score PDB starttime = time.time() print 'retrieving homologous PDB structures...', sys.stdout.flush() inferPDB(basename) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # run SCI-PHY to infer subfamilies if sciphy: starttime = time.time() print 'inferring subfamilies...', sys.stdout.flush() inferSubfamilies(basename, alignmentdict) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # compute alignment conservation starttime = time.time() print 'calculating alignment conservation...', sys.stdout.flush() computeAlignmentConservation(basename, alignmentfilename) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # gotta run astats to get bulks of info about who's long, who's short, # and who's dating whom starttime = time.time() print 'calculating alignment statistics...', sys.stdout.flush() getAlignmentStatistics(basename, alignmentfilename, astats) endtime = time.time() print 'done. %s' % getTimeStr(starttime, endtime) # record build date print 'recording build date...', sys.stdout.flush() datefname = basename + '.build_date' handle = open(datefname, 'w') print >>handle, date.today() handle.close() print 'done.'
def get_seguid(self, sequence): sequence_handle = StringIO.StringIO(sequence) record = SeqIO.parse(sequence_handle, 'fasta').next() sequence_handle.close() seguid = CheckSum.seguid(record.seq) return seguid
def results(request, work_path, response_dict): pickle_path = os.path.join(work_path, 'alignment_offset_of_left_id.pkl') if os.path.exists(pickle_path): f = open(pickle_path) alignment_offset_of_left_id = cPickle.load(f) f.close() else: alignment_offset_of_left_id = find_alignment_offset_of_left_id(work_path) pickle_path = os.path.join(work_path, 'ids_of_seguid.pkl') if os.path.exists(pickle_path): f = open(pickle_path) ids_of_seguid = cPickle.load(f) f.close() else: seguids, ids_of_seguid = get_seguids_of_ids(work_path) left_id = 1 if 'left_id' in request.GET: try: left_id = int(request.GET['left_id'].strip()) if left_id < 1: left_id = 1 except ValueError: left_id = 1 alignments = [] if left_id != 1: if left_id in alignment_offset_of_left_id: offset, num_bytes = alignment_offset_of_left_id[left_id] f = open(os.path.join(work_path, "satchmo.smo")) f.seek(offset) fake_f = StringIO.StringIO(f.read(num_bytes)) f.close() alignments = list(AlignIO.parse(fake_f, "fasta")) fake_f.close() else: left_id = 1 else: f = open(os.path.join(work_path, 'satchmo_alignment.fasta')) alignments = list(AlignIO.parse(f, "fasta")) f.close() alignment_blocks = [] if len(alignments) > 0: alignment = alignments[0] alignment_length = 0 aligned_column_indices = set() alignment_seqs = {} first_pass = True i = 0 k = 0 prev_seguid = '' uppercase_translation = string.maketrans(string.lowercase, string.uppercase) dotdash = '.-' print for row in alignment: seq = row.seq.tostring() if first_pass: alignment_length = len(row.seq) for j in range(len(seq)): if seq[j] == '-' or seq[j].isupper(): aligned_column_indices.add(j) first_pass = False alignment_seqs[i] = seq unaligned_seq = seq.translate(uppercase_translation, dotdash) seguid = CheckSum.seguid(unaligned_seq) if seguid in ids_of_seguid and len(ids_of_seguid[seguid]) >= 1: if seguid == prev_seguid: if k < len(ids_of_seguid[seguid]) - 1: k += 1 else: k = 0 row.id = ids_of_seguid[seguid][k] prev_seguid = seguid i += 1 column_conserved_residue = {} column_score = {} class_of_column = {} for j in aligned_column_indices: freq_of_residue = {} highest_frequency = 0 most_frequent_residue = '' for i in alignment_seqs.keys(): residue = alignment_seqs[i][j] if residue == '-': continue if residue not in freq_of_residue: freq_of_residue[residue] = 0 freq_of_residue[residue] += 1 if freq_of_residue[residue] > highest_frequency: highest_frequency = freq_of_residue[residue] most_frequent_residue = residue column_conserved_residue[j] = most_frequent_residue num_pairs = 0 sum_of_scores = 0.0 for i0 in range(len(alignment_seqs)): residue0 = alignment_seqs[i0][j] if residue0 != '-': for i1 in range(i0): residue1 = alignment_seqs[i1][j] if residue1 != '-': score = blosum62_of_residues(alignment_seqs[i0][j], alignment_seqs[i1][j]) sum_of_scores += score num_pairs += 1 if num_pairs > 0: column_score[j] = sum_of_scores / num_pairs if column_score[j] >= 3: class_of_column[j] = 'align_high' elif column_score[j] >= 1.5: class_of_column[j] = 'align_moderate' elif column_score[j] >= 0.5: class_of_column[j] = 'align_low' num_blocks = alignment_length / wrapwidth useless_re = re.compile('^[\.-]*$') if alignment_length % wrapwidth > 0: num_blocks += 1 back_count = [0 for i in range(len(alignment))] for i in range(num_blocks): block = [] for row_no,row in enumerate(alignment): seq = row.seq.tostring() seq_piece = seq[(i * wrapwidth):((i + 1) * wrapwidth)] if useless_re.match(seq_piece): continue alignment_row = {} alignment_row['id'] = row.id alignment_row['seq'] = [] alignment_row['start'] = back_count[row_no] alignment_row['stop'] = back_count[row_no] \ + len(seq_piece.replace('.','').replace('-','')) back_count[row_no] = alignment_row['stop'] for j in xrange(i*wrapwidth,(i+1)*wrapwidth): if j < len(seq): residue = seq[j] spec = {} spec['residue'] = residue spec['class'] = '' if j in aligned_column_indices and residue != '-' and \ j in class_of_column: if blosum62_of_residues(residue, column_conserved_residue[j]) \ >= column_score[j]: spec['class'] = class_of_column[j] alignment_row['seq'] = alignment_row['seq'] + [spec] else: alignment_row['seq'].append(dict( (('residue',' '), ('class', None)) )) block = block + [alignment_row] alignment_blocks = alignment_blocks + [block] return render_to_response('satchmo/results.html', dict(response_dict, relative_path=os.path.basename(work_path).replace('satchmo', '', 1), js=js, alignment_blocks=alignment_blocks, left_id=left_id, left_ids_with_alignments = alignment_offset_of_left_id.keys(), ))
print IUPACData.ambiguous_dna_complement #dictionary of complements #and a lot more from Bio.Data import CodonTable print CodonTable.generic_by_id[2] #SeqUtils. Several functions to deal with DNA and protein sequences. #DNA utils import Bio.SeqUtils as SeqUtils print SeqUtils.GC('gacgatcggtattcgtag') #GC content from Bio.SeqUtils import MeltingTemp print MeltingTemp.Tm_staluc('tgcagtacgtatcgt') #DNA/RNA melting temperature #checksum functions: short alphanumeric string signature of a file or sequence #usually written in description of sequence #cgc is a easy, weak, very used checksum (better crc32, crc64) from Bio.SeqUtils import CheckSum myseq='acaagatgccattgtcccccggcctcctgctgctgct' print CheckSum.gcg(myseq) print CheckSum.crc32(myseq) print CheckSum.crc64(myseq) print CheckSum.seguid(myseq) #Protein utils from Bio.SeqUtils import ProtParam myprot=ProtParam.ProteinAnalysis('MLTNK') print myprot.count_amino_acids() print myprot.get_amino_acids_percent() print myprot.molecular_weight() print myprot.aromaticity() print myprot.instability_index() print myprot.flexibility() print myprot.isoelectric_point() print myprot.secondary_structure_fraction()
def insertFamilyIntoDB(alignment_path, assume_seed_first=False, seed_id=None, build_database_source="UniProt", gathering_method="FlowerPower", private=False, family_specific_evalue_criterion=None, famiy_specific_sw_method=None, notes=None, build_alignment_notes_id=0, family_type_id='C'): workdir, alignment_filename = os.path.split(alignment_path) os.chdir(workdir) # Read in the alignment f = open(alignment_filename) alignments = AlignIO.parse(f, 'fasta') # AlignIO.parse returns a list of alignments. We only want one # alignment, so we take the first one (there shouldn't be any more # for our inputs anyhow) for alignment in alignments: break basename = os.path.splitext(alignment_filename)[0] msg = 'This file should have been created by buildFamily.py.' # Look for the ID mapping from SEQ num identifiers to fasta headers idmap_filename = basename + '.idmap' if not os.path.exists(idmap_filename): print "File %s not found. %s" % (idmap_filename, msg) return 1 f = open(idmap_filename) idmap = cPickle.load(f) f.close() # Reverse the ID mapping seqid_of_description = {} for seqid in idmap: seqid_of_description[idmap[seqid]] = seqid # Assert assumptions from buildFamily. The idmap is a mapping between # both unique SEQs and unique headers assert (len(seqid_of_description.keys()) == len(idmap.keys())) sequence_of_seqid = {} aligned_sequence_of_seqid = {} sequence_header_of_seqid = {} seed_sequence_header = None num_aligned_columns = 0 # For each sequence in the alignment, make sure records exist in the # sequence, sequence_header, and aligned_sequence tables for record in alignment: seed_sequence_header = update_sequence_info( record, seed_id, seqid_of_description, aligned_sequence_of_seqid, num_aligned_columns, sequence_of_seqid, sequence_header_of_seqid, assume_seed_first, seed_sequence_header) canonical_tree_method = "" root_of_method = {} if len(sequence_header_of_seqid) < 4: # There won't be an actual tree, so make a fake one root_of_method['trivial'] = node() for seqid in sequence_header_of_seqid.keys(): child = node(seqid=seqid) child.branch_length = 1.0 root_of_method['trivial'].addChild(child) canonical_tree_method = "trivial" ml_tree_filename = None nj_tree_filename = None else: nj_tree_filename = basename + ".nj.rooted.tre" ml_tree_filename = basename + ".fasttree.ml.rooted.tre" # TODO: We should also check that the tree files are nonempty. If both are # empty we should create a trivial tree. if os.path.exists(ml_tree_filename): root_of_method['ml'] = node() f = open(ml_tree_filename) treeString = f.read() f.close() treeString = treeString.translate(trivial_translation, string.whitespace) root_of_method['ml'].readFromTreeString(treeString, 0) canonical_tree_method = "ml" if os.path.exists(nj_tree_filename): root_of_method['nj'] = node() f = open(nj_tree_filename) treeString = f.read() f.close() treeString = treeString.translate(trivial_translation, string.whitespace) root_of_method['nj'].readFromTreeString(treeString, 0) if canonical_tree_method == "": canonical_tree_method = "nj" if canonical_tree_method == "": print "No tree file found. %s" % msg return 1 # Try to read the build date from a file, otherwise assume it is today build_date_filename = basename + ".build_date" build_date = datetime.date.today() if os.path.exists(build_date_filename): try: f = open(build_date_filename) year, month, day = [ int(field) for field in f.read().strip().split('-') ] f.close() build_date = datetime.date(year, month, day) except ValueError: pass # Try to get the build_alignment_notes, if applicable if build_alignment_notes_id > 0: build_alignment_notes = BuildAlignmentNotes.objects.get( id__exact=build_alignment_notes_id) # At this point we have the minimum information necessary, namely an # alignment and a tree, so we can go ahead and create a family # TODO: We should set the status to "bad" here, and then update the status to # "draft" at the very end. family = Family.objects.create( build_database_source=build_database_source, build_date=build_date, status="draft", # all families start out as draft private=private, gathering_method=gathering_method, family_type_id=family_type_id, partition="B", # all families start in B partition ) # Now we have created a new family accession family_accession = 'bpg%07d' % family.id print family_accession # Create the appropriate directories and symbolic links pfacts_base_dir = '/clusterfs/ohana/bpg/pfacts' dir1 = os.path.join(pfacts_base_dir, family_accession[0:4]) dir2 = os.path.join(dir1, family_accession[0:7]) dir3 = os.path.join(dir2, family_accession) if not os.path.exists(dir1): os.mkdir(dir1) if not os.path.exists(dir2): os.mkdir(dir2) if not os.path.exists(dir3): os.chdir(dir2) os.symlink(workdir, family_accession) os.chdir(workdir) # Create symbolic links to files os.symlink(alignment_filename, family_accession + '.a2m') os.symlink(idmap_filename, family_accession + '.idmap') if ml_tree_filename is not None and os.path.exists(ml_tree_filename): replace_seqids_by_seq_header_ids(ml_tree_filename, family_accession + '.ml', sequence_header_of_seqid) if nj_tree_filename is not None and os.path.exists(nj_tree_filename): replace_seqids_by_seq_header_ids(nj_tree_filename, family_accession + '.nj', sequence_header_of_seqid) # Link in the build_alignment_notes if build_alignment_notes_id > 0: family.build_alignment_notes = build_alignment_notes family.save() # Create the tree objects tree_of_method = {} for method in ['trivial', 'nj', 'ml']: if method in root_of_method: tree_of_method[method] = Tree.objects.create(family=family, method=method, is_rsd_rooted=False) canonical_tree = tree_of_method[canonical_tree_method] # Link the canonical tree to the family family.canonical_tree = canonical_tree family.save() # Link the seed sequence header to the family if seed_sequence_header: family.seed_sequence_header = seed_sequence_header family.save() # Do the modified pre-order tree traversal to find the leftIds and rightIds # of each of the nodes for method in ['trivial', 'nj', 'ml']: if method in root_of_method: root_of_method[method].updateLeftId(1, 0) # Create tree_node objects for each tree # The leaf nodes will be linked to sequence_header objects for method in ['trivial', 'nj', 'ml']: if method in root_of_method: root_of_method[method].createTreeNodeObjects( tree_of_method[method], sequence_header_of_seqid) # Link the family alignment to the root of each tree # It appears redundant to link the family alignment multiple times, but this # is not the case. If we subsequently run SATCHMO-JS, we will create a new # tree for this family with method 'satchmo-js', and the alignment linked to # the root of that tree will be a different alignment, namely the one output # by SATCHMO-JS. We will also link the SATCHMO-JS subalignments to the # internal nodes of that tree, and we may make that tree the canonical tree. for method in ['trivial', 'nj', 'ml']: if method in root_of_method: root_node = root_of_method[method].tree_node for seqid in aligned_sequence_of_seqid: TreeNodeAlignment.objects.create( tree_node=root_node, aligned_sequence=aligned_sequence_of_seqid[seqid], sequence_header=sequence_header_of_seqid[seqid]) # Now link some family data to the root of the canonical tree # TODO: Write a procedure for changing the canonical tree. This procedure # must link all this family data to the root of the new canonical tree. canonical_root_node = root_of_method[canonical_tree_method].tree_node # Link the family HMMs sam_hmm_filename = basename + '.mod' if os.path.exists(sam_hmm_filename): os.symlink(sam_hmm_filename, family_accession + '.mod') sam_hmm = HMM.objects.create(length=num_aligned_columns, hmm_type='SAM', method='w0.5', tree_node=canonical_root_node) hmmer_hmm_filename = basename + '.hmm' if os.path.exists(hmmer_hmm_filename): # Rewrite the HMMER hmm so its name is the family id inf = open(hmmer_hmm_filename) hmm_lines = inf.readlines() inf.close() outf = open((family_accession + '.hmm'), "w") for line in hmm_lines: if line[0:4] == 'NAME': outf.write("NAME %s\n" % family_accession) else: outf.write(line) outf.close() hmmer_hmm = HMM.objects.create(length=num_aligned_columns, hmm_type='HMMER3', method='hmmbuild', tree_node=canonical_root_node) # Link the family consensus sequence consensus_sequence_filename = basename + '.con.fa' if os.path.exists(consensus_sequence_filename): os.symlink(consensus_sequence_filename, family_accession + '.con.fa') f = open(consensus_sequence_filename) record = list(SeqIO.parse(f, "fasta"))[0] f.close() consensus_seguid = CheckSum.seguid(record.seq) # It's not inconceivable that the consensus sequence is already in the # sequence table. Find the sequence record for this consensus sequence, or # create it if it isn't there consensus_sequence = _sequence(Sequence, record.seq.tostring(), consensus_seguid) # Link the consensus sequence to the family hmm hmmer_hmm_consensus = HMM_Consensus.objects.create( hmm=hmmer_hmm, sequence=consensus_sequence) # Link the consensus sequence to the canonical tree root. # It appears redundant that the sequence record is linked here, since the # sequence record is already linked to the hmm_consensus record. The # reason for linking the sequence record directly to the # tree_node_consensus record is that the consensus sequence might not come # from an HMM; it might be derived directly from the alignment instead. In # that case there would have been no hmm_consensus record linked to the # tree_node_consensus record. But there must always be a sequence record # linked to every tree_node_consensus record. canonical_root_consensus = TreeNodeConsensus.objects.create( tree_node=canonical_root_node, sequence=consensus_sequence, method='hmm', hmm_consensus=hmmer_hmm_consensus) # Insert the alignment conservation os.symlink(basename + '.alignmentconservation.csv', family_accession + '.alignmentconservation.csv') insertAlignmentConservation(family_accession, canonical_root_node) # Link the PFAM domains insertPFAMPredictionsIntoDB(consensus_sequence, basename) # Link the signal peptide and transmembrane prediction phobius_filename = basename + '.phobius' if os.path.exists(phobius_filename): insertPhobiusPredictionsIntoDB(canonical_root_node, phobius_filename) # Link the homologous PDB structures insertPDBPredictionsIntoDB(hmmer_hmm, canonical_root_node, basename) if os.path.exists(build_date_filename): os.symlink(build_date_filename, family_accession + '.build_date')
def main(): if len(sys.argv) < 2: print "usage: specify taxonomy id" sys.exit(1) taxon_id = sys.argv[1] input_file = sys.argv[2] logging.basicConfig() logger = setup_logger(taxon_id) files = glob.glob( "/clusterfs/ohana/external/genomes/QuestForOrthologs/Release5/%s_*.fasta" % taxon_id) if len(files) == 0: files = glob.glob( "/clusterfs/ohana/bpg/coverage/redundant/pfam/after_17GHG/ID/QFO/%s_*.fasta" % taxon_id) if len(files) == 0: #print "Didn't find file" fh = open(input_file, 'r') #sys.exit(0) #fh = open(files[0], 'r') lines = fh.readlines() fh.close() accessions = set() for line in lines: if line.strip(): if line[0] == ">": fields = line.split() try: accession = fields[0].split(":")[1] except: accession = fields[0].split("|")[1] accessions.add(accession) o = open('%s.coverage' % taxon_id, 'w') num_covered = GHG_num_covered = GHG_over2_covered = Pfam_num_covered = 0 for accession in accessions: # First search the uniprot accession in the UniProt_Dat_Index table uniprot_dat_indices = UniProtDatIndex.objects.filter( uniprot_accession=accession) if uniprot_dat_indices: uniprot_object = uniprot_dat_indices[0].uniprot if TreeNodeAlignment.objects.filter( sequence_header__uniprot=uniprot_object, tree_node__tree__family__active=True).exclude( tree_node__tree__family__status__exact='bad'): num_covered += 1 TNA = TreeNodeAlignment.objects.filter( sequence_header__uniprot=uniprot_object, tree_node__tree__family__active=True, tree_node__tree__family__family_type='G').exclude( tree_node__tree__family__status__exact='bad') if len(TNA) > 0: GHG_num_covered += 1 GHG_over2_covered += get_family_size(TNA) if TreeNodeAlignment.objects.filter( sequence_header__uniprot=uniprot_object, tree_node__tree__family__active=True, tree_node__tree__family__family_type='C').exclude( tree_node__tree__family__status__exact='bad'): Pfam_num_covered += 1 else: log(logger, "%s is not covered in the database\n" % accession) # if the accession is not in the uniprot_dat_index table, use seguid to # find identical sequences else: log(logger, "%s is not in uniprot_dat_index, try the seguid\n" % accession) uniprot_accession = accession if uniprot_accession_re1.match(uniprot_accession) or \ uniprot_accession_re2.match(uniprot_accession): fasta_file = '%s.fasta' % uniprot_accession cmd = 'wget http://www.uniprot.org/uniprot/%s' % fasta_file try: os.system(cmd) except: log(logger, "Unable to download sequence from UniProt\n") response = open(fasta_file, 'r') record = SeqIO.parse(response, 'fasta').next() response.close() os.remove(fasta_file) seguid = CheckSum.seguid(record.seq) sequence_objects = Sequence.objects.filter( seguid__exact=seguid) if sequence_objects: if TreeNodeAlignment.objects.filter( sequence_header__sequence__in=sequence_objects, tree_node__tree__family__active=True).exclude( tree_node__tree__family__status__exact='bad'): num_covered += 1 TNA = TreeNodeAlignment.objects.filter( sequence_header__sequence__in=sequence_objects, tree_node__tree__family__active=True, tree_node__tree__family__family_type='G').exclude( tree_node__tree__family__status__exact='bad') if len(TNA) > 0: GHG_num_covered += 1 GHG_over2_covered += get_family_size(TNA) if TreeNodeAlignment.objects.filter( sequence_header__sequence__in=sequence_objects, tree_node__tree__family__active=True, tree_node__tree__family__family_type='C' ).exclude( tree_node__tree__family__status__exact='bad'): Pfam_num_covered += 1 else: o.write("There are no families containing %s.\n" % accession) else: log( logger, "%s is not in the PhyloFacts 3 database.\n" % accession) else: print "The argument must be a valid UniProt accession\n" taxon = UniProtTaxonomy.objects.get(id=taxon_id) print "Coverage for %s" % taxon.scientific_name print "Source: EBI Reference Proteome (http://www.ebi.ac.uk/reference_proteomes/)" lineage = taxon.lineage() print "Taxonomy: %s" % "/".join([str(item) for item in lineage]) print "Number of genes in genome = %d" % len(accessions) print "Number of genes covered = %d" % num_covered coverage_GHG = float(GHG_num_covered) / len(accessions) coverage_GHG_over2 = float(GHG_over2_covered) / len(accessions) coverage_Pfam = float(Pfam_num_covered) / len(accessions) coverage_any = float(num_covered) / len(accessions) log(logger, "Coverage = %g" % coverage_any) o.write( 'taxon ID,scientific name,# of proteins,# covered,% covered,# covered by \ GHG,% covered by GHG,# covered by GHG of size>=3,% covered by GHG of size >=3,# covered by Pfam, %covered by Pfam\n' ) o.write('%s,%s,%d,%d,%3.1f,%d,%3.1f,%d,%3.1f,%d,%3.1f\n' % (taxon_id, taxon.scientific_name, len(accessions), num_covered, coverage_any * 100, GHG_num_covered, coverage_GHG * 100, GHG_over2_covered, coverage_GHG_over2 * 100, Pfam_num_covered, coverage_Pfam * 100)) o.close()