def update_sequence_info(record, seed_id, seqid_of_description,
                         aligned_sequence_of_seqid, num_aligned_columns,
                         sequence_of_seqid, sequence_header_of_seqid,
                         assume_seed_first, seed_sequence_header):
    """
  record=biopython Bio.SeqRecord.SeqRecord. 
  record.description is string (e.g., O28424_ARCFU/3-199 1,197)
  seqid_of_description - dictionary (reverse of seqs)

  """

    # Find the SEQ number associated with the record
    description = record.description
    seqid = seqid_of_description[description]

    # Find or create the aligned sequence record
    aligned_seq = record.seq.tostring()
    aligned_seguid = CheckSum.seguid(record.seq)

    aligned_sequence_of_seqid[seqid] = _sequence(AlignedSequence, aligned_seq,
                                                 aligned_seguid)

    # Compute the number of aligned columns, if not already known
    if num_aligned_columns == 0:
        num_aligned_columns = len(
            aligned_seq.translate(trivial_translation, dotlowercase))

    # Find or create the unaligned sequence record
    unaligned_seq = aligned_seq.translate(uppercase_translation, dotdash)
    unaligned_seguid = CheckSum.seguid(unaligned_seq)
    sequence_of_seqid[seqid] = _sequence(Sequence, unaligned_seq,
                                         unaligned_seguid)

    # Find or create the sequence_header record
    sequence_header_objects = \
        SequenceHeader.objects.filter(header__exact = description,
                                      sequence__exact = sequence_of_seqid[seqid])
    if sequence_header_objects:
        # Since the combination of header and sequence_id is constrained to be
        # unique, there can only be one
        sequence_header_of_seqid[seqid] = sequence_header_objects[0]
    else:
        # Create a new sequence_header record
        sequence_header_of_seqid[seqid] =\
                                 create_sequence_header(record.id,
                                                        description,
                                                        sequence_of_seqid[seqid])
    if seed_sequence_header is None:
        if assume_seed_first:
            seed_sequence_header = sequence_header_of_seqid[seqid]
        elif seed_id is not None and seed_id == record.id:
            seed_sequence_header = sequence_header_of_seqid[seqid]
    return seed_sequence_header
예제 #2
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <uniprot_accession>" % sys.argv[0]
        sys.exit(0)
    uniprot_accession = sys.argv[1]
    if not uniprot_accession_re1.match(uniprot_accession) and not \
        uniprot_accession_re2.match(uniprot_accession):
        print "The argument must be a valid UniProt accession"
        sys.exit(1)
    try:
        response = urllib2.urlopen('http://www.uniprot.org/uniprot/%s.fasta' %
                                   uniprot_accession)
    except urllib2.HTTPError:
        print "Unable to download sequence from UniProt"
        sys.exit(1)

    record = SeqIO.parse(response, 'fasta').next()
    seguid = CheckSum.seguid(record.seq)
    sequence_objects = Sequence.objects.filter(seguid__exact=seguid)
    if sequence_objects:
        tree_node_alignment_objects = TreeNodeAlignment.objects.filter(
            sequence_header__sequence__in=sequence_objects)
        if tree_node_alignment_objects:
            families = set([
                obj.tree_node.tree.family
                for obj in tree_node_alignment_objects
            ])
            for family in families:
                print family.get_accession()
        else:
            print "There are no families containing this sequence."
    else:
        print "This sequence is not in the PhyloFacts 3 database"
예제 #3
0
 def __init__(self, file, fastaRecord):
     super(SequenceStat, self).__init__()
     self.file = file
     self.length = len(fastaRecord.seq)
     self.description = fastaRecord.description
     self.gc = SeqUtils.GC(fastaRecord.seq)
     self.crc32 = CheckSum.crc32(fastaRecord.seq)
예제 #4
0
	def __init__(self, file,fastaRecord):
		super(SequenceStat, self).__init__()
		self.file = file
		self.length = len(fastaRecord.seq)
		self.description=fastaRecord.description
		self.gc=SeqUtils.GC(fastaRecord.seq)
		self.crc32=CheckSum.crc32(fastaRecord.seq)
def parse_smo(work_path):
    f = open(os.path.join(work_path, "satchmo.smo"))
    alignment_offset_dict = {}
    records = set()
    current_header = ""
    current_sequence = ""
    alignmentOffset = 0
    alignmentNumBytes = 0
    withinAlignment = False
    start_of_line = f.tell()
    line = f.readline()
    while line:
        if line.rstrip() == 'alignment':
            # skip past the line with the curly brace
            f.readline()
            # the next line is the start of the alignment
            alignmentOffset = f.tell()
            withinAlignment = True
        elif line.rstrip() == '//':
            if current_sequence != '':
                seguid = CheckSum.seguid(current_sequence)
                records.add(seguid)
            alignmentNumBytes = start_of_line - alignmentOffset
            for seguid in records:
                if seguid not in alignment_offset_dict:
                    alignment_offset_dict[seguid] = {}
                alignment_offset_dict[seguid][len(records)] \
                    = (alignmentOffset, alignmentNumBytes)
            records = set()
            current_header = ""
            current_sequence = ""
            withinAlignment = False
        elif withinAlignment:
            if len(line) > 0 and line[0] == '>':
                if current_sequence != '':
                    seguid = CheckSum.seguid(current_sequence)
                    records.add(seguid)
                current_header = line[1:].rstrip()
                current_sequence = ""
            else:
                current_sequence = current_sequence + \
                    line.strip().translate(uppercase_translation, dotdash)
        start_of_line = f.tell()
        line = f.readline()
    f.close()
    return alignment_offset_dict
def main():

# This script takes UniProt Accession as input and finds the best GHG books
# that contains the Accession. Best book means having the most diverse taxa.
# Note: In PhyloFacts3 there are families without trees (normally because all
# the sequences in the family are identical, which makes the tree has no branch
# length), this script cannot find these families.
    if  len(sys.argv) < 2:
        print "usage: %prog UniProt_accession outputfile "
        sys.exit(1)

    accession = sys.argv[1]
# find the UniProt index for the accession
    uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot_accession = accession)
    if uniprot_dat_indices:
        uniprot_object = uniprot_dat_indices[0].uniprot
        TreeNodeAlignments=TreeNodeAlignment.objects.filter(sequence_header__uniprot=uniprot_object,tree_node__tree__family__active=True,tree_node__tree__family__family_type='G').exclude(tree_node__tree__family__status__exact ='bad')
        if not len(TreeNodeAlignments)==0:
            families=set([tree_node_alignment.tree_node.tree.family \
              for tree_node_alignment in TreeNodeAlignments])
            (max_family_id, max_taxa_num)=get_all_taxon_ids(families)
            print ("%s,%s,%s" %(accession,max_family_id,max_taxa_num))
        else:
            print ("%s is not covered in the database\n" % accession)
    else:
# find the seguid from UniProt
        print ("%s is not in uniprot_dat_index, try the seguid\n" % accession)
        uniprot_accession = accession
        if uniprot_accession_re1.match(uniprot_accession) or \
          uniprot_accession_re2.match(uniprot_accession):
            try:
                response = urllib2.urlopen('http://www.uniprot.org/uniprot/%s.fasta'
                                % uniprot_accession)
            except urllib2.HTTPError:
                print ("Unable to download sequence from UniProt\n")
            record = SeqIO.parse(response, 'fasta').next()
            seguid = CheckSum.seguid(record.seq)
            sequence_objects = Sequence.objects.filter(seguid__exact = seguid)
            if sequence_objects:
                TreeNodeAlignments=TreeNodeAlignment.objects.filter \
                 (sequence_header__sequence__in = sequence_objects, \
                 tree_node__tree__family__active=True, \
                 tree_node__tree__family__family_type='G') \
                 .exclude(tree_node__tree__family__status__exact='bad')
                if not len(TreeNodeAlignments)==0:
                    families=set([tree_node_alignment.tree_node.tree.family \
  for tree_node_alignment in TreeNodeAlignments])
                    (max_family_id,max_taxa_num)=get_all_taxon_ids(families)
                    print ("%s,%s,%s" % (accession,max_family_id,max_taxa_num))
                else:
                    print ("There are no families containing %s.\n" % accession)
            else:
                print ("%s is not in the PhyloFacts 3 database.\n" % accession)
        else:
            print ("The argument must be a valid UniProt accession\n")
def insertPDBPredictionsIntoDB(hmm, tree_node, basename):
    hmmsearch_filename = basename + "_vs_PDB.hmmsearch.out"
    hmmsearch_results = parse_results_of_hmmsearch_or_hmmscan.parse(
        hmmsearch_filename, 0.001, 1, 1)
    # There should only be one query here - the family HMM
    for query in hmmsearch_results.hit_result_of_name_of_query:
        for pdb_chain_id in hmmsearch_results.hit_result_of_name_of_query[
                query]:
            pdb_id, chain_id = pdb_chain_id.split('_')
            pdb_chain_objects = PDB_Chain.objects.filter(
                pdb__id__exact=pdb_id, chain_id__exact=chain_id)
            if pdb_chain_objects:
                pdb_chain = pdb_chain_objects[0]
            else:
                print "Unrecognized PDB chain %s in hmmsearch results." \
                  % pdb_chain_id,
                print "The PDB_Chain table may be out of date."
                continue
            hit_result \
                = hmmsearch_results.hit_result_of_name_of_query[query][pdb_chain_id]
            for match_number in hit_result.matches:
                match_result = hit_result.matches[match_number]
                aligned_seguid = CheckSum.seguid(match_result.aligned_hit)
                aligned_sequence_objects = AlignedSequence.objects.filter(
                    seguid__exact=aligned_seguid)
                if aligned_sequence_objects:
                    aligned_sequence = aligned_sequence_objects[0]
                else:
                    # Because multiple versions of this fun run simultaneously, it is
                    # possible this was just created moments ago. So, get_or_create
                    # just in case.
                    aligned_sequence, is_created  \
                        = AlignedSequence.objects.get_or_create(
                                                      chars = match_result.aligned_hit,
                                                      seguid = aligned_seguid)
                sequence_hmm = SequenceHMM.objects.create(
                    hmm=hmm,
                    sequence=pdb_chain.full_sequence,
                    aligned_sequence=aligned_sequence,
                    bit_score=match_result.bit_score,
                    e_value=match_result.i_evalue,
                    sequence_type='query',
                    hmm_start=match_result.hmm_from,
                    hmm_end=match_result.hmm_to,
                    sequence_start=match_result.seq_from,
                    sequence_end=match_result.seq_to,
                    match_type=match_result.match_type,
                    n_aligned_chars=match_result.num_aligned_chars)
                TreeNodePDB.objects.create(sequence_hmm=sequence_hmm,
                                           tree_node=tree_node,
                                           pdb_chain=pdb_chain)
예제 #8
0
def get_nr_sequences(fasta_file, genomes_list):
    locus2genome = {}
    for fasta in genomes_list:
        genome = os.path.basename(fasta).split('.')[0]
        for seq in SeqIO.parse(fasta, "fasta"):
            locus2genome[seq.name] = genome
    nr_fasta = open('nr.faa', 'w')
    nr_mapping = open('nr_mapping.tab', 'w')

    hsh_checksum_list = {}

    records = SeqIO.parse(fasta_file, "fasta")
    updated_records = []

    for record in records:

        # NOTE: the case is important for crc64, need to check whether it
        # is necessary to make all entries lower/upper case to ensure consistency.
        checksum = CheckSum.crc64(record.seq)
        nr_mapping.write("%s\t%s\t%s\n" % (record.id,
                                          checksum,
                                          locus2genome[record.id]))
        if checksum not in hsh_checksum_list:
            hsh_checksum_list[checksum] = [record]
            record.id = checksum
            record.name = ""
            updated_records.append(record)
        else:
            # NOTE: having same hash does not mean that the sequences are identical: as
            # the hash space is smaller than the sequence space, it means that collision
            # are unavoidable (but not probable) and record with same hashes should be compared
            # https://www.uniprot.org/help/uniparc (sequence comparison)
            #
            # the list of records having the same checksum, but potentially, 
            # different sequences -> compare them: python does so 
            # comparing the sequences as strings, assuming a similar alphabet
            lst_records = hsh_checksum_list[checksum]
            sequence = record.seq
            has_identical = False
            for prev_record in lst_records:
                if prev_record.seq == sequence:
                    has_identical = True
                    break
            if not has_identical:
                lst_records.append(record)
                record.id = checksum + "-" + len(lst_records)
                record.name = ""
                updated_records.append(record)

    SeqIO.write(updated_records, nr_fasta, "fasta")
예제 #9
0
def parse_tree(work_path):
    seguids = {}
    f = open(os.path.join(work_path, "input_unaligned.fasta"))
    for record in SeqIO.parse(f, "fasta"):
        id = record.id.replace(':', '_')
        seguids[id] = CheckSum.seguid(record.seq)
    f.close()
    f = open(os.path.join(work_path, "satchmo_tree.newick"))
    tree_string = f.read()
    f.close()
    tree_string = tree_string.translate(trivial_translation, string.whitespace)
    root = node()
    root.readFromTreeString(tree_string, seguids, 0)
    root.updateLeftId(1)
    return root
예제 #10
0
def main():
    if len(sys.argv) < 3:
        print "Usage: %s <seedX_id> <seedY_id>" % sys.argv[0]
        sys.exit(0)
    seedX_id = sys.argv[1]
    seedY_id = sys.argv[2]
    all_alignment_filenames = \
       matchmaker_seed_alignment_filenames(seedX_id, seedY_id)
    output_handle = open(alignment_nr_csv_file(seedX_id, seedY_id), "w")
    output_handle.write("Alignment\n")
    nr_alignments = {}
    dups_of_alignment = {}
    for alignment_filename in all_alignment_filenames:
        """
    #7/15: replaced below with functions that can read compressed alignment files
    f = open(alignment_filename)
    lines = f.readlines()
    if len(lines) < 4:
      continue
    ali = '*'.join([lines[1],lines[3]])
    """
        (seed1, seq1), (seed2, seq2) = read_alignment_file(alignment_filename)
        ali = '*'.join([seq1, seq2])
        the_seguid = CheckSum.seguid(ali)
        if the_seguid in nr_alignments and nr_alignments[the_seguid] != None:
            if ali in nr_alignments[the_seguid] \
                and nr_alignments[the_seguid][ali] != None:
                nr_alignments[the_seguid][ali].add(alignment_filename)
                dups_of_alignment[alignment_filename] = nr_alignments[
                    the_seguid][ali]
            else:
                nr_alignments[the_seguid][ali] = set([alignment_filename])
                output_handle.write("%s\n" % alignment_filename)
                dups_of_alignment[alignment_filename] = nr_alignments[
                    the_seguid][ali]
        else:
            nr_alignments[the_seguid] = {ali: set([alignment_filename])}
            output_handle.write("%s\n" % alignment_filename)
            dups_of_alignment[alignment_filename] = nr_alignments[the_seguid][
                ali]

    pklfp = open(
        os.path.join(align_dir(seedX_id, seedY_id),
                     "%s_%s_alignment_dict.pkl" % (seedX_id, seedY_id)), "w")
    cPickle.dump((dups_of_alignment, nr_alignments), pklfp)
    pklfp.close()
예제 #11
0
def main():
    parser = OptionParser(usage='%prog')
    (options, args) = parser.parse_args()

    f = open("/clusterfs/ohana/external/pdb_rcsb_full", "rU")
    for record in SeqIO.parse(f, "fasta"):
        fields = record.description.split()
        if len(fields) < 2 or fields[1] != 'mol:protein':
            continue
        pdb_id, chain_id = fields[0].split('_')
        pdb_objects = PDB.objects.filter(id__exact=pdb_id)
        if pdb_objects:
            pdb = pdb_objects[0]
        else:
            pdb = PDB.objects.create(id=pdb_id)
        seguid = CheckSum.seguid(record.seq)
        sequence_objects = Sequence.objects.filter(seguid__exact=seguid)
        if sequence_objects:
            sequence = sequence_objects[0]
        else:
            sequence = Sequence.objects.create(chars=record.seq.tostring(),
                                               seguid=seguid)
        pdb_chain_objects = PDB_Chain.objects.filter(pdb__exact=pdb,
                                                     chain_id__exact=chain_id)
        if pdb_chain_objects:
            pdb_chain = pdb_chain_objects[0]
            # Update the sequence in case it has changed
            if pdb_chain.full_sequence != sequence:
                pdb_chain.full_sequence = sequence
                pdb_chain.all_residues_have_atom_records_f = None
            if len(fields) >= 4:
                pdb_chain.description = ' '.join(fields[3:])
            pdb_chain.save()
        else:
            if len(fields) >= 4:
                description = ' '.join(fields[3:])
                pdb_chain = PDB_Chain.objects.create(pdb=pdb,
                                                     chain_id=chain_id,
                                                     full_sequence=sequence,
                                                     description=description)
            else:
                pdb_chain = PDB_Chain.objects.create(pdb=pdb,
                                                     chain_id=chain_id,
                                                     full_sequence=sequence)
def get_seguids_of_ids(work_path):
    seguids = {}
    ids_of_seguid = {}
    f = open(os.path.join(work_path, "input_unaligned.fasta"))
    for record in SeqIO.parse(f, "fasta"):
        id = record.id.replace(':', '_')
        additional_id = id.translate(special_tree_char_translation, '')
        seguid = CheckSum.seguid(record.seq)
        seguids[id] = seguid
        seguids[additional_id] = seguid
        # Don't put the additional_id in ids_of_seguid, as views.py expects there
        # to be one id per leaf.
        if seguid not in ids_of_seguid:
            ids_of_seguid[seguid] = set()
        ids_of_seguid[seguid].add(id)
    f.close()
    for seguid in ids_of_seguid:
        id_list = list(ids_of_seguid[seguid])
        id_list.sort()
        ids_of_seguid[seguid] = id_list
    f = open(os.path.join(work_path, "ids_of_seguid.pkl"), "w")
    cPickle.dump(ids_of_seguid, f)
    f.close()
    return (seguids, ids_of_seguid)
예제 #13
0
def results(request, work_path, response_dict):
    pickle_path = os.path.join(work_path,
                              'alignment_offset_of_left_id.pkl')
    if os.path.exists(pickle_path):
      f = open(pickle_path)
      alignment_offset_of_left_id = cPickle.load(f)
      f.close()
    else:
      alignment_offset_of_left_id = find_alignment_offset_of_left_id(work_path)
    pickle_path = os.path.join(work_path,
                                'ids_of_seguid.pkl')
    if os.path.exists(pickle_path):
      f = open(pickle_path)
      ids_of_seguid = cPickle.load(f)
      f.close()
    else:
      seguids, ids_of_seguid = get_seguids_of_ids(work_path)
    left_id = 1
    if 'left_id' in request.GET:
      try:
        left_id = int(request.GET['left_id'].strip())
        if left_id < 1:
          left_id = 1
      except ValueError:
        left_id = 1
    alignments = []
    if left_id != 1:
      if left_id in alignment_offset_of_left_id:
        offset, num_bytes = alignment_offset_of_left_id[left_id]
        f = open(os.path.join(work_path, "satchmo.smo"))
        f.seek(offset)
        fake_f = StringIO.StringIO(f.read(num_bytes))
        f.close()
        alignments = list(AlignIO.parse(fake_f, "fasta"))
        fake_f.close()
      else:
        left_id = 1
    else:
      f = open(os.path.join(work_path, 'satchmo_alignment.fasta'))
      alignments = list(AlignIO.parse(f, "fasta"))
      f.close()
    alignment_blocks = []
    if len(alignments) > 0:
      alignment = alignments[0]
      alignment_length = 0
      aligned_column_indices = set()
      alignment_seqs = {}
      first_pass = True
      i = 0
      k = 0
      prev_seguid = ''
      uppercase_translation = string.maketrans(string.lowercase, 
                                                string.uppercase)
      dotdash = '.-'
      print 
      for row in alignment:
        seq = row.seq.tostring()
        if first_pass:
          alignment_length = len(row.seq)
          for j in range(len(seq)):
            if seq[j] == '-' or seq[j].isupper():
              aligned_column_indices.add(j)
          first_pass = False
        alignment_seqs[i] = seq
        unaligned_seq = seq.translate(uppercase_translation, dotdash)
        seguid = CheckSum.seguid(unaligned_seq)
        if seguid in ids_of_seguid and len(ids_of_seguid[seguid]) >= 1:
          if seguid == prev_seguid:
            if k < len(ids_of_seguid[seguid]) - 1:
              k += 1
          else:
            k = 0
          row.id = ids_of_seguid[seguid][k]
          prev_seguid = seguid
        i += 1
      column_conserved_residue = {}
      column_score = {}
      class_of_column = {}
      for j in aligned_column_indices:
        freq_of_residue = {}
        highest_frequency = 0
        most_frequent_residue = ''
        for i in alignment_seqs.keys():
          residue = alignment_seqs[i][j]
          if residue == '-':
            continue
          if residue not in freq_of_residue:
            freq_of_residue[residue] = 0
          freq_of_residue[residue] += 1
          if freq_of_residue[residue] > highest_frequency:
            highest_frequency = freq_of_residue[residue]
            most_frequent_residue = residue
        column_conserved_residue[j] = most_frequent_residue
        num_pairs = 0
        sum_of_scores = 0.0
        for i0 in range(len(alignment_seqs)):
          residue0 = alignment_seqs[i0][j]
          if residue0 != '-':
            for i1 in range(i0):
              residue1 = alignment_seqs[i1][j]
              if residue1 != '-':
                score = blosum62_of_residues(alignment_seqs[i0][j],
                                              alignment_seqs[i1][j])
                sum_of_scores += score
                num_pairs += 1
        if num_pairs > 0:
          column_score[j] = sum_of_scores / num_pairs
          if column_score[j] >= 3:
            class_of_column[j] = 'align_high'
          elif column_score[j] >= 1.5:
            class_of_column[j] = 'align_moderate'
          elif column_score[j] >= 0.5:
            class_of_column[j] = 'align_low'
      
      num_blocks = alignment_length / wrapwidth
      useless_re = re.compile('^[\.-]*$')
      if alignment_length % wrapwidth > 0:
        num_blocks += 1
      back_count = [0 for i in range(len(alignment))]
      for i in range(num_blocks):
        block = []
        for row_no,row in enumerate(alignment):
          seq = row.seq.tostring()
          seq_piece = seq[(i * wrapwidth):((i + 1) * wrapwidth)]
          if useless_re.match(seq_piece):
            continue
          alignment_row = {}
          alignment_row['id'] = row.id
          alignment_row['seq'] = []
          alignment_row['start'] = back_count[row_no]
          alignment_row['stop'] = back_count[row_no] \
            + len(seq_piece.replace('.','').replace('-',''))
          back_count[row_no] = alignment_row['stop']

          for j in xrange(i*wrapwidth,(i+1)*wrapwidth):
            if j < len(seq):
              residue = seq[j]
              spec = {}
              spec['residue'] = residue
              spec['class'] = ''
              if j in aligned_column_indices and residue != '-' and \
                  j in class_of_column:
                if blosum62_of_residues(residue, column_conserved_residue[j]) \
                    >= column_score[j]:
                  spec['class'] = class_of_column[j]
              alignment_row['seq'] = alignment_row['seq'] + [spec]
            else:
                alignment_row['seq'].append(dict(
                    (('residue',' '), ('class', None))
                ))

          block = block + [alignment_row]
        alignment_blocks = alignment_blocks + [block]
    return render_to_response('satchmo/results.html', dict(response_dict,
        relative_path=os.path.basename(work_path).replace('satchmo', '', 1),
        js=js,
        alignment_blocks=alignment_blocks,
        left_id=left_id,
        left_ids_with_alignments = alignment_offset_of_left_id.keys(),
    ))
예제 #14
0
 def get_seguid(self, sequence):
     sequence_handle = StringIO.StringIO(sequence)
     record = SeqIO.parse(sequence_handle, 'fasta').next()
     sequence_handle.close()
     seguid = CheckSum.seguid(record.seq)
     return seguid
예제 #15
0
def main():
    if len(sys.argv) < 2:
        usage()
        sys.exit(0)

    family_accession = sys.argv[1]
    try:
        family_id = int(family_accession[3:])
    except ValueError:
        usage()
        sys.exit(1)

    try:
        family = Family.objects.get(id=family_id)
        if family.status == "bad":
            raise Family.DoesNotExist
    except Family.DoesNotExist:
        print "No family found with accession %s" % family_accession
        sys.exit(1)

    family_dir = get_dir_of_family_accession(family_accession)
    seed_path = os.path.join(family_dir, "seed.fa")
    if not os.path.exists(seed_path):
        if os.path.realpath(family_dir).find('TreeFam') >= 0:
            os.chdir(family_dir)
            possible_seed_files = glob.glob("*_HUMAN*.fa")
            candidates = set()
            swissprot_desc_re = re.compile('^%s$' % swissprot_desc_pat)
            for file in possible_seed_files:
                basename = os.path.splitext(file)[0]
                components = basename.split('_')
                if len(components) < 2 or components[1] != 'HUMAN':
                    continue
                if swissprot_desc_re.match(components[0]) is None and \
                    uniprot_accession_re1.match(components[0]) is None and \
                    uniprot_accession_re2.match(components[0]) is None:
                    continue
                if len(components) > 2:
                    if len(components) != 4:
                        continue
                    try:
                        start = int(components[2])
                    except ValueError:
                        continue
                    try:
                        end = int(components[3])
                    except ValueError:
                        continue
                candidates.add(file)
            if len(candidates) != 1:
                print "Seed file for family %s missing" % family_accession
                sys.exit(1)
            seed_path = os.path.join(family_dir, list(candidates)[0])
        else:
            print "Seed file for family %s missing" % family_accession
            sys.exit(1)

    f = open(seed_path)
    seed_record = SeqIO.parse(f, "fasta").next()
    f.close()
    seed_seguid = CheckSum.seguid(seed_record.seq)

    seed_id = seed_record.id.strip('lcl|')
    print "%s: FlowerPower seed id %s" % (family_accession, seed_id)
    seed_accession = None
    recognizing_regexp = None
    # uniprot_accession_re1 recognizes a UniProt accession only if it is the
    # whole string, not if it is a substring
    for regexp in [
            re.compile(uniprot_accession_pat1),
            re.compile(uniprot_accession_pat2), gi_re
    ]:
        m = regexp.search(seed_id)
        if m:
            seed_accession = m.group()
            recognizing_regexp = regexp
            break

    if seed_accession is None:
        print "Could not parse accession from seed id"
        sys.exit(1)

    sequences = Sequence.objects.filter(seguid=seed_seguid)
    sequence_headers = SequenceHeader.objects.filter(sequence__in=sequences)
    possible_sequence_headers = set()
    for sequence_header in sequence_headers:
        m = recognizing_regexp.search(sequence_header.header)
        if m:
            accession = m.group()
            if accession == seed_accession:
                if len(sequence_header.header) >= 4 and \
                    sequence_header.header[0:4] == 'lcl|':
                    possible_sequence_headers = set([sequence_header])
                    break
                if sequence_header.header.find('|') < 0:
                    possible_sequence_headers = set([sequence_header])
                    break
                possible_sequence_headers.add(sequence_header)
    if len(possible_sequence_headers) > 1:
        alns = TreeNodeAlignment.objects.filter(
            tree_node=family.canonical_root_node(),
            sequence_header__in=possible_sequence_headers)
        possible_sequence_headers = set([aln.sequence_header for aln in alns])

    print "%s: Found %d possible sequence headers" % (
        family_accession, len(possible_sequence_headers))
    for seqhdr in possible_sequence_headers:
        print "%s: possible sequence header %s" % (family_accession,
                                                   seqhdr.header)

    if len(possible_sequence_headers) == 1:
        seed_sequence_header = list(possible_sequence_headers)[0]
        print "Assigning seed sequence header id %d to family %s" \
            % (seed_sequence_header.id, family_accession)
        family.seed_sequence_header = seed_sequence_header
        family.save()
예제 #16
0
go = Ontology(snakemake.input["go_obo"])
go_annotations = open(snakemake.output["go_annotations"], 'w')
uniparcdb = snakemake.input["uniparcdb"]

conn = sqlite3.connect(goa_path)
cursor = conn.cursor()

sqlatt = f'attach database "{uniparcdb}" as uniparc;'
cursor.execute(sqlatt,)

# 1. retrieve uniprot accession from exact match (hash)
# 2. retrieve GO annotations

for record in SeqIO.parse(faa_path, "fasta"):
    
    checksum = CheckSum.seguid(record.seq)
    

      
    sqlq = 'select * from uniparc.uniparc_accession where sequence_hash="%s"' % checksum
    
    uniparc_id = cursor.execute(sqlq,).fetchall()[0][0]
    
    print("uid", uniparc_id)
    
    sqlq2 = 'select distinct accession from uniparc_cross_references t1 ' \
      ' inner join crossref_databases t2 on t1.db_id=t2.db_id ' \
      ' where t1.uniparc_id=%s and db_name in ("UniProtKB/Swiss-Prot", "UniProtKB/TrEMBL");' % uniparc_id
    print(sqlq2)
    uniprotkb_acc_list = [i[0].split(".")[0] for i in cursor.execute(sqlq2,).fetchall()]
    print("hits:", uniprotkb_acc_list)
예제 #17
0
def main():
    parser = OptionParser(usage='%prog [StartPos_EndPos]')
    parser.add_option(
        '--update_features',
        action='store_true',
        dest='update_features',
        default=True,
        help="Insert new entries into uniprot_feature and delete old ones.")
    parser.add_option(
        '--no_update_features',
        action='store_false',
        dest='update_features',
        default=True,
        help="Only insert entries into uniprot_feature for new uniprot records."
    )
    (options, args) = parser.parse_args()
    sharding = False
    if len(args) >= 1:
        sharding = True
        shard_spec = args[0]
        positions = shard_spec.split('_')
        if positions < 2:
            parser.error(
              "Must specify shard as starting and ending file positions " \
              + "separated by an underscore")
        try:
            start_pos = int(positions[0])
        except ValueError:
            parser.error(
              "Must specify shard as starting and ending file positions " \
              + "separated by an underscore")
        try:
            end_pos = int(positions[1])
        except ValueError:
            parser.error(
              "Must specify shard as starting and ending file positions " \
              + "separated by an underscore")
    num_records = 0
    description_re = re.compile(
        '(RecName: |AltName: |SubName: ' +
        '|Full=|Short=|EC=|Allergen=|Biotech=|CD_antigen=|INN=|;)' +
        '|Includes: |Contains: |Flags: ')

    # Prepare regular expressions and object maps for parsing feature tables
    ptm_types = PostTranslationalModificationType.objects.all()
    ptm_re = re.compile(
        '(%s)' % '|'.join([ptm_type.modification for ptm_type in ptm_types]))
    ptm_type_object_of_modification = {}
    for ptm_type in ptm_types:
        ptm_type_object_of_modification[ptm_type.modification] = ptm_type
    # Example:
    # RP   PHOSPHORYLATION [LARGE SCALE ANALYSIS] AT SER-267
    # Here PHOSPHORYLATION is a post-translational modification type, which
    # occurs at position 267.  Later there is a corresponding line in the
    # feature table:
    # FT   MOD_RES     267    267       Phosphoserine.
    ptm_pos_re = re.compile(' AT [A-Z][A-Z][A-Z]-([0-9]*)')
    feature_keys = FeatureKey.objects.all()
    feature_key_object_of_key_name = {}
    for feature_key in feature_keys:
        feature_key_object_of_key_name[feature_key.key_name] = feature_key
    nonexperimental_qualifiers = NonExperimentalQualifier.objects.all()
    nonexperimental_re = re.compile('(%s)' % '|'.join(
        [qualifier.description for qualifier in nonexperimental_qualifiers]))
    nonexperimental_qualifier_object_of_description = {}
    for qualifier in nonexperimental_qualifiers:
        nonexperimental_qualifier_object_of_description[qualifier.description] \
            = qualifier
    dbSNPrs_re = re.compile('dbSNP:rs([0-9]*)')
    large_scale_re = re.compile('LARGE SCALE')

    go_evidence_objects = GO_EvidencePriority.objects.all()
    go_evidence_object_of_go_evidence_code = {}
    for go_evidence_object in go_evidence_objects:
        go_evidence_object_of_go_evidence_code[go_evidence_object.evidence]\
            = go_evidence_object

    f = open("/clusterfs/ohana/external/UniProt/to_import/uniprot.dat")
    pos_f = open("/clusterfs/ohana/external/UniProt/to_import/uniprot.dat")
    if sharding:
        f.seek(start_pos)
    current_pos = f.tell()
    pos_f.seek(current_pos)
    for record in SwissProt.parse(f):
        try:
            taxon = UniProtTaxonomy.objects.get(
                id__exact=record.taxonomy_id[0])
        except UniProtTaxonomy.DoesNotExist:
            taxon = handle_missing_taxonomy(record)
        seguid = CheckSum.seguid(record.sequence)

        # Parse the description
        description_tokens = description_re.split(record.description)
        full_recommended_name = ''
        # Look for the first recommended name category
        # (before any Includes or Contains sections)
        for i in xrange(len(description_tokens)):
            if description_tokens[i] == 'RecName: ':
                break
        # Now look for the full name
        for j in xrange(i, len(description_tokens)):
            if description_tokens[j] == 'Full=':
                break
        # The full recommended name is the next token
        if j < len(description_tokens) - 1:
            full_recommended_name = description_tokens[j + 1]
        else:
            # Try looking for SubName instead, maybe this is a fragment
            # Look for the first subname category
            # (before any Includes or Contains sections)
            for i in xrange(len(description_tokens)):
                if description_tokens[i] == 'SubName: ':
                    break
            # Now look for the full name
            for j in xrange(i, len(description_tokens)):
                if description_tokens[j] == 'Full=':
                    break
            # The full subname is the next token
            if j < len(description_tokens) - 1:
                full_recommended_name = description_tokens[j + 1]
            else:
                print "Full recommended name not found for %s" % record.entry_name
                print record.description
        # Look for all the EC numbers
        ecs = set()
        for i in xrange(len(description_tokens)):
            if description_tokens[i] == 'EC=':
                ecs.add(description_tokens[i + 1])
        # Look for precursor or fragment flags
        is_fragment = False
        is_precursor = False
        for i in xrange(len(description_tokens)):
            if description_tokens[i] == 'Flag: ':
                if description_tokens[i + 1][0:8] == 'Fragment':
                    is_fragment = True
                elif description_tokens[i + 1] == 'Precursor':
                    is_precursor = True

        # Every UniProt accession is present in the uniprot_dat_index table.
        # Each of them points to a record in the uniprot table.
        # On the other hand, a record in the uniprot table has only one
        # accession, the one that was the primary accession the last time we did
        # this update.
        # The primary accession may have changed since we last updated (it may now
        # be a secondary accession).
        # The identifier may also have changed.  E.g., if the record was previously
        # in TrEMBL and is now in SwissProt, then its identifier may have changed
        # from one like Q197F8_IIV3 to one like 002R_IIV3 (i.e., the first part is
        # no longer the accession, but a gene name or something more informative).
        # So, we can't necessarily tell which was the existing record in the uniprot
        # table corresponding to the record we are now parsing by looking at either
        # its identifier or its accession.
        # Instead, we find the entries in the uniprot_dat_index table for each of
        # the accessions.
        # If one corresponding to the primary accession is already present, we take
        # the corresponding record in the UniProt table to be the canonical entry
        # corresponding to this UniProt record.
        # If none corresponding to the primary accession is present but entries in
        # the uniprot_dat_index for other accessions are present, we pick one of
        # these and make the corresponding record in the uniprot table the canonical
        # entry.
        # If no entries in the uniprot_dat_index table corresponding to any of these
        # accessions are present, we create a new record in the uniprot table and
        # make it the canonical entry.
        # If the entries in the uniprot_dat_index table corresponding to these
        # accessions point to multiple different records in the uniprot table, we
        # will delete the other ones at the end of this loop iteration.  But first
        # we will update the sequence_header records that point to those entries to
        # point instead to the canonical entry.
        new_uniprot = False
        uniprot_ids_to_delete = set()
        uniprot_dat_index_of_accession = {}
        uniprot_dat_indices = UniProtDatIndex.objects.filter(
            uniprot_accession__in=record.accessions)
        uniprot_of_uniprot_id = {}
        for uniprot_dat_index in uniprot_dat_indices:
            uniprot_dat_index_of_accession[uniprot_dat_index.uniprot_accession] \
                = uniprot_dat_index
            uniprot_of_uniprot_id[uniprot_dat_index.uniprot.id] \
                = uniprot_dat_index.uniprot
        if len(uniprot_dat_index_of_accession.keys()) > 0:
            if record.accessions[0] in uniprot_dat_index_of_accession:
                uniprot = uniprot_dat_index_of_accession[
                    record.accessions[0]].uniprot
            else:
                an_accession = uniprot_dat_index_of_accession.keys()[0]
                uniprot = uniprot_dat_index_of_accession[an_accession].uniprot
            for accession in uniprot_dat_index_of_accession:
                uniprot_dat_index_of_accession[accession].uniprot = uniprot
                uniprot_dat_index_of_accession[
                    accession].file_char = current_pos
                uniprot_dat_index_of_accession[accession].save()
            missing_accessions \
              = set(record.accessions) - set(uniprot_dat_index_of_accession.keys())
            for accession in missing_accessions:
                uniprot_dat_index_of_accession[accession] \
                    = UniProtDatIndex.objects.create(file_char = current_pos,
                                              uniprot_accession = accession,
                                              uniprot = uniprot)
            uniprot_ids_to_delete = set(uniprot_of_uniprot_id.keys())
            uniprot_ids_to_delete.remove(uniprot.id)
            # Find sequence_headers pointing to the obsolete uniprot records, and
            # point them at the canonical one instead
            sequence_headers = SequenceHeader.objects.filter(
                uniprot__id__in=uniprot_ids_to_delete)
            for sequence_header in sequence_headers:
                sequence_header.uniprot = uniprot
                sequence_header.save()
        else:
            new_uniprot = True
            uniprot = UniProt.objects.create(
                uniprot_identifier=record.entry_name,
                accession=record.accessions[0],
                taxon=taxon,
                de=full_recommended_name,
                seguid=seguid,
                in_swissprot_f=(record.data_class == 'Reviewed'))
            for accession in record.accessions:
                uniprot_dat_index_of_accession[accession] \
                    = UniProtDatIndex.objects.create(file_char = current_pos,
                                              uniprot_accession = accession,
                                              uniprot = uniprot)
            # Look for orphaned sequence headers that can be assigned to this
            # uniprot
            sequences = Sequence.objects.filter(seguid__exact=seguid)
            sequence_ids = [sequence.id for sequence in sequences]
            if sequences:
                sequence_headers = SequenceHeader.objects.filter(
                    sequence__id__in=sequence_ids,
                    uniprot__isnull=True,
                    taxon__id__exact=taxon.id)
                for sequence_header in sequence_headers:
                    sequence_header.uniprot = uniprot

        uniprot.uniprot_identifier = record.entry_name
        uniprot.accession = record.accessions[0]
        uniprot.uniprot_taxon = taxon
        uniprot.de = full_recommended_name
        uniprot.seguid = seguid
        uniprot.in_swissprot_f = (record.data_class == 'Reviewed')
        uniprot.description = record.description
        if is_fragment:
            uniprot.is_fragment = True
        if is_precursor:
            uniprot.is_precursor = True
        uniprot.save()

        # Update the EC associations
        uniprot_ec_objects = UniProtEC.objects.filter(uniprot__exact=uniprot)
        uniprot_ec_object_of_ec_id = {}
        for uniprot_ec_object in uniprot_ec_objects:
            uniprot_ec_object_of_ec_id[
                uniprot_ec_object.ec.id] = uniprot_ec_object
        db_ec_ids = set(uniprot_ec_object_of_ec_id.keys())
        ec_object_of_ec_id = {}
        for ec in ecs:
            class_number_str, subclass_number_str, subsubclass_number_str, \
                enzyme_number_str = ec.split('.')
            is_preliminary = False

            # If EC number is similar to '-.-.-.-', then we should report
            # an error and continue. This is obviously dirty data and should
            # be reported to UniProt
            try:
                class_number = int(class_number_str)
            except ValueError:
                print "Warning: %s has invalid EC Number: '%s'" % (
                    uniprot.uniprot_identifier, ec)
                continue

            if subclass_number_str == '-':
                ec_objects = EC.objects.filter(
                    class_number__exact=class_number,
                    subclass_number__isnull=True,
                    subsubclass_number__isnull=True,
                    enzyme_number__isnull=True)
            else:
                subclass_number = int(subclass_number_str)
                if subsubclass_number_str == '-':
                    ec_objects = EC.objects.filter(
                        class_number__exact=class_number,
                        subclass_number__exact=subclass_number,
                        subsubclass_number__isnull=True,
                        enzyme_number__isnull=True)
                else:
                    subsubclass_number = int(subsubclass_number_str)
                    enzyme_number_str = enzyme_number_str.strip().rstrip(';')
                    if enzyme_number_str == '-':
                        ec_objects = EC.objects.filter(
                            class_number__exact=class_number,
                            subclass_number__exact=subclass_number,
                            subsubclass_number__exact=subsubclass_number,
                            enzyme_number__isnull=True)
                    else:
                        try:
                            enzyme_number = int(enzyme_number_str)
                        except ValueError:
                            print "Preliminary EC %s in %s" % (
                                ec, record.entry_name)
                            print record.description
                            is_preliminary = True
                            enzyme_number = int(enzyme_number_str[1:])
                        ec_objects = EC.objects.filter(
                            class_number__exact=class_number,
                            subclass_number__exact=subclass_number,
                            subsubclass_number__exact=subsubclass_number,
                            enzyme_number__exact=enzyme_number,
                            is_preliminary_f=is_preliminary)
            if ec_objects:
                ec_object = ec_objects[0]
            else:
                ec_object = EC.objects.create(
                    class_number=class_number,
                    subclass_number=subclass_number,
                    subsubclass_number=subsubclass_number,
                    enzyme_number=enzyme_number,
                    is_preliminary_f=is_preliminary)
            ec_object_of_ec_id[ec_object.id] = ec_object
        uniprot_dat_ec_ids = set(ec_object_of_ec_id.keys())
        for ec_id in db_ec_ids - uniprot_dat_ec_ids:
            uniprot_ec_object_of_ec_id[ec_id].delete()
        for ec_id in uniprot_dat_ec_ids - db_ec_ids:
            UniProtEC.objects.create(uniprot=uniprot,
                                     ec=ec_object_of_ec_id[ec_id])

        # Update the keyword associations
        uniprot_keyword_objects = UniProtKeyword.objects.filter(
            uniprot__exact=uniprot)
        uniprot_keyword_object_of_keyword_accession = {}
        for uniprot_keyword_object in uniprot_keyword_objects:
            uniprot_keyword_object_of_keyword_accession[ \
                  uniprot_keyword_object.keyword.accession] = uniprot_keyword_object
        db_keyword_accessions \
            = set(uniprot_keyword_object_of_keyword_accession.keys())
        keyword_object_of_keyword_accession = {}
        for keyword in record.keywords:
            keyword_objects = Keyword.objects.filter(identifier__exact=keyword)
            if keyword_objects:
                keyword_object = keyword_objects[0]
                keyword_object_of_keyword_accession[keyword_object.accession] \
                    = keyword_object
            else:
                print "Unrecognized keyword %s while parsing %s." % (
                    keyword, record.entry_name),
                print "The keyword table may be out of date."
        uniprot_dat_keyword_accessions \
            = set(keyword_object_of_keyword_accession.keys())
        for accession in db_keyword_accessions - uniprot_dat_keyword_accessions:
            uniprot_keyword_object_of_keyword_accession[accession].delete()
        for accession in uniprot_dat_keyword_accessions - db_keyword_accessions:
            UniProtKeyword.objects.create(
                uniprot=uniprot,
                keyword=keyword_object_of_keyword_accession[accession])

        # Update the organelle associations
        uniprot_organelle_objects = UniProtOrganelle.objects.filter(
            uniprot__exact=uniprot)
        uniprot_organelle_object_of_organelle_id = {}
        for uniprot_organelle_object in uniprot_organelle_objects:
            uniprot_organelle_object_of_organelle_id[ \
                uniprot_organelle_object.organelle.id] = uniprot_organelle_object
        db_organelle_ids = set(uniprot_organelle_object_of_organelle_id.keys())
        organelle_object_of_organelle_id = {}
        for organelle in record.organelle.rstrip('.').split(','):
            if len(organelle) == 0:
                continue
            fields = organelle.split('; ')
            if len(fields) > 1:
                # This had better be a plastid
                if fields[0] == 'Plastid':
                    organelle_objects = Organelle.objects.filter(
                        description__exact=fields[0],
                        plastid_type__exact=fields[1])
                else:
                    print "Unrecognized organelle %s in %s" % (
                        organelle, record.entry_name)
            else:
                organelle_objects = Organelle.objects.filter(
                    description__exact=fields[0])
            if organelle_objects:
                organelle_object = organelle_objects[0]
            else:
                field = fields[0].strip()
                # This had better be a plasmid
                if len(field) >= 9 and field[0:7] == 'Plasmid':
                    organelle_object = Organelle.objects.create(
                        description=field, plasmid_name=field[8:])
                elif len(field) >= 13 and field[4:11] == 'Plasmid':
                    organelle_object = Organelle.objects.create(
                        description=field[4:], plasmid_name=field[12:])
                else:
                    print "Unrecognized organelle %s in %s" % (
                        organelle, record.entry_name)
                    continue
            organelle_object_of_organelle_id[
                organelle_object.id] = organelle_object
        uniprot_dat_organelle_ids = set(
            organelle_object_of_organelle_id.keys())
        for organelle_id in db_organelle_ids - uniprot_dat_organelle_ids:
            uniprot_organelle_object_of_organelle_id[organelle_id].delete()
        for organelle_id in uniprot_dat_organelle_ids - db_organelle_ids:
            UniProtOrganelle.objects.create(
                uniprot=uniprot,
                organelle=organelle_object_of_organelle_id[organelle_id])

        # Update the host organism associations
        uniprot_host_objects = UniProtHostOrganism.objects.filter(
            uniprot__exact=uniprot)
        uniprot_host_object_of_host_id = {}
        for uniprot_host_object in uniprot_host_objects:
            uniprot_host_object_of_host_id[uniprot_host_object.host_organism.id] \
                = uniprot_host_object
        db_host_ids = set(uniprot_host_object_of_host_id.keys())
        host_object_of_host_id = {}
        host_ids = [
            int(host_spec.split(';')[0]) for host_spec in record.host_organism
        ]
        host_objects = UniProtTaxonomy.objects.filter(id__in=host_ids)
        for host_object in host_objects:
            host_object_of_host_id[host_object.id] = host_object
        uniprot_dat_host_ids = set(host_object_of_host_id.keys())
        for host_id in set(host_ids) - uniprot_dat_host_ids:
            print "Unknown host taxonomy id %d when parsing %s;" \
                % (host_id, record.entry_name),
            print "UniProtTaxonomy table may be out of date."
        for host_id in db_host_ids - uniprot_dat_host_ids:
            uniprot_host_object_of_host_id[host_id].delete()
        for host_id in uniprot_dat_host_ids - db_host_ids:
            UniProtHostOrganism.objects.create(
                uniprot=uniprot, host_organism=host_object_of_host_id[host_id])

        # Update the literature references
        uniprot_literature_objects = UniProtLiterature.objects.filter(
            uniprot__exact=uniprot)
        uniprot_literature_object_of_title = {}
        for uniprot_literature_object in uniprot_literature_objects:
            uniprot_literature_object_of_title[uniprot_literature_object.title] \
                = uniprot_literature_object
        db_titles = set(uniprot_literature_object_of_title.keys())
        uniprot_dat_ref_of_title = {}
        for ref in record.references:
            titles = ref.title.split(';')
            for title in titles:
                uniprot_dat_ref_of_title[title.strip('"')] = ref
        uniprot_dat_titles = set(uniprot_dat_ref_of_title.keys())
        for title in db_titles - uniprot_dat_titles:
            uniprot_literature_object_of_title[title].delete()
        for title in uniprot_dat_titles:
            if title in db_titles:
                uniprot_literature_object = uniprot_literature_object_of_title[
                    title]
            else:
                uniprot_literature_object \
                  = UniProtLiterature.objects.create(uniprot = uniprot, title = title)
            ref = uniprot_dat_ref_of_title[title]
            uniprot_literature_object.authors = ref.authors
            positional_info = ' '.join(ref.positions)
            m = large_scale_re.search(positional_info)
            if m:
                uniprot_literature_object.is_large_scale_f = True
            for db_name, db_reference in ref.references:
                if db_name == 'MEDLINE':
                    uniprot_literature_object.medline_ui = db_reference
                elif db_name == 'PubMed':
                    uniprot_literature_object.pmid = db_reference
                elif db_name == 'DOI':
                    uniprot_literature_object.doi = db_reference
                elif db_name == 'AGRICOLA':
                    uniprot_literature_object.agricola = db_reference
            uniprot_literature_object.save()

        # Find cross references to other databases
        geneids = set()
        go_evidence_of_go_accession = {}
        pfam_accessions = set()
        extent_of_pdb_chain_ids = {}
        for reference in record.cross_references:
            if reference[0] == 'GeneID':
                geneids.add(reference[1])
            elif reference[0] == 'GO':
                if len(reference) >= 4:
                    go_evidence_of_go_accession[reference[1]] \
                        = reference[3].split(':')[0]
            elif reference[0] == 'Pfam':
                pfam_accessions.add(reference[1])
            elif reference[0] == 'PDB':
                pdb_id = reference[1].lower()
                chain_ids = reference[4].split('=')[0].split('/')
                pdb_from_residue = None
                pdb_to_residue = None
                fields = reference[4].split('=')
                if len(fields) > 1:
                    try:
                        pdb_from_residue, pdb_to_residue \
                            = [int(x) for x in fields[1].split('-')]
                    except IndexError:
                        pdb_from_residue = None
                        pdb_to_residue = None
                    except ValueError:
                        pdb_from_residue = None
                        pdb_to_residue = None
                for chain_id in chain_ids:
                    pdb_chain_id = pdb_id + chain_id
                    extent_of_pdb_chain_ids[pdb_chain_id] \
                        = (pdb_from_residue, pdb_to_residue)

        # Update the GeneID associations
        uniprot_geneid_objects = UniProtGeneID.objects.filter(
            uniprot__exact=uniprot)
        uniprot_geneid_object_of_geneid = {}
        for uniprot_geneid_object in uniprot_geneid_objects:
            uniprot_geneid_object_of_geneid[uniprot_geneid_object.geneid] \
                = uniprot_geneid_object
        db_geneids = set(uniprot_geneid_object_of_geneid.keys())
        for geneid in db_geneids - geneids:
            uniprot_geneid_object_of_geneid[geneid].delete()
        for geneid in geneids - db_geneids:
            UniProtGeneID.objects.create(uniprot=uniprot, geneid=geneid)

        # Update the GO associations
        uniprot_go_objects = UniProtGO.objects.filter(uniprot__exact=uniprot)
        uniprot_go_object_of_go_term_accession = {}
        for uniprot_go_object in uniprot_go_objects:
            uniprot_go_object_of_go_term_accession[uniprot_go_object.go_term.acc] \
                = uniprot_go_object
        db_go_term_accessions = set(
            uniprot_go_object_of_go_term_accession.keys())
        go_term_objects = GO_Term.objects.filter(
            acc__in=go_evidence_of_go_accession.keys())
        go_term_object_of_go_term_accession = {}
        for go_term_object in go_term_objects:
            go_term_object_of_go_term_accession[go_term_object.acc] \
                = go_term_object
        uniprot_dat_go_term_accessions \
            = set(go_term_object_of_go_term_accession.keys())
        for go_accession in set(go_evidence_of_go_accession.keys()) - \
                              uniprot_dat_go_term_accessions:
            print "Unrecognized GO accession %s while parsing %s" \
                % (go_accession, record.entry_name),
            print "GO term table may be out of date."
        for go_evidence_code in set(go_evidence_of_go_accession.values()) - \
                          set(go_evidence_object_of_go_evidence_code.keys()):
            print "Unrecognized GO evidence code %s while parsing %s" \
                % (go_evidence_code, record.entry_name),
            print "GO evidence_priority table may be out of date."
        for go_term_accession in db_go_term_accessions \
                                  - uniprot_dat_go_term_accessions:
            uniprot_go_object_of_go_term_accession[go_term_accession].delete()
        for go_term_accession in uniprot_dat_go_term_accessions:
            go_evidence_code = go_evidence_of_go_accession[go_term_accession]
            if go_evidence_code in go_evidence_object_of_go_evidence_code:
                go_evidence_object = \
                    go_evidence_object_of_go_evidence_code[go_evidence_code]
                if go_term_accession in db_go_term_accessions:
                    uniprot_go \
                        = uniprot_go_object_of_go_term_accession[go_term_accession]
                    if uniprot_go.go_evidence.evidence != go_evidence_code:
                        uniprot_go.go_evidence = go_evidence_object
                        uniprot_go.save()
                else:
                    UniProtGO.objects.create(
                        go_term=go_term_object_of_go_term_accession[
                            go_term_accession],
                        go_evidence=go_evidence_object,
                        uniprot=uniprot)

        # Update the Pfam associations
        uniprot_pfam_objects = UniProtPfam.objects.filter(
            uniprot__exact=uniprot)
        uniprot_pfam_object_of_pfam_accession = {}
        for uniprot_pfam_object in uniprot_pfam_objects:
            uniprot_pfam_object_of_pfam_accession[
                uniprot_pfam_object.pfam.accession] = uniprot_pfam_object
        db_pfam_accessions = set(uniprot_pfam_object_of_pfam_accession.keys())
        pfam_object_of_pfam_accession = {}
        for pfam_accession in pfam_accessions:
            pfam_objects = Pfam.objects.filter(
                accession__exact=pfam_accession).order_by(
                    'overall_pfam_version').reverse()
            if pfam_objects:
                pfam_object = pfam_objects[0]
                pfam_object_of_pfam_accession[
                    pfam_object.accession] = pfam_object
            else:
                print "Unknown Pfam accession %s encountered when parsing %s" \
                    % (pfam_accession, record.entry_name),
                print "Pfam table may be out of date"
        uniprot_dat_pfam_accessions = set(pfam_object_of_pfam_accession.keys())
        for pfam_accession in db_pfam_accessions - uniprot_dat_pfam_accessions:
            uniprot_pfam_object_of_pfam_accession[pfam_accession].delete()
        for pfam_accession in uniprot_dat_pfam_accessions - db_pfam_accessions:
            UniProtPfam.objects.create(
                uniprot=uniprot,
                pfam=pfam_object_of_pfam_accession[pfam_accession])

        # Update the PDB associations
        uniprot_pdb_chain_objects = UniProtPDB_Chain.objects.filter(
            uniprot__exact=uniprot)
        uniprot_pdb_chain_object_of_pdb_chain_id = {}
        for uniprot_pdb_chain_object in uniprot_pdb_chain_objects:
            pdb_chain_id = uniprot_pdb_chain_object.pdb_chain.pdb.id + \
                            uniprot_pdb_chain_object.pdb_chain.chain_id
            uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id] \
                = uniprot_pdb_chain_object
        db_pdb_chain_ids = set(uniprot_pdb_chain_object_of_pdb_chain_id.keys())
        pdb_chain_object_of_pdb_chain_id = {}
        for pdb_chain_id in extent_of_pdb_chain_ids.keys():
            pdb_chain_objects = PDB_Chain.objects.filter(
                pdb__id__exact=pdb_chain_id[0:4],
                chain_id__exact=pdb_chain_id[4:])
            if pdb_chain_objects:
                pdb_chain_object = pdb_chain_objects[0]
                pdb_chain_object_of_pdb_chain_id[
                    pdb_chain_id] = pdb_chain_object
            else:
                print "Unknown PDB chain %s encountered when parsing %s" \
                    % (pdb_chain_id, record.entry_name),
                print "The PDB_Chain table may be out of date."
        uniprot_dat_pdb_chain_ids = set(
            pdb_chain_object_of_pdb_chain_id.keys())
        for pdb_chain_id in db_pdb_chain_ids - uniprot_dat_pdb_chain_ids:
            uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id].delete()
        for pdb_chain_id in uniprot_dat_pdb_chain_ids:
            pdb_from_residue, pdb_to_residue = extent_of_pdb_chain_ids[
                pdb_chain_id]
            if pdb_from_residue:
                if pdb_chain_id in db_pdb_chain_ids:
                    uniprot_pdb_chain \
                        = uniprot_pdb_chain_object_of_pdb_chain_id[pdb_chain_id]
                    uniprot_pdb_chain.from_residue = pdb_from_residue
                    uniprot_pdb_chain.to_residue = pdb_to_residue
                    uniprot_pdb_chain.save()
                else:
                    UniProtPDB_Chain.objects.create(
                        uniprot=uniprot,
                        pdb_chain=pdb_chain_object_of_pdb_chain_id[
                            pdb_chain_id],
                        from_residue=pdb_from_residue,
                        to_residue=pdb_to_residue)
            else:
                UniProtPDB_Chain.objects.create(
                    uniprot=uniprot,
                    pdb_chain=pdb_chain_object_of_pdb_chain_id[pdb_chain_id])

        # Update the feature table (position-specific information)
        if new_uniprot or options.update_features:
            # Unfortunately, there is no part of a feature table entry that is
            # guaranteed to persist from one release of UniProt of the next (except
            # the FTid, but that's not always present).  Therefore there's no way to
            # easily determine that a feature in the uniprot.dat line is the same or
            # nearly the same as one that's already in the database.  So, we update
            # the features by simply inserting all the entries anew and deleting the
            # old ones (without trying to check if they were the same or modify the
            # old ones).  So, we get the old feature entries first so we can delete
            # them later.
            uniprot_feature_objects = UniProtFeature.objects.filter(
                uniprot__exact=uniprot)
            # Instantiate the queryset by turning it into a list.  Otherwise, it
            # won't be instantiated until we get to the bottom and loop over these to
            # delete them, at which point it would delete *all* of them (including
            # the ones we just created).
            uniprot_feature_object_list = list(uniprot_feature_objects)

            # The fields from_residue_is_uncertain, to_residue_is_uncertain,
            # extends_n_terminally, and extends_c_terminally derive from the FT line
            # in the uniprot.dat line, according to the UniProt KnowledgeBase user
            # manual:

            # When a feature is known to extend beyond the position that is given in
            # the feature table, the endpoint specification will be preceded by '<'
            # for features which continue to the left end (N-terminal direction) or
            # by '>' for features which continue to the right end (C- terminal
            # direction); Unknown endpoints are denoted by '?'. Uncertain endpoints
            # are denoted by a '?' before the position, e.g. '?42'.

            for feature in record.features:
                key_name, from_residue_spec, to_residue_spec, description, \
                    ftid = feature
                if key_name in feature_key_object_of_key_name:
                    feature_key = feature_key_object_of_key_name[key_name]
                    # Check for nonexperimental qualifier
                    nonexperimental_qualifier = None
                    m = nonexperimental_re.search(description)
                    if m:
                        nonexperimental_qualifier \
                            = nonexperimental_qualifier_object_of_description[m.group(0)]
                    created_object = False
                    if key_name == 'VARIANT':
                        # Check for dbSNP:rsaccession_number
                        dbSNP_rs_accession = None
                        m = dbSNPrs_re.search(description)
                        if m:
                            dbSNP_rs_accession = int(m.group(1))
                            if nonexperimental_qualifier:
                                if ftid == '':
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        dbsnp_rs_number=dbSNP_rs_accession,
                                        nonexperimental_qualifier=
                                        nonexperimental_qualifier)
                                else:
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        feature_identifier=ftid,
                                        dbsnp_rs_number=dbSNP_rs_accession,
                                        nonexperimental_qualifier=
                                        nonexperimental_qualifier)
                            else:
                                if ftid == '':
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        dbsnp_rs_number=dbSNP_rs_accession)
                                else:
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        feature_identifier=ftid,
                                        dbsnp_rs_number=dbSNP_rs_accession)
                            created_object = True
                    elif key_name == 'MOD_RES' and from_residue_spec == to_residue_spec:
                        # Look for the post-translational modification type
                        found_ptm = False
                        for reference in record.references:
                            if found_ptm:
                                break
                            positional_info = ' '.join(reference.positions)
                            ptm_tokens = ptm_re.split(positional_info)
                            if len(ptm_tokens) > 1:
                                for i in range((len(ptm_tokens) - 1) / 2):
                                    modification = ptm_tokens[2 * i + 1]
                                    string_with_position = ptm_tokens[2 * i +
                                                                      2]
                                    match_position = ptm_pos_re.search(
                                        string_with_position)
                                    if match_position:
                                        position = int(match_position.group(1))
                                        if position == from_residue:
                                            # Success!
                                            found_ptm = True
                                            ptm_type = ptm_type_object_of_modification[
                                                modification]
                                            break
                        if found_ptm:
                            if nonexperimental_qualifier:
                                if ftid == '':
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        posttranslational_modification_type=
                                        ptm_type,
                                        nonexperimental_qualifier=
                                        nonexperimental_qualifier)
                                else:
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        feature_identifier=ftid,
                                        posttranslational_modification_type=
                                        ptm_type,
                                        nonexperimental_qualifier=
                                        nonexperimental_qualifier)
                            else:
                                if ftid == '':
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        posttranslational_modification_type=
                                        ptm_type)
                                else:
                                    feature_obj = UniProtFeature.objects.create(
                                        uniprot=uniprot,
                                        feature_key=feature_key,
                                        description=description,
                                        feature_identifier=ftid,
                                        posttranslational_modification_type=
                                        ptm_type)
                            created_object = True
                    if not created_object:
                        if nonexperimental_qualifier:
                            if ftid == '':
                                feature_obj = UniProtFeature.objects.create(
                                    uniprot=uniprot,
                                    feature_key=feature_key,
                                    description=description,
                                    nonexperimental_qualifier=
                                    nonexperimental_qualifier)
                            else:
                                feature_obj = UniProtFeature.objects.create(
                                    uniprot=uniprot,
                                    feature_key=feature_key,
                                    description=description,
                                    feature_identifier=ftid,
                                    nonexperimental_qualifier=
                                    nonexperimental_qualifier)
                        else:
                            if ftid == '':
                                feature_obj = UniProtFeature.objects.create(
                                    uniprot=uniprot,
                                    feature_key=feature_key,
                                    description=description)
                            else:
                                feature_obj = UniProtFeature.objects.create(
                                    uniprot=uniprot,
                                    feature_key=feature_key,
                                    description=description,
                                    feature_identifier=ftid)
                else:
                    print "Unrecognized feature key %s while parsing %s" \
                        % (key_name, record.entry_name)
                # Parse the from_residue and to_residue and update the appropriate
                # fields in the object.
                try:
                    from_residue = int(from_residue_spec)
                    feature_obj.from_residue = from_residue
                    feature_obj.save()
                except ValueError:
                    if from_residue_spec != '?':
                        for i in range(len(from_residue_spec)):
                            if from_residue_spec[i] == '<':
                                feature_obj.extends_n_terminally = True
                            elif from_residue_spec[i] == '>':
                                # We don't expect this to happen, but anyway...
                                feature_obj.extends_c_terminally = True
                            elif from_residue_spec[i] == '?':
                                feature_obj.from_residue_is_uncertain = True
                            else:
                                break
                        feature_obj.from_residue = int(from_residue_spec[i:])
                        feature_obj.save()
                try:
                    to_residue = int(to_residue_spec)
                    feature_obj.to_residue = to_residue
                    feature_obj.save()
                except ValueError:
                    if to_residue_spec != '?':
                        for i in range(len(to_residue_spec)):
                            if to_residue_spec[i] == '<':
                                # We don't expect this to happen, but anyway...
                                feature_obj.extends_n_terminally = True
                            elif to_residue_spec[i] == '>':
                                feature_obj.extends_c_terminally = True
                            elif to_residue_spec[i] == '?':
                                feature_obj.to_residue_is_uncertain = True
                            else:
                                break
                        feature_obj.to_residue = int(to_residue_spec[i:])
                        feature_obj.save()
            # Delete the old feature entries.
            for uniprot_feature_object in uniprot_feature_object_list:
                uniprot_feature_object.delete()

        for id in uniprot_ids_to_delete:
            uniprot_of_uniprot_id[id].delete()
        # The SwissProt parser may have eaten many characters from the next record
        # by this point, buffering them away until we ask for the next record.  So
        # if we set current_pos from f.tell() now, we will get the wrong answer.
        # Instead, we will update current_pos by reading lines from pos_f, without
        # parsing them--we're just looking for the record separator.
        line = pos_f.readline()
        while len(line) >= 2 and line[0:2] != '//':
            line = pos_f.readline()
        current_pos = pos_f.tell()
        num_records += 1
        if sharding:
            if current_pos > end_pos:
                break
def main():
  basepath = '/clusterfs/ohana/bpg/Hpylori26695/GHGs'
  os.chdir(basepath)
  f = open('Helicobacter_pylori_26695')
  input_records = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
  f.close()
  set_cover_dict = {}
  seed_dict = {}
  f = open("clusters")
  lines = f.readlines()
  f.close()
  for line in lines:
    if line.split()[0] == 'Cluster':
      starting_cluster = True
    else:
      if starting_cluster:
        seed_id = line.split()[0]
        set_cover_dict[seed_id] = seed_id
        seed_dict[seed_id] = set([seed_id])
        starting_cluster = False
      else:
        seq_id = line.split()[0]
        set_cover_dict[seq_id] = seed_id
        seed_dict[seed_id].add(seq_id)
  uppercase_translation = string.maketrans(string.lowercase, string.uppercase)
  dotdash='.-'
  seed_paths = glob.glob('seeds*/*')
  for seed_path in seed_paths:
    seed_dir, seed = os.path.split(seed_path)
    sought_seguids = [(id, CheckSum.seguid(input_records[id].seq))
                      for id in
                      seed_dict[('lcl|%s' % seed)]]
    alignment_paths = glob.glob('%s/bpg*.a2m' % seed_path)
    if alignment_paths:
      family_id = int(os.path.splitext(os.path.split(alignment_paths[0])[1])[0][3:])
      family = Family.objects.get(id = family_id)
      root = family.canonical_root_node()
      tree_node_objects = TreeNode.objects.filter(tree = family.canonical_tree,
                            sequence_header__isnull = False).order_by('left_id')
      sequence_headers = [node.sequence_header for node in tree_node_objects]
      tree_node_alignment_objs = \
        TreeNodeAlignment.objects.filter(tree_node = root)
      alignment_of_sequence_header = {}
      for obj in tree_node_alignment_objs:
        alignment_of_sequence_header[obj.sequence_header] = obj
      sequence_headers_of_seguid = {}
      for obj in tree_node_alignment_objs:
        seguid = obj.sequence_header.sequence.seguid
        if seguid not in sequence_headers_of_seguid:
          sequence_headers_of_seguid[seguid] = set()
        sequence_headers_of_seguid[seguid].add(obj.sequence_header)
      for id, seguid in sought_seguids:
        if seguid in sequence_headers_of_seguid:
          uniprot_identifiers = list(set(
                    [sequence_header.uniprot.uniprot_identifier for
                    sequence_header in sequence_headers_of_seguid[seguid]
                    if sequence_header.uniprot]))
          if uniprot_identifiers:
            print '%s,"%s"' % (id, ','.join(uniprot_identifiers))
          else:
            the_sequence_header = list(sequence_headers_of_seguid[seguid])[0]
            seq0 = alignment_of_sequence_header[
                                    the_sequence_header].aligned_sequence.chars
            max_percent_id = 0.0
            closest_uniprot = None
            for obj in tree_node_alignment_objs:
              if obj.sequence_header.uniprot:
                seq1 = obj.aligned_sequence.chars
                pwid = BPGPWID.pairwise_identity_KS_1(seq0, seq1)
                if pwid > max_percent_id:
                  max_percent_id = pwid
                  closest_uniprot = obj.sequence_header.uniprot
            print '%s,"%s(%0.3f)"' % (id, closest_uniprot.uniprot_identifier,
                                      max_percent_id)
        else:
          print "No exact match for %s" % id
    else:
      alignment_path = '%s/final.a2m' % seed_path
      if os.path.exists(alignment_path):
        uniprot_identifiers_of_seguid = {}
        f = open(alignment_path)
        flowerpower_alignment = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
        f.close()
        for id in flowerpower_alignment:
          if id[0:3] == 'tr|' or id[0:3] == 'sp|':
            unaligned_seq = flowerpower_alignment[
                id].seq.tostring().translate(uppercase_translation, dotdash)
            seguid = CheckSum.seguid(unaligned_seq)
            if seguid not in uniprot_identifiers_of_seguid:
              uniprot_identifiers_of_seguid[seguid] = set()
            uniprot_identifiers_of_seguid[seguid].add(id.split('|')[2])
        for id, seguid in sought_seguids:
          if seguid in uniprot_identifiers_of_seguid:
            print '%s,"%s"' % (id,
                ','.join(list(uniprot_identifiers_of_seguid[seguid])))
          else:
            print "No exact match for %s" % id
        
      else:
        print "Seed %s has not been FlowerPowered" % seed
예제 #19
0
def buildFamily(alignmentfilename, njtree, njbootstrap, mltree, sciphy, astats,
                  fasttree):
    """main routine for buildFamily, runs the pipeline..."""

    starttime = time.time()
    print 'reading input alignment %s...' % alignmentfilename,
    sys.stdout.flush()
    
    # get base name based on name of alignment file
    # this will be used to create names for tree files, etc
    basename = os.path.splitext(alignmentfilename)[0]

    # read input alignment
    handle = open(alignmentfilename, 'r')
    alignmentrecords = list(SeqIO.parse(handle, 'fasta'))
    handle.close()

    # check alignment for duplicate entries
    num_duplicates = 0
    sequences_of_seguid_of_id = {}
    for record in alignmentrecords:
      id = record.id
      description = record.description
      seq = record.seq.tostring()
      seguid = CheckSum.seguid(seq)
      if id not in sequences_of_seguid_of_id:
        sequences_of_seguid_of_id[id] = {}
      if seguid not in sequences_of_seguid_of_id[id]:
        sequences_of_seguid_of_id[id][seguid] = {}
      if seq in sequences_of_seguid_of_id[id][seguid]:
        num_duplicates += 1
      else:
        sequences_of_seguid_of_id[id][seguid][seq] = description
    # de-dup the input alignment
    if num_duplicates > 0:
      print "Found %d duplicates in %s" % (num_duplicates, alignmentfilename)
      oldalignmentfilename = basename + '_with_dups.afa'
      print "Renaming %s to %s" % (alignmentfilename, oldalignmentfilename)
      os.system("mv %s %s" % (alignmentfilename, oldalignmentfilename))
      print "Writing de-dupped alignment to %s" % alignmentfilename
      f = open(alignmentfilename, "w")
      for id in sequences_of_seguid_of_id:
        for seguid in sequences_of_seguid_of_id[id]:
          for seq in sequences_of_seguid_of_id[id][seguid]:
            f.write(">%s\n" % sequences_of_seguid_of_id[id][seguid][seq])
            f.write("%s\n" % seq)
      f.close()

    # create alignment for treebuilding
    # also create an ID mapping from internal to fasta identifiers
    idmap = {}
    treealignment = {}
    alignmentdict = {}
    id = 1

    for record in alignmentrecords:
        myid = 'SEQ%d' % id
        id += 1
    
        alignmentdict[myid] = record.seq.tostring()

        idmap[myid] = record.description        

        treealignment[myid] = ''

        for c in record.seq.tostring():
            if c == '-' or c.isupper():
                treealignment[myid] += c
    
    # print ID map file (pickle)
    idmapfname = basename + '.idmap'
    handle = open(idmapfname, 'w')
    cPickle.dump(idmap, handle)
    handle.close()

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # print alignment file for NJ and build NJ tree
    if njtree:
        ntaxa = len(alignmentrecords)

        if ntaxa >= 4:
            
            starttime = time.time()
            print 'inferring tree by neighbor joining...',
            sys.stdout.flush()

            inferNJTree(basename, treealignment, njbootstrap)

            endtime = time.time()
            print 'done. %s' % getTimeStr(starttime, endtime)

        else:
            print 'too few sequences (%d) to build neighbor joining tree, skipping.' % (ntaxa)

    # print alignment for ML tree and build ML tree
    if mltree:
        ntaxa = len(alignmentrecords)

        if ntaxa >= 4:

            starttime = time.time()
            print 'inferring tree by maximum likelihood...',
            sys.stdout.flush()

            inferMLTree(basename, treealignment, fasttree)
    
            endtime = time.time()
            print 'done. %s' % getTimeStr(starttime, endtime)

        else:
            print 'too few sequences (%d) to build maximum likelihood tree, skipping.' % (ntaxa)

    # create family-level hmm

    starttime = time.time()
    print 'creating general hidden Markov model...',
    sys.stdout.flush()

    createHMM(basename, alignmentfilename)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # get PFam domains

    starttime = time.time()
    print 'inferring PFam domains...',
    sys.stdout.flush()

    inferPFam(basename)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # get transmembrane and signal peptide predictions

    starttime = time.time()
    print 'inferring transmembrane domains and signal peptides...',
    sys.stdout.flush()

    inferTransmembrane(basename)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # score PDB

    starttime = time.time()
    print 'retrieving homologous PDB structures...',
    sys.stdout.flush()

    inferPDB(basename)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # run SCI-PHY to infer subfamilies
    if sciphy:

      starttime = time.time()
      print 'inferring subfamilies...',
      sys.stdout.flush()

      inferSubfamilies(basename, alignmentdict)

      endtime = time.time()
      print 'done. %s' % getTimeStr(starttime, endtime)

    # compute alignment conservation
    starttime = time.time()
    print 'calculating alignment conservation...',
    sys.stdout.flush() 

    computeAlignmentConservation(basename, alignmentfilename)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # gotta run astats to get bulks of info about who's long, who's short,
    # and who's dating whom

    starttime = time.time()
    print 'calculating alignment statistics...',
    sys.stdout.flush()

    getAlignmentStatistics(basename, alignmentfilename, astats)

    endtime = time.time()
    print 'done. %s' % getTimeStr(starttime, endtime)

    # record build date

    print 'recording build date...',
    sys.stdout.flush()
    
    datefname = basename + '.build_date'
    handle = open(datefname, 'w')
    print >>handle, date.today()
    handle.close()

    print 'done.'
def main():
    if len(sys.argv) < 2:
        print "usage: specify taxonomy id"
        sys.exit(1)

    taxon_id = sys.argv[1]
    input_file = sys.argv[2]

    logging.basicConfig()
    logger = setup_logger(taxon_id)
    files = glob.glob(
        "/clusterfs/ohana/external/genomes/QuestForOrthologs/Release5/%s_*.fasta"
        % taxon_id)

    if len(files) == 0:
        files = glob.glob(
            "/clusterfs/ohana/bpg/coverage/redundant/pfam/after_17GHG/ID/QFO/%s_*.fasta"
            % taxon_id)
        if len(files) == 0:
            #print "Didn't find file"
            fh = open(input_file, 'r')
            #sys.exit(0)

    #fh = open(files[0], 'r')
    lines = fh.readlines()
    fh.close()

    accessions = set()
    for line in lines:
        if line.strip():
            if line[0] == ">":
                fields = line.split()
                try:
                    accession = fields[0].split(":")[1]
                except:
                    accession = fields[0].split("|")[1]
                accessions.add(accession)

    o = open('%s.coverage' % taxon_id, 'w')
    num_covered = GHG_num_covered = GHG_over2_covered = Pfam_num_covered = 0
    for accession in accessions:
        # First search the uniprot accession in the UniProt_Dat_Index table
        uniprot_dat_indices = UniProtDatIndex.objects.filter(
            uniprot_accession=accession)
        if uniprot_dat_indices:
            uniprot_object = uniprot_dat_indices[0].uniprot
            if TreeNodeAlignment.objects.filter(
                    sequence_header__uniprot=uniprot_object,
                    tree_node__tree__family__active=True).exclude(
                        tree_node__tree__family__status__exact='bad'):
                num_covered += 1
                TNA = TreeNodeAlignment.objects.filter(
                    sequence_header__uniprot=uniprot_object,
                    tree_node__tree__family__active=True,
                    tree_node__tree__family__family_type='G').exclude(
                        tree_node__tree__family__status__exact='bad')
                if len(TNA) > 0:
                    GHG_num_covered += 1
                    GHG_over2_covered += get_family_size(TNA)
                if TreeNodeAlignment.objects.filter(
                        sequence_header__uniprot=uniprot_object,
                        tree_node__tree__family__active=True,
                        tree_node__tree__family__family_type='C').exclude(
                            tree_node__tree__family__status__exact='bad'):
                    Pfam_num_covered += 1
            else:
                log(logger, "%s is not covered in the database\n" % accession)
        # if the accession is not in the uniprot_dat_index table, use seguid to
        # find identical sequences
        else:
            log(logger,
                "%s is not in uniprot_dat_index, try the seguid\n" % accession)
            uniprot_accession = accession
            if uniprot_accession_re1.match(uniprot_accession) or \
              uniprot_accession_re2.match(uniprot_accession):
                fasta_file = '%s.fasta' % uniprot_accession
                cmd = 'wget http://www.uniprot.org/uniprot/%s' % fasta_file
                try:
                    os.system(cmd)
                except:
                    log(logger, "Unable to download sequence from UniProt\n")
                response = open(fasta_file, 'r')
                record = SeqIO.parse(response, 'fasta').next()
                response.close()
                os.remove(fasta_file)
                seguid = CheckSum.seguid(record.seq)
                sequence_objects = Sequence.objects.filter(
                    seguid__exact=seguid)
                if sequence_objects:
                    if TreeNodeAlignment.objects.filter(
                            sequence_header__sequence__in=sequence_objects,
                            tree_node__tree__family__active=True).exclude(
                                tree_node__tree__family__status__exact='bad'):
                        num_covered += 1
                        TNA = TreeNodeAlignment.objects.filter(
                            sequence_header__sequence__in=sequence_objects,
                            tree_node__tree__family__active=True,
                            tree_node__tree__family__family_type='G').exclude(
                                tree_node__tree__family__status__exact='bad')
                        if len(TNA) > 0:
                            GHG_num_covered += 1
                            GHG_over2_covered += get_family_size(TNA)
                        if TreeNodeAlignment.objects.filter(
                                sequence_header__sequence__in=sequence_objects,
                                tree_node__tree__family__active=True,
                                tree_node__tree__family__family_type='C'
                        ).exclude(
                                tree_node__tree__family__status__exact='bad'):
                            Pfam_num_covered += 1
                    else:
                        o.write("There are no families containing %s.\n" %
                                accession)
                else:
                    log(
                        logger, "%s is not in the PhyloFacts 3 database.\n" %
                        accession)
            else:
                print "The argument must be a valid UniProt accession\n"

    taxon = UniProtTaxonomy.objects.get(id=taxon_id)
    print "Coverage for %s" % taxon.scientific_name

    print "Source: EBI Reference Proteome (http://www.ebi.ac.uk/reference_proteomes/)"

    lineage = taxon.lineage()
    print "Taxonomy: %s" % "/".join([str(item) for item in lineage])

    print "Number of genes in genome = %d" % len(accessions)

    print "Number of genes covered = %d" % num_covered

    coverage_GHG = float(GHG_num_covered) / len(accessions)
    coverage_GHG_over2 = float(GHG_over2_covered) / len(accessions)
    coverage_Pfam = float(Pfam_num_covered) / len(accessions)
    coverage_any = float(num_covered) / len(accessions)

    log(logger, "Coverage = %g" % coverage_any)
    o.write(
        'taxon ID,scientific name,# of proteins,# covered,% covered,# covered by \
  GHG,% covered by GHG,# covered by GHG of size>=3,% covered by GHG of size >=3,# covered by Pfam, %covered by Pfam\n'
    )
    o.write('%s,%s,%d,%d,%3.1f,%d,%3.1f,%d,%3.1f,%d,%3.1f\n' %
            (taxon_id, taxon.scientific_name, len(accessions), num_covered,
             coverage_any * 100, GHG_num_covered, coverage_GHG * 100,
             GHG_over2_covered, coverage_GHG_over2 * 100, Pfam_num_covered,
             coverage_Pfam * 100))
    o.close()
def insertFamilyIntoDB(alignment_path,
                       assume_seed_first=False,
                       seed_id=None,
                       build_database_source="UniProt",
                       gathering_method="FlowerPower",
                       private=False,
                       family_specific_evalue_criterion=None,
                       famiy_specific_sw_method=None,
                       notes=None,
                       build_alignment_notes_id=0,
                       family_type_id='C'):

    workdir, alignment_filename = os.path.split(alignment_path)
    os.chdir(workdir)

    # Read in the alignment
    f = open(alignment_filename)
    alignments = AlignIO.parse(f, 'fasta')

    # AlignIO.parse returns a list of alignments. We only want one
    # alignment, so we take the first one (there shouldn't be any more
    # for our inputs anyhow)
    for alignment in alignments:
        break

    basename = os.path.splitext(alignment_filename)[0]
    msg = 'This file should have been created by buildFamily.py.'

    # Look for the ID mapping from SEQ num identifiers to fasta headers
    idmap_filename = basename + '.idmap'
    if not os.path.exists(idmap_filename):
        print "File %s not found. %s" % (idmap_filename, msg)
        return 1

    f = open(idmap_filename)
    idmap = cPickle.load(f)
    f.close()

    # Reverse the ID mapping
    seqid_of_description = {}
    for seqid in idmap:
        seqid_of_description[idmap[seqid]] = seqid

    # Assert assumptions from buildFamily. The idmap is a mapping between
    # both unique SEQs and unique headers
    assert (len(seqid_of_description.keys()) == len(idmap.keys()))

    sequence_of_seqid = {}
    aligned_sequence_of_seqid = {}
    sequence_header_of_seqid = {}
    seed_sequence_header = None
    num_aligned_columns = 0
    # For each sequence in the alignment, make sure records exist in the
    # sequence, sequence_header, and aligned_sequence tables
    for record in alignment:
        seed_sequence_header = update_sequence_info(
            record, seed_id, seqid_of_description, aligned_sequence_of_seqid,
            num_aligned_columns, sequence_of_seqid, sequence_header_of_seqid,
            assume_seed_first, seed_sequence_header)

    canonical_tree_method = ""
    root_of_method = {}
    if len(sequence_header_of_seqid) < 4:
        # There won't be an actual tree, so make a fake one
        root_of_method['trivial'] = node()
        for seqid in sequence_header_of_seqid.keys():
            child = node(seqid=seqid)
            child.branch_length = 1.0
            root_of_method['trivial'].addChild(child)
        canonical_tree_method = "trivial"
        ml_tree_filename = None
        nj_tree_filename = None
    else:
        nj_tree_filename = basename + ".nj.rooted.tre"
        ml_tree_filename = basename + ".fasttree.ml.rooted.tre"
        # TODO: We should also check that the tree files are nonempty.  If both are
        # empty we should create a trivial tree.
        if os.path.exists(ml_tree_filename):
            root_of_method['ml'] = node()
            f = open(ml_tree_filename)
            treeString = f.read()
            f.close()
            treeString = treeString.translate(trivial_translation,
                                              string.whitespace)
            root_of_method['ml'].readFromTreeString(treeString, 0)
            canonical_tree_method = "ml"
        if os.path.exists(nj_tree_filename):
            root_of_method['nj'] = node()
            f = open(nj_tree_filename)
            treeString = f.read()
            f.close()
            treeString = treeString.translate(trivial_translation,
                                              string.whitespace)
            root_of_method['nj'].readFromTreeString(treeString, 0)
            if canonical_tree_method == "":
                canonical_tree_method = "nj"

    if canonical_tree_method == "":
        print "No tree file found. %s" % msg
        return 1

    # Try to read the build date from a file, otherwise assume it is today
    build_date_filename = basename + ".build_date"
    build_date = datetime.date.today()
    if os.path.exists(build_date_filename):
        try:
            f = open(build_date_filename)
            year, month, day = [
                int(field) for field in f.read().strip().split('-')
            ]
            f.close()
            build_date = datetime.date(year, month, day)
        except ValueError:
            pass

    # Try to get the build_alignment_notes, if applicable
    if build_alignment_notes_id > 0:
        build_alignment_notes = BuildAlignmentNotes.objects.get(
            id__exact=build_alignment_notes_id)

    # At this point we have the minimum information necessary, namely an
    # alignment and a tree, so we can go ahead and create a family

    # TODO: We should set the status to "bad" here, and then update the status to
    # "draft" at the very end.
    family = Family.objects.create(
        build_database_source=build_database_source,
        build_date=build_date,
        status="draft",  # all families start out as draft
        private=private,
        gathering_method=gathering_method,
        family_type_id=family_type_id,
        partition="B",  # all families start in B partition
    )

    # Now we have created a new family accession
    family_accession = 'bpg%07d' % family.id
    print family_accession

    # Create the appropriate directories and symbolic links
    pfacts_base_dir = '/clusterfs/ohana/bpg/pfacts'
    dir1 = os.path.join(pfacts_base_dir, family_accession[0:4])
    dir2 = os.path.join(dir1, family_accession[0:7])
    dir3 = os.path.join(dir2, family_accession)
    if not os.path.exists(dir1):
        os.mkdir(dir1)
    if not os.path.exists(dir2):
        os.mkdir(dir2)
    if not os.path.exists(dir3):
        os.chdir(dir2)
        os.symlink(workdir, family_accession)
        os.chdir(workdir)

    # Create symbolic links to files
    os.symlink(alignment_filename, family_accession + '.a2m')
    os.symlink(idmap_filename, family_accession + '.idmap')
    if ml_tree_filename is not None and os.path.exists(ml_tree_filename):
        replace_seqids_by_seq_header_ids(ml_tree_filename,
                                         family_accession + '.ml',
                                         sequence_header_of_seqid)
    if nj_tree_filename is not None and os.path.exists(nj_tree_filename):
        replace_seqids_by_seq_header_ids(nj_tree_filename,
                                         family_accession + '.nj',
                                         sequence_header_of_seqid)

    # Link in the build_alignment_notes
    if build_alignment_notes_id > 0:
        family.build_alignment_notes = build_alignment_notes
        family.save()

    # Create the tree objects
    tree_of_method = {}
    for method in ['trivial', 'nj', 'ml']:
        if method in root_of_method:
            tree_of_method[method] = Tree.objects.create(family=family,
                                                         method=method,
                                                         is_rsd_rooted=False)

    canonical_tree = tree_of_method[canonical_tree_method]

    # Link the canonical tree to the family
    family.canonical_tree = canonical_tree
    family.save()

    # Link the seed sequence header to the family
    if seed_sequence_header:
        family.seed_sequence_header = seed_sequence_header
        family.save()

    # Do the modified pre-order tree traversal to find the leftIds and rightIds
    # of each of the nodes
    for method in ['trivial', 'nj', 'ml']:
        if method in root_of_method:
            root_of_method[method].updateLeftId(1, 0)

    # Create tree_node objects for each tree
    # The leaf nodes will be linked to sequence_header objects
    for method in ['trivial', 'nj', 'ml']:
        if method in root_of_method:
            root_of_method[method].createTreeNodeObjects(
                tree_of_method[method], sequence_header_of_seqid)

    # Link the family alignment to the root of each tree
    # It appears redundant to link the family alignment multiple times, but this
    # is not the case.  If we subsequently run SATCHMO-JS, we will create a new
    # tree for this family with method 'satchmo-js', and the alignment linked to
    # the root of that tree will be a different alignment, namely the one output
    # by SATCHMO-JS.  We will also link the SATCHMO-JS subalignments to the
    # internal nodes of that tree, and we may make that tree the canonical tree.
    for method in ['trivial', 'nj', 'ml']:
        if method in root_of_method:
            root_node = root_of_method[method].tree_node
            for seqid in aligned_sequence_of_seqid:
                TreeNodeAlignment.objects.create(
                    tree_node=root_node,
                    aligned_sequence=aligned_sequence_of_seqid[seqid],
                    sequence_header=sequence_header_of_seqid[seqid])

    # Now link some family data to the root of the canonical tree
    # TODO: Write a procedure for changing the canonical tree.  This procedure
    # must link all this family data to the root of the new canonical tree.
    canonical_root_node = root_of_method[canonical_tree_method].tree_node

    # Link the family HMMs
    sam_hmm_filename = basename + '.mod'
    if os.path.exists(sam_hmm_filename):
        os.symlink(sam_hmm_filename, family_accession + '.mod')
        sam_hmm = HMM.objects.create(length=num_aligned_columns,
                                     hmm_type='SAM',
                                     method='w0.5',
                                     tree_node=canonical_root_node)

    hmmer_hmm_filename = basename + '.hmm'
    if os.path.exists(hmmer_hmm_filename):
        # Rewrite the HMMER hmm so its name is the family id
        inf = open(hmmer_hmm_filename)
        hmm_lines = inf.readlines()
        inf.close()
        outf = open((family_accession + '.hmm'), "w")
        for line in hmm_lines:
            if line[0:4] == 'NAME':
                outf.write("NAME  %s\n" % family_accession)
            else:
                outf.write(line)
        outf.close()
        hmmer_hmm = HMM.objects.create(length=num_aligned_columns,
                                       hmm_type='HMMER3',
                                       method='hmmbuild',
                                       tree_node=canonical_root_node)

    # Link the family consensus sequence
    consensus_sequence_filename = basename + '.con.fa'
    if os.path.exists(consensus_sequence_filename):
        os.symlink(consensus_sequence_filename, family_accession + '.con.fa')
        f = open(consensus_sequence_filename)
        record = list(SeqIO.parse(f, "fasta"))[0]
        f.close()
        consensus_seguid = CheckSum.seguid(record.seq)
        # It's not inconceivable that the consensus sequence is already in the
        # sequence table.  Find the sequence record for this consensus sequence, or
        # create it if it isn't there

        consensus_sequence = _sequence(Sequence, record.seq.tostring(),
                                       consensus_seguid)

        # Link the consensus sequence to the family hmm
        hmmer_hmm_consensus = HMM_Consensus.objects.create(
            hmm=hmmer_hmm, sequence=consensus_sequence)
        # Link the consensus sequence to the canonical tree root.
        # It appears redundant that the sequence record is linked here, since the
        # sequence record is already linked to the hmm_consensus record.  The
        # reason for linking the sequence record directly to the
        # tree_node_consensus record is that the consensus sequence might not come
        # from an HMM; it might be derived directly from the alignment instead.  In
        # that case there would have been no hmm_consensus record linked to the
        # tree_node_consensus record.  But there must always be a sequence record
        # linked to every tree_node_consensus record.
        canonical_root_consensus = TreeNodeConsensus.objects.create(
            tree_node=canonical_root_node,
            sequence=consensus_sequence,
            method='hmm',
            hmm_consensus=hmmer_hmm_consensus)

    # Insert the alignment conservation
    os.symlink(basename + '.alignmentconservation.csv',
               family_accession + '.alignmentconservation.csv')
    insertAlignmentConservation(family_accession, canonical_root_node)

    # Link the PFAM domains
    insertPFAMPredictionsIntoDB(consensus_sequence, basename)

    # Link the signal peptide and transmembrane prediction
    phobius_filename = basename + '.phobius'
    if os.path.exists(phobius_filename):
        insertPhobiusPredictionsIntoDB(canonical_root_node, phobius_filename)

    # Link the homologous PDB structures
    insertPDBPredictionsIntoDB(hmmer_hmm, canonical_root_node, basename)

    if os.path.exists(build_date_filename):
        os.symlink(build_date_filename, family_accession + '.build_date')
예제 #22
0
print IUPACData.ambiguous_dna_complement	#dictionary of complements
#and a lot more
from Bio.Data import CodonTable
print CodonTable.generic_by_id[2]

#SeqUtils. Several functions to deal with DNA and protein sequences.
#DNA utils
import Bio.SeqUtils as SeqUtils
print SeqUtils.GC('gacgatcggtattcgtag')	#GC content
from Bio.SeqUtils import MeltingTemp
print MeltingTemp.Tm_staluc('tgcagtacgtatcgt')	#DNA/RNA melting temperature
#checksum functions: short alphanumeric string signature of a file or sequence
#usually written in description of sequence
#cgc is a easy, weak, very used checksum (better crc32, crc64)
from Bio.SeqUtils import CheckSum
myseq='acaagatgccattgtcccccggcctcctgctgctgct'
print CheckSum.gcg(myseq)
print CheckSum.crc32(myseq)
print CheckSum.crc64(myseq)
print CheckSum.seguid(myseq)
#Protein utils
from Bio.SeqUtils import ProtParam
myprot=ProtParam.ProteinAnalysis('MLTNK')
print myprot.count_amino_acids()
print myprot.get_amino_acids_percent()
print myprot.molecular_weight()
print myprot.aromaticity()
print myprot.instability_index()
print myprot.flexibility()
print myprot.isoelectric_point()
print myprot.secondary_structure_fraction()