Example #1
0
def blast2(subject, query):
  subject = clean_sequence(subject)
  query = clean_sequence(query)

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    subject_file = f.name
    f.write(">Subject\n%s\n" % (subject,))
    #print 'subject=%s' % (subject,)

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    query_file = f.name
    f.write(">Query\n%s\n" % (query,))
    #print 'query=%s' % (query,)

  outfile = "%s.out.xml" % (query_file,)
  blast_cl = NcbiblastnCommandline(query=query_file, subject=subject_file,
                                   evalue=0.1, word_size=6,
                                   # these params were tested to allow gaps in
                                   # alignments. i.e. large number of bps
                                   # misaligned or gapped.
                                   gapextend=4, gapopen=0, reward=2,
                                   outfmt=5, out=outfile)
  cl = str(blast_cl)
  cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
  r = subprocess.call(cl.split(" "))

  if r != 0:
    try:
      os.unlink(outfile)
      os.unlink(subject_file)
      os.unlink(query_file)
    except:
      pass

    raise Exception("Blast failed: %s" % (cl,))

  res = []

  with open(outfile, "r") as f:
    blast_record = NCBIXML.read(f)
    for alignment in blast_record.alignments:
      for hsp in alignment.hsps:
        res.append({ "query_start": hsp.query_start,
                     "query_end": hsp.query_end,
                     "subject_start": hsp.sbjct_start,
                     "subject_end": hsp.sbjct_end,
                     "evalue": hsp.expect,
                     "query": hsp.query,
                     "match": hsp.match,
                     "subject": hsp.sbjct, })

  os.unlink(outfile)
  os.unlink(subject_file)
  os.unlink(query_file)
  return res
Example #2
0
  def save(self, *args, **kwargs):
    from hippo import clean_sequence
    from Bio.Alphabet import IUPAC

    if self.is_dna():
      alphabet = IUPAC.unambiguous_dna
    else:
      alphabet = IUPAC.protein

    self.sequence = clean_sequence(self.sequence, strict=True, alphabet=alphabet)
    return super(Feature,self).save(*args, **kwargs)
Example #3
0
  def make_feature(dbname, feature):
    from hippo import clean_sequence, Blast_Accession
    from Bio.Alphabet import IUPAC

    if feature.is_dna():
      alphabet = IUPAC.unambiguous_dna
    else:
      alphabet = IUPAC.protein
    data = clean_sequence(feature.sequence, strict=True, alphabet=alphabet, exception=False)
    if data is not None:
      return ">gnl|%s|%s %s\n%s\n" % (
        dbname,
        Blast_Accession.make(type=feature.type.type, feature_id=feature.id, feature_length=len(data)),
        feature.name, data)
    return None
Example #4
0
def _blast2(params, is_ajax):
    """
    Post query and subject sequences, returns alignment of the two using blastn.
    Expects: query, subject
    Response: JSON dictionary with subject and query strings
    """

    if (not 'subject' in params) or (not 'query' in params):
      res = []

    else:
      subject = clean_sequence(params['subject'])
      query = clean_sequence(params['query'])
      res = features.blast2(subject, query)

    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
      j = params['jsonp']+'('+j+')'
      http_res = HttpResponse(j,mimetype="text/javascript",status=httplib.OK)

    else:
      # technically we should be returning "application/json", but in that
      # case browsers force user to download into a file, and for debugging
      # we want to be able to see the JSON list in browser. looks like most
      # browsers will handle JSON sent back as text/html anyways.
      if is_ajax:
        http_res = HttpResponse(j,mimetype="application/json",status=httplib.OK)
      else:
        http_res = HttpResponse(j,status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000
    return http_res
Example #5
0
  def __build_db(self, dna_or_protein, features):
    import os, tempfile, subprocess
    from Bio.Alphabet import IUPAC
    from hippo import clean_sequence, Blast_Accession

    is_dna = False
    infile = None
    nadded = 0
      
    if features is None and self.features.count() == 0:
      return

    if features is None:
      features = self.features.all()

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
      infile = f.name

      for feature in features:
        if feature.dna_or_protein == dna_or_protein:
          if feature.is_dna():
            is_dna = True
            alphabet = IUPAC.unambiguous_dna
          else:
            is_dna = False
            alphabet = IUPAC.protein

          data = clean_sequence(feature.sequence, strict=True, alphabet=alphabet, exception=False)
          if data is not None:
            f.write(">gnl|%s|%s %s\n%s\n" % (
                    self.name,
                    Blast_Accession.make(type=feature.type.type, feature_id=feature.id, feature_length=len(data)),
                    feature.name, data))
            nadded += 1

    if nadded > 0:
      outfn = self.dna_db_name() if is_dna else self.protein_db_name()
      dbtype = self.dna_db_type() if is_dna else self.protein_db_type()

      cmd = "%s/makeblastdb -in %s -out %s -title %s -dbtype %s -parse_seqids -input_type fasta" % (
            settings.NCBI_BIN_DIR, infile, outfn, self.name, dbtype)

      r = subprocess.check_output(cmd.split(' '))
      if 'Adding sequences from FASTA' not in r:
        print r

    os.unlink(infile)
Example #6
0
def find_restriction_sites(sequence, circular=True):
  input_seq = clean_sequence(sequence)
  if circular is True:
    input2 = Seq(input_seq+input_seq)
  else:
    input2 = Seq(input_seq)
  r = MyEnzymes.search(input2)
  cutter_list = []
  for enzyme in r:
    v = r[enzyme]
    for cut in v:
      cut_after = cut-1
      if cut_after <= 0:
        cut_after += len(input2)
      pattern = enzyme.elucidate()
      pattern = re.sub(r'_', '', pattern)
      cut_off = pattern.find('^')
      if cut_off < 0:
        raise Exception('Cannot find cut site for %s (%s)' % (enzyme, pattern))
      # first try fwd
      start = cut-cut_off-1
      end = start+enzyme.size-1
      # print 'try %s vs %s' % (input2[start:end+1].lower(), enzyme.site.lower())
      if str(input2[start:end+1]).lower() == enzyme.site.lower():
        if start < len(input_seq):
          end = end % len(input_seq)
          cut_after = cut_after % len(input_seq)
          f = Restriction_Site(enzyme, start+1, end+1, True, cut_after)
          cutter_list.append(f)
          # print 'found %s' % (f.to_dict(),)
      else:
        end = cut+cut_off+1
        start = end-enzyme.size+1
        # print 'try rc %s vs %s' % (input2[start:end+1].reverse_complement().lower(), enzyme.site.lower())
        if str(input2[start:end+1].reverse_complement()).lower() == enzyme.site.lower():
          if start < len(input_seq):
            end = end % len(input_seq)
            cut_after = cut_after % len(input_seq)
            f = Restriction_Site(enzyme, start+1, end+1, False, cut_after)
            cutter_list.append(f)
            # print 'found %s' % (f.to_dict(),)
        else:
          raise Exception('Cannot find reported cut site %s %s %s %s' % (enzyme, cut, cut_off, pattern))

  return cutter_list
Example #7
0
def blast(sequence, dbobj, input_type='dna', protein=False,
          identity_threshold=0.85, evalue_threshold=0.1, feature_threshold=None, circular=True):
  """
  Blast sequence against specified feature database. If input type is 'dna',
  using blastn if protein=False (default), or blastx if protein=True. If input
  type is 'protein', using tblastn if protein=False, or blastp if protein=True.

  identity_threshold: only return results with identity rate greater than this
  threshold. Can be None. Default is 0.85.

  evalue_threshold: only return results with evalue smaller than this
  threshold. Default is 0.1.

  feature_threshold: only return results that span at least this amount of a
  feature. Can be None (default). E.g. if set to 0.99, only results spanning an
  entire feature are returned.
  """

  infile = None
  feature_list = []
  input = clean_sequence(sequence)
  if circular is True:
    input2 = input+input
  else:
    input2 = input

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    infile = f.name
    f.write(">Query\n%s\n" % (input2,))

  outfile = "%s.out.xml" % (infile,)
  if protein:
    if input_type == 'dna':
      blast_cl = NcbiblastxCommandline(query=infile, db="%s" % (dbobj.protein_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=3, outfmt=5, out=outfile,
                                       max_target_seqs=500)
    else:
      blast_cl = NcbiblastpCommandline(query=infile, db="%s" % (dbobj.protein_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=3, outfmt=5, out=outfile,
                                       max_target_seqs=500)
  else:
    if input_type == 'dna':
      blast_cl = NcbiblastnCommandline(query=infile, db="%s" % (dbobj.dna_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile,
                                       max_target_seqs=500)
    else:
      blast_cl = NcbitblastnCommandline(query=infile, db="%s" % (dbobj.dna_db_name(),), soft_masking=True,
                                        evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile,
                                        max_target_seqs=500)

  cl = str(blast_cl)
  cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
  r = subprocess.call(cl.split(" "))
  if r != 0:
    # blast can fail if blastdb is not there, which can happen if there were no
    # sequences available to build a db
    print "Blast failed: %s" % (cl,)

    try:
      os.unlink(outfile)
      os.unlink(infile)
    except:
      pass

    return []

  with open(outfile, "r") as f:
    blast_record = NCBIXML.read(f)
    for alignment in blast_record.alignments:
      accession = Blast_Accession(alignment.accession)
      for hsp in alignment.hsps:

        # since we doubled up the input, ignore hits starting after the input
        if hsp.query_start > len(input):
          continue

        # check identity threshold
        if identity_threshold is not None and \
           1.0*hsp.identities/len(hsp.sbjct) < identity_threshold:
          continue

        if hsp.sbjct_end > hsp.sbjct_start:
          clockwise = True
          hit_start = hsp.sbjct_start
          hit_end = hsp.sbjct_end
        else:
          clockwise = False
          hit_end = hsp.sbjct_start
          hit_start = hsp.sbjct_end

        # check feature threshold
        if feature_threshold is not None and \
           1.0*(1+abs(hit_end-hit_start))/accession.feature_length < feature_threshold:
          continue

        # print "hit %s evalue %s" % (alignment.hit_def, hsp.expect)
        # print "  query %s-%s, sbjct %s-%s" % (hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end)

        start = hsp.query_start
        end = hsp.query_end
        if end > len(input):
          end = end % len(input)

        feature = alignment.hit_def

        if hit_start != 1 or hit_end != accession.feature_length:
          feature = '%s (%s-%s/%s)' % (feature, hit_start, hit_end, accession.feature_length)

        f = Aligned_Feature(alignment.hit_def, feature,
                            start, end, hsp.sbjct_start, hsp.sbjct_end,
                            accession.type,
                            hsp.query, hsp.match, hsp.sbjct,
                            hsp.expect, hsp.identities)
        setattr(f, 'feature_id', accession.feature_id)
        feature_list.append(f)

  os.unlink(outfile)
  os.unlink(infile)

  # remove truncated features across circular boundary
  filtered = []
  for f in feature_list:
    trumped = False
    if f.query_start == 1:
      # see if this feature is trumped by another one
      for other_f in feature_list:
        # same ending, direction, feature, but other_f is across circular
        # boundary (start > end)
        if other_f.query_start != f.query_start and \
           other_f.query_end == f.query_end and \
           other_f.feature_id == f.feature_id and \
           other_f.query_start > other_f.query_end:
          trumped = True
          break
    if not trumped:
      filtered.append(f)

  return filtered
Example #8
0
def _post(params, is_ajax):
    """
    Post a sequence and run the sequence through blast and orf detection.
    Expects: db and sequence
    Response: JSON list of features
    """

    from hippo.models import Feature_Database

    is_gb = False
    db_name = params['db'].strip()
    db = Feature_Database.objects.get(name=db_name)

    sequence = params['sequence']
    gb_features = []

    # parse genbank
    if sequence.strip().startswith('LOCUS'):
      is_gb = True
      try:
        sequence, gb_features = gb.parse_genbank(sequence.lstrip())
      except Exception as e:
        sequence = ""
        gb_features = []

    # clean sequence
    input_type = params['input'] if 'input' in params else 'dna'
    if input_type in ['protein']:
      sequence = clean_sequence(sequence, alphabet=IUPAC.protein)
    else:
      sequence = clean_sequence(sequence)

    feature_list = gb_features
    gbonly = 'gbonly' in params and params['gbonly'] in ['1', 'true', 'True']
    blastonly = 'blastonly' in params and params['blastonly'] in ['1', 'true', 'True']

    if not is_gb or gbonly is False:
      args = {}
      if 'identity_threshold' in params:
        args['identity_threshold'] = float(params['identity_threshold'])
      if 'feature_threshold' in params:
        args['feature_threshold'] = float(params['feature_threshold'])
      circular = True
      if 'circular' in params and str(params['circular']).strip().lower() in ['false', 0, '0']:
        circular = False

      # feature detection
      feature_list += features.blast(sequence, db, input_type=input_type, protein=False, circular=circular, **args)
      feature_list += features.blast(sequence, db, input_type=input_type, protein=True, circular=circular, **args)

      if input_type == 'dna' and blastonly is False:
        # restriction site search
        feature_list += features.find_restriction_sites(sequence, circular=circular)
        # ORFs and tags
        orf_list, tag_list = orfs.detect_orfs_and_tags(sequence, circular=circular)
        feature_list += orf_list
        feature_list += tag_list

    res = [x.to_dict() for x in feature_list]
    # print 'returning %s' % (res,)

    # now sort everything by start
    res.sort(cmp=lambda x,y:cmp(int(x['query_start']),int(y['query_start'])))

    res = [len(sequence),res,sequence]
    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
        j = params['jsonp']+'('+j+')'
        http_res = HttpResponse(j,mimetype="text/javascript",status=httplib.OK)

    else:
        # technically we should be returning "application/json", but in that
        # case browsers force user to download into a file, and for debugging
        # we want to be able to see the JSON list in browser. looks like most
        # browsers will handle JSON sent back as text/html anyways.
        if is_ajax:
            http_res = HttpResponse(j,mimetype="application/json",status=httplib.OK)
        else:
            http_res = HttpResponse(j,status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000

    return http_res