Python clean_sequence Examples, hippo.clean_sequence Python Examples

Example #1

0

Show file

File: features.py Project: benjiec/giraffe

def blast2(subject, query):
  subject = clean_sequence(subject)
  query = clean_sequence(query)

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    subject_file = f.name
    f.write(">Subject\n%s\n" % (subject,))
    #print 'subject=%s' % (subject,)

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    query_file = f.name
    f.write(">Query\n%s\n" % (query,))
    #print 'query=%s' % (query,)

  outfile = "%s.out.xml" % (query_file,)
  blast_cl = NcbiblastnCommandline(query=query_file, subject=subject_file,
                                   evalue=0.1, word_size=6,
                                   # these params were tested to allow gaps in
                                   # alignments. i.e. large number of bps
                                   # misaligned or gapped.
                                   gapextend=4, gapopen=0, reward=2,
                                   outfmt=5, out=outfile)
  cl = str(blast_cl)
  cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
  r = subprocess.call(cl.split(" "))

  if r != 0:
    try:
      os.unlink(outfile)
      os.unlink(subject_file)
      os.unlink(query_file)
    except:
      pass

    raise Exception("Blast failed: %s" % (cl,))

  res = []

  with open(outfile, "r") as f:
    blast_record = NCBIXML.read(f)
    for alignment in blast_record.alignments:
      for hsp in alignment.hsps:
        res.append({ "query_start": hsp.query_start,
                     "query_end": hsp.query_end,
                     "subject_start": hsp.sbjct_start,
                     "subject_end": hsp.sbjct_end,
                     "evalue": hsp.expect,
                     "query": hsp.query,
                     "match": hsp.match,
                     "subject": hsp.sbjct, })

  os.unlink(outfile)
  os.unlink(subject_file)
  os.unlink(query_file)
  return res

Example #2

0

Show file

File: models.py Project: benjiec/giraffe

  def save(self, *args, **kwargs):
    from hippo import clean_sequence
    from Bio.Alphabet import IUPAC

    if self.is_dna():
      alphabet = IUPAC.unambiguous_dna
    else:
      alphabet = IUPAC.protein

    self.sequence = clean_sequence(self.sequence, strict=True, alphabet=alphabet)
    return super(Feature,self).save(*args, **kwargs)

Example #3

0

Show file

File: models.py Project: UndeadBlow/giraffe

  def save(self, *args, **kwargs):
    from hippo import clean_sequence
    from Bio.Alphabet import IUPAC

    if self.is_dna():
      alphabet = IUPAC.unambiguous_dna
    else:
      alphabet = IUPAC.protein

    self.sequence = clean_sequence(self.sequence, strict=True, alphabet=alphabet)
    return super(Feature,self).save(*args, **kwargs)

Example #4

0

Show file

File: views.py Project: UndeadBlow/giraffe

def _blast2(params, is_ajax):
    """
    Post query and subject sequences, returns alignment of the two using blastn.
    Expects: query, subject
    Response: JSON dictionary with subject and query strings
    """

    if (not 'subject' in params) or (not 'query' in params):
        res = []

    else:
        subject = clean_sequence(params['subject'])
        query = clean_sequence(params['query'])
        res = features.blast2(subject, query)

    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
        j = params['jsonp'] + '(' + j + ')'
        http_res = HttpResponse(j,
                                mimetype="text/javascript",
                                status=httplib.OK)

    else:
        # technically we should be returning "application/json", but in that
        # case browsers force user to download into a file, and for debugging
        # we want to be able to see the JSON list in browser. looks like most
        # browsers will handle JSON sent back as text/html anyways.
        if is_ajax:
            http_res = HttpResponse(j,
                                    mimetype="application/json",
                                    status=httplib.OK)
        else:
            http_res = HttpResponse(j, status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000
    return http_res

Example #5

0

Show file

File: features.py Project: UndeadBlow/giraffe

def find_restriction_sites(sequence, circular=True):
    input_seq = clean_sequence(sequence)
    if circular is True:
        input2 = Seq(input_seq + input_seq)
    else:
        input2 = Seq(input_seq)
    r = MyEnzymes.search(input2)
    cutter_list = []
    for enzyme in r:
        v = r[enzyme]
        for cut in v:
            cut_after = cut - 1
            if cut_after <= 0:
                cut_after += len(input2)
            pattern = enzyme.elucidate()
            pattern = re.sub(r'_', '', pattern)
            cut_off = pattern.find('^')
            if cut_off < 0:
                raise Exception('Cannot find cut site for %s (%s)' %
                                (enzyme, pattern))
            # first try fwd
            start = cut - cut_off - 1
            end = start + enzyme.size - 1
            # print 'try %s vs %s' % (input2[start:end+1].lower(), enzyme.site.lower())
            if str(input2[start:end + 1]).lower() == enzyme.site.lower():
                if start < len(input_seq):
                    end = end % len(input_seq)
                    cut_after = cut_after % len(input_seq)
                    f = Restriction_Site(enzyme, start + 1, end + 1, True,
                                         cut_after)
                    cutter_list.append(f)
                    # print 'found %s' % (f.to_dict(),)
            else:
                end = cut + cut_off + 1
                start = end - enzyme.size + 1
                # print 'try rc %s vs %s' % (input2[start:end+1].reverse_complement().lower(), enzyme.site.lower())
                if str(input2[start:end + 1].reverse_complement()).lower(
                ) == enzyme.site.lower():
                    if start < len(input_seq):
                        end = end % len(input_seq)
                        cut_after = cut_after % len(input_seq)
                        f = Restriction_Site(enzyme, start + 1, end + 1, False,
                                             cut_after)
                        cutter_list.append(f)
                        # print 'found %s' % (f.to_dict(),)
                else:
                    raise Exception(
                        'Cannot find reported cut site %s %s %s %s' %
                        (enzyme, cut, cut_off, pattern))

    return cutter_list

Example #6

0

Show file

File: models.py Project: benjiec/giraffe

  def make_feature(dbname, feature):
    from hippo import clean_sequence, Blast_Accession
    from Bio.Alphabet import IUPAC

    if feature.is_dna():
      alphabet = IUPAC.unambiguous_dna
    else:
      alphabet = IUPAC.protein
    data = clean_sequence(feature.sequence, strict=True, alphabet=alphabet, exception=False)
    if data is not None:
      return ">gnl|%s|%s %s\n%s\n" % (
        dbname,
        Blast_Accession.make(type=feature.type.type, feature_id=feature.id, feature_length=len(data)),
        feature.name, data)
    return None

Example #7

0

Show file

File: views.py Project: benjiec/giraffe

def _blast2(params, is_ajax):
    """
    Post query and subject sequences, returns alignment of the two using blastn.
    Expects: query, subject
    Response: JSON dictionary with subject and query strings
    """

    if (not 'subject' in params) or (not 'query' in params):
      res = []

    else:
      subject = clean_sequence(params['subject'])
      query = clean_sequence(params['query'])
      res = features.blast2(subject, query)

    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
      j = params['jsonp']+'('+j+')'
      http_res = HttpResponse(j,mimetype="text/javascript",status=httplib.OK)

    else:
      # technically we should be returning "application/json", but in that
      # case browsers force user to download into a file, and for debugging
      # we want to be able to see the JSON list in browser. looks like most
      # browsers will handle JSON sent back as text/html anyways.
      if is_ajax:
        http_res = HttpResponse(j,mimetype="application/json",status=httplib.OK)
      else:
        http_res = HttpResponse(j,status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000
    return http_res

Example #8

0

Show file

File: models.py Project: UndeadBlow/giraffe

  def __build_db(self, dna_or_protein, features):
    import os, tempfile, subprocess
    from Bio.Alphabet import IUPAC
    from hippo import clean_sequence, Blast_Accession

    is_dna = False
    infile = None
    nadded = 0
      
    if features is None and self.features.count() == 0:
      return

    if features is None:
      features = self.features.all()

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
      infile = f.name

      for feature in features:
        if feature.dna_or_protein == dna_or_protein:
          if feature.is_dna():
            is_dna = True
            alphabet = IUPAC.unambiguous_dna
          else:
            is_dna = False
            alphabet = IUPAC.protein

          data = clean_sequence(feature.sequence, strict=True, alphabet=alphabet, exception=False)
          if data is not None:
            f.write(">gnl|%s|%s %s\n%s\n" % (
                    self.name,
                    Blast_Accession.make(type=feature.type.type, feature_id=feature.id, feature_length=len(data)),
                    feature.name, data))
            nadded += 1

    if nadded > 0:
      outfn = self.dna_db_name() if is_dna else self.protein_db_name()
      dbtype = self.dna_db_type() if is_dna else self.protein_db_type()

      cmd = "%s/makeblastdb -in %s -out %s -title %s -dbtype %s -parse_seqids -input_type fasta" % (
            settings.NCBI_BIN_DIR, infile, outfn, self.name, dbtype)

      r = subprocess.check_output(cmd.split(' '))
      if 'Adding sequences from FASTA' not in r:
        print r

    os.unlink(infile)

Example #9

0

Show file

File: features.py Project: benjiec/giraffe

def find_restriction_sites(sequence, circular=True):
  input_seq = clean_sequence(sequence)
  if circular is True:
    input2 = Seq(input_seq+input_seq)
  else:
    input2 = Seq(input_seq)
  r = MyEnzymes.search(input2)
  cutter_list = []
  for enzyme in r:
    v = r[enzyme]
    for cut in v:
      cut_after = cut-1
      if cut_after <= 0:
        cut_after += len(input2)
      pattern = enzyme.elucidate()
      pattern = re.sub(r'_', '', pattern)
      cut_off = pattern.find('^')
      if cut_off < 0:
        raise Exception('Cannot find cut site for %s (%s)' % (enzyme, pattern))
      # first try fwd
      start = cut-cut_off-1
      end = start+enzyme.size-1
      # print 'try %s vs %s' % (input2[start:end+1].lower(), enzyme.site.lower())
      if str(input2[start:end+1]).lower() == enzyme.site.lower():
        if start < len(input_seq):
          end = end % len(input_seq)
          cut_after = cut_after % len(input_seq)
          f = Restriction_Site(enzyme, start+1, end+1, True, cut_after)
          cutter_list.append(f)
          # print 'found %s' % (f.to_dict(),)
      else:
        end = cut+cut_off+1
        start = end-enzyme.size+1
        # print 'try rc %s vs %s' % (input2[start:end+1].reverse_complement().lower(), enzyme.site.lower())
        if str(input2[start:end+1].reverse_complement()).lower() == enzyme.site.lower():
          if start < len(input_seq):
            end = end % len(input_seq)
            cut_after = cut_after % len(input_seq)
            f = Restriction_Site(enzyme, start+1, end+1, False, cut_after)
            cutter_list.append(f)
            # print 'found %s' % (f.to_dict(),)
        else:
          raise Exception('Cannot find reported cut site %s %s %s %s' % (enzyme, cut, cut_off, pattern))

  return cutter_list

Example #10

0

Show file

File: views.py Project: UndeadBlow/giraffe

def _post(params, is_ajax):
    """
    Post a sequence and run the sequence through blast and orf detection.
    Expects: db and sequence
    Response: JSON list of features
    """

    from hippo.models import Feature_Database

    is_gb = False
    db_name = params['db'].strip()
    db = Feature_Database.objects.get(name=db_name)

    sequence = params['sequence']
    gb_features = []

    # parse genbank
    if sequence.strip().startswith('LOCUS'):
        is_gb = True
        try:
            sequence, gb_features = gb.parse_genbank(sequence.lstrip())
        except Exception as e:
            sequence = ""
            gb_features = []

    # clean sequence
    input_type = params['input'] if 'input' in params else 'dna'
    if input_type in ['protein']:
        sequence = clean_sequence(sequence, alphabet=IUPAC.protein)
    else:
        sequence = clean_sequence(sequence)

    feature_list = gb_features
    gbonly = 'gbonly' in params and params['gbonly'] in ['1', 'true', 'True']
    blastonly = 'blastonly' in params and params['blastonly'] in [
        '1', 'true', 'True'
    ]

    if not is_gb or gbonly is False:
        args = {}
        if 'identity_threshold' in params:
            args['identity_threshold'] = float(params['identity_threshold'])
        if 'feature_threshold' in params:
            args['feature_threshold'] = float(params['feature_threshold'])
        circular = True
        if 'circular' in params and str(
                params['circular']).strip().lower() in ['false', 0, '0']:
            circular = False

        # feature detection
        feature_list += features.blast(sequence,
                                       db,
                                       input_type=input_type,
                                       protein=False,
                                       circular=circular,
                                       **args)
        feature_list += features.blast(sequence,
                                       db,
                                       input_type=input_type,
                                       protein=True,
                                       circular=circular,
                                       **args)

        if input_type == 'dna' and blastonly is False:
            # restriction site search
            feature_list += features.find_restriction_sites(sequence,
                                                            circular=circular)
            # ORFs and tags
            orf_list, tag_list = orfs.detect_orfs_and_tags(sequence,
                                                           circular=circular)
            feature_list += orf_list
            feature_list += tag_list

    res = [x.to_dict() for x in feature_list]
    # print 'returning %s' % (res,)

    # now sort everything by start
    res.sort(
        cmp=lambda x, y: cmp(int(x['query_start']), int(y['query_start'])))

    res = [len(sequence), res, sequence]
    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
        j = params['jsonp'] + '(' + j + ')'
        http_res = HttpResponse(j,
                                mimetype="text/javascript",
                                status=httplib.OK)

    else:
        # technically we should be returning "application/json", but in that
        # case browsers force user to download into a file, and for debugging
        # we want to be able to see the JSON list in browser. looks like most
        # browsers will handle JSON sent back as text/html anyways.
        if is_ajax:
            http_res = HttpResponse(j,
                                    mimetype="application/json",
                                    status=httplib.OK)
        else:
            http_res = HttpResponse(j, status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000

    return http_res

Example #11

0

Show file

File: features.py Project: benjiec/giraffe

def blast(sequence, dbobj, input_type='dna', protein=False,
          identity_threshold=0.85, evalue_threshold=0.1, feature_threshold=None, circular=True):
  """
  Blast sequence against specified feature database. If input type is 'dna',
  using blastn if protein=False (default), or blastx if protein=True. If input
  type is 'protein', using tblastn if protein=False, or blastp if protein=True.

  identity_threshold: only return results with identity rate greater than this
  threshold. Can be None. Default is 0.85.

  evalue_threshold: only return results with evalue smaller than this
  threshold. Default is 0.1.

  feature_threshold: only return results that span at least this amount of a
  feature. Can be None (default). E.g. if set to 0.99, only results spanning an
  entire feature are returned.
  """

  infile = None
  feature_list = []
  input = clean_sequence(sequence)
  if circular is True:
    input2 = input+input
  else:
    input2 = input

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    infile = f.name
    f.write(">Query\n%s\n" % (input2,))

  outfile = "%s.out.xml" % (infile,)
  if protein:
    if input_type == 'dna':
      blast_cl = NcbiblastxCommandline(query=infile, db="%s" % (dbobj.protein_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=3, outfmt=5, out=outfile,
                                       max_target_seqs=500)
    else:
      blast_cl = NcbiblastpCommandline(query=infile, db="%s" % (dbobj.protein_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=3, outfmt=5, out=outfile,
                                       max_target_seqs=500)
  else:
    if input_type == 'dna':
      blast_cl = NcbiblastnCommandline(query=infile, db="%s" % (dbobj.dna_db_name(),), soft_masking=True,
                                       evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile,
                                       max_target_seqs=500)
    else:
      blast_cl = NcbitblastnCommandline(query=infile, db="%s" % (dbobj.dna_db_name(),), soft_masking=True,
                                        evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile,
                                        max_target_seqs=500)

  cl = str(blast_cl)
  cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
  r = subprocess.call(cl.split(" "))
  if r != 0:
    # blast can fail if blastdb is not there, which can happen if there were no
    # sequences available to build a db
    print "Blast failed: %s" % (cl,)

    try:
      os.unlink(outfile)
      os.unlink(infile)
    except:
      pass

    return []

  with open(outfile, "r") as f:
    blast_record = NCBIXML.read(f)
    for alignment in blast_record.alignments:
      accession = Blast_Accession(alignment.accession)
      for hsp in alignment.hsps:

        # since we doubled up the input, ignore hits starting after the input
        if hsp.query_start > len(input):
          continue

        # check identity threshold
        if identity_threshold is not None and \
           1.0*hsp.identities/len(hsp.sbjct) < identity_threshold:
          continue

        if hsp.sbjct_end > hsp.sbjct_start:
          clockwise = True
          hit_start = hsp.sbjct_start
          hit_end = hsp.sbjct_end
        else:
          clockwise = False
          hit_end = hsp.sbjct_start
          hit_start = hsp.sbjct_end

        # check feature threshold
        if feature_threshold is not None and \
           1.0*(1+abs(hit_end-hit_start))/accession.feature_length < feature_threshold:
          continue

        # print "hit %s evalue %s" % (alignment.hit_def, hsp.expect)
        # print "  query %s-%s, sbjct %s-%s" % (hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end)

        start = hsp.query_start
        end = hsp.query_end
        if end > len(input):
          end = end % len(input)

        feature = alignment.hit_def

        if hit_start != 1 or hit_end != accession.feature_length:
          feature = '%s (%s-%s/%s)' % (feature, hit_start, hit_end, accession.feature_length)

        f = Aligned_Feature(alignment.hit_def, feature,
                            start, end, hsp.sbjct_start, hsp.sbjct_end,
                            accession.type,
                            hsp.query, hsp.match, hsp.sbjct,
                            hsp.expect, hsp.identities)
        setattr(f, 'feature_id', accession.feature_id)
        feature_list.append(f)

  os.unlink(outfile)
  os.unlink(infile)

  # remove truncated features across circular boundary
  filtered = []
  for f in feature_list:
    trumped = False
    if f.query_start == 1:
      # see if this feature is trumped by another one
      for other_f in feature_list:
        # same ending, direction, feature, but other_f is across circular
        # boundary (start > end)
        if other_f.query_start != f.query_start and \
           other_f.query_end == f.query_end and \
           other_f.feature_id == f.feature_id and \
           other_f.query_start > other_f.query_end:
          trumped = True
          break
    if not trumped:
      filtered.append(f)

  return filtered

Example #12

0

Show file

File: views.py Project: benjiec/giraffe

def _post(params, is_ajax):
    """
    Post a sequence and run the sequence through blast and orf detection.
    Expects: db and sequence
    Response: JSON list of features
    """

    from hippo.models import Feature_Database

    is_gb = False
    db_name = params['db'].strip()
    db = Feature_Database.objects.get(name=db_name)

    sequence = params['sequence']
    gb_features = []

    # parse genbank
    if sequence.strip().startswith('LOCUS'):
      is_gb = True
      try:
        sequence, gb_features = gb.parse_genbank(sequence.lstrip())
      except Exception as e:
        sequence = ""
        gb_features = []

    # clean sequence
    input_type = params['input'] if 'input' in params else 'dna'
    if input_type in ['protein']:
      sequence = clean_sequence(sequence, alphabet=IUPAC.protein)
    else:
      sequence = clean_sequence(sequence)

    feature_list = gb_features
    gbonly = 'gbonly' in params and params['gbonly'] in ['1', 'true', 'True']
    blastonly = 'blastonly' in params and params['blastonly'] in ['1', 'true', 'True']

    if not is_gb or gbonly is False:
      args = {}
      if 'identity_threshold' in params:
        args['identity_threshold'] = float(params['identity_threshold'])
      if 'feature_threshold' in params:
        args['feature_threshold'] = float(params['feature_threshold'])
      circular = True
      if 'circular' in params and str(params['circular']).strip().lower() in ['false', 0, '0']:
        circular = False

      # feature detection
      feature_list += features.blast(sequence, db, input_type=input_type, protein=False, circular=circular, **args)
      feature_list += features.blast(sequence, db, input_type=input_type, protein=True, circular=circular, **args)

      if input_type == 'dna' and blastonly is False:
        # restriction site search
        feature_list += features.find_restriction_sites(sequence, circular=circular)
        # ORFs and tags
        orf_list, tag_list = orfs.detect_orfs_and_tags(sequence, circular=circular)
        feature_list += orf_list
        feature_list += tag_list

    res = [x.to_dict() for x in feature_list]
    # print 'returning %s' % (res,)

    # now sort everything by start
    res.sort(cmp=lambda x,y:cmp(int(x['query_start']),int(y['query_start'])))

    res = [len(sequence),res,sequence]
    j = json.JSONEncoder().encode(res)

    if 'jsonp' in params:
        j = params['jsonp']+'('+j+')'
        http_res = HttpResponse(j,mimetype="text/javascript",status=httplib.OK)

    else:
        # technically we should be returning "application/json", but in that
        # case browsers force user to download into a file, and for debugging
        # we want to be able to see the JSON list in browser. looks like most
        # browsers will handle JSON sent back as text/html anyways.
        if is_ajax:
            http_res = HttpResponse(j,mimetype="application/json",status=httplib.OK)
        else:
            http_res = HttpResponse(j,status=httplib.OK)

    # allow cross origin API calls
    http_res['Access-Control-Allow-Origin'] = '*'
    http_res['Access-Control-Allow-Methods'] = 'POST, GET, OPTIONS'
    http_res['Access-Control-Max-Age'] = 1000

    return http_res

Example #13

0

Show file

File: features.py Project: UndeadBlow/giraffe

def blast2(subject, query):
    subject = clean_sequence(subject)
    query = clean_sequence(query)

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        subject_file = f.name
        f.write(">Subject\n%s\n" % (subject, ))
        #print 'subject=%s' % (subject,)

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        query_file = f.name
        f.write(">Query\n%s\n" % (query, ))
        #print 'query=%s' % (query,)

    outfile = "%s.out.xml" % (query_file, )
    blast_cl = NcbiblastnCommandline(
        query=query_file,
        subject=subject_file,
        evalue=0.001,
        word_size=6,
        # these params were tested to allow gaps in
        # alignments. i.e. large number of bps
        # misaligned or gapped.
        gapextend=4,
        gapopen=0,
        reward=2,
        outfmt=5,
        out=outfile)
    cl = str(blast_cl)
    cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
    r = subprocess.call(cl.split(" "))

    if r != 0:
        try:
            os.unlink(outfile)
            os.unlink(subject_file)
            os.unlink(query_file)
        except:
            pass

        raise Exception("Blast failed: %s" % (cl, ))

    res = []

    with open(outfile, "r") as f:
        blast_record = NCBIXML.read(f)
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                res.append({
                    "query_start": hsp.query_start,
                    "query_end": hsp.query_end,
                    "subject_start": hsp.sbjct_start,
                    "subject_end": hsp.sbjct_end,
                    "evalue": hsp.expect,
                    "query": hsp.query,
                    "match": hsp.match,
                    "subject": hsp.sbjct,
                })

    os.unlink(outfile)
    os.unlink(subject_file)
    os.unlink(query_file)
    return res

Example #14

0

Show file

File: features.py Project: UndeadBlow/giraffe

def blast(sequence,
          dbobj,
          input_type='dna',
          protein=False,
          identity_threshold=0.85,
          evalue_threshold=0.0001,
          feature_threshold=None,
          circular=True):
    """
  Blast sequence against specified feature database. If input type is 'dna',
  using blastn if protein=False (default), or blastx if protein=True. If input
  type is 'protein', using tblastn if protein=False, or blastp if protein=True.

  identity_threshold: only return results with identity rate greater than this
  threshold. Can be None. Default is 0.85.

  evalue_threshold: only return results with evalue smaller than this
  threshold. Default is 0.001.

  feature_threshold: only return results that span at least this amount of a
  feature. Can be None (default). E.g. if set to 0.99, only results spanning an
  entire feature are returned.
  """

    infile = None
    feature_list = []
    input = clean_sequence(sequence)
    if circular is True:
        input2 = input + input
    else:
        input2 = input

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        infile = f.name
        f.write(">Query\n%s\n" % (input2, ))

    outfile = "%s.out.xml" % (infile, )
    if protein:
        if input_type == 'dna':
            blast_cl = NcbiblastxCommandline(query=infile,
                                             db="%s" %
                                             (dbobj.protein_db_name(), ),
                                             soft_masking=True,
                                             evalue=evalue_threshold,
                                             word_size=3,
                                             outfmt=5,
                                             out=outfile,
                                             max_target_seqs=500)
        else:
            blast_cl = NcbiblastpCommandline(query=infile,
                                             db="%s" %
                                             (dbobj.protein_db_name(), ),
                                             soft_masking=True,
                                             evalue=evalue_threshold,
                                             word_size=3,
                                             outfmt=5,
                                             out=outfile,
                                             max_target_seqs=500)
    else:
        if input_type == 'dna':
            blast_cl = NcbiblastnCommandline(query=infile,
                                             db="%s" % (dbobj.dna_db_name(), ),
                                             soft_masking=True,
                                             evalue=evalue_threshold,
                                             word_size=6,
                                             outfmt=5,
                                             out=outfile,
                                             max_target_seqs=500)
        else:
            blast_cl = NcbitblastnCommandline(query=infile,
                                              db="%s" %
                                              (dbobj.dna_db_name(), ),
                                              soft_masking=True,
                                              evalue=evalue_threshold,
                                              word_size=6,
                                              outfmt=5,
                                              out=outfile,
                                              max_target_seqs=500)

    cl = str(blast_cl)
    cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
    r = subprocess.call(cl.split(" "))
    if r != 0:
        # blast can fail if blastdb is not there, which can happen if there were no
        # sequences available to build a db
        print "Blast failed: %s" % (cl, )

        try:
            os.unlink(outfile)
            os.unlink(infile)
        except:
            pass

        return []

    with open(outfile, "r") as f:
        blast_record = NCBIXML.read(f)
        for alignment in blast_record.alignments:
            accession = Blast_Accession(alignment.accession)
            for hsp in alignment.hsps:

                # since we doubled up the input, ignore hits starting after the input
                if hsp.query_start > len(input):
                    continue

                # check identity threshold
                if identity_threshold is not None and \
                   1.0*hsp.identities/len(hsp.sbjct) < identity_threshold:
                    continue

                if hsp.sbjct_end > hsp.sbjct_start:
                    clockwise = True
                    hit_start = hsp.sbjct_start
                    hit_end = hsp.sbjct_end
                else:
                    clockwise = False
                    hit_end = hsp.sbjct_start
                    hit_start = hsp.sbjct_end

                # check feature threshold
                if feature_threshold is not None and \
                   1.0*(1+abs(hit_end-hit_start))/accession.feature_length < feature_threshold:
                    continue

                # print "hit %s evalue %s" % (alignment.hit_def, hsp.expect)
                # print "  query %s-%s, sbjct %s-%s" % (hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end)

                start = hsp.query_start
                end = hsp.query_end
                if end > len(input):
                    end = end % len(input)

                feature = alignment.hit_def

                if hit_start != 1 or hit_end != accession.feature_length:
                    feature = '%s (%s-%s/%s)' % (feature, hit_start, hit_end,
                                                 accession.feature_length)

                f = Aligned_Feature(alignment.hit_def, feature, start, end,
                                    hsp.sbjct_start, hsp.sbjct_end,
                                    accession.type, hsp.query, hsp.match,
                                    hsp.sbjct, hsp.expect, hsp.identities)
                setattr(f, 'feature_id', accession.feature_id)
                feature_list.append(f)

    os.unlink(outfile)
    os.unlink(infile)

    # remove truncated features across circular boundary
    filtered = []
    for f in feature_list:
        trumped = False
        if f.query_start == 1:
            # see if this feature is trumped by another one
            for other_f in feature_list:
                # same ending, direction, feature, but other_f is across circular
                # boundary (start > end)
                if other_f.query_start != f.query_start and \
                   other_f.query_end == f.query_end and \
                   other_f.feature_id == f.feature_id and \
                   other_f.query_start > other_f.query_end:
                    trumped = True
                    break
        if not trumped:
            filtered.append(f)

    return filtered