예제 #1
0
 def debug_off(self) :
   self.debug = DebugNop()
예제 #2
0
 def debug_off(self):
     self.debug = DebugNop()
예제 #3
0
class Compiler :
  MAYBE = 'maybe'
  
  def __init__(self, n = None, debug = False) :
    if n :
      self.n = n
    else :
      self.n = Namespaces()
      
    self.n.bind('query',   '<http://dwiel.net/axpress/query/0.1/>')
    self.n.bind('var',     '<http://dwiel.net/axpress/var/0.1/>')
    self.n.bind('tvar',    '<http://dwiel.net/axpress/translation/var/0.1/>')
    self.n.bind('bnode',   '<http://dwiel.net/simplesparql/bnode/0.1/>')
    self.n.bind('lit_var', '<http://dwiel.net/express/lit_var/0.1/>')
    
    self.parser = Parser(self.n)
    
    self.translations = []
    self.translations_by_name = {}
    self.translations_by_id = {}
    self._next_num = 0
    self._next_translation_id = 0
    
    self.match_strings_both_ways = False

    self._debug = self.debug = Debug()

    self.show_dead_ends = True

  def next_num(self) :
    """ for generating unique lit_var name """
    self._next_num += 1
    return self._next_num
    
  def debug_on(self) :
    self.debug = self._debug
      
  def debug_off(self) :
    self.debug = DebugNop()

  def rule(self, name, input, output, fn=None, input_function=None, **kwargs) :
    """ shorthand for full length register_translation """
    assert isinstance(name, basestring)
    options = {
      'name'   : name,
      'input'  : input,
      'output' : output,
      'function' : fn,
      'input_function' : input_function,
    }
    options.update(kwargs)
    self.register_translation(options)

  def register_translation(self, translation) :
    # make sure all of the required keys are present
    required = ['input', 'output', 'name']
    missing = [key for key in required if key not in translation]
    if missing :
      raise Exception('translation is missing keys: %s' % prettyquery(missing))
    
    if 'function' not in translation or translation['function'] == None :
      if 'multi_function' not in translation :
        translation['function'] = lambda x:x
    
    if 'input_function' in translation and translation['input_function'] == None :
      del translation['input_function']
      
    translation['step_size'] = translation.get('step_size', 1)
    
    # parse any string expressions
    translation['input'] = self.parser.parse_query(translation['input'], reset_bnodes=False)
    translation['output'] = self.parser.parse_query(translation['output'], reset_bnodes=False)
    #p(translation['name'], translation['input'], translation['output'])
    
    ## hash the triples
    #translation['input']  = self.hash_triples(translation['input'])
    #translation['output'] = self.hash_triples(translation['output'])
    
    # figure out which variables are in both the input and output of the 
    # translation
    invars = find_vars(translation['input'], find_string_vars = True)
    outvars = find_vars(translation['output'], find_string_vars = True)
    outvars = outvars.union(set(translation.get('add_output_vars', [])))
    #p(translation['name'])
    #p('invars', invars)
    #p('outvars', outvars)
    translation['output_vars'] = outvars
    translation['constant_vars'] = list(
      invars.intersection(outvars)
    )
    
    translation['in_lit_vars'] = find_vars(
      translation['input'], is_lit_var, find_string_vars = True
    )
    
    import inspect
    filename = inspect.currentframe().f_back.f_back.f_code.co_filename
    
    translation['filename'] = filename
    translation['id'] = self.next_translation_id()
    self.translations.append(translation)
    self.translations_by_name[translation['name']] = translation
    self.translations_by_id[  translation['id'  ]] = translation
    
  #def hash_triple(self, triple) :
    #return Triple(triple)
  
  #def hash_triples(self, triples) :
    #return map(self.hash_triple, triples)
  
  def next_translation_id(self) :
    self._next_translation_id += 1
    return self._next_translation_id
  
  def translation_can_follow(self, this, next) :
    """ a translation can follow if any of the output triples of this match
    with any of the input triples from next """
    for triple in this['output'] :
      for ntriple in next['input'] :
        if self.triples_match(ntriple, triple) :
          return True
    
    return False
  
  def compile_translations(self) :
    self.match_strings_both_ways = True
    self.translation_matrix = {}
    for id, translation in self.translations_by_id.iteritems() :
      self.translation_matrix[id] = [
        t for t in self.translations if self.translation_can_follow(translation, t)
      ]
      #p('t', translation['name'], [t['name'] for t in self.translation_matrix[id] ])
    #print 'avg', sum(len(ts) for ts in self.translation_matrix.values())/float(len(self.translation_matrix))
    #print [len(ts) for ts in self.translation_matrix.values()]
    #print [t['name'] for t in self.translations if len(t['input']) != 1]
    self.match_strings_both_ways = False
  
  
  #############################################################################
  # MATCHING
  
  def find_matches(self, value, qvalue) :
    return StringMatch.match(value, qvalue)
    
  def string_matches(self, value, qvalue) :
    # boolean version of find_matches
    # note that [] denotes a successful basic string match, just without any
    # variables to match against
    return self.find_matches(value, qvalue) not in [None, False]
  
  def values_match(self, value, qvalue) :
    #self.debug.p('values_match', value, qvalue)
    if is_any_var(value) :
      if is_var(value) :
        return True
      elif is_meta_var(value) :
        if is_any_var(qvalue) :
          return is_any_var(qvalue) and not is_lit_var(qvalue)
        else :
          return False
      elif is_lit_var(value) :
        if is_any_var(qvalue) :
          return is_lit_var(qvalue) or not is_any_var(qvalue)
        else :
          return True
      elif is_out_lit_var(value) :
        # not often ... probably only in the if matches(q,v) or (v,q) ...
        if is_lit_var(qvalue) :
          return True
        elif is_any_var(qvalue) :
          return False
        else :
          return True
      else :
        raise Exception('shouldnt get here')
    elif is_out_lit_var(qvalue) :
      return True
    elif isinstance(value, list) :
      ret = any(imap(lambda v : self.values_match(v, qvalue), value))
      return ret
    
    if isstr(value) and isstr(qvalue) :
      if self.match_strings_both_ways :
        return self.string_matches(value, qvalue) or self.string_matches(qvalue, value)
      else :
        return self.string_matches(value, qvalue)
    
    if value == qvalue :
      return True
  
  _triples_hash = {}
  def triples_match(self, triple, qtriple) :
    #assert isinstance(triple, Triple) and isinstance(qtriple, Triple)
    key = triple.hash + qtriple.hash
    if key in self._triples_hash :
      return self._triples_hash[key]
    
    for tv, qv in izip(triple, qtriple) :
      if not self.values_match(tv, qv) :
        self._triples_hash[key] = False
        return False
    self._triples_hash[key] = True
    return True
  
  def find_triple_match(self, triple, query) :
    for qtriple in query :
      if self.triples_match(triple, qtriple) :
        return True
    return False
  
  def partial_match_exists(self, pattern, reqd_triples) :
    # check that one of the reqd_triples match part of the query
    for triple in pattern :
      if self.find_triple_match(triple, reqd_triples) :
        return True
    return False
    
  #############################################################################
  # BINDINGS
  def mul_bindings_set(self, bs1, bs2) :
    new_bs = []
    for b1 in bs1 :
      for b2 in bs2 :
        #b2 = copy.copy(b2)
        for name, value in b1.iteritems() :
          b2[name] = value
        new_bs.append(b2)
    return new_bs
  
  def get_binding(self, triple, ftriple) :
    bindings = [Bindings()]
    for i, (t, q) in enumerate(izip(triple, ftriple)) :
      if is_any_var(t) and self.values_match(t, q):
        # two lit_vars in the front of a triple don't match
        if i == 0 and is_lit_var(t) and is_lit_var(q) :
          return []
        # if the same var is trying to be bound to two different values, 
        # not a valid binding
        if t in bindings[0] and bindings[0][t.name] != q :
          return []
        bindings = self.mul_bindings_set(bindings, [Bindings({t.name : q})])
      elif (is_lit_var(t) or is_var(t)) and is_var(q) :
        # prefer_litvars is set in bind_vars.  In some cases we never want to 
        # bind a litvar to a var because it means we would be loosing 
        # information
        if self.prefer_litvars and is_lit_var(t) :
          assert q.name not in bindings[0]
          bindings = self.mul_bindings_set(bindings, [Bindings({q.name : t})])
        else :
          assert t.name not in bindings[0]
          bindings = self.mul_bindings_set(bindings, [Bindings({t.name : q})])
      elif isstr(t) and isstr(q) :
        # BUG: if there is more than one way to match the string with the 
        # pattern this will only return the first
        # I believe that this bug is fixed in 0647dad1 and nearby commits
        ret = self.find_matches(str(t), str(q))
          
        if ret not in [None, False, []] :
          for name, value in ret[0].iteritems() :
            assert unicode(name) not in bindings[0]
          
          bindings = self.mul_bindings_set(bindings, map(Bindings, ret))
      elif isinstance(t, list) :
        # NOTE: copy and paste from above ...
        for ti in t :
          ret = self.find_matches(str(ti), str(q))
          if ret not in [None, False, []] :
            for name, value in ret[0].iteritems() :
              assert unicode(name) not in bindings[0]
            
            bindings = self.mul_bindings_set(bindings, map(Bindings, ret))
      elif t != q :
        return []
      elif is_lit_var(t) and is_out_lit_var(q) :
        bindings = self.mul_bindings_set(bindings, [Bindings({t.name : q})])
    if len(bindings) == 1 and not bindings[0] :
      return []
    return bindings
  
  def find_bindings_for_triple(self, triple, facts, reqd_facts) :
    ret_bindings = []
    for ftriple in facts :
      bindings = self.get_binding(triple, ftriple)
      if not reqd_facts or ftriple in reqd_facts :
        for binding in bindings :
          binding.matches_reqd_fact = True
      if bindings :
        for binding in bindings :
          if binding in ret_bindings :
            for b in bindings :
              if b == binding :
                b.matches_reqd_fact = b.matches_reqd_fact or binding.matches_reqd_fact
          else :
            ret_bindings.append(binding)
    
    return ret_bindings
  
  def merge_bindings(self, a, b) :
    """
    a and b are dictionaries.  Returns True if there are keys which are in 
    both a and b, but have different values.  Used in unification
    """
    # WARNING: this should probably return the new binding so that
    # is_out_lit_var never clobbers not is_any_var
    new_bindings = Bindings()
    for k, v in a.iteritems() :
      if k in b and b[k] != v :
        if is_out_lit_var(b[k]) and not is_any_var(v) :
          new_bindings[k] = v
        elif not is_any_var(b[k]) and is_out_lit_var(v) :
          new_bindings[k] = b[k]
        else :
          return False
      else :
        new_bindings[k] = v
    
    # add bindings whose keys are in b, but not a
    for k, v in b.iteritems() :
      if k not in new_bindings :
        new_bindings[k] = v
    
    # if a or b are Bindings objects, merge their matches_reqd_fact values
    new_bindings.matches_reqd_fact = (
      (isinstance(a, Bindings) and a.matches_reqd_fact)
      or
      (isinstance(b, Bindings) and b.matches_reqd_fact)
    )
    
    return new_bindings
  
  def merge_bindings_sets(self, a, b) :
    # see if any of the next_bindings fit with the existing bindings
    new_bindings = []
    for bbinding in b :
      for abinding in a :
        new_binding = self.merge_bindings(abinding, bbinding)
        if new_binding != False :
          # WARNING: this isn't going to copy the values of the bindings!!!
          if new_binding not in new_bindings :
            new_bindings.append(new_binding)
        elif new_binding == self.MAYBE :
          # WARNING: this isn't going to copy the values of the bindings!!!
          new_binding.possible = True
          if new_binding not in new_bindings :
            new_bindings.append(new_binding)
          matches = self.MAYBE
        else :
          pass # binding conflicted
    
    return new_bindings

  def bind_vars(self, translation, facts, reqd_facts, initial_bindings = {}, prefer_litvars = False) :
    """
    @arg translation is a list of triples (the input part of the translation)
    @arg facts is a list of triples (the currently known facts)
    @arg reqd_facts is a list of triples of which one must be used in the binding
    @arg initial_bindings is a set of bindings which should be the starting 
        point and included in all returned bindings.  This is used by output
        unification to ensure that some of the input bindings are preserved
    @returns matches, bindings
      matches is True if the query matches the translation
      bindings is a list of bindings for var to value
    """
    
    self.prefer_litvars = prefer_litvars
    
    # loop through each triple.  find possible bindings for each triple.  If 
    # this triple's bindings conflict with previous triple's bindings then 
    # return false.  It does not bind, unless we are only looking for partial
    # matches like for output unification.  In that case however, I think that
    # technically, there should be a split there so that we return both sets
    # of possible bindings rather than just the first one.
    # WARNING: likely bug,  See above comment
    # check that all of the translation inputs match part of the query

    if reqd_facts != False:
      for triple in translation :
        if not self.find_triple_match(triple, facts) :
          return False
    
    bindings = [Bindings()]
    for ttriple in translation :
      possible_bindings = self.find_bindings_for_triple(ttriple, facts, reqd_facts)
      
      new_bindings = self.merge_bindings_sets(bindings, possible_bindings)
      
      if len(new_bindings) > 0 :
        bindings = new_bindings
      else :
        # in output unification reqd_facts will be False - in that
        # case, we don't care if every triple binds
        if reqd_facts != False :
          return False
    
    # merge with initial_bindings after collecting bindings from the facts.  It
    # is more complex to start with the initial_bindings than to end with them
    if initial_bindings :
      bindings = self.merge_bindings_sets(bindings, [initial_bindings])
    
    # get a set of all vars used in the translation
    vars = find_vars(translation, find_string_vars = True)

    # if there are no vars, this does still match, but there are no bindings
    if len(vars) == 0 :
      return bindings

    # keep only the bindings which contain bindings for all of the vars and 
    # match a reqd_triple.  In output unification reqd_facts is False and we 
    # only need partial bindings so this step isn't necessary
    if reqd_facts != False:
      bindings = [binding for binding in bindings
                  if len(binding) == len(vars) and binding.matches_reqd_fact]
    
    # if there are no bindings, failed to find a match
    if len(bindings) == 0 :
      return False
    
    return bindings

  #############################################################################
  # testtranslation and next_steps

  def testtranslation(self, translation, query, reqd_triples) :
    """
    @returns matches, bindings
      matches is True if the translation is guaranteed to match the query.  It 
        is self.MAYBE if the translation might match the query and False if it
        is guaranteed to not match the query.
      bindings is the set of bindings which allow this translation to match the
        query
    """
    # HEURISTIC
    # make sure that the translation's input matches part of the reqd_triples
    # otherwise, not a new path
    # right now, this is also a required step because this is the only way that
    # testtranslation will return False
    
    if reqd_triples and not self.partial_match_exists(translation['input'], reqd_triples) :
      return False, None
    ret = self.bind_vars(
      translation['input'], query, reqd_triples
    )
    # if a partial match did exist, but no bindings could be found, then this 
    # was a partial match
    if ret == False :
      # look for any triples that matched.  If there are any, than a partial 
      # match has been found.  Partial matches are recorded because we might
      # want to merge this partial match with another to get a full match
      matched_triples = set()
      for i, triple in enumerate(translation['input']) :
        if self.find_triple_match(triple, query) :
          matched_triples.add(i)

      return "partial", matched_triples
    else :
      return True, ret
  
  # return all triples which have at least one var in vars
  def find_specific_var_triples(self, query, vars) :
    return [triple for triple in query if any(map(lambda x:is_out_lit_var(x) and x.name in vars, triple))]

  def next_steps(self, orig_query, lineage, reqd_triples) :
    """
    @arg orig_query the origional query in triples set form
    @arg lineage the lineage of steps already followed
    @returns the set of next guaranteed_steps and possible_steps.
      Ensures that this set of translation and bindings haven't already been 
      searched.....
    """
    #self.debug.p('orig_query', orig_query)
    guaranteed_steps = []
    
    # the translation_queue is a list of translations that will be searched.  
    # TODO if there is a lineage, translation_queue should be a 
    # combination of the tq from the end of both merged paths
    if lineage and 'new_lineage' not in lineage[-1] :
      translation_queue = self.translation_matrix[lineage[-1]['translation'].get('id')]
    else :
      translation_queue = list(self.translations)
    # NOTE setting translation_queue to all translations all the time causes
    # errors.  This is because we can go in all kinds of weird directions ...
    
    # NOTE: this is a definite hack.  The problem is that the 
    # translation_matrix can't account for situations where a variable gets
    # turned into a litvar.  That variable can be in many triples and all of
    # those triples must be used as initial pruning instead of just the output
    # triples.
    # WARNING: Here, I am only using the middle value of the triple as a test.
    # It will fail when the above situation happens and variables are used in
    # the 2nd position (property) of a triple.
    for triple in reqd_triples :
      for trans in self.translations :
        if triple[1] in [t[1] for t in trans['input']] :
          if trans not in translation_queue :
            translation_queue.append(trans)
      
    # HEURISTIC: stop DFS search at self.depth
    if lineage :
      lineage_depth = sum(s['translation']['step_size'] for s in lineage)
      self.debug.p('lineage_depth', lineage_depth)
      if lineage_depth >= self.depth :
        translation_queue = []
    
      # OPTIMIZATION: skip this translation if it is the inverse of the last
      # translation
      # WARNING: not 100% sure this is always going to work, but it does for
      # now ...
      def test_for_inverse(translation) :
        inverse_function = lineage[-1]['translation'].get('inverse_function') 
        if inverse_function :
          if inverse_function == translation['name'] :
            return False
        return True
      translation_queue = filter(test_for_inverse, translation_queue)
    
    # show the list of translations that show up in the queue
    #self.debug.p('tq', [t['name'] for t in translation_queue])
    
    def merge_partial(translation, matched_triples) :
      # this function gets called if this translation was partially matched
      
      found_merge = False
      
      # all of the search paths that have partially matched this translation
      past_partials = self.partials[translation['id']]
      # NOTE: use heuristics to pick which past_partial to try first:
      #     *** keep stats about p of tip of branch1 combining with tip of 
      #         branch2 to fulfil this translation
      #     * comparing how recently the two paths diverged might correlate
      #     * prefer combined_bindings_set with more litvars
      # TODO: n-way merges instead of just 2-way merges ... ouch!
      #      just found a case that needs this 
      #self.debug.p('past_partials', len(past_partials))
      for past_lineage, past_query, past_matched_triples in past_partials :
        # make sure that the triples that these two partials atleast cover
        # all input triples
        if len(past_matched_triples.union(matched_triples)) == len(translation['input']) :
          # OPTIMIZATION make sure that past_query isn't a subset of orig_query
          # NOTE: might be faster to compare lineages instead of queries
          if all(triple in orig_query for triple in past_query) :
            continue
          
          # merge past_query and orig_query:
          # as we merge past_query and orig_query, if a litvar and a var bind,
          # keep the litvar, it has a known computation, where the var does not
          # NOTE: there may be problems binding litvars to other litvars.  The
          # two could have different values, but we won't know until runtime.
          merged_bindings_set = self.bind_vars(
            orig_query, past_query, False, {}, prefer_litvars = True
          )
          
          # all the ways orig_query and past_query can be merged
          for merged_bindings in merged_bindings_set :
            # see if any variables are mapped to twice ... this may be a big hack
            if len(merged_bindings.values()) != len(set(merged_bindings.values())) :
              continue
            
            new_query, new_triples = sub_var_bindings_track_changes(
              orig_query + past_query, merged_bindings
            )
            
            new_query = remove_duplicate_triples(new_query)
            
            # test to see if this new merged query has enough information to
            # trigger this translation
            ret, more = self.testtranslation(translation, new_query, new_triples)
            if ret == True :
              found_merge = True
              self.debug.open_block('merge for ' + translation['name'])
              self.debug.p('orig_query', orig_query)
              self.debug.p('past_query', past_query)
              self.debug.p('merged_bindings_set', merged_bindings_set)
              yield new_query, translation, more, past_lineage
              self.debug.close_block()
      
      # add this instance to past partials
      #p('storing partial', translation['name'], matched_triples)
      self.partials[translation['id']].append((lineage, orig_query, matched_triples))
    
    def test_and_merge() :
      """ test each translation against the current query.  If there is a 
      partial match, also yield all possible merges """
      for translation in translation_queue :
        ret, more = self.testtranslation(translation, orig_query, reqd_triples)
        if ret == "partial" :
          # in this case more is a list of the triples that matched
          for x in merge_partial(translation, more) :
            yield x
        elif ret == False :
          continue 
        else :
          # in this case more is a bindings_set
          yield orig_query, translation, more, False
    
    # main loop
    for query, translation, bindings_set, new_lineage in test_and_merge() :
      # the 2nd value from testtranslation is bindings_set if we've gotten here
      
      # we've found a match, now we just need to find the bindings.  This is
      # the step where we unify the new information (generated by output 
      # triples) with existing information.
      for bindings in bindings_set :
        # input_bindings map from translation space to query space
        input_bindings = bindings
        # output_bindings map from translation space to query space
        output_bindings = {}
        
        input_bindings_vars = [var for (var, binding) in input_bindings.iteritems() if not is_var(binding)]
        missing_vars = translation['in_lit_vars'] - set(input_bindings_vars)
        if len(missing_vars) :
          continue
        
        # initial_bindings are the bindings that we already know from the 
        # input unification that must also hold true for output unification
        # some of the initial_binding vars don't appear in the output triples
        # so we can get rid of them
        output_triples = translation['output']
        initial_bindings = dict(
          (unicode(name), bindings[name]) for name in translation['constant_vars']
            if name in bindings and 
              name in translation['output_vars']
        )
        
        # used in a couple places later on
        output_lit_vars = find_vars(translation['output'], is_lit_var).union(
          set(translation.get('add_output_vars', [])))
        
        # if the translation has an input_function, run it here to see if these
        # input_bindings pass the test
        if 'input_function' in translation :
          if not translation['input_function'](input_bindings) :
            continue
        
        # unify output_triples with query
        if not translation['output'] :
          # if there is not output, simply replace input vars with litvars
          # since after the translation is applied they will have values
          output_bindings_set = [
            {unicode(name) : LitVar(input_bindings[name].name)}
            for name in translation['add_output_vars']]
        else :
          output_bindings_set = self.bind_vars(output_triples, query, False, initial_bindings = initial_bindings)
        # if no unification is found, just use the initial_bindings
        if output_bindings_set == False :
          output_bindings_set = [initial_bindings]
        
        for output_bindings in output_bindings_set :
          # if var is a lit var in the output_triples, then its output bindings
          # must bind it to a new variable since it will be computed and set by
          # the translation function and may not have the same value any more
          # WARNING: I think this means that bind_vars might not do the 
          # right thing if it thinks that it can bind whatever it wants to 
          # lit_vars.  lit_vars for example shouldn't bind to literal values
          # this might also have to do with a schema, some things can be bound
          # again (a.is), whereas some can not (u.inches)
          
          # if get_bindings found variable to variable matches, we will need
          # to alter the triples in the existing query (not just add triples)
          # unified_bindings maps old query variables to new query variables
          unified_bindings = {}
          for var in output_lit_vars :
            new_lit_var = LitVar(var+'_out_'+str(self.next_num()))
            if var in output_bindings :
              if is_any_var(output_bindings[var]) :
                if not is_out_lit_var(output_bindings[var]) :
                  unified_bindings[output_bindings[var].name] = new_lit_var
            # only replace output_bindings with a lit var if
            # output_bindings isn't already bound to a literal value,
            # like a string or an int
            if var not in output_bindings or is_any_var(output_bindings[var]) :
              output_bindings[var] = new_lit_var
          
          # make sure all vanila vars have unique names
          for var in find_vars(translation['output'], is_var) :
            if var not in output_bindings :
              output_bindings[var] = Var(var+'_'+str(self.next_num()))

          # generate the new query by adding the output triples with 
          # output bindings substituted in
          new_triples = sub_var_bindings(translation['output'], output_bindings)

          new_query, new_query_new_triples = sub_var_bindings_track_changes(query, unified_bindings)

          new_query.extend(new_triples)
          new_triples.extend(new_query_new_triples)
          
          # remove output_bindings which are not constant_vars or lit_vars (in
          # the translation's output triples.  An example of an instance when
          # a variable would be in output_bindings that we would remove here is
          # when an output_triple has normal variables which are not used in 
          # the input, but also aren't bound to anything by the translation fn.
          # we want to know if that variable binds to anything for creating 
          # new_triples above, but as far as the evaluator is concerned, it has
          # no value and thus no output binding
          output_bindings = dict(
            (var, binding) for var, binding in output_bindings.iteritems()
            if var in output_lit_vars or
              var in translation['constant_vars']
          )
          
          new_query = remove_duplicate_triples(new_query)
          
          #self.debug.p('new_triples', new_triples)
          #self.debug.p('new_query', new_query)
          #self.debug.p('input_bindings', input_bindings)
          #self.debug.p('output_bindings', output_bindings)
          
          step = {
            'input_bindings' : input_bindings,
            'output_bindings' : output_bindings,
            'translation' : translation,
            'new_triples' : new_triples,
            'new_query' : new_query,
          }
          if new_lineage :
            step['new_lineage'] = new_lineage
          yield step
  
  #############################################################################
  # find solution
  
  def find_solution_values_match(self, tv, qv) :
    """
    does the pattern (value) in tv match the value of qv?
    """
    if is_any_var(tv) :
      if is_out_lit_var(tv) :
        # if the pattern is an out_lit_var, qv must be a lit_var or a literal
        if is_lit_var(qv):
          return {tv : qv}
        elif is_any_var(qv) :
          return False
        else :
          return {tv : qv}
      elif is_out_var(tv) :
        # not sure if this is really right ...
        if is_any_var(qv) :
          if tv.name == qv.name :
            return {tv : qv}
        return False
      elif is_out_lit_var(qv) :
        # This happens when a query is looking for a literal variable
        # and a translation is willing to provide a variable, just not
        # a literal one.  (see lastfm similar artist output variable
        # similar_artist) and a query wanting it to be literal
        return False
      elif is_lit_var(tv) and is_lit_var(qv) :
        return True
      elif is_any_var(qv) :
        return tv.name == qv.name
      return False
    else :
      return tv == qv
  
  def find_solution_triples_match(self, triple, qtriple) :
    """
    does the pattern in triple match the qtriple?
    """
    bindings = {}
    for tv, qv in izip(triple, qtriple) :
      ret = self.find_solution_values_match(tv, qv)
      if not ret :
        return False
      elif isinstance(ret, dict) :
        bindings.update(ret)
    return bindings or True
  
  def find_solution_triple(self, triple, facts) :
    """
    does the pattern defined in triple have a match in facts?
    """
    for ftriple in facts :
      bindings = self.find_solution_triples_match(triple, ftriple)
      if bindings :
        if bindings :
          return bindings, ftriple
        else :
          return True, ftriple
    return False, None
    
  def find_solution(self, var_triples, facts) :
    """
    returns True if a solution for var_triples can be found in facts
    @arg var_triples is the set of triples which need to be bound in query for
      a solution to exist
    @arg query is the query to find a solution satisfying var_triples in
    @returns True iff a solution exists
    """
    bindings = {}
    for triple in var_triples :
      new_bindings, ftriple = self.find_solution_triple(triple, facts)
      if not new_bindings :
        return False
      bindings.update(new_bindings)
    return bindings or True
  
  def find_partial_solution(self, var_triples, facts) :
    """
    returns a list of triples from var_triples which have matches in facts
    """
    bindings = {}
    for triple in var_triples :
      new_bindings, ftriple = self.find_solution_triple(triple, facts)
      if new_bindings :
        if new_bindings == True :
          pass
        else :
          bindings.update(new_bindings)
    
    # make bindings just to the variable name not the full URI (if the value of
    # the binding is a varialbe, make sure it is in the n.var namespace)
    # at one point, just the variable name was used, but sometimes the compiler
    # can actually find hard values for the bindings (no evaluation required)
    # and so we must use a full uri
    def normalize(value) :
      if is_any_var(value) :
        return Var(value.name)
      else :
        return value
    bindings = dict([(var.name, normalize(value)) for var, value in bindings.iteritems()])
    return bindings
  
  def found_solution(self, new_query) :
    # NOTE: it is quite possible that the output unification step has enough 
    # information to know if a solution has been found too, which could make
    # this step unecessary.
    
    # var_triples are the triples which contain the variables which we are 
    # looking to bind
    var_triples = self.find_specific_var_triples(new_query, self.reqd_bound_vars)
    initial_bindings = dict((var, Var(var)) for var in find_vars(var_triples, lambda x:is_var(x) or is_lit_var(x)))

    # see if the triples which contain the variables can bind to any of the 
    # other triples in the query
    bindings_set = self.bind_vars(var_triples, new_query, False, initial_bindings = initial_bindings)
    if bindings_set != False :
      for bindings in bindings_set :
        found_bindings_for = set()
        # find the bindings that are bound to a lit var or a value.  Sometimes
        # a variable will be bound to another variable, but that is not a result
        for k, v in bindings.iteritems() :
          if is_lit_var(v) or not is_any_var(v) :
            found_bindings_for.add(k)
        
        # if we found bindings 
        if found_bindings_for == set(self.reqd_bound_vars) :
          # WARNING: it is possible that multiple bindings will be valid in 
          # which case we should return a set of solutions rather than a 
          # solution
          return dict((Var(name), v) for name, v in bindings.iteritems())
    
    return False

  #############################################################################
  # SEARCH
  
  def remove_steps_already_taken(self, steps, lineage) :
    """ remove any steps that we've already taken """
    def eq(s1, s2) :
      """ True iff step1 and step2 are equal """
      return ((s1['translation']['id'] == s2['translation']['id'])
                and
              (s1['input_bindings'] == s2['input_bindings']))
    
    for step in steps :
      # if we've already made this translation once before, skip it
      if any(eq(step, lstep) for lstep in lineage) :
        continue
      
      yield step
  
  def log_root(fn) :
    def log_root_wrapper(self, *args, **kwargs) :
      if 'root' in kwargs and kwargs['root'] :
        self.debug.open_block('search')
        ret = fn(self, *args, **kwargs)
        self.debug.close_block()
        return ret
      else :
        return fn(self,  *args, **kwargs)
    return log_root_wrapper
  
  @log_root
  def search(self, query, new_triples, lineage = [], root = False) :
    """
    follow guaranteed translations and add possible translations to the 
      possible_stack
    this is somewhat of an evaluator ...
    @arg query is the query to start from
    @new_triples is a set of triples which are new as of the previous 
      translation.  This next translation must take them into account.  If they
      are not needed, then an earlier step could have gotten there already and
      the most recent step was unnecessary
    @lineage is a list of the steps we've taken to get here
    @return the compiled guaranteed path
    """
    
    self.debug.p('query', query)
    
    # find the possible next steps
    steps = self.next_steps(query, lineage, new_triples)
    
    # remove any steps we've already taken
    steps = self.remove_steps_already_taken(steps, lineage)
    
    if self.show_dead_ends :
      steps = list(steps)
      if not steps :
        p('dead_end', query)
        p('lineage', [step['translation']['name'] for step in lineage])
        p()
    
    #steps = list(steps)
    #self.debug.open_block('steps')
    #self.debug.p(steps)
    #self.debug.close_block()
    
    # look through all steps recursively to see if they result in a 
    # solution and should be added to the compile_node, the finished 'program'
    for step in steps :
      self.debug.open_block((step['translation']['name'] or '<unnamed>') + ' ' + color(hash(step['input_bindings'], step['output_bindings'])) + ' ' + prettyquery(step['input_bindings']) + str(time.time() - self.start_time))
      
      # add this step to the lineage, but before that, add any new steps that
      # were injected by the step itself (in the case of a merged path)
      new_lineage = copy.copy(lineage)
      if 'new_lineage' in step :
        for s in step['new_lineage'] :
          if s not in lineage :
            new_lineage.append(s)
      new_lineage += [step]
      
      # if the new information at this point is enough to fulfil the query, done
      # otherwise, recursively continue searching.
      # found_solution is filled with the bindings which bind out_lit_vars from 
      # the query to literal values (strings, numbers, uris, etc)
      # TODO: found_solution might be able to return enough information to 
      # completely remove the partial solution step at the end of compilation
      found_solution = self.found_solution(step['new_query'])
      if found_solution :
        self.debug.p('last_step', step)
        self.debug.p('input', step['translation']['input'])
        self.debug.p('output', step['translation']['output'])
        self.debug.close_block()
        return new_lineage
      else :
        # recur
        ret = self.search(step['new_query'], step['new_triples'], new_lineage)
        self.debug.close_block()
        if ret :
          return ret
  
  #############################################################################
  # compile
  
  def make_vars_out_vars(self, query, reqd_bound_vars) :
    """
    replaces all instances of variables in query whose name is in the 
    reqd_bound_vars list with self.n.out_lit_var variables of the same name
    @arg query is a query to change
    @arg reqd_bound_vars is a list which the function will change
    """
    for triple in query :
      for j, value in enumerate(triple) :
        if is_lit_var(value) and value.name in reqd_bound_vars :
          triple[j] = OutLitVar(value.name)
        elif is_any_var(value) and value.name in reqd_bound_vars :
          triple[j] = OutVar(value.name)
  
  def extract_query_modifiers(self, query) :
    modifiers = {}
    new_query = []
    for triple in query :
      modified = False
      if triple[0] == self.n.query.query :
        if triple[1] == self.n.query.limit :
          modifiers.update({'limit' : int(triple[2])})
          modified = True
      
      if not modified :
        new_query.append(triple)
    new_query
    
    return new_query, modifiers
  
  def compile(self, query, reqd_bound_vars, input = [], output = []) :
    self.debug.reset()
    self.start_time = time.time()
    
    if isinstance(query, basestring) :
      query = [line.strip() for line in query.split('\n')]
      query = [line for line in query if line is not ""]
    query = self.parser.parse(query)
    
    query, modifiers = self.extract_query_modifiers(query)
    
    # TODO: change axpress to parse _vars as outlitvars in the first place
    # this replaces all litvars with outlitvars in query
    # that said, this isn't a costly function in the grand scheme of things
    # replaces all vars in reqd_bound_vars not already litvars with outvars ...
    self.make_vars_out_vars(query, reqd_bound_vars)
    
    #p('query',query)
    
    self.reqd_bound_vars = reqd_bound_vars
    var_triples = self.find_specific_var_triples(query, reqd_bound_vars)
    if var_triples == [] :
      raise Exception("Waring, required bound triples were provided, but not found in the query")
    
    # an iterative deepening search
    self.depth = 6
    steps = None
    max_depth = 12
    while not steps and self.depth < max_depth:
      self.debug.p("depth: %d" % self.depth)
      #self.show_dead_end = self.show_dead_ends and self.depth == max_depth - 1
      self.show_dead_ends = False
      self.partials = defaultdict(list)
      steps = self.search(query, query, lineage = [], root = True)
      self.depth += 1
    
    # if there were no paths through the search space we are done here
    if not steps :
      return False

    #p('steps', steps)
    
    """
    at one point, steps was allowed to return many paths through the 
    translation space and the rest of this code would make sure that the 
    interleaving paths didn't wind up causing translations to be run twice
    or run when they were not necessary, etc.  With DFS, this is no longer an
    issue, and we have moved away from attempting to run every guaranteed path
    and instead run just one of them, or the first few.  I've discovered that 
    finding all paths is much more difficult because there are many ways which
    translations can be combined into infite loops that are hard to detect
    """

    solution_bindings_set = {}
    for step in steps :
      step['input_bindings'] = dict([(var, binding) for (var, binding) in step['input_bindings'].iteritems() if not is_var(binding)])
      
      step['output_bindings'] = dict([(var, binding) for (var, binding) in step['output_bindings'].iteritems() if not is_var(binding)])
      
      # figure out if any parts of the output of this step satisfy part of 
      # the solution
      var_triples = self.find_specific_var_triples(step['new_query'], self.reqd_bound_vars)
      partial_bindings = self.find_partial_solution(
        var_triples, step['new_query']
      )

      # keep track of which variables will end up holding the solution
      solution_bindings_set.update(partial_bindings)
      
      # get rid of extra stuff in steps
      del step['new_query']
      del step['new_triples']
      if 'new_lineage' in step :
        del step['new_lineage']

    ret = {
      'combinations' : [[{
        'depends' : steps[:-1],
        'step' : steps[-1]
      }]],
      'modifiers' : modifiers,
      'solution_bindings_set' : [solution_bindings_set],
    }
    #p('ret', ret)
    return ret
예제 #4
0
class Compiler:
    MAYBE = 'maybe'

    def __init__(self, n=None, debug=False):
        if n:
            self.n = n
        else:
            self.n = Namespaces()

        self.n.bind('query', '<http://dwiel.net/axpress/query/0.1/>')
        self.n.bind('var', '<http://dwiel.net/axpress/var/0.1/>')
        self.n.bind('tvar', '<http://dwiel.net/axpress/translation/var/0.1/>')
        self.n.bind('bnode', '<http://dwiel.net/simplesparql/bnode/0.1/>')
        self.n.bind('lit_var', '<http://dwiel.net/express/lit_var/0.1/>')

        self.parser = Parser(self.n)

        self.translations = []
        self.translations_by_name = {}
        self.translations_by_id = {}
        self._next_num = 0
        self._next_translation_id = 0

        self.match_strings_both_ways = False

        self._debug = self.debug = Debug()

        self.show_dead_ends = True

    def next_num(self):
        """ for generating unique lit_var name """
        self._next_num += 1
        return self._next_num

    def debug_on(self):
        self.debug = self._debug

    def debug_off(self):
        self.debug = DebugNop()

    def rule(self,
             name,
             input,
             output,
             fn=None,
             input_function=None,
             **kwargs):
        """ shorthand for full length register_translation """
        assert isinstance(name, basestring)
        options = {
            'name': name,
            'input': input,
            'output': output,
            'function': fn,
            'input_function': input_function,
        }
        options.update(kwargs)
        self.register_translation(options)

    def register_translation(self, translation):
        # make sure all of the required keys are present
        required = ['input', 'output', 'name']
        missing = [key for key in required if key not in translation]
        if missing:
            raise Exception('translation is missing keys: %s' %
                            prettyquery(missing))

        if 'function' not in translation or translation['function'] == None:
            if 'multi_function' not in translation:
                translation['function'] = lambda x: x

        if 'input_function' in translation and translation[
                'input_function'] == None:
            del translation['input_function']

        translation['step_size'] = translation.get('step_size', 1)

        # parse any string expressions
        translation['input'] = self.parser.parse_query(translation['input'],
                                                       reset_bnodes=False)
        translation['output'] = self.parser.parse_query(translation['output'],
                                                        reset_bnodes=False)
        #p(translation['name'], translation['input'], translation['output'])

        ## hash the triples
        #translation['input']  = self.hash_triples(translation['input'])
        #translation['output'] = self.hash_triples(translation['output'])

        # figure out which variables are in both the input and output of the
        # translation
        invars = find_vars(translation['input'], find_string_vars=True)
        outvars = find_vars(translation['output'], find_string_vars=True)
        outvars = outvars.union(set(translation.get('add_output_vars', [])))
        #p(translation['name'])
        #p('invars', invars)
        #p('outvars', outvars)
        translation['output_vars'] = outvars
        translation['constant_vars'] = list(invars.intersection(outvars))

        translation['in_lit_vars'] = find_vars(translation['input'],
                                               is_lit_var,
                                               find_string_vars=True)

        import inspect
        filename = inspect.currentframe().f_back.f_back.f_code.co_filename

        translation['filename'] = filename
        translation['id'] = self.next_translation_id()
        self.translations.append(translation)
        self.translations_by_name[translation['name']] = translation
        self.translations_by_id[translation['id']] = translation

    #def hash_triple(self, triple) :
    #return Triple(triple)

    #def hash_triples(self, triples) :
    #return map(self.hash_triple, triples)

    def next_translation_id(self):
        self._next_translation_id += 1
        return self._next_translation_id

    def translation_can_follow(self, this, next):
        """ a translation can follow if any of the output triples of this match
    with any of the input triples from next """
        for triple in this['output']:
            for ntriple in next['input']:
                if self.triples_match(ntriple, triple):
                    return True

        return False

    def compile_translations(self):
        self.match_strings_both_ways = True
        self.translation_matrix = {}
        for id, translation in self.translations_by_id.iteritems():
            self.translation_matrix[id] = [
                t for t in self.translations
                if self.translation_can_follow(translation, t)
            ]
            #p('t', translation['name'], [t['name'] for t in self.translation_matrix[id] ])
        #print 'avg', sum(len(ts) for ts in self.translation_matrix.values())/float(len(self.translation_matrix))
        #print [len(ts) for ts in self.translation_matrix.values()]
        #print [t['name'] for t in self.translations if len(t['input']) != 1]
        self.match_strings_both_ways = False

    #############################################################################
    # MATCHING

    def find_matches(self, value, qvalue):
        return StringMatch.match(value, qvalue)

    def string_matches(self, value, qvalue):
        # boolean version of find_matches
        # note that [] denotes a successful basic string match, just without any
        # variables to match against
        return self.find_matches(value, qvalue) not in [None, False]

    def values_match(self, value, qvalue):
        #self.debug.p('values_match', value, qvalue)
        if is_any_var(value):
            if is_var(value):
                return True
            elif is_meta_var(value):
                if is_any_var(qvalue):
                    return is_any_var(qvalue) and not is_lit_var(qvalue)
                else:
                    return False
            elif is_lit_var(value):
                if is_any_var(qvalue):
                    return is_lit_var(qvalue) or not is_any_var(qvalue)
                else:
                    return True
            elif is_out_lit_var(value):
                # not often ... probably only in the if matches(q,v) or (v,q) ...
                if is_lit_var(qvalue):
                    return True
                elif is_any_var(qvalue):
                    return False
                else:
                    return True
            else:
                raise Exception('shouldnt get here')
        elif is_out_lit_var(qvalue):
            return True
        elif isinstance(value, list):
            ret = any(imap(lambda v: self.values_match(v, qvalue), value))
            return ret

        if isstr(value) and isstr(qvalue):
            if self.match_strings_both_ways:
                return self.string_matches(
                    value, qvalue) or self.string_matches(qvalue, value)
            else:
                return self.string_matches(value, qvalue)

        if value == qvalue:
            return True

    _triples_hash = {}

    def triples_match(self, triple, qtriple):
        #assert isinstance(triple, Triple) and isinstance(qtriple, Triple)
        key = triple.hash + qtriple.hash
        if key in self._triples_hash:
            return self._triples_hash[key]

        for tv, qv in izip(triple, qtriple):
            if not self.values_match(tv, qv):
                self._triples_hash[key] = False
                return False
        self._triples_hash[key] = True
        return True

    def find_triple_match(self, triple, query):
        for qtriple in query:
            if self.triples_match(triple, qtriple):
                return True
        return False

    def partial_match_exists(self, pattern, reqd_triples):
        # check that one of the reqd_triples match part of the query
        for triple in pattern:
            if self.find_triple_match(triple, reqd_triples):
                return True
        return False

    #############################################################################
    # BINDINGS
    def mul_bindings_set(self, bs1, bs2):
        new_bs = []
        for b1 in bs1:
            for b2 in bs2:
                #b2 = copy.copy(b2)
                for name, value in b1.iteritems():
                    b2[name] = value
                new_bs.append(b2)
        return new_bs

    def get_binding(self, triple, ftriple):
        bindings = [Bindings()]
        for i, (t, q) in enumerate(izip(triple, ftriple)):
            if is_any_var(t) and self.values_match(t, q):
                # two lit_vars in the front of a triple don't match
                if i == 0 and is_lit_var(t) and is_lit_var(q):
                    return []
                # if the same var is trying to be bound to two different values,
                # not a valid binding
                if t in bindings[0] and bindings[0][t.name] != q:
                    return []
                bindings = self.mul_bindings_set(bindings,
                                                 [Bindings({t.name: q})])
            elif (is_lit_var(t) or is_var(t)) and is_var(q):
                # prefer_litvars is set in bind_vars.  In some cases we never want to
                # bind a litvar to a var because it means we would be loosing
                # information
                if self.prefer_litvars and is_lit_var(t):
                    assert q.name not in bindings[0]
                    bindings = self.mul_bindings_set(bindings,
                                                     [Bindings({q.name: t})])
                else:
                    assert t.name not in bindings[0]
                    bindings = self.mul_bindings_set(bindings,
                                                     [Bindings({t.name: q})])
            elif isstr(t) and isstr(q):
                # BUG: if there is more than one way to match the string with the
                # pattern this will only return the first
                # I believe that this bug is fixed in 0647dad1 and nearby commits
                ret = self.find_matches(str(t), str(q))

                if ret not in [None, False, []]:
                    for name, value in ret[0].iteritems():
                        assert unicode(name) not in bindings[0]

                    bindings = self.mul_bindings_set(bindings,
                                                     map(Bindings, ret))
            elif isinstance(t, list):
                # NOTE: copy and paste from above ...
                for ti in t:
                    ret = self.find_matches(str(ti), str(q))
                    if ret not in [None, False, []]:
                        for name, value in ret[0].iteritems():
                            assert unicode(name) not in bindings[0]

                        bindings = self.mul_bindings_set(
                            bindings, map(Bindings, ret))
            elif t != q:
                return []
            elif is_lit_var(t) and is_out_lit_var(q):
                bindings = self.mul_bindings_set(bindings,
                                                 [Bindings({t.name: q})])
        if len(bindings) == 1 and not bindings[0]:
            return []
        return bindings

    def find_bindings_for_triple(self, triple, facts, reqd_facts):
        ret_bindings = []
        for ftriple in facts:
            bindings = self.get_binding(triple, ftriple)
            if not reqd_facts or ftriple in reqd_facts:
                for binding in bindings:
                    binding.matches_reqd_fact = True
            if bindings:
                for binding in bindings:
                    if binding in ret_bindings:
                        for b in bindings:
                            if b == binding:
                                b.matches_reqd_fact = b.matches_reqd_fact or binding.matches_reqd_fact
                    else:
                        ret_bindings.append(binding)

        return ret_bindings

    def merge_bindings(self, a, b):
        """
    a and b are dictionaries.  Returns True if there are keys which are in 
    both a and b, but have different values.  Used in unification
    """
        # WARNING: this should probably return the new binding so that
        # is_out_lit_var never clobbers not is_any_var
        new_bindings = Bindings()
        for k, v in a.iteritems():
            if k in b and b[k] != v:
                if is_out_lit_var(b[k]) and not is_any_var(v):
                    new_bindings[k] = v
                elif not is_any_var(b[k]) and is_out_lit_var(v):
                    new_bindings[k] = b[k]
                else:
                    return False
            else:
                new_bindings[k] = v

        # add bindings whose keys are in b, but not a
        for k, v in b.iteritems():
            if k not in new_bindings:
                new_bindings[k] = v

        # if a or b are Bindings objects, merge their matches_reqd_fact values
        new_bindings.matches_reqd_fact = (
            (isinstance(a, Bindings) and a.matches_reqd_fact)
            or (isinstance(b, Bindings) and b.matches_reqd_fact))

        return new_bindings

    def merge_bindings_sets(self, a, b):
        # see if any of the next_bindings fit with the existing bindings
        new_bindings = []
        for bbinding in b:
            for abinding in a:
                new_binding = self.merge_bindings(abinding, bbinding)
                if new_binding != False:
                    # WARNING: this isn't going to copy the values of the bindings!!!
                    if new_binding not in new_bindings:
                        new_bindings.append(new_binding)
                elif new_binding == self.MAYBE:
                    # WARNING: this isn't going to copy the values of the bindings!!!
                    new_binding.possible = True
                    if new_binding not in new_bindings:
                        new_bindings.append(new_binding)
                    matches = self.MAYBE
                else:
                    pass  # binding conflicted

        return new_bindings

    def bind_vars(self,
                  translation,
                  facts,
                  reqd_facts,
                  initial_bindings={},
                  prefer_litvars=False):
        """
    @arg translation is a list of triples (the input part of the translation)
    @arg facts is a list of triples (the currently known facts)
    @arg reqd_facts is a list of triples of which one must be used in the binding
    @arg initial_bindings is a set of bindings which should be the starting 
        point and included in all returned bindings.  This is used by output
        unification to ensure that some of the input bindings are preserved
    @returns matches, bindings
      matches is True if the query matches the translation
      bindings is a list of bindings for var to value
    """

        self.prefer_litvars = prefer_litvars

        # loop through each triple.  find possible bindings for each triple.  If
        # this triple's bindings conflict with previous triple's bindings then
        # return false.  It does not bind, unless we are only looking for partial
        # matches like for output unification.  In that case however, I think that
        # technically, there should be a split there so that we return both sets
        # of possible bindings rather than just the first one.
        # WARNING: likely bug,  See above comment
        # check that all of the translation inputs match part of the query

        if reqd_facts != False:
            for triple in translation:
                if not self.find_triple_match(triple, facts):
                    return False

        bindings = [Bindings()]
        for ttriple in translation:
            possible_bindings = self.find_bindings_for_triple(
                ttriple, facts, reqd_facts)

            new_bindings = self.merge_bindings_sets(bindings,
                                                    possible_bindings)

            if len(new_bindings) > 0:
                bindings = new_bindings
            else:
                # in output unification reqd_facts will be False - in that
                # case, we don't care if every triple binds
                if reqd_facts != False:
                    return False

        # merge with initial_bindings after collecting bindings from the facts.  It
        # is more complex to start with the initial_bindings than to end with them
        if initial_bindings:
            bindings = self.merge_bindings_sets(bindings, [initial_bindings])

        # get a set of all vars used in the translation
        vars = find_vars(translation, find_string_vars=True)

        # if there are no vars, this does still match, but there are no bindings
        if len(vars) == 0:
            return bindings

        # keep only the bindings which contain bindings for all of the vars and
        # match a reqd_triple.  In output unification reqd_facts is False and we
        # only need partial bindings so this step isn't necessary
        if reqd_facts != False:
            bindings = [
                binding for binding in bindings
                if len(binding) == len(vars) and binding.matches_reqd_fact
            ]

        # if there are no bindings, failed to find a match
        if len(bindings) == 0:
            return False

        return bindings

    #############################################################################
    # testtranslation and next_steps

    def testtranslation(self, translation, query, reqd_triples):
        """
    @returns matches, bindings
      matches is True if the translation is guaranteed to match the query.  It 
        is self.MAYBE if the translation might match the query and False if it
        is guaranteed to not match the query.
      bindings is the set of bindings which allow this translation to match the
        query
    """
        # HEURISTIC
        # make sure that the translation's input matches part of the reqd_triples
        # otherwise, not a new path
        # right now, this is also a required step because this is the only way that
        # testtranslation will return False

        if reqd_triples and not self.partial_match_exists(
                translation['input'], reqd_triples):
            return False, None
        ret = self.bind_vars(translation['input'], query, reqd_triples)
        # if a partial match did exist, but no bindings could be found, then this
        # was a partial match
        if ret == False:
            # look for any triples that matched.  If there are any, than a partial
            # match has been found.  Partial matches are recorded because we might
            # want to merge this partial match with another to get a full match
            matched_triples = set()
            for i, triple in enumerate(translation['input']):
                if self.find_triple_match(triple, query):
                    matched_triples.add(i)

            return "partial", matched_triples
        else:
            return True, ret

    # return all triples which have at least one var in vars
    def find_specific_var_triples(self, query, vars):
        return [
            triple for triple in query
            if any(map(lambda x: is_out_lit_var(x) and x.name in vars, triple))
        ]

    def next_steps(self, orig_query, lineage, reqd_triples):
        """
    @arg orig_query the origional query in triples set form
    @arg lineage the lineage of steps already followed
    @returns the set of next guaranteed_steps and possible_steps.
      Ensures that this set of translation and bindings haven't already been 
      searched.....
    """
        #self.debug.p('orig_query', orig_query)
        guaranteed_steps = []

        # the translation_queue is a list of translations that will be searched.
        # TODO if there is a lineage, translation_queue should be a
        # combination of the tq from the end of both merged paths
        if lineage and 'new_lineage' not in lineage[-1]:
            translation_queue = self.translation_matrix[
                lineage[-1]['translation'].get('id')]
        else:
            translation_queue = list(self.translations)
        # NOTE setting translation_queue to all translations all the time causes
        # errors.  This is because we can go in all kinds of weird directions ...

        # NOTE: this is a definite hack.  The problem is that the
        # translation_matrix can't account for situations where a variable gets
        # turned into a litvar.  That variable can be in many triples and all of
        # those triples must be used as initial pruning instead of just the output
        # triples.
        # WARNING: Here, I am only using the middle value of the triple as a test.
        # It will fail when the above situation happens and variables are used in
        # the 2nd position (property) of a triple.
        for triple in reqd_triples:
            for trans in self.translations:
                if triple[1] in [t[1] for t in trans['input']]:
                    if trans not in translation_queue:
                        translation_queue.append(trans)

        # HEURISTIC: stop DFS search at self.depth
        if lineage:
            lineage_depth = sum(s['translation']['step_size'] for s in lineage)
            self.debug.p('lineage_depth', lineage_depth)
            if lineage_depth >= self.depth:
                translation_queue = []

            # OPTIMIZATION: skip this translation if it is the inverse of the last
            # translation
            # WARNING: not 100% sure this is always going to work, but it does for
            # now ...
            def test_for_inverse(translation):
                inverse_function = lineage[-1]['translation'].get(
                    'inverse_function')
                if inverse_function:
                    if inverse_function == translation['name']:
                        return False
                return True

            translation_queue = filter(test_for_inverse, translation_queue)

        # show the list of translations that show up in the queue
        #self.debug.p('tq', [t['name'] for t in translation_queue])

        def merge_partial(translation, matched_triples):
            # this function gets called if this translation was partially matched

            found_merge = False

            # all of the search paths that have partially matched this translation
            past_partials = self.partials[translation['id']]
            # NOTE: use heuristics to pick which past_partial to try first:
            #     *** keep stats about p of tip of branch1 combining with tip of
            #         branch2 to fulfil this translation
            #     * comparing how recently the two paths diverged might correlate
            #     * prefer combined_bindings_set with more litvars
            # TODO: n-way merges instead of just 2-way merges ... ouch!
            #      just found a case that needs this
            #self.debug.p('past_partials', len(past_partials))
            for past_lineage, past_query, past_matched_triples in past_partials:
                # make sure that the triples that these two partials atleast cover
                # all input triples
                if len(past_matched_triples.union(matched_triples)) == len(
                        translation['input']):
                    # OPTIMIZATION make sure that past_query isn't a subset of orig_query
                    # NOTE: might be faster to compare lineages instead of queries
                    if all(triple in orig_query for triple in past_query):
                        continue

                    # merge past_query and orig_query:
                    # as we merge past_query and orig_query, if a litvar and a var bind,
                    # keep the litvar, it has a known computation, where the var does not
                    # NOTE: there may be problems binding litvars to other litvars.  The
                    # two could have different values, but we won't know until runtime.
                    merged_bindings_set = self.bind_vars(orig_query,
                                                         past_query,
                                                         False, {},
                                                         prefer_litvars=True)

                    # all the ways orig_query and past_query can be merged
                    for merged_bindings in merged_bindings_set:
                        # see if any variables are mapped to twice ... this may be a big hack
                        if len(merged_bindings.values()) != len(
                                set(merged_bindings.values())):
                            continue

                        new_query, new_triples = sub_var_bindings_track_changes(
                            orig_query + past_query, merged_bindings)

                        new_query = remove_duplicate_triples(new_query)

                        # test to see if this new merged query has enough information to
                        # trigger this translation
                        ret, more = self.testtranslation(
                            translation, new_query, new_triples)
                        if ret == True:
                            found_merge = True
                            self.debug.open_block('merge for ' +
                                                  translation['name'])
                            self.debug.p('orig_query', orig_query)
                            self.debug.p('past_query', past_query)
                            self.debug.p('merged_bindings_set',
                                         merged_bindings_set)
                            yield new_query, translation, more, past_lineage
                            self.debug.close_block()

            # add this instance to past partials
            #p('storing partial', translation['name'], matched_triples)
            self.partials[translation['id']].append(
                (lineage, orig_query, matched_triples))

        def test_and_merge():
            """ test each translation against the current query.  If there is a 
      partial match, also yield all possible merges """
            for translation in translation_queue:
                ret, more = self.testtranslation(translation, orig_query,
                                                 reqd_triples)
                if ret == "partial":
                    # in this case more is a list of the triples that matched
                    for x in merge_partial(translation, more):
                        yield x
                elif ret == False:
                    continue
                else:
                    # in this case more is a bindings_set
                    yield orig_query, translation, more, False

        # main loop
        for query, translation, bindings_set, new_lineage in test_and_merge():
            # the 2nd value from testtranslation is bindings_set if we've gotten here

            # we've found a match, now we just need to find the bindings.  This is
            # the step where we unify the new information (generated by output
            # triples) with existing information.
            for bindings in bindings_set:
                # input_bindings map from translation space to query space
                input_bindings = bindings
                # output_bindings map from translation space to query space
                output_bindings = {}

                input_bindings_vars = [
                    var for (var, binding) in input_bindings.iteritems()
                    if not is_var(binding)
                ]
                missing_vars = translation['in_lit_vars'] - set(
                    input_bindings_vars)
                if len(missing_vars):
                    continue

                # initial_bindings are the bindings that we already know from the
                # input unification that must also hold true for output unification
                # some of the initial_binding vars don't appear in the output triples
                # so we can get rid of them
                output_triples = translation['output']
                initial_bindings = dict(
                    (unicode(name), bindings[name])
                    for name in translation['constant_vars']
                    if name in bindings and name in translation['output_vars'])

                # used in a couple places later on
                output_lit_vars = find_vars(
                    translation['output'], is_lit_var).union(
                        set(translation.get('add_output_vars', [])))

                # if the translation has an input_function, run it here to see if these
                # input_bindings pass the test
                if 'input_function' in translation:
                    if not translation['input_function'](input_bindings):
                        continue

                # unify output_triples with query
                if not translation['output']:
                    # if there is not output, simply replace input vars with litvars
                    # since after the translation is applied they will have values
                    output_bindings_set = [{
                        unicode(name):
                        LitVar(input_bindings[name].name)
                    } for name in translation['add_output_vars']]
                else:
                    output_bindings_set = self.bind_vars(
                        output_triples,
                        query,
                        False,
                        initial_bindings=initial_bindings)
                # if no unification is found, just use the initial_bindings
                if output_bindings_set == False:
                    output_bindings_set = [initial_bindings]

                for output_bindings in output_bindings_set:
                    # if var is a lit var in the output_triples, then its output bindings
                    # must bind it to a new variable since it will be computed and set by
                    # the translation function and may not have the same value any more
                    # WARNING: I think this means that bind_vars might not do the
                    # right thing if it thinks that it can bind whatever it wants to
                    # lit_vars.  lit_vars for example shouldn't bind to literal values
                    # this might also have to do with a schema, some things can be bound
                    # again (a.is), whereas some can not (u.inches)

                    # if get_bindings found variable to variable matches, we will need
                    # to alter the triples in the existing query (not just add triples)
                    # unified_bindings maps old query variables to new query variables
                    unified_bindings = {}
                    for var in output_lit_vars:
                        new_lit_var = LitVar(var + '_out_' +
                                             str(self.next_num()))
                        if var in output_bindings:
                            if is_any_var(output_bindings[var]):
                                if not is_out_lit_var(output_bindings[var]):
                                    unified_bindings[output_bindings[var].
                                                     name] = new_lit_var
                        # only replace output_bindings with a lit var if
                        # output_bindings isn't already bound to a literal value,
                        # like a string or an int
                        if var not in output_bindings or is_any_var(
                                output_bindings[var]):
                            output_bindings[var] = new_lit_var

                    # make sure all vanila vars have unique names
                    for var in find_vars(translation['output'], is_var):
                        if var not in output_bindings:
                            output_bindings[var] = Var(var + '_' +
                                                       str(self.next_num()))

                    # generate the new query by adding the output triples with
                    # output bindings substituted in
                    new_triples = sub_var_bindings(translation['output'],
                                                   output_bindings)

                    new_query, new_query_new_triples = sub_var_bindings_track_changes(
                        query, unified_bindings)

                    new_query.extend(new_triples)
                    new_triples.extend(new_query_new_triples)

                    # remove output_bindings which are not constant_vars or lit_vars (in
                    # the translation's output triples.  An example of an instance when
                    # a variable would be in output_bindings that we would remove here is
                    # when an output_triple has normal variables which are not used in
                    # the input, but also aren't bound to anything by the translation fn.
                    # we want to know if that variable binds to anything for creating
                    # new_triples above, but as far as the evaluator is concerned, it has
                    # no value and thus no output binding
                    output_bindings = dict(
                        (var, binding)
                        for var, binding in output_bindings.iteritems()
                        if var in output_lit_vars
                        or var in translation['constant_vars'])

                    new_query = remove_duplicate_triples(new_query)

                    #self.debug.p('new_triples', new_triples)
                    #self.debug.p('new_query', new_query)
                    #self.debug.p('input_bindings', input_bindings)
                    #self.debug.p('output_bindings', output_bindings)

                    step = {
                        'input_bindings': input_bindings,
                        'output_bindings': output_bindings,
                        'translation': translation,
                        'new_triples': new_triples,
                        'new_query': new_query,
                    }
                    if new_lineage:
                        step['new_lineage'] = new_lineage
                    yield step

    #############################################################################
    # find solution

    def find_solution_values_match(self, tv, qv):
        """
    does the pattern (value) in tv match the value of qv?
    """
        if is_any_var(tv):
            if is_out_lit_var(tv):
                # if the pattern is an out_lit_var, qv must be a lit_var or a literal
                if is_lit_var(qv):
                    return {tv: qv}
                elif is_any_var(qv):
                    return False
                else:
                    return {tv: qv}
            elif is_out_var(tv):
                # not sure if this is really right ...
                if is_any_var(qv):
                    if tv.name == qv.name:
                        return {tv: qv}
                return False
            elif is_out_lit_var(qv):
                # This happens when a query is looking for a literal variable
                # and a translation is willing to provide a variable, just not
                # a literal one.  (see lastfm similar artist output variable
                # similar_artist) and a query wanting it to be literal
                return False
            elif is_lit_var(tv) and is_lit_var(qv):
                return True
            elif is_any_var(qv):
                return tv.name == qv.name
            return False
        else:
            return tv == qv

    def find_solution_triples_match(self, triple, qtriple):
        """
    does the pattern in triple match the qtriple?
    """
        bindings = {}
        for tv, qv in izip(triple, qtriple):
            ret = self.find_solution_values_match(tv, qv)
            if not ret:
                return False
            elif isinstance(ret, dict):
                bindings.update(ret)
        return bindings or True

    def find_solution_triple(self, triple, facts):
        """
    does the pattern defined in triple have a match in facts?
    """
        for ftriple in facts:
            bindings = self.find_solution_triples_match(triple, ftriple)
            if bindings:
                if bindings:
                    return bindings, ftriple
                else:
                    return True, ftriple
        return False, None

    def find_solution(self, var_triples, facts):
        """
    returns True if a solution for var_triples can be found in facts
    @arg var_triples is the set of triples which need to be bound in query for
      a solution to exist
    @arg query is the query to find a solution satisfying var_triples in
    @returns True iff a solution exists
    """
        bindings = {}
        for triple in var_triples:
            new_bindings, ftriple = self.find_solution_triple(triple, facts)
            if not new_bindings:
                return False
            bindings.update(new_bindings)
        return bindings or True

    def find_partial_solution(self, var_triples, facts):
        """
    returns a list of triples from var_triples which have matches in facts
    """
        bindings = {}
        for triple in var_triples:
            new_bindings, ftriple = self.find_solution_triple(triple, facts)
            if new_bindings:
                if new_bindings == True:
                    pass
                else:
                    bindings.update(new_bindings)

        # make bindings just to the variable name not the full URI (if the value of
        # the binding is a varialbe, make sure it is in the n.var namespace)
        # at one point, just the variable name was used, but sometimes the compiler
        # can actually find hard values for the bindings (no evaluation required)
        # and so we must use a full uri
        def normalize(value):
            if is_any_var(value):
                return Var(value.name)
            else:
                return value

        bindings = dict([(var.name, normalize(value))
                         for var, value in bindings.iteritems()])
        return bindings

    def found_solution(self, new_query):
        # NOTE: it is quite possible that the output unification step has enough
        # information to know if a solution has been found too, which could make
        # this step unecessary.

        # var_triples are the triples which contain the variables which we are
        # looking to bind
        var_triples = self.find_specific_var_triples(new_query,
                                                     self.reqd_bound_vars)
        initial_bindings = dict((var, Var(var)) for var in find_vars(
            var_triples, lambda x: is_var(x) or is_lit_var(x)))

        # see if the triples which contain the variables can bind to any of the
        # other triples in the query
        bindings_set = self.bind_vars(var_triples,
                                      new_query,
                                      False,
                                      initial_bindings=initial_bindings)
        if bindings_set != False:
            for bindings in bindings_set:
                found_bindings_for = set()
                # find the bindings that are bound to a lit var or a value.  Sometimes
                # a variable will be bound to another variable, but that is not a result
                for k, v in bindings.iteritems():
                    if is_lit_var(v) or not is_any_var(v):
                        found_bindings_for.add(k)

                # if we found bindings
                if found_bindings_for == set(self.reqd_bound_vars):
                    # WARNING: it is possible that multiple bindings will be valid in
                    # which case we should return a set of solutions rather than a
                    # solution
                    return dict(
                        (Var(name), v) for name, v in bindings.iteritems())

        return False

    #############################################################################
    # SEARCH

    def remove_steps_already_taken(self, steps, lineage):
        """ remove any steps that we've already taken """
        def eq(s1, s2):
            """ True iff step1 and step2 are equal """
            return ((s1['translation']['id'] == s2['translation']['id'])
                    and (s1['input_bindings'] == s2['input_bindings']))

        for step in steps:
            # if we've already made this translation once before, skip it
            if any(eq(step, lstep) for lstep in lineage):
                continue

            yield step

    def log_root(fn):
        def log_root_wrapper(self, *args, **kwargs):
            if 'root' in kwargs and kwargs['root']:
                self.debug.open_block('search')
                ret = fn(self, *args, **kwargs)
                self.debug.close_block()
                return ret
            else:
                return fn(self, *args, **kwargs)

        return log_root_wrapper

    @log_root
    def search(self, query, new_triples, lineage=[], root=False):
        """
    follow guaranteed translations and add possible translations to the 
      possible_stack
    this is somewhat of an evaluator ...
    @arg query is the query to start from
    @new_triples is a set of triples which are new as of the previous 
      translation.  This next translation must take them into account.  If they
      are not needed, then an earlier step could have gotten there already and
      the most recent step was unnecessary
    @lineage is a list of the steps we've taken to get here
    @return the compiled guaranteed path
    """

        self.debug.p('query', query)

        # find the possible next steps
        steps = self.next_steps(query, lineage, new_triples)

        # remove any steps we've already taken
        steps = self.remove_steps_already_taken(steps, lineage)

        if self.show_dead_ends:
            steps = list(steps)
            if not steps:
                p('dead_end', query)
                p('lineage', [step['translation']['name'] for step in lineage])
                p()

        #steps = list(steps)
        #self.debug.open_block('steps')
        #self.debug.p(steps)
        #self.debug.close_block()

        # look through all steps recursively to see if they result in a
        # solution and should be added to the compile_node, the finished 'program'
        for step in steps:
            self.debug.open_block(
                (step['translation']['name'] or '<unnamed>') + ' ' +
                color(hash(step['input_bindings'], step['output_bindings'])) +
                ' ' + prettyquery(step['input_bindings']) +
                str(time.time() - self.start_time))

            # add this step to the lineage, but before that, add any new steps that
            # were injected by the step itself (in the case of a merged path)
            new_lineage = copy.copy(lineage)
            if 'new_lineage' in step:
                for s in step['new_lineage']:
                    if s not in lineage:
                        new_lineage.append(s)
            new_lineage += [step]

            # if the new information at this point is enough to fulfil the query, done
            # otherwise, recursively continue searching.
            # found_solution is filled with the bindings which bind out_lit_vars from
            # the query to literal values (strings, numbers, uris, etc)
            # TODO: found_solution might be able to return enough information to
            # completely remove the partial solution step at the end of compilation
            found_solution = self.found_solution(step['new_query'])
            if found_solution:
                self.debug.p('last_step', step)
                self.debug.p('input', step['translation']['input'])
                self.debug.p('output', step['translation']['output'])
                self.debug.close_block()
                return new_lineage
            else:
                # recur
                ret = self.search(step['new_query'], step['new_triples'],
                                  new_lineage)
                self.debug.close_block()
                if ret:
                    return ret

    #############################################################################
    # compile

    def make_vars_out_vars(self, query, reqd_bound_vars):
        """
    replaces all instances of variables in query whose name is in the 
    reqd_bound_vars list with self.n.out_lit_var variables of the same name
    @arg query is a query to change
    @arg reqd_bound_vars is a list which the function will change
    """
        for triple in query:
            for j, value in enumerate(triple):
                if is_lit_var(value) and value.name in reqd_bound_vars:
                    triple[j] = OutLitVar(value.name)
                elif is_any_var(value) and value.name in reqd_bound_vars:
                    triple[j] = OutVar(value.name)

    def extract_query_modifiers(self, query):
        modifiers = {}
        new_query = []
        for triple in query:
            modified = False
            if triple[0] == self.n.query.query:
                if triple[1] == self.n.query.limit:
                    modifiers.update({'limit': int(triple[2])})
                    modified = True

            if not modified:
                new_query.append(triple)
        new_query

        return new_query, modifiers

    def compile(self, query, reqd_bound_vars, input=[], output=[]):
        self.debug.reset()
        self.start_time = time.time()

        if isinstance(query, basestring):
            query = [line.strip() for line in query.split('\n')]
            query = [line for line in query if line is not ""]
        query = self.parser.parse(query)

        query, modifiers = self.extract_query_modifiers(query)

        # TODO: change axpress to parse _vars as outlitvars in the first place
        # this replaces all litvars with outlitvars in query
        # that said, this isn't a costly function in the grand scheme of things
        # replaces all vars in reqd_bound_vars not already litvars with outvars ...
        self.make_vars_out_vars(query, reqd_bound_vars)

        #p('query',query)

        self.reqd_bound_vars = reqd_bound_vars
        var_triples = self.find_specific_var_triples(query, reqd_bound_vars)
        if var_triples == []:
            raise Exception(
                "Waring, required bound triples were provided, but not found in the query"
            )

        # an iterative deepening search
        self.depth = 6
        steps = None
        max_depth = 12
        while not steps and self.depth < max_depth:
            self.debug.p("depth: %d" % self.depth)
            #self.show_dead_end = self.show_dead_ends and self.depth == max_depth - 1
            self.show_dead_ends = False
            self.partials = defaultdict(list)
            steps = self.search(query, query, lineage=[], root=True)
            self.depth += 1

        # if there were no paths through the search space we are done here
        if not steps:
            return False

        #p('steps', steps)
        """
    at one point, steps was allowed to return many paths through the 
    translation space and the rest of this code would make sure that the 
    interleaving paths didn't wind up causing translations to be run twice
    or run when they were not necessary, etc.  With DFS, this is no longer an
    issue, and we have moved away from attempting to run every guaranteed path
    and instead run just one of them, or the first few.  I've discovered that 
    finding all paths is much more difficult because there are many ways which
    translations can be combined into infite loops that are hard to detect
    """

        solution_bindings_set = {}
        for step in steps:
            step['input_bindings'] = dict([
                (var, binding)
                for (var, binding) in step['input_bindings'].iteritems()
                if not is_var(binding)
            ])

            step['output_bindings'] = dict([
                (var, binding)
                for (var, binding) in step['output_bindings'].iteritems()
                if not is_var(binding)
            ])

            # figure out if any parts of the output of this step satisfy part of
            # the solution
            var_triples = self.find_specific_var_triples(
                step['new_query'], self.reqd_bound_vars)
            partial_bindings = self.find_partial_solution(
                var_triples, step['new_query'])

            # keep track of which variables will end up holding the solution
            solution_bindings_set.update(partial_bindings)

            # get rid of extra stuff in steps
            del step['new_query']
            del step['new_triples']
            if 'new_lineage' in step:
                del step['new_lineage']

        ret = {
            'combinations': [[{
                'depends': steps[:-1],
                'step': steps[-1]
            }]],
            'modifiers': modifiers,
            'solution_bindings_set': [solution_bindings_set],
        }
        #p('ret', ret)
        return ret