def __add_select_and_aggregate(self, select, groupby, where, window, tree): """ select, groupby, and where are a list of unparsed fields in those respective clauses """ tuple_descriptor = TupleDescriptor() fields_to_verify = [] all_fields = chain(select, where) if groupby != ['']: groupby = groupby[1:][0] all_fields = chain(all_fields, groupby) self.__remove_all(groupby, QueryTokens.EMPTY_STRING) for field in all_fields: (field_descriptors, verify) = self.__parse_field(field, self.twitter_td, True, False) fields_to_verify.extend(verify) tuple_descriptor.add_descriptor_list(field_descriptors) for field in fields_to_verify: self.__verify_and_fix_field(field, tuple_descriptor) # at this point, tuple_descriptor should contain a tuple descriptor # with fields/aliases that are correct (we would have gotten an # exception otherwise. built select_descriptor/group_descriptor # from it select_descriptor = TupleDescriptor() group_descriptor = TupleDescriptor() aggregates = [] for field in select: (field_descriptors, verify) = self.__parse_field(field, tuple_descriptor, True, True) select_descriptor.add_descriptor_list(field_descriptors) if field_descriptors[0].field_type == FieldType.AGGREGATE: aggregates.append(field_descriptors[0]) # add WHERE clause fields as invisible attributes for field in where: (field_descriptors, verify) = self.__parse_field(field, tuple_descriptor, True, False) select_descriptor.add_descriptor_list(field_descriptors) if len(aggregates) > 0: if window == None: raise QueryException( "Aggregate expression provided with no WINDOW parameter") for field in groupby: (field_descriptors, verify) = self.__parse_field(field, tuple_descriptor, True, True) group_descriptor.add_descriptor_list(field_descriptors) for alias in select_descriptor.aliases: select_field = select_descriptor.get_descriptor(alias) group_field = group_descriptor.get_descriptor(alias) if group_field == None and \ select_field.field_type != FieldType.AGGREGATE and \ select_field.visible: raise QueryException( "'%s' appears in the SELECT but is is neither an aggregate nor a GROUP BY field" % (alias)) tree = operators.GroupBy(tree, group_descriptor, aggregates, window) tree.assign_descriptor(select_descriptor) return tree
def __get_source(self, parsed): source = parsed.sources[0] if source == QueryTokens.TWITTER: return StatusSource.TWITTER_FILTER elif source.startswith(QueryTokens.TWITTER_SAMPLE): return StatusSource.TWITTER_SAMPLE else: raise QueryException('Unknown query source: %s' % (source))
def build(self, query_str): """ Takes a Unicode string query_str, and outputs a query tree """ try: parsed = self.parser.parseString(query_str) except ParseException, e: raise QueryException(e)
def __parse_rval(self, val, allow_null): if val == QueryTokens.NULL_TOKEN: if allow_null: return None else: raise QueryException( "NULL appears in clause where it should not.") else: return val
def __get_handler(self, parsed): into = parsed.into.asList() handler = None if (into == ['']) or (into[1] == QueryTokens.STDOUT): handler = PrintStatusHandler(1) elif (len(into) == 3) and (into[1] == QueryTokens.TABLE): handler = DbInsertStatusHandler(1000, into[2]) elif (len(into) == 3) and (into[1] == QueryTokens.STREAM): raise DbException( "Putting results into a STREAM is not yet supported") else: raise QueryException("Invalid INTO clause") return handler
def run_built_query(self, query_built, async): self.build_stream() self.query = query_built self.query.handler.set_tuple_descriptor(self.query.get_tuple_descriptor()) if self.query.source == StatusSource.TWITTER_FILTER: no_filter_exception = QueryException("You haven't specified any filters that can query Twitter. Perhaps you want to query TWITTER_SAMPLE?") try: (follow_ids, track_words) = self.query.query_tree.filter_params() if (follow_ids == None) and (track_words == [None]): raise no_filter_exception self.stream.filter(follow_ids, track_words, async) except NotImplementedError: raise no_filter_exception elif self.query.source == StatusSource.TWITTER_SAMPLE: self.stream.sample(None, async)
def __getattr__(self, attr): field_descriptor = self.__tuple_descriptor.get_descriptor(attr) result = None if field_descriptor.field_type == FieldType.FUNCTION: uf = field_descriptor.underlying_fields func = field_descriptor.function args = [getattr(self, field) for field in uf] args.insert(0, self.__data) result = func(*args) elif field_descriptor.field_type == FieldType.LITERAL: result = field_descriptor.literal_value elif field_descriptor.underlying_fields[0] in self.__data: result = self.__data[field_descriptor.underlying_fields[0]] else: raise QueryException("Attribute not defined: %s" % (attr)) if (field_descriptor.return_type == ReturnType.STRING) and isinstance( result, str): result = unicode(result) setattr(self, attr, result) return result
def add_descriptor(self, descriptor): visible = descriptor.visible copy_descriptor = True if descriptor.alias in self.descriptors: if (self.descriptors[descriptor.alias].field_type != FieldType.UNDEFINED) and \ (descriptor.field_type != FieldType.UNDEFINED) and \ (self.descriptors[descriptor.alias] != descriptor): raise QueryException( "The alias '%s' appears more than once in your query" % (descriptor.alias)) # if one of the descriptors is visible, mark the stored one as # visible. visible = self.descriptors[ descriptor.alias].visible or descriptor.visible if descriptor.field_type == FieldType.UNDEFINED: copy_descriptor = False else: self.aliases.append(descriptor.alias) if copy_descriptor: self.descriptors[ descriptor.alias] = descriptor #copy.deepcopy(descriptor) self.descriptors[descriptor.alias].visible = visible
def __verify_and_fix_field(self, field, tuple_descriptor): field_descriptor = tuple_descriptor.get_descriptor(field) error = False if field_descriptor == None: error = True elif field_descriptor.field_type == FieldType.UNDEFINED: if field == field_descriptor.underlying_fields[0]: error = True else: referenced_field_descriptor = \ self.__verify_and_fix_field(field_descriptor.underlying_fields[0], tuple_descriptor) field_descriptor.underlying_fields = referenced_field_descriptor.underlying_fields field_descriptor.field_type = referenced_field_descriptor.field_type field_descriptor.return_type = referenced_field_descriptor.return_type field_descriptor.aggregate_factory = referenced_field_descriptor.aggregate_factory field_descriptor.func_factory = referenced_field_descriptor.func_factory field_descriptor.function = referenced_field_descriptor.function if error: raise QueryException( "Field '%s' is neither a builtin field nor an alias" % (field)) else: return field_descriptor
def get_function(self, alias): if alias not in self.__functions: raise QueryException("'%s' is not a registered function" % (alias)) return self.__functions[alias]
def register(self, alias, function_information): if alias in self.__functions: raise QueryException("'%s' has already been registered" % (alias)) self.__functions[alias] = function_information
def __parse_field(self, field, tuple_descriptor, alias_on_complex_types, make_visible): """ Returns a tuple containing (field_descriptors, fieldnames_to_verify) The first field in field_descriptors is the one requested to be parsed by this function call. If the field turns out to be an aggregate or a user-defined function call, then field_descriptors will contain those parsed field descriptors as well, with their visible flag set to False. fieldnames_to_verify is a list of field names that should be verified in order to ensure that at some point their alias is defined in an AS clause. """ alias = None field_type = None return_type = None underlying_fields = None aggregate_factory = None literal_value = None func_factory = None fields_to_verify = [] parsed_fds = [] field_backup = list(field) self.__clean_list(field) # parse aliases if they exist if (len(field) >= 4) and (field[-2] == QueryTokens.AS): alias = field[-1] field = field[:-2] if (field[0] == QueryTokens.STRING_LITERAL) or \ (field[0] == QueryTokens.INTEGER_LITERAL) or \ (field[0] == QueryTokens.FLOAT_LITERAL): alias = self.unnamed_operator_name() underlying_fields = [] field_type = FieldType.LITERAL literal_value = field[1] if field[0] == QueryTokens.STRING_LITERAL: return_type = ReturnType.STRING elif field[0] == QueryTokens.INTEGER_LITERAL: return_type = ReturnType.INTEGER literal_value = int(literal_value) elif field[0] == QueryTokens.FLOAT_LITERAL: return_type = ReturnType.FLOAT literal_value = float(literal_value) elif field[0] == QueryTokens.COLUMN_NAME: # field or alias if alias == None: alias = field[1] field_descriptor = tuple_descriptor.get_descriptor(field[1]) if field_descriptor == None: # underlying field not yet defined. mark to check later field_type = FieldType.UNDEFINED underlying_fields = [field[1]] # check alias and underlying once this process is done to # find yet-undefined fields fields_to_verify.append(field[1]) fields_to_verify.append(alias) else: # field found, copy information field_type = field_descriptor.field_type return_type = field_descriptor.return_type underlying_fields = field_descriptor.underlying_fields aggregate_factory = field_descriptor.aggregate_factory func_factory = field_descriptor.func_factory elif field[ 0] == QueryTokens.FUNCTION_OR_AGGREGATE: # function or aggregate if alias == None: if alias_on_complex_types: raise QueryException( "Must specify alias (AS clause) for '%s'" % (field[1])) else: alias = self.unnamed_operator_name() underlying_field_list = field[2:] underlying_fields = [] for underlying in underlying_field_list: (parsed_fd_list, parsed_verify) = self.__parse_field(underlying, tuple_descriptor, False, False) for parsed_fd in parsed_fd_list: parsed_fd.visible = False fields_to_verify.extend(parsed_verify) parsed_fds.extend(parsed_fd_list) underlying_fields.append(parsed_fd_list[0].alias) aggregate_factory = get_aggregate_factory(field[1]) if aggregate_factory != None: # found an aggregate function field_type = FieldType.AGGREGATE return_type = ReturnType.FLOAT else: function_information = self.function_registry.get_function( field[1]) if function_information != None: field_type = FieldType.FUNCTION func_factory = function_information.func_factory return_type = function_information.return_type else: raise QueryException( "'%s' is neither an aggregate or a registered function" % (field[1])) else: raise QueryException("Empty field clause found: %s" % ("".join(field_backup))) fd = FieldDescriptor(alias, underlying_fields, field_type, return_type, aggregate_factory, func_factory, literal_value) fd.visible = make_visible parsed_fds.insert(0, fd) return (parsed_fds, fields_to_verify)