def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] replacementVal = algorithm.attCounter() groupBy = ["c%d" % i for (i, x) in separatorSubs] if len(groupBy): groupByString = 'group by ' + ', '.join(groupBy) else: groupByString = '' selectString = ', '.join(groupBy + ['ior(COALESCE(pUse,0))']) separatorSubs.append((self.replacementVal, self.separator)) childSQL = self.child.generateSQL_DNF(separatorSubs[:]) sql = "\n -- independent project \n select %s as pUse from (%s) as q%d %s " % ( selectString, childSQL, algorithm.counter(), groupByString) if self.child.hasGenericConstant(): genericConstantStr = self.child.getGenericConstantStr() self.genericConstantStr = genericConstantStr groupBy.append(genericConstantStr) groupByString = 'group by ' + ', '.join(groupBy) sql = "\n -- independent project \n select %s, %s as pUse from (%s) as q%d %s " % ( genericConstantStr, selectString, childSQL, algorithm.counter(), groupByString) else: sql = "\n -- independent project \n select %s as pUse from (%s) as q%d %s " % ( selectString, childSQL, algorithm.counter(), groupByString) return sql
def generateSQL_DNF(self, separatorSubs=None): #print separatorSubs if separatorSubs is None: separatorSubs = [] replacementVal = algorithm.attCounter() groupBy = ["c%d" % i for (i, x) in separatorSubs] if len(groupBy): groupByString = 'group by ' + ', '.join(groupBy) else: groupByString = '' selectString = ', '.join(groupBy + ["l_ior(COALESCE(cast(pUse as text),'0'))"]) separatorSubs.append((self.replacementVal, self.separator)) childSQL = self.child.generateSQL_DNF(separatorSubs[:]) sql = "\n select %s as pUse from (%s\n) as q%d \n %s \n" % ( selectString, childSQL, algorithm.counter(), groupByString) if self.child.hasGenericConstant(): genericConstantStr = self.child.getGenericConstantStr() self.genericConstantStr = genericConstantStr groupBy.append(genericConstantStr) groupByString = 'group by ' + ', '.join(groupBy) sql = "\n select %s, %s as pUse from (%s\n) as q%d \n %s \n" % ( genericConstantStr, selectString, childSQL, algorithm.counter(), groupByString) else: add_line = ','.join(groupBy + [ " concat(cast(pUse as text), ' + <%s>*(<%s_%s> - ', ct, ')') as pUse" % (self.child.lam, algorithm.dom, separatorSubs[-1][1][0]) ]) sql = "\n select %s from (\n select %s as pUse, count(*) as ct from (%s\n) as q%d \n %s \n) as q%d \n" % ( add_line, selectString, childSQL, algorithm.counter(), groupByString, algorithm.counter()) return sql
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] selectAttributes = [] counters = [] counterIdentToRelations = {} counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 identOfSampledRelation = -1 for (i, child) in enumerate(self.children): subquerySQL = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): if hasattr(child, "isSampled"): identOfSampledRelation = ident genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr self.genericConstantStrs.append("q%d.%s" % (ident, genericConstantStr)) counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((subquerySQL, ident)) counterIdentToRelations[ident] = self.subqueries[i].getRelations() if self.hasGenericConstant(): selectAttributes.append("q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s) as q%d" % (sql, ident) if previousIdent: joinType = "INNER JOIN" if len(separatorSubs): if ident in counterIdentToGenericConstantStr: conditions = [] for prevIdent in range(ident): if prevIdent in counterIdentToGenericConstantStr: conditions.append( "q%d.%s = q%d.%s" % ( ident, counterIdentToGenericConstantStr[ident], prevIdent, counterIdentToGenericConstantStr[prevIdent], ) ) conditions += [ "q%d.c%d = q%d.c%d" % (prevIdent, i, ident, i) for (i, x) in separatorSubs if self.separatorInRelation1And2(i, prevIdent, ident, counterIdentToRelations) ] condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: conditions = [] for prevIdent in range(ident): conditions += [ "q%d.c%d = q%d.c%d" % (prevIdent, i, ident, i) for (i, x) in separatorSubs if self.separatorInRelation1And2(i, prevIdent, ident, counterIdentToRelations) ] condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: if ident in counterIdentToGenericConstantStr: conditions = [] for prevIdent in range(ident): if prevIdent in counterIdentToGenericConstantStr: conditions.append( "q%d.%s = q%d.%s" % ( ident, counterIdentToGenericConstantStr[ident], prevIdent, counterIdentToGenericConstantStr[prevIdent], ) ) condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: subqueries.append("%s %s ON TRUE" % (joinType, newSubquery)) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident for (separatorReplacement, separatorVarsByComponent) in separatorSubs: termIdentWithThisSubstitution = -1 for i in counters: separatorInI = False for rel in counterIdentToRelations[i]: if separatorReplacement in rel.getSeparatorReplacementValues(): separatorInI = True if separatorInI: termIdentWithThisSubstitution = i break selectAttributes.append("q%d.c%d" % (termIdentWithThisSubstitution, separatorReplacement)) pString = "*".join(["q%d.pUse" % i for i in counters]) attString = ", ".join(selectAttributes) if attString: selectString = "%s, %s" % (attString, pString) else: selectString = pString sql = "\n -- independent join \n select %s as pUse from %s" % (selectString, " ".join(subqueries)) return sql
def generateSQL_CNF(self, params): if params["useLog"]: if params["useNull"]: defaultValue = "NULL" else: defaultValue = "'-Infinity'" else: defaultValue = "0" tableAliases = [] tableAliasToSubquerySQLMap = {} tableAliasToGenericIdentifiersMap = {} tableAliasToUsedSeparatorsMap = {} genericIdentifierToTableAliasMap = {} usedSeparatorToTableAliasMap = {} tableAliasesUsingAllSeparators = set() tableAliasesUsingAllSeparatorsAndGenericIdentifiers = set() tableAliasIsTrueOnMissing = {} tableAliasToMissingGenericIdentifiersMap = {} tableAliasToMissingSeparatorsMap = {} tableAliasesTrueOnMissing = set() tableAliasesFalseOnMissing = set() tableAliasToVariablesMap = {} variableToTableAliasMap = {} restOfTableAliases = set() self.genericIdentifiers = set() # assign each child a table alias and fetch its SQL code, then build the maps # that say which identifiers/separator vars are used by each child for child in self.children: currentSubqueryID = algorithm.counter() subquerySQL = child.generateSQL_CNF(params) tableAlias = "q%d" % currentSubqueryID tableAliases.append(tableAlias) tableAliasToSubquerySQLMap[tableAlias] = subquerySQL tableAliasIsTrueOnMissing[tableAlias] = child.trueOnMissing if child.trueOnMissing: # we are in an or operator, so if any child is true on missing, # missing tuples are true self.trueOnMissing = True tableAliasesTrueOnMissing.add(tableAlias) else: tableAliasesFalseOnMissing.add(tableAlias) childGenericIdentifiers = child.genericIdentifiers.copy() tableAliasToGenericIdentifiersMap[tableAlias] = childGenericIdentifiers self.genericIdentifiers.update(childGenericIdentifiers) for genericIdentifier in childGenericIdentifiers: if genericIdentifier in genericIdentifierToTableAliasMap: genericIdentifierToTableAliasMap[genericIdentifier].add(tableAlias) else: genericIdentifierToTableAliasMap[genericIdentifier] = set([tableAlias]) if genericIdentifier in variableToTableAliasMap: variableToTableAliasMap[genericIdentifier].add(tableAlias) else: variableToTableAliasMap[genericIdentifier] = set([tableAlias]) usesAllSeparators = True tableAliasToUsedSeparatorsMap[tableAlias] = set() for usedSeparatorVariable in self.usedSeparatorVars: if child.usesSeparator(usedSeparatorVariable): formattedSeparator = self.formatSeparatorVariable(usedSeparatorVariable) tableAliasToUsedSeparatorsMap[tableAlias].add(formattedSeparator) if formattedSeparator in usedSeparatorToTableAliasMap: usedSeparatorToTableAliasMap[formattedSeparator].add(tableAlias) else: usedSeparatorToTableAliasMap[formattedSeparator] = set([tableAlias]) if formattedSeparator in variableToTableAliasMap: variableToTableAliasMap[formattedSeparator].add(tableAlias) else: variableToTableAliasMap[formattedSeparator] = set([tableAlias]) else: usesAllSeparators = False if usesAllSeparators: tableAliasesUsingAllSeparators.add(tableAlias) tableAliasToVariablesMap[tableAlias] = tableAliasToUsedSeparatorsMap[tableAlias].union( childGenericIdentifiers ) if not tableAliasesUsingAllSeparators: raise Exception("No subquery containing all separators!") for tableAlias in tableAliases: tableAliasToMissingGenericIdentifiersMap[tableAlias] = self.genericIdentifiers.difference( tableAliasToGenericIdentifiersMap[tableAlias] ) tableAliasToMissingSeparatorsMap[tableAlias] = set(self.formattedUsedSeparators).difference( tableAliasToUsedSeparatorsMap[tableAlias] ) if tableAliasToMissingGenericIdentifiersMap[tableAlias]: restOfTableAliases.add(tableAlias) elif tableAlias in tableAliasesUsingAllSeparators: tableAliasesUsingAllSeparatorsAndGenericIdentifiers.add(tableAlias) else: restOfTableAliases.add(tableAlias) if not tableAliasesUsingAllSeparatorsAndGenericIdentifiers: raise Exception("No subquery containing all subqueries and generic identifiers!") # need a fixed order for selected attributes, before building the union # queries orderedAttributes = [] for previousSeparatorVariable in self.formattedUsedSeparators: orderedAttributes.append(previousSeparatorVariable) for genericIdentifier in self.genericIdentifiers: orderedAttributes.append(genericIdentifier) orderedAttributes.append("pUse") orderedAttributes.append("trueOnMissing") # use the child subquery sql, except if there are missing generic identifiers: # then we need to cross product the child relation with the active domain to fill these # in - essentially the difficulty comes from the fact that we are only "pretending" to # project out these generic values, so we can't just ignore them as constant as when # we have a regular separator variable withClauses = [] for tableAlias in tableAliases: if tableAliasToMissingGenericIdentifiersMap[tableAlias]: # we need to cross product with the active domain to fill-in # the missing generic vars aliasForMissingGenericVarsTable = "%s_missing_generic_vars" % tableAlias withClauses.append( "%s as (%s)" % (aliasForMissingGenericVarsTable, tableAliasToSubquerySQLMap[tableAlias]) ) index = 1 extraDomainSelectVars = [] extraDomainTables = [] for missingGenericVariable in tableAliasToMissingGenericIdentifiersMap[tableAlias]: extraDomainAlias = "A_%d" % index extraDomainSelectVars.append("%s.v0 as %s" % (extraDomainAlias, missingGenericVariable)) extraDomainTables.append("A %s" % extraDomainAlias) index += 1 # now this generic identifier is in the table alias tableAliasToVariablesMap[tableAlias].add(missingGenericVariable) variableToTableAliasMap[missingGenericVariable].add(tableAlias) extraDomainSelectClause = "*, %s" % ", ".join(extraDomainSelectVars) extraDomainFromClause = "%s, %s" % (aliasForMissingGenericVarsTable, ", ".join(extraDomainTables)) withClauses.append( "%s as (select %s from %s)" % (tableAlias, extraDomainSelectClause, extraDomainFromClause) ) else: withClauses.append("%s as (%s)" % (tableAlias, tableAliasToSubquerySQLMap[tableAlias])) joinSubqueries = [] previousAliases = [] # these can be done via an inner join, as any one missing => true for tableAlias in tableAliasesTrueOnMissing: if not previousAliases: joinSubqueries.append(tableAlias) else: joinConditions = [] for variable in tableAliasToVariablesMap[tableAlias]: for previousAlias in previousAliases: if variable in tableAliasToVariablesMap[previousAlias]: joinConditions.append("%s.%s = %s.%s" % (tableAlias, variable, previousAlias, variable)) break if joinConditions: joinConditionsString = " AND ".join(joinConditions) else: joinConditionsString = "TRUE" joinSubqueries.append(" INNER JOIN %s ON %s" % (tableAlias, joinConditionsString)) previousAliases.append(tableAlias) if previousAliases: joinType = "LEFT OUTER" else: joinType = "FULL OUTER" # these require an outer join, as any missing do not imply true for tableAlias in tableAliasesFalseOnMissing: # possible there were no tables true on missing, so we may still # start here if not previousAliases: joinSubqueries.append(tableAlias) else: joinConditions = [] for variable in tableAliasToVariablesMap[tableAlias]: falseOnMissingJoinAliases = [] # if this variable is in a true on missing table, we only # need to join with this; otherwise, we need to COALESCE() over all potentialJoinAlias # tables from earlier left outer joins, as any of these tables may have # had a NULL value trueOnMissingJoinAlias = None for previousAlias in previousAliases: if variable in tableAliasToVariablesMap[previousAlias]: if tableAliasIsTrueOnMissing[previousAlias]: trueOnMissingJoinAlias = previousAlias break else: falseOnMissingJoinAliases.append(previousAlias) if trueOnMissingJoinAlias: joinConditions.append( "%s.%s = %s.%s" % (tableAlias, variable, trueOnMissingJoinAlias, variable) ) elif falseOnMissingJoinAliases: falseOnMissingJoinAliasesString = ", ".join( ["%s.%s" % (alias, variable) for alias in falseOnMissingJoinAliases] ) joinConditions.append( "%s.%s = COALESCE(%s)" % (tableAlias, variable, falseOnMissingJoinAliasesString) ) if joinConditions: joinConditionsString = " AND ".join(joinConditions) else: joinConditionsString = "TRUE" joinSubqueries.append(" %s JOIN %s ON %s" % (joinType, tableAlias, joinConditionsString)) previousAliases.append(tableAlias) selectVariables = set(self.formattedUsedSeparators).union(self.genericIdentifiers) selectClause = [] for variable in selectVariables: falseOnMissingTableAliases = [] # if the variable to select is in at least one true on missing # table, we just arbitrarily select one (as they all must be non-null and equal in # any result row) trueOnMissingTableAlias = None for tableAlias in variableToTableAliasMap[variable]: if tableAliasIsTrueOnMissing[tableAlias]: trueOnMissingTableAlias = tableAlias break else: falseOnMissingTableAliases.append(tableAlias) if trueOnMissingTableAlias: selectClause.append("%s.%s" % (trueOnMissingTableAlias, variable)) else: falseOnMissingTableAliasesString = ", ".join( ["%s.%s" % (alias, variable) for alias in falseOnMissingTableAliases] ) selectClause.append("COALESCE(%s) as %s" % (falseOnMissingTableAliasesString, variable)) if params["useLog"]: if params["useNull"]: pUseTemplate = "(1-exp(%%s.pUse))" else: pUseTemplate = "CASE WHEN %s.pUse != '-Infinity' THEN 1-exp(%s.pUse) ELSE 1 END" else: pUseTemplate = "(1-%%s.pUse)" pSelect = [] for tableAlias in tableAliases: if params["useLog"]: if params["useNull"]: pSelect.append("COALESCE(1-exp(%s.pUse), 1)" % tableAlias) else: pSelect.append( "COALESCE(CASE WHEN %s.pUse != '-Infinity' THEN 1-exp(%s.pUse) ELSE 1 END, 1)" % (tableAlias, tableAlias) ) else: pSelect.append("COALESCE(1-%s.pUse, 1)" % tableAlias) pUseString = "1 - ( %s )" % (" * ".join(pSelect)) if params["useLog"]: if params["useNull"]: selectClause.append("CASE WHEN %s > 0 THEN ln(%s) ELSE NULL END AS pUse" % (pUseString, pUseString)) else: selectClause.append( "CASE WHEN %s > 0 THEN ln(%s) ELSE '-Infinity' END AS pUse" % (pUseString, pUseString) ) else: selectClause.append("%s as pUse" % pUseString) fromString = "".join(joinSubqueries) selectString = ", ".join(selectClause) withString = ",\n".join(withClauses) joinSQL = "\n -- independent join \nWITH %s\nselect %s from %s" % (withString, selectString, fromString) return joinSQL
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] selectAttributes = [] counters = [] counterIdentToRelations = {} counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 identOfSampledRelation = -1 for (i, child) in enumerate(self.children): subquerySQL = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): if hasattr(child, 'isSampled'): identOfSampledRelation = ident genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr self.genericConstantStrs.append("q%d.%s" % (ident, genericConstantStr)) counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((subquerySQL, ident)) counterIdentToRelations[ident] = self.subqueries[i].getRelations() if self.hasGenericConstant(): selectAttributes.append( "q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s\n) as q%d \n" % (sql, ident) if previousIdent: # joinType = "INNER JOIN" joinType = "FULL OUTER JOIN" if len(separatorSubs): if ident in counterIdentToGenericConstantStr: conditions = [] for prevIdent in range(ident): if prevIdent in counterIdentToGenericConstantStr: conditions.append("q%d.%s = q%d.%s" % ( ident, counterIdentToGenericConstantStr[ident], prevIdent, counterIdentToGenericConstantStr[prevIdent] )) conditions += [ "q%d.c%d = q%d.c%d" % (prevIdent, i, ident, i) for (i, x) in separatorSubs if self.separatorInRelation1And2( i, prevIdent, ident, counterIdentToRelations) ] condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: conditions = [] for prevIdent in range(ident): conditions += [ "q%d.c%d = q%d.c%d" % (prevIdent, i, ident, i) for (i, x) in separatorSubs if self.separatorInRelation1And2( i, prevIdent, ident, counterIdentToRelations) ] condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: if ident in counterIdentToGenericConstantStr: conditions = [] for prevIdent in range(ident): if prevIdent in counterIdentToGenericConstantStr: conditions.append("q%d.%s = q%d.%s" % ( ident, counterIdentToGenericConstantStr[ident], prevIdent, counterIdentToGenericConstantStr[prevIdent] )) condition = "ON %s" % " and ".join(conditions) subqueries.append("%s %s %s" % (joinType, newSubquery, condition)) else: subqueries.append("%s %s ON TRUE" % (joinType, newSubquery)) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident for (separatorReplacement, separatorVarsByComponent) in separatorSubs: termIdentWithThisSubstitution = -1 for i in counters: separatorInI = False for rel in counterIdentToRelations[i]: if separatorReplacement in rel.getSeparatorReplacementValues( ): separatorInI = True if separatorInI: termIdentWithThisSubstitution = i break selectAttributes.append( "q%d.c%d" % (termIdentWithThisSubstitution, separatorReplacement)) # pString = '*'.join(["COALESCE(q%d.pUse,q%d.lam)" % (i, i) for i in counters]) vlist = [ "COALESCE(q%d.pUse, <%s>)" % (i, l) for i, l in zip(counters, [c.lam for c in self.children]) ] pString = reduce(lambda x, y: "l1prod_n(" + x + "," + y + ")", vlist) #pString = '*'.join(["q%d.pUse" % i for i in counters]) attString = ', '.join(selectAttributes) if attString: selectString = '%s, %s' % (attString, pString) else: selectString = pString sql = "\n select %s as pUse from %s" % (selectString, " ".join(subqueries)) return sql
def generateSQL_CNF(self, params): if params['useLog']: if params['useNull']: defaultValue = "NULL" else: defaultValue = "'-Infinity'" else: defaultValue = "0" tableAliases = [] tableAliasToSubquerySQLMap = {} tableAliasToGenericIdentifiersMap = {} tableAliasToUsedSeparatorsMap = {} genericIdentifierToTableAliasMap = {} usedSeparatorToTableAliasMap = {} tableAliasesUsingAllSeparators = set() tableAliasesUsingAllSeparatorsAndGenericIdentifiers = set() tableAliasIsTrueOnMissing = {} tableAliasToMissingGenericIdentifiersMap = {} tableAliasToMissingSeparatorsMap = {} tableAliasesTrueOnMissing = set() tableAliasesFalseOnMissing = set() tableAliasToVariablesMap = {} variableToTableAliasMap = {} restOfTableAliases = set() self.genericIdentifiers = set() # assign each child a table alias and fetch its SQL code, then build the maps # that say which identifiers/separator vars are used by each child for child in self.children: currentSubqueryID = algorithm.counter() subquerySQL = child.generateSQL_CNF(params) tableAlias = "q%d" % currentSubqueryID tableAliases.append(tableAlias) tableAliasToSubquerySQLMap[tableAlias] = subquerySQL tableAliasIsTrueOnMissing[tableAlias] = child.trueOnMissing if child.trueOnMissing: # we are in an or operator, so if any child is true on missing, # missing tuples are true self.trueOnMissing = True tableAliasesTrueOnMissing.add(tableAlias) else: tableAliasesFalseOnMissing.add(tableAlias) childGenericIdentifiers = child.genericIdentifiers.copy() tableAliasToGenericIdentifiersMap[ tableAlias] = childGenericIdentifiers self.genericIdentifiers.update(childGenericIdentifiers) for genericIdentifier in childGenericIdentifiers: if genericIdentifier in genericIdentifierToTableAliasMap: genericIdentifierToTableAliasMap[genericIdentifier].add( tableAlias) else: genericIdentifierToTableAliasMap[genericIdentifier] = set( [tableAlias]) if genericIdentifier in variableToTableAliasMap: variableToTableAliasMap[genericIdentifier].add(tableAlias) else: variableToTableAliasMap[genericIdentifier] = set( [tableAlias]) usesAllSeparators = True tableAliasToUsedSeparatorsMap[tableAlias] = set() for usedSeparatorVariable in self.usedSeparatorVars: if child.usesSeparator(usedSeparatorVariable): formattedSeparator = self.formatSeparatorVariable( usedSeparatorVariable) tableAliasToUsedSeparatorsMap[tableAlias].add( formattedSeparator) if formattedSeparator in usedSeparatorToTableAliasMap: usedSeparatorToTableAliasMap[formattedSeparator].add( tableAlias) else: usedSeparatorToTableAliasMap[formattedSeparator] = set( [tableAlias]) if formattedSeparator in variableToTableAliasMap: variableToTableAliasMap[formattedSeparator].add( tableAlias) else: variableToTableAliasMap[formattedSeparator] = set( [tableAlias]) else: usesAllSeparators = False if usesAllSeparators: tableAliasesUsingAllSeparators.add(tableAlias) tableAliasToVariablesMap[ tableAlias] = tableAliasToUsedSeparatorsMap[tableAlias].union( childGenericIdentifiers) if not tableAliasesUsingAllSeparators: raise Exception("No subquery containing all separators!") for tableAlias in tableAliases: tableAliasToMissingGenericIdentifiersMap[ tableAlias] = self.genericIdentifiers.difference( tableAliasToGenericIdentifiersMap[tableAlias]) tableAliasToMissingSeparatorsMap[tableAlias] = set( self.formattedUsedSeparators).difference( tableAliasToUsedSeparatorsMap[tableAlias]) if tableAliasToMissingGenericIdentifiersMap[tableAlias]: restOfTableAliases.add(tableAlias) elif tableAlias in tableAliasesUsingAllSeparators: tableAliasesUsingAllSeparatorsAndGenericIdentifiers.add( tableAlias) else: restOfTableAliases.add(tableAlias) if not tableAliasesUsingAllSeparatorsAndGenericIdentifiers: raise Exception( "No subquery containing all subqueries and generic identifiers!" ) # need a fixed order for selected attributes, before building the union # queries orderedAttributes = [] for previousSeparatorVariable in self.formattedUsedSeparators: orderedAttributes.append(previousSeparatorVariable) for genericIdentifier in self.genericIdentifiers: orderedAttributes.append(genericIdentifier) orderedAttributes.append("pUse") orderedAttributes.append("trueOnMissing") # use the child subquery sql, except if there are missing generic identifiers: # then we need to cross product the child relation with the active domain to fill these # in - essentially the difficulty comes from the fact that we are only "pretending" to # project out these generic values, so we can't just ignore them as constant as when # we have a regular separator variable withClauses = [] for tableAlias in tableAliases: if tableAliasToMissingGenericIdentifiersMap[tableAlias]: # we need to cross product with the active domain to fill-in # the missing generic vars aliasForMissingGenericVarsTable = "%s_missing_generic_vars" % tableAlias withClauses.append("%s as (%s)" % (aliasForMissingGenericVarsTable, tableAliasToSubquerySQLMap[tableAlias])) index = 1 extraDomainSelectVars = [] extraDomainTables = [] for missingGenericVariable in tableAliasToMissingGenericIdentifiersMap[ tableAlias]: extraDomainAlias = "A_%d" % index extraDomainSelectVars.append( "%s.v0 as %s" % (extraDomainAlias, missingGenericVariable)) extraDomainTables.append("A %s" % extraDomainAlias) index += 1 # now this generic identifier is in the table alias tableAliasToVariablesMap[tableAlias].add( missingGenericVariable) variableToTableAliasMap[missingGenericVariable].add( tableAlias) extraDomainSelectClause = "*, %s" % ", ".join( extraDomainSelectVars) extraDomainFromClause = "%s, %s" % ( aliasForMissingGenericVarsTable, ", ".join(extraDomainTables)) withClauses.append("%s as (select %s from %s)" % (tableAlias, extraDomainSelectClause, extraDomainFromClause)) else: withClauses.append( "%s as (%s)" % (tableAlias, tableAliasToSubquerySQLMap[tableAlias])) joinSubqueries = [] previousAliases = [] # these can be done via an inner join, as any one missing => true for tableAlias in tableAliasesTrueOnMissing: if not previousAliases: joinSubqueries.append(tableAlias) else: joinConditions = [] for variable in tableAliasToVariablesMap[tableAlias]: for previousAlias in previousAliases: if variable in tableAliasToVariablesMap[previousAlias]: joinConditions.append("%s.%s = %s.%s" % (tableAlias, variable, previousAlias, variable)) break if joinConditions: joinConditionsString = " AND ".join(joinConditions) else: joinConditionsString = "TRUE" joinSubqueries.append(" INNER JOIN %s ON %s" % (tableAlias, joinConditionsString)) previousAliases.append(tableAlias) if previousAliases: joinType = "LEFT OUTER" else: joinType = "FULL OUTER" # these require an outer join, as any missing do not imply true for tableAlias in tableAliasesFalseOnMissing: # possible there were no tables true on missing, so we may still # start here if not previousAliases: joinSubqueries.append(tableAlias) else: joinConditions = [] for variable in tableAliasToVariablesMap[tableAlias]: falseOnMissingJoinAliases = [] # if this variable is in a true on missing table, we only # need to join with this; otherwise, we need to COALESCE() over all potentialJoinAlias # tables from earlier left outer joins, as any of these tables may have # had a NULL value trueOnMissingJoinAlias = None for previousAlias in previousAliases: if variable in tableAliasToVariablesMap[previousAlias]: if tableAliasIsTrueOnMissing[previousAlias]: trueOnMissingJoinAlias = previousAlias break else: falseOnMissingJoinAliases.append(previousAlias) if trueOnMissingJoinAlias: joinConditions.append( "%s.%s = %s.%s" % (tableAlias, variable, trueOnMissingJoinAlias, variable)) elif falseOnMissingJoinAliases: falseOnMissingJoinAliasesString = ", ".join([ "%s.%s" % (alias, variable) for alias in falseOnMissingJoinAliases ]) joinConditions.append( "%s.%s = COALESCE(%s)" % (tableAlias, variable, falseOnMissingJoinAliasesString)) if joinConditions: joinConditionsString = " AND ".join(joinConditions) else: joinConditionsString = "TRUE" joinSubqueries.append( " %s JOIN %s ON %s" % (joinType, tableAlias, joinConditionsString)) previousAliases.append(tableAlias) selectVariables = set(self.formattedUsedSeparators).union( self.genericIdentifiers) selectClause = [] for variable in selectVariables: falseOnMissingTableAliases = [] # if the variable to select is in at least one true on missing # table, we just arbitrarily select one (as they all must be non-null and equal in # any result row) trueOnMissingTableAlias = None for tableAlias in variableToTableAliasMap[variable]: if tableAliasIsTrueOnMissing[tableAlias]: trueOnMissingTableAlias = tableAlias break else: falseOnMissingTableAliases.append(tableAlias) if trueOnMissingTableAlias: selectClause.append("%s.%s" % (trueOnMissingTableAlias, variable)) else: falseOnMissingTableAliasesString = ", ".join([ "%s.%s" % (alias, variable) for alias in falseOnMissingTableAliases ]) selectClause.append( "COALESCE(%s) as %s" % (falseOnMissingTableAliasesString, variable)) if params['useLog']: if params['useNull']: pUseTemplate = "(1-exp(%%s.pUse))" else: pUseTemplate = "CASE WHEN %s.pUse != '-Infinity' THEN 1-exp(%s.pUse) ELSE 1 END" else: pUseTemplate = "(1-%%s.pUse)" pSelect = [] for tableAlias in tableAliases: if params['useLog']: if params['useNull']: pSelect.append("COALESCE(1-exp(%s.pUse), 1)" % tableAlias) else: pSelect.append( "COALESCE(CASE WHEN %s.pUse != '-Infinity' THEN 1-exp(%s.pUse) ELSE 1 END, 1)" % (tableAlias, tableAlias)) else: pSelect.append("COALESCE(1-%s.pUse, 1)" % tableAlias) pUseString = "1 - ( %s )" % (' * '.join(pSelect)) if params['useLog']: if params['useNull']: selectClause.append( "CASE WHEN %s > 0 THEN ln(%s) ELSE NULL END AS pUse" % (pUseString, pUseString)) else: selectClause.append( "CASE WHEN %s > 0 THEN ln(%s) ELSE '-Infinity' END AS pUse" % (pUseString, pUseString)) else: selectClause.append("%s as pUse" % pUseString) fromString = "".join(joinSubqueries) selectString = ", ".join(selectClause) withString = ",\n".join(withClauses) joinSQL = "\nWITH %s\nselect %s from %s" % (withString, selectString, fromString) return joinSQL
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] counters = [] selectAtts = [] counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 for (i, child) in enumerate(self.children): sql = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((sql, ident)) if self.hasGenericConstant(): selectAtts.append( "q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueryPairs = [pair for pair in itertools.product( counters, counters) if pair[0] < pair[1]] joinConditions = [] for (j1, j2) in subqueryPairs: if (j1 in counterIdentToGenericConstantStr and j2 in counterIdentToGenericConstantStr): joinConditions.append( "q%d.%s = q%d.%s" % (j1, counterIdentToGenericConstantStr[j1], j2, counterIdentToGenericConstantStr[j2])) if len(joinConditions): joinCondition = "where %s" % ' and '.join(joinConditions) else: joinCondition = "" subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s) as q%d" % (sql, ident) if previousIdent: if len(separatorSubs): condition = "ON %s" % " and ".join( ["q%d.c%d = q%d.c%d" % (previousIdent, i, ident, i) for(i, x) in separatorSubs]) subqueries.append( "FULL OUTER JOIN %s %s" % (newSubquery, condition)) else: subqueries.append(newSubquery) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident if len(separatorSubs): subqueryString = " ".join(subqueries) else: subqueryString = ", ".join(subqueries) pString = ' + '.join(["( -1 * %d * q%d.pUse)" % (self.coeffs[ind], i) for ind, i in enumerate(counters)]) for (i, x) in separatorSubs: selectAtts.append("COALESCE(%s) as c%d" % ( ", ".join(["q%d.c%d" % (ident, i) for ident in counters]), i)) attString = ', '.join(selectAtts) if attString: selectString = '%s, %s as pUse' % (attString, pString) else: selectString = '%s as pUse' % (pString) sql = "\n -- inclusion/exclusion \n select %s from %s %s" % ( selectString, subqueryString, joinCondition) return sql
def generateSQL_CNF(self, params): replacementVal = algorithm.attCounter() childSQL = self.child.generateSQL_CNF(params) self.trueOnMissing = self.child.trueOnMissing self.genericIdentifiers = self.child.genericIdentifiers.copy() subqueryAlias = 'q%d' % algorithm.counter() # this steps replaces a universally (\forall) quantified variable # with a product - if some tuples can be missing, we need to count # and invalidate any products with too few terms (missing terms = # false terms) if params['missingTuples']: # for webkb if self.effectiveDomainSize: effectiveDomainSize = self.effectiveDomainSize else: effectiveDomainSize = params['domainSize'] if self.isInequalityVar(self.separator): effectiveDomainSize = effectiveDomainSize - 1 groupByAttributes = [] selectAttributes = [] joinClause = '' # take care of generic constants, which have been projected out but # must still be "bubbled up" to the previous level for genericIdentifier in self.genericIdentifiers: groupByAttributes.append(genericIdentifier) selectAttributes.append(genericIdentifier) if self.isGenericInequalityVar(self.separator): genericConstantIdentifier = "sep_var_%s" % str(self.replacementVal) selectAttributes.append('A.v0 as %s' % genericConstantIdentifier) joinClause = ', A WHERE A.v0 != %s.%s' % (subqueryAlias, genericConstantIdentifier) groupByAttributes.append('A.v0') self.genericIdentifiers.add(genericConstantIdentifier) for storedReplacementVal in self.usedSeparatorVars: selectAttributes.append("sep_var_%s" % str(storedReplacementVal)) groupByAttributes.append("sep_var_%s" % str(storedReplacementVal)) if len(groupByAttributes): groupByClause = 'group by ' + (', ' . join(groupByAttributes)) else: groupByClause = '' havingClause = '' if params['useLog']: if self.trueOnMissing: if params['useNull']: selectAttributes.append('CASE WHEN COUNT(*) = COUNT(pUse) THEN SUM(pUse) ELSE NULL END AS pUse') # if the result is empty and trueOnMissing=True, we should # return empty set (true) havingClause = ' HAVING COUNT(*) > 0' else: selectAttributes.append('SUM(pUse) AS pUse') else: if params['useNull']: if params['missingTuples']: selectAttributes.append('CASE WHEN COUNT(*) = COUNT(pUse) and COUNT(*) = %d THEN SUM(pUse) ELSE NULL END AS pUse' % effectiveDomainSize) else: selectAttributes.append('CASE WHEN COUNT(*) = COUNT(pUse) THEN SUM(pUse) ELSE NULL END AS pUse') else: if params['missingTuples']: selectAttributes.append("CASE WHEN COUNT(*) = %d THEN SUM(pUse) ELSE '-Infinity' END AS pUse" % effectiveDomainSize) else: selectAttributes.append('SUM(pUse) AS pUse') else: if not self.trueOnMissing and params['missingTuples']: selectAttributes.append("CASE WHEN COUNT(*) = %d THEN prod_double(pUse) ELSE 0 END AS pUse" % effectiveDomainSize) else: selectAttributes.append('prod_double(pUse) AS pUse') selectClause = ', '.join(selectAttributes) sql = "\n select %s from (%s) as %s%s %s %s" % (selectClause, childSQL, subqueryAlias, joinClause, groupByClause, havingClause) return sql
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] counters = [] selectAtts = [] identToTermSubs = {} counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 for (i, child) in enumerate(self.children): sql = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((sql, ident)) thisTermSubs = set() for (subId, varList) in separatorSubs: if child.usesSeparator(subId): thisTermSubs.add(subId) identToTermSubs[ident] = thisTermSubs if self.hasGenericConstant(): selectAtts.append("q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueryPairs = [pair for pair in itertools.product(counters, counters) if pair[0] < pair[1]] joinConditions = [] for (j1, j2) in subqueryPairs: if j1 in counterIdentToGenericConstantStr and j2 in counterIdentToGenericConstantStr: joinConditions.append( "q%d.%s = q%d.%s" % (j1, counterIdentToGenericConstantStr[j1], j2, counterIdentToGenericConstantStr[j2]) ) if len(joinConditions): joinCondition = "where %s" % " and ".join(joinConditions) else: joinCondition = "" subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s) as q%d" % (sql, ident) if previousIdent: if len(separatorSubs): condition = "ON %s" % " and ".join( [ "q%d.c%d = q%d.c%d" % (previousIdent, i, ident, i) for (i, x) in separatorSubs if i in identToTermSubs[previousIdent] and i in identToTermSubs[ident] ] ) subqueries.append("FULL OUTER JOIN %s %s" % (newSubquery, condition)) else: subqueries.append("FULL OUTER JOIN %s ON true" % (newSubquery)) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident subqueryString = " ".join(subqueries) pString = "*".join(["COALESCE(1-q%d.pUse,1)" % i for i in counters]) for (i, x) in separatorSubs: attsToCoalesce = ", ".join( [ "q%d.c%d" % (ident, i) for ident in counters if i in identToTermSubs[previousIdent] and i in identToTermSubs[ident] ] ) if len(attsToCoalesce) > 0: selectAtts.append("COALESCE(%s) as c%d" % (attsToCoalesce, i)) attString = ", ".join(selectAtts) if attString: selectString = "%s, 1-%s as pUse" % (attString, pString) else: selectString = "1-%s as pUse" % (pString) sql = "\n -- independent union \n select %s from %s %s" % (selectString, subqueryString, joinCondition) return sql
def generateSQL_CNF(self, params): if params["useLog"]: if params["useNull"]: defaultValue = "NULL" else: defaultValue = "'-Infinity'" else: defaultValue = "0" tableAliases = [] tableAliasToSubquerySQLMap = {} tableAliasToGenericIdentifiersMap = {} tableAliasToUsedSeparatorsMap = {} genericIdentifierToTableAliasMap = {} usedSeparatorToTableAliasMap = {} tableAliasesUsingAllSeparators = set() tableAliasesUsingAllSeparatorsAndGenericIdentifiers = set() tableAliasIsTrueOnMissing = {} tableAliasToMissingGenericIdentifiersMap = {} tableAliasToMissingSeparatorsMap = {} restOfTableAliases = set() self.genericIdentifiers = set() # if any child is not true on missing, this gets set to False self.trueOnMissing = True # assign each child a table alias and fetch its SQL code, then build the maps # that say which identifiers/separator vars are used by each child for child in self.children: currentSubqueryID = algorithm.counter() subquerySQL = child.generateSQL_CNF(params) tableAlias = "q%d" % currentSubqueryID tableAliases.append(tableAlias) tableAliasToSubquerySQLMap[tableAlias] = subquerySQL tableAliasIsTrueOnMissing[tableAlias] = child.trueOnMissing if not child.trueOnMissing: self.trueOnMissing = False childGenericIdentifiers = child.genericIdentifiers.copy() tableAliasToGenericIdentifiersMap[tableAlias] = childGenericIdentifiers self.genericIdentifiers.update(childGenericIdentifiers) for genericIdentifier in childGenericIdentifiers: if genericIdentifier in genericIdentifierToTableAliasMap: genericIdentifierToTableAliasMap[genericIdentifier].add(tableAlias) else: genericIdentifierToTableAliasMap[genericIdentifier] = set([tableAlias]) usesAllSeparators = True tableAliasToUsedSeparatorsMap[tableAlias] = set() for usedSeparatorVariable in self.usedSeparatorVars: if child.usesSeparator(usedSeparatorVariable): formattedSeparator = self.formatSeparatorVariable(usedSeparatorVariable) tableAliasToUsedSeparatorsMap[tableAlias].add(formattedSeparator) if formattedSeparator in usedSeparatorToTableAliasMap: usedSeparatorToTableAliasMap[formattedSeparator].add(tableAlias) else: usedSeparatorToTableAliasMap[formattedSeparator] = set([tableAlias]) else: usesAllSeparators = False if usesAllSeparators: tableAliasesUsingAllSeparators.add(tableAlias) for tableAlias in tableAliases: tableAliasToMissingGenericIdentifiersMap[tableAlias] = self.genericIdentifiers.difference( tableAliasToGenericIdentifiersMap[tableAlias] ) tableAliasToMissingSeparatorsMap[tableAlias] = set(self.formattedUsedSeparators).difference( tableAliasToUsedSeparatorsMap[tableAlias] ) if tableAliasToMissingGenericIdentifiersMap[tableAlias]: restOfTableAliases.add(tableAlias) elif tableAlias in tableAliasesUsingAllSeparators: tableAliasesUsingAllSeparatorsAndGenericIdentifiers.add(tableAlias) else: restOfTableAliases.add(tableAlias) # need a fixed order for selected attributes, before building the union # queries orderedAttributes = [] for previousSeparatorVariable in self.formattedUsedSeparators: orderedAttributes.append(previousSeparatorVariable) for genericIdentifier in self.genericIdentifiers: orderedAttributes.append(genericIdentifier) orderedAttributes.append("pUse") orderedAttributes.append("trueOnMissing") unionSubqueries = [] selectVariables = set(self.formattedUsedSeparators).union(self.genericIdentifiers) for tableAlias in tableAliases: if tableAlias in tableAliasesUsingAllSeparatorsAndGenericIdentifiers: selectAttributeMap = {attribute: "%s.%s" % (tableAlias, attribute) for attribute in selectVariables} selectAttributeMap["pUse"] = "%s.pUse" % tableAlias selectAttributeMap["trueOnMissing"] = "%s as trueOnMissing" % str(tableAliasIsTrueOnMissing[tableAlias]) selectAttributeString = self.getOrderedSelectString(orderedAttributes, selectAttributeMap) unionSubqueries.append("SELECT %s FROM %s" % (selectAttributeString, tableAlias)) else: # skip if this tableAlias has no joining variables? missingSelectVariables = tableAliasToMissingGenericIdentifiersMap[tableAlias].union( tableAliasToMissingSeparatorsMap[tableAlias] ) selectAttributeMap = {} additionalTables = [] index = 0 for attribute in selectVariables: if attribute in missingSelectVariables: domainTable = "A%d" % index additionalTables.append("A %s" % domainTable) selectAttributeMap[attribute] = "%s.v0 as %s" % (domainTable, attribute) index += 1 else: selectAttributeMap[attribute] = "%s.%s" % (tableAlias, attribute) selectAttributeMap["pUse"] = "%s.pUse" % tableAlias selectAttributeMap["trueOnMissing"] = "%s as trueOnMissing" % str(tableAliasIsTrueOnMissing[tableAlias]) selectAttributeString = self.getOrderedSelectString(orderedAttributes, selectAttributeMap) additionalTablesString = ", ".join(additionalTables) unionSubqueries.append( "SELECT %s FROM %s, %s" % (selectAttributeString, tableAlias, additionalTablesString) ) selectAttributeMap = {attribute: "%s" % (attribute) for attribute in selectVariables} numberOfChildrenFalseOnMissing = len(self.children) - sum(tableAliasIsTrueOnMissing.values()) if params["useLog"]: if params["useNull"]: selectAttributeMap["pUse"] = ( "iunion_log_null_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing ) else: selectAttributeMap["pUse"] = ( "iunion_log_neginf_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing ) else: selectAttributeMap["pUse"] = ( "iunion_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing ) selectClause = self.getOrderedSelectString(orderedAttributes, selectAttributeMap) groupByAttributeMap = {attribute: "%s" % (attribute) for attribute in selectVariables} if groupByAttributeMap: groupByClause = "GROUP BY %s" % self.getOrderedSelectString(orderedAttributes, groupByAttributeMap) else: groupByClause = "" withClause = ",\n".join( ["%s as (%s)" % (tableAlias, tableAliasToSubquerySQLMap[tableAlias]) for tableAlias in tableAliases] ) unionClause = " UNION ALL ".join(unionSubqueries) unionClauseAlias = "q%d" % algorithm.counter() joinSQL = "\n -- independent union \n WITH %s select %s from (%s) %s %s" % ( withClause, selectClause, unionClause, unionClauseAlias, groupByClause, ) return joinSQL
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] counters = [] selectAtts = [] identToTermSubs = {} counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 for (i, child) in enumerate(self.children): sql = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((sql, ident)) thisTermSubs = set() for (subId, varList) in separatorSubs: if child.usesSeparator(subId): thisTermSubs.add(subId) identToTermSubs[ident] = thisTermSubs if self.hasGenericConstant(): selectAtts.append( "q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueryPairs = [pair for pair in itertools.product( counters, counters) if pair[0] < pair[1]] joinConditions = [] for (j1, j2) in subqueryPairs: if j1 in counterIdentToGenericConstantStr and j2 in counterIdentToGenericConstantStr: joinConditions.append( "q%d.%s = q%d.%s" % (j1, counterIdentToGenericConstantStr[j1], j2, counterIdentToGenericConstantStr[j2])) if len(joinConditions): joinCondition = "where %s" % ' and '.join(joinConditions) else: joinCondition = "" subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s) as q%d \n" % (sql, ident) if previousIdent: if len(separatorSubs): condition = "ON %s" % " and ".join( ["q%d.c%d = q%d.c%d" % (previousIdent, i, ident, i) for(i, x) in separatorSubs if i in identToTermSubs[previousIdent] and i in identToTermSubs [ident]]) subqueries.append( "FULL OUTER JOIN %s %s" % (newSubquery, condition)) else: subqueries.append( "FULL OUTER JOIN %s ON true" % (newSubquery)) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident subqueryString = " ".join(subqueries) # pString = '*'.join(["COALESCE(1-q%d.pUse,1)" % i for i in counters]) pString = '+'.join(["COALESCE(q%d.pUse,<%s>)" % (i, l) for i, l in zip(counters, [c.lam for c in self.children])]) for (i, x) in separatorSubs: attsToCoalesce = ", ".join( ["q%d.c%d" % (ident, i) for ident in counters if i in identToTermSubs[previousIdent] and i in identToTermSubs [ident]]) if len(attsToCoalesce) > 0: selectAtts.append("COALESCE(%s) as c%d" % (attsToCoalesce, i)) attString = ', '.join(selectAtts) if attString: selectString = '%s, %s as pUse' % (attString, pString) else: selectString = '%s as pUse' % (pString) sql = "\n select %s from %s %s" % ( selectString, subqueryString, joinCondition) return sql
def generateSQL_CNF(self, params): if params['useLog']: if params['useNull']: defaultValue = "NULL" else: defaultValue = "'-Infinity'" else: defaultValue = "0" tableAliases = [] tableAliasToSubquerySQLMap = {} tableAliasToGenericIdentifiersMap = {} tableAliasToUsedSeparatorsMap = {} genericIdentifierToTableAliasMap = {} usedSeparatorToTableAliasMap = {} tableAliasesUsingAllSeparators = set() tableAliasesUsingAllSeparatorsAndGenericIdentifiers = set() tableAliasIsTrueOnMissing = {} tableAliasToMissingGenericIdentifiersMap = {} tableAliasToMissingSeparatorsMap = {} restOfTableAliases = set() self.genericIdentifiers = set() # if any child is not true on missing, this gets set to False self.trueOnMissing = True # assign each child a table alias and fetch its SQL code, then build the maps # that say which identifiers/separator vars are used by each child for child in self.children: currentSubqueryID = algorithm.counter() subquerySQL = child.generateSQL_CNF(params) tableAlias = "q%d" % currentSubqueryID tableAliases.append(tableAlias) tableAliasToSubquerySQLMap[tableAlias] = subquerySQL tableAliasIsTrueOnMissing[tableAlias] = child.trueOnMissing if not child.trueOnMissing: self.trueOnMissing = False childGenericIdentifiers = child.genericIdentifiers.copy() tableAliasToGenericIdentifiersMap[ tableAlias] = childGenericIdentifiers self.genericIdentifiers.update(childGenericIdentifiers) for genericIdentifier in childGenericIdentifiers: if genericIdentifier in genericIdentifierToTableAliasMap: genericIdentifierToTableAliasMap[ genericIdentifier].add(tableAlias) else: genericIdentifierToTableAliasMap[ genericIdentifier] = set([tableAlias]) usesAllSeparators = True tableAliasToUsedSeparatorsMap[tableAlias] = set() for usedSeparatorVariable in self.usedSeparatorVars: if child.usesSeparator(usedSeparatorVariable): formattedSeparator = self.formatSeparatorVariable( usedSeparatorVariable) tableAliasToUsedSeparatorsMap[ tableAlias].add(formattedSeparator) if formattedSeparator in usedSeparatorToTableAliasMap: usedSeparatorToTableAliasMap[ formattedSeparator].add(tableAlias) else: usedSeparatorToTableAliasMap[ formattedSeparator] = set([tableAlias]) else: usesAllSeparators = False if usesAllSeparators: tableAliasesUsingAllSeparators.add(tableAlias) for tableAlias in tableAliases: tableAliasToMissingGenericIdentifiersMap[tableAlias] = self.genericIdentifiers.difference( tableAliasToGenericIdentifiersMap[tableAlias]) tableAliasToMissingSeparatorsMap[tableAlias] = set( self.formattedUsedSeparators).difference( tableAliasToUsedSeparatorsMap[tableAlias]) if tableAliasToMissingGenericIdentifiersMap[tableAlias]: restOfTableAliases.add(tableAlias) elif tableAlias in tableAliasesUsingAllSeparators: tableAliasesUsingAllSeparatorsAndGenericIdentifiers.add( tableAlias) else: restOfTableAliases.add(tableAlias) # need a fixed order for selected attributes, before building the union # queries orderedAttributes = [] for previousSeparatorVariable in self.formattedUsedSeparators: orderedAttributes.append(previousSeparatorVariable) for genericIdentifier in self.genericIdentifiers: orderedAttributes.append(genericIdentifier) orderedAttributes.append("pUse") orderedAttributes.append("trueOnMissing") unionSubqueries = [] selectVariables = set(self.formattedUsedSeparators).union( self.genericIdentifiers) for tableAlias in tableAliases: if tableAlias in tableAliasesUsingAllSeparatorsAndGenericIdentifiers: selectAttributeMap = { attribute: "%s.%s" % (tableAlias, attribute) for attribute in selectVariables} selectAttributeMap["pUse"] = "%s.pUse" % tableAlias selectAttributeMap["trueOnMissing"] = "%s as trueOnMissing" % str( tableAliasIsTrueOnMissing[tableAlias]) selectAttributeString = self.getOrderedSelectString( orderedAttributes, selectAttributeMap) unionSubqueries.append( "SELECT %s FROM %s" % (selectAttributeString, tableAlias)) else: # skip if this tableAlias has no joining variables? missingSelectVariables = tableAliasToMissingGenericIdentifiersMap[ tableAlias].union(tableAliasToMissingSeparatorsMap[tableAlias]) selectAttributeMap = {} additionalTables = [] index = 0 for attribute in selectVariables: if attribute in missingSelectVariables: domainTable = "A%d" % index additionalTables.append("A %s" % domainTable) selectAttributeMap[attribute] = "%s.v0 as %s" % ( domainTable, attribute) index += 1 else: selectAttributeMap[attribute] = "%s.%s" % ( tableAlias, attribute) selectAttributeMap["pUse"] = "%s.pUse" % tableAlias selectAttributeMap["trueOnMissing"] = "%s as trueOnMissing" % str( tableAliasIsTrueOnMissing[tableAlias]) selectAttributeString = self.getOrderedSelectString( orderedAttributes, selectAttributeMap) additionalTablesString = ", ".join(additionalTables) unionSubqueries.append("SELECT %s FROM %s, %s" % ( selectAttributeString, tableAlias, additionalTablesString)) selectAttributeMap = {attribute: "%s" % (attribute) for attribute in selectVariables} numberOfChildrenFalseOnMissing = len( self.children) - sum(tableAliasIsTrueOnMissing.values()) if params['useLog']: if params['useNull']: selectAttributeMap[ "pUse"] = "iunion_log_null_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing else: selectAttributeMap[ "pUse"] = "iunion_log_neginf_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing else: selectAttributeMap[ "pUse"] = "iunion_%d_false_on_missing(pUse, trueOnMissing) as pUse" % numberOfChildrenFalseOnMissing selectClause = self.getOrderedSelectString( orderedAttributes, selectAttributeMap) groupByAttributeMap = {attribute: "%s" % (attribute) for attribute in selectVariables} if groupByAttributeMap: groupByClause = "GROUP BY %s" % self.getOrderedSelectString( orderedAttributes, groupByAttributeMap) else: groupByClause = "" withClause = ",\n".join( ["%s as (%s)" % (tableAlias, tableAliasToSubquerySQLMap[tableAlias]) for tableAlias in tableAliases]) unionClause = " UNION ALL ".join(unionSubqueries) unionClauseAlias = "q%d" % algorithm.counter() joinSQL = "\n WITH %s select %s from (\n %s \n) %s \n %s" % ( withClause, selectClause, unionClause, unionClauseAlias, groupByClause) return joinSQL
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] counters = [] selectAtts = [] counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 for (i, child) in enumerate(self.children): sql = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((sql, ident)) if self.hasGenericConstant(): selectAtts.append( "q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueryPairs = [ pair for pair in itertools.product(counters, counters) if pair[0] < pair[1] ] joinConditions = [] for (j1, j2) in subqueryPairs: if (j1 in counterIdentToGenericConstantStr and j2 in counterIdentToGenericConstantStr): joinConditions.append( "q%d.%s = q%d.%s" % (j1, counterIdentToGenericConstantStr[j1], j2, counterIdentToGenericConstantStr[j2])) if len(joinConditions): joinCondition = "where %s" % ' and '.join(joinConditions) else: joinCondition = "" subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s\n) as q%d \n" % (sql, ident) if previousIdent: if len(separatorSubs): condition = "ON %s" % " and ".join([ "q%d.c%d = q%d.c%d" % (previousIdent, i, ident, i) for (i, x) in separatorSubs ]) subqueries.append("FULL OUTER JOIN %s %s" % (newSubquery, condition)) else: subqueries.append(newSubquery) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident if len(separatorSubs): subqueryString = " ".join(subqueries) else: subqueryString = ", ".join(subqueries) red_list = [(self.coeffs[ind], "q%d.pUse" % i) for ind, i in enumerate(counters)] red_list = sorted(red_list, key=lambda x: x[0], reverse=True) # recursive function for creating the correct sql code def collapse_func(red_list, ind=None): if ind is None: ind = len(red_list) - 1 if ind == 0: if red_list[ind][0] == -1: return "l1sum_n(0," + red_list[ind][1] + ")" elif red_list[ind][0] == 1: return "l1diff_n(0," + red_list[ind][1] + ")" if red_list[ind][0] == -1: return "l1sum_n(" + collapse_func( red_list, ind - 1) + "," + red_list[ind][1] + ")" elif red_list[ind][0] == 1: return "l1diff_n(" + collapse_func( red_list, ind - 1) + "," + red_list[ind][1] + ")" pString = collapse_func(red_list) # pString = ' + '.join(["( -1 * %d * q%d.pUse)" % # (self.coeffs[ind], # i) for ind, i in enumerate(counters)]) for (i, x) in separatorSubs: selectAtts.append( "COALESCE(%s) as c%d" % (", ".join(["q%d.c%d" % (ident, i) for ident in counters]), i)) attString = ', '.join(selectAtts) if attString: selectString = '%s, %s as pUse' % (attString, pString) else: selectString = '%s as pUse' % (pString) sql = "\n select %s from %s %s" % (selectString, subqueryString, joinCondition) return sql
def generateSQL_CNF(self, params): replacementVal = algorithm.attCounter() childSQL = self.child.generateSQL_CNF(params) self.trueOnMissing = self.child.trueOnMissing self.genericIdentifiers = self.child.genericIdentifiers.copy() subqueryAlias = 'q%d' % algorithm.counter() # this steps replaces a universally (\forall) quantified variable # with a product - if some tuples can be missing, we need to count # and invalidate any products with too few terms (missing terms = # false terms) if params['missingTuples']: # for webkb if self.effectiveDomainSize: effectiveDomainSize = self.effectiveDomainSize else: effectiveDomainSize = params['domainSize'] if self.isInequalityVar(self.separator): effectiveDomainSize = effectiveDomainSize - 1 groupByAttributes = [] selectAttributes = [] joinClause = '' # take care of generic constants, which have been projected out but # must still be "bubbled up" to the previous level for genericIdentifier in self.genericIdentifiers: groupByAttributes.append(genericIdentifier) selectAttributes.append(genericIdentifier) if self.isGenericInequalityVar(self.separator): genericConstantIdentifier = "sep_var_%s" % str(self.replacementVal) selectAttributes.append('A.v0 as %s' % genericConstantIdentifier) joinClause = ', A WHERE A.v0 != %s.%s' % ( subqueryAlias, genericConstantIdentifier) groupByAttributes.append('A.v0') self.genericIdentifiers.add(genericConstantIdentifier) for storedReplacementVal in self.usedSeparatorVars: selectAttributes.append("sep_var_%s" % str(storedReplacementVal)) groupByAttributes.append("sep_var_%s" % str(storedReplacementVal)) if len(groupByAttributes): groupByClause = 'group by ' + (', ' . join(groupByAttributes)) else: groupByClause = '' havingClause = '' if params['useLog']: if self.trueOnMissing: if params['useNull']: selectAttributes.append( 'CASE WHEN COUNT(*) = COUNT(pUse) THEN SUM(pUse) ELSE NULL END AS pUse') # if the result is empty and trueOnMissing=True, we should # return empty set (true) havingClause = ' HAVING COUNT(*) > 0' else: selectAttributes.append('SUM(pUse) AS pUse') else: if params['useNull']: if params['missingTuples']: selectAttributes.append( 'CASE WHEN COUNT(*) = COUNT(pUse) and COUNT(*) = %d THEN SUM(pUse) ELSE NULL END AS pUse' % effectiveDomainSize) else: selectAttributes.append( 'CASE WHEN COUNT(*) = COUNT(pUse) THEN SUM(pUse) ELSE NULL END AS pUse') else: if params['missingTuples']: selectAttributes.append( "CASE WHEN COUNT(*) = %d THEN SUM(pUse) ELSE '-Infinity' END AS pUse" % effectiveDomainSize) else: selectAttributes.append('SUM(pUse) AS pUse') else: if not self.trueOnMissing and params['missingTuples']: selectAttributes.append( "CASE WHEN COUNT(*) = %d THEN prod_double(pUse) ELSE 0 END AS pUse" % effectiveDomainSize) else: selectAttributes.append('prod_double(pUse) AS pUse') selectClause = ', '.join(selectAttributes) sql = "\n -- independent project \n select %s from (%s) as %s%s %s %s" % ( selectClause, childSQL, subqueryAlias, joinClause, groupByClause, havingClause) return sql
def generateSQL_DNF(self, separatorSubs=None): if separatorSubs is None: separatorSubs = [] results = [] counters = [] selectAtts = [] counterIdentToGenericConstantStr = {} genericConstantStrIdent = 0 for (i, child) in enumerate(self.children): sql = child.generateSQL_DNF(separatorSubs[:]) ident = algorithm.counter() if child.hasGenericConstant(): genericConstantStr = child.getGenericConstantStr() # doesn't matter which one, just pick arbitrarily self.genericConstantStr = genericConstantStr counterIdentToGenericConstantStr[ident] = genericConstantStr genericConstantStrIdent = ident counters.append(ident) results.append((sql, ident)) if self.hasGenericConstant(): selectAtts.append( "q%d.%s" % (genericConstantStrIdent, self.genericConstantStr)) subqueryPairs = [ pair for pair in itertools.product(counters, counters) if pair[0] < pair[1] ] joinConditions = [] for (j1, j2) in subqueryPairs: if (j1 in counterIdentToGenericConstantStr and j2 in counterIdentToGenericConstantStr): joinConditions.append( "q%d.%s = q%d.%s" % (j1, counterIdentToGenericConstantStr[j1], j2, counterIdentToGenericConstantStr[j2])) if len(joinConditions): joinCondition = "where %s" % ' and '.join(joinConditions) else: joinCondition = "" subqueries = [] previousIdent = False for (sql, ident) in results: newSubquery = "(%s) as q%d" % (sql, ident) if previousIdent: if len(separatorSubs): condition = "ON %s" % " and ".join([ "q%d.c%d = q%d.c%d" % (previousIdent, i, ident, i) for (i, x) in separatorSubs ]) subqueries.append("FULL OUTER JOIN %s %s" % (newSubquery, condition)) else: subqueries.append(newSubquery) else: subqueries.append("%s" % (newSubquery)) previousIdent = ident if len(separatorSubs): subqueryString = " ".join(subqueries) else: subqueryString = ", ".join(subqueries) pString = ' + '.join([ "( -1 * %d * q%d.pUse)" % (self.coeffs[ind], i) for ind, i in enumerate(counters) ]) for (i, x) in separatorSubs: selectAtts.append( "COALESCE(%s) as c%d" % (", ".join(["q%d.c%d" % (ident, i) for ident in counters]), i)) attString = ', '.join(selectAtts) if attString: selectString = '%s, %s as pUse' % (attString, pString) else: selectString = '%s as pUse' % (pString) sql = "\n -- inclusion/exclusion \n select %s from %s %s" % ( selectString, subqueryString, joinCondition) return sql