def correlation_answer(query, table_path): #Answer questions about correlation option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') attributes = taskMap.get('correlation')[0].get('attributes') Data = [] attributes_data = [] for attribute in attributes: attr = [] attr.append(attribute) attr = attr + list(table[attribute]) attributes_data.append(attr) Data.append(list(table[attribute])) attributes_data = list(combinations(attributes_data, 2)) pearsonr = [] for i in attributes_data: object = [] object.append(i[0][0]) object.append(i[1][0]) pearsonr.append([object, stats.pearsonr(i[0][1:], i[1][1:])[0]]) min_index = 0 max_index = 0 min, max = 1, 0 for i in range(len(pearsonr)): if pearsonr[i][1] > max: max = pearsonr[i][1] max_index = i elif pearsonr[i][1] < min: min = pearsonr[i][1] min_index = i answer = 'the most relevant attribute pair : ' + pearsonr[max_index][0][ 0] + ' and ' + pearsonr[max_index][0][ 1] + '\n' + 'the least relevant attribute pair : ' + pearsonr[ min_index][0][0] + ' and ' + pearsonr[min_index][0][1] option = find_correlation_render(attributes, Data, pearsonr, query, table_path, answer) return {"option": option, "attributes": attributes, "task": "correlation"}
def parse(query, table_path): parse_instance = Parse(data_url=table_path) respon = parse_instance.analyze_query(query) taskMap = respon.get('taskMap') option = {} if "proportion" in taskMap.keys(): option = proportion_answer(query, table_path) elif "distribution" in taskMap.keys(): option = distribution_answer(query, table_path) elif "derived_value" in taskMap.keys(): option = derived_value_answer(query, table_path) elif "difference" in taskMap.keys(): option = difference_answer(query, table_path) elif "rank" in taskMap.keys(): option = rank_answer(query, table_path) elif "trend" in taskMap.keys(): option = trend_answer(query, table_path) elif "correlation" in taskMap.keys(): option = correlation_answer(query, table_path) elif "find_value" in taskMap.keys(): option = find_value_answer(query, table_path) elif "find_extremum" in taskMap.keys(): option = find_extremum_answer(query, table_path) return option
def init(): global parse_instance global TTM #Initialize the task transition matrix TTM = [[0.02, 0.16, 0.13, 0.12, 0.11, 0.14, 0.11, 0.07, 0.17], [0.16, 0.12, 0.18, 0.07, 0.12, 0.1, 0.1, 0.12, 0.13], [0.07, 0.17, 0.14, 0.12, 0.11, 0.13, 0.09, 0.08, 0.09], [0.15, 0.13, 0.11, 0.04, 0.12, 0.13, 0.07, 0.08, 0.17], [0.14, 0.12, 0.17, 0.09, 0.08, 0.11, 0.1, 0.07, 0.12], [0.16, 0.12, 0.18, 0.07, 0.12, 0.1, 0.1, 0.12, 0.13], [0.12, 0.11, 0.17, 0.13, 0.13, 0.11, 0.04, 0.06, 0.13], [0.11, 0.13, 0.14, 0.15, 0.13, 0.12, 0.07, 0.05, 0.1], [0.12, 0.14, 0.08, 0.12, 0.14, 0.15, 0.09, 0.07, 0.09]] if parse_instance is not None: return jsonify({"message": "parse already initialized"}) # dependency_parser_config = {'name': 'corenlp','model': os.path.join("assets","jars","stanford-parser-4.0.0-models.jar"),'parser': os.path.join("assets","jars","stanford-parser.jar")} parse_instance = Parse(verbose=True) return jsonify({"message": "NL4DV Initialized"})
def difference_answer(query,table_path): #Answer questions about difference option={} table=pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') difference_attributes=taskMap.get('difference')[0].get('attributes') if len(taskMap) >= 2: x_label = "" Filter = taskMap.get('filter') F = "" for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains(values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype(table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[ (table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[difference_attributes[0]]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype(table[attribute]): Label.append(attribute) if len(Label)!=0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) DATA=[] X=[] for attribute in difference_attributes: DATA.append([attribute,list(table[attribute])]) X.append(x_label) X.append(x) answer="" if len(DATA)==2: z_count=0 f_count=0 difference=[] for i in range(len(DATA[0][1])): difference.append(abs(DATA[0][1][i]-DATA[1][1][i])) if (DATA[0][1][i]-DATA[1][1][i])>=0: z_count+=1 else: f_count+=1 flag="" if z_count>=f_count: flag+="bigger" else: flag+="smaller" answer+=F+", the max difference between "+DATA[0][0]+" and "+DATA[1][0]+" is "+str(max(difference))+"\n"+"\n"+" the min difference between " +DATA[0][0]+" and "+DATA[1][0]+" is "+str(min(difference))+"\n"+"\n"+"In the all, "+DATA[0][0]+" is "+flag+" than "+DATA[1][0] elif len(DATA)>2: C=list(combinations(DATA,2)) MAX=0 MIN=99999999 MAX_pair=[] MIN_pair=[] for i in C: for j in range(len(i[0][1])): if abs(i[0][1][j]-i[1][1][j])>=MAX: MAX_pair = [] MAX=abs(i[0][1][j]-i[1][1][j]) MAX_pair.append(i[0][0]) MAX_pair.append(i[1][0]) elif abs(i[0][1][j]-i[1][1][j])<=MIN: MIN_pair=[] MIN=abs(i[0][1][j]-i[1][1][j]) MIN_pair.append(i[0][0]) MIN_pair.append(i[1][0]) answer += F+", the max difference is between " + MAX_pair[0] + " and " + MAX_pair[1] + ",the value is " + str( MAX) + "\n"+"\n"+"the min difference is between " + MIN_pair[0] + " and " + MIN_pair[1] + ", the value is " + str( MIN) option = find_difference_render(DATA, X, query, table_path,answer) elif len(taskMap)==1: DATA=[] for attribute in difference_attributes: DATA.append([attribute,list(table[attribute])]) x_attributes = [] table_attributes = table.columns for attribute in table_attributes: if attribute.lower() in utils.time: x_attributes.append(attribute) elif is_string_dtype(table[attribute]): x_attributes.append(attribute) X = [] for attribute in x_attributes: X.append([attribute,list(table[attribute])]) answer = "" if len(DATA) == 2: z_count = 0 f_count = 0 difference = [] for i in range(len(DATA[0][1])): difference.append(abs(DATA[0][1][i] - DATA[1][1][i])) if (DATA[0][1][i] - DATA[1][1][i]) >= 0: z_count += 1 else: f_count += 1 flag = "" if z_count >= f_count: flag += "bigger" else: flag += "smaller" answer +="Max difference is " + str(max(difference)) + ", min difference is " + str( min(difference)) + "\n"+"\n"+" In the all, " + DATA[0][0] + " is " + flag + " than " + DATA[1][0]+"." elif len(DATA) > 2: C = list(combinations(DATA, 2)) MAX = 0 MIN = 99999999 MAX_pair = [] MIN_pair = [] for i in C: for j in range(len(i[0][1])): if abs(i[0][1][j] - i[1][1][j]) >= MAX: MAX_pair = [] MAX = abs(i[0][1][j] - i[1][1][j]) MAX_pair.append(i[0][0]) MAX_pair.append(i[1][0]) elif abs(i[0][1][j] - i[1][1][j]) <= MIN: MIN_pair = [] MIN = abs(i[0][1][j] - i[1][1][j]) MIN_pair.append(i[0][0]) MIN_pair.append(i[1][0]) answer += "The max difference is between " + MAX_pair[0] + " and " + MAX_pair[1] + ",the value is " + str( MAX) + "\n"+"\n"+" the min difference is between " + MIN_pair[0] + " and " + MIN_pair[1] + ", the value is " + str( MIN) if len(X) == 0: z=[] z.append("Serial") t=[] for i in range(len(DATA[0][1])): t.append(i + 1) z.append(t) X.append(z) option=find_difference_render(DATA,X,query,table_path,answer) else: option=find_difference_render(DATA,X[0],query,table_path,answer) return {"option":option,"attributes":difference_attributes,"task":"difference"}
def distribution_answer(query, table_path): #Answer questions about distribution option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') y_label = taskMap.get('distribution')[0].get('attributes') replace = [] if len(taskMap) >= 2: x_label = "" Filter = taskMap.get('filter') F = "" for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] replace.append(filter_attributes) if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase == "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[y_label[0]]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) Y = [] answer = "" answer += "Under the filter conditions: " + F + "\n" for attribute in y_label: if attribute != x_label: data = list(table[attribute]) Max = max(list(table[attribute])) Min = min(list(table[attribute])) trend = "" if (data[len(data) - 1] - data[0]) >= 0: trend += "increased" else: trend += "decreased" answer += " In " + attribute + ": " + "the max is " + str( Max) + ", " + "the min is " + str(Min) # +", "+"the overall trend is "+trend Y.append([attribute, list(table[attribute])]) option = find_distribution_render(x, Y, x_label, query, table_path, answer) for i in y_label: if i in replace: y_label.remove(i) elif len(list(taskMap.keys())) == 1: Y = [] answer = "" for attribute in y_label: data = list(table[attribute]) Max = max(list(table[attribute])) Min = min(list(table[attribute])) trend = "" if (data[len(data) - 1] - data[0]) >= 0: trend += "increased" else: trend += "decreased" answer += " In " + attribute + ": " + "the max is " + str( Max) + ", " + "the min is " + str(Min) # + "\n" + "the overall trend is " + trend Y.append([attribute, list(table[attribute])]) x_attributes = [] table_attributes = table.columns for attribute in table_attributes: if attribute.lower() in utils.time: x_attributes.append(attribute) elif is_string_dtype( table[attribute]) or attribute.lower() in utils.time: x_attributes.append(attribute) X = [] for attribute in x_attributes: X.append([attribute, list(table[attribute])]) if len(X) == 0: for i in range(len(Y)): X.append(i + 1) option = find_distribution_render(X, Y, "Serial", query, table_path, answer) else: option = find_distribution_render(X[0][1], Y, X[0][0], query, table_path, answer) return {"option": option, "attributes": y_label, "task": "distribution"}
def proportion_answer(query, table_path): #answer questions about proportion option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') proportion_attribute = taskMap.get('proportion')[0].get('attributes')[0] proportion_main_attribute = "" if len(taskMap) >= 2: x_label = "" Filter = taskMap.get('filter') p_filter = [] F = "" for filter in Filter: flag = "true" for column in table.columns: if filter.get('values') == column.lower(): p_filter.append(filter) flag = "flase" filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]) and flag == "true": table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]) and flag == "true": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]) and flag == "true": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes] ) and filter_Phrase != "over" and flag == "true": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase == "over" and flag == "true": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]) and flag == "true": table[filter_attributes] = table[filter_attributes].astype(int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes proportion_main_attribute = p_filter[0].get('attributes')[0] x = [] y = list(table[proportion_attribute]) z = list(table[p_filter[0].get('attributes')[0]]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) proportion = [] for i in range(len(y)): proportion.append(round(y[i] / z[i], 2)) answer = "" if len(proportion) == 1: answer += "The proportion of " + proportion_attribute + " in " + p_filter[ 0].get('attributes')[0] + " " + F + " is " + str(proportion[0]) elif len(proportion) > 1: Max = max(proportion) Min = min(proportion) trend = "" if (proportion[len(proportion) - 1] - proportion[0]) >= 0: trend += "increased" else: trend += "decreased" answer += "In the proportion of " + proportion_attribute + " in " + p_filter[ 0].get('attributes')[0] + " " + F + ", MAx is " + str( Max) + ", Min is " + str(Min) option = find_proportion_render(x, y, z, proportion_attribute, p_filter[0].get('attributes')[0], query, table_path, answer) elif len(filter) == 1: proportion_main_attribute = taskMap.get('filter')[0].get( 'attributes')[0] x = [] y = list(table[proportion_attribute]) z = list(table[proportion_main_attribute]) Label = [] x_label = "" for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) proportion = [] for i in range(len(y)): proportion.append(round(y[i] / z[i]), 2) answer = "" Max = max(proportion) Min = min(proportion) trend = "" if (proportion(len(proportion) - 1) - proportion[0]) >= 0: trend += "increased" else: trend += "decreased" answer += "In the proportion of " + proportion_attribute + " in " + proportion_main_attribute + "\n" + "the Max is " + str( Max) + ", the Min is " + str( Min) + "\n" + " the overall trend is " + trend option = find_proportion_render(x, y, z, proportion_attribute, proportion_main_attribute, query, table_path, answer) return { "option": option, "attributes": [proportion_attribute, proportion_main_attribute], "task": "proportion" }
def trend_answer(query, table_path): #Answer questions about trend option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') y_label = taskMap.get('trend')[0].get('attributes')[0] if len(taskMap) >= 2: x_label = "" F = "" Filter = taskMap.get('filter') for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[y_label]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) y_up = [] y_down = [] up_count = 0 down_count = 0 for i in range(len(y)): if i == 0: y_up.append('-') y_down.append('-') else: if (y[i] - y[i - 1]) > 0: y_up.append(round(y[i] - y[i - 1], 2)) y_down.append('-') up_count += 1 else: y_up.append('-') y_down.append(round(y[i - 1] - y[i], 2)) down_count += 1 flag = '' if (y[len(y) - 1] - y[0]) >= 0: flag = "increase" else: flag = "decrease" Y_up = [] Y_down = [] for i in y_up: if i == '-': Y_up.append(0) else: Y_up.append(i) for i in y_down: if i == '-': Y_down.append(0) else: Y_down.append(i) max_index = np.argmax(Y_up, axis=0) min_index = np.argmax(Y_down, axis=0) z = [] for i in range(len(y)): z.append(int(y[i] / 20)) answer = ' the count of ' + y_label + " increase is " + str( up_count ) + ', ' + ' the count of ' + y_label + " decrease is " + str( down_count ) + "\n" + "the overall trend of " + y_label + " is " + flag + " " + F option = TREND_render(x, y, z, y_up, y_down, max_index, min_index, x_label, y_label, query, answer) elif len(taskMap) == 1: # only query 1 attribute trend without a label_attribute # e.g."What is the trend of GDP?" # the trend attribute queried : GDP # label_attribute : None trend_attribute = taskMap.get('trend')[0].get('attributes')[0] y = list(table[trend_attribute]) x = [] x_label = "" Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) y_up = [] y_down = [] up_count = 0 down_count = 0 for i in range(len(y)): if i == 0: y_up.append('-') y_down.append('-') else: if (y[i] - y[i - 1]) > 0: y_up.append(round(y[i] - y[i - 1], 2)) y_down.append('-') up_count += 1 else: y_up.append('-') y_down.append(round(y[i - 1] - y[i], 2)) down_count += 1 flag = '' if (y[len(y) - 1] - y[0]) >= 0: flag = "increase" else: flag = "decrease" Y_up = [] Y_down = [] for i in y_up: if i == '-': Y_up.append(0) else: Y_up.append(i) for i in y_down: if i == '-': Y_down.append(0) else: Y_down.append(i) max_index = np.argmax(Y_up, axis=0) min_index = np.argmax(Y_down, axis=0) z = [] for i in range(len(y)): z.append(int(y[i] / 20)) answer = ' the count of ' + trend_attribute + " increase is " + str( up_count ) + ', ' + ' the count of ' + trend_attribute + " decrease is " + str( down_count ) + "\n" + "\n" + " the overall trend of " + trend_attribute + " is " + flag option = TREND_render(x, y, z, y_up, y_down, max_index, min_index, x_label, trend_attribute, query, answer) return {"option": option, "attributes": [y_label], "task": "trend"}
def rank_answer(query, table_path): #Answer questions about rank option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') y_label = taskMap.get('rank')[0].get('attributes')[0] if len(taskMap) >= 2: x_label = "" Filter = taskMap.get('filter') F = "" for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[y_label]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) y_rank = np.array(y) index = np.argsort(y_rank) X = [] Y = [] for i in range(len(index)): if x_label != "Serial": X.append(x[index[i]]) else: X.append(i + 1) Y.append(y[index[i]]) answer = "" answer += "In " + y_label + ": " + "the max value is " + str( Y[0]) + ", the min value is " + str(Y[len(Y) - 1]) + " " + F option = find_rank_render(X, Y, query, table_path, answer) elif len(taskMap) == 1: rank_attribute = taskMap.get('rank')[0].get('attributes')[0] rank_queryPhrase = taskMap.get('rank')[0].get('queryPhrase') y = list(table[rank_attribute]) y_rank = np.array(y) index = np.argsort(y_rank) Y = [] for i in range(len(index)): Y.append(y[index[i]]) x_attributes = [] table_attributes = table.columns for attribute in table_attributes: if attribute.lower() in utils.time: x_attributes.append(attribute) elif is_string_dtype( table[attribute]) or attribute.lower() in utils.time: x_attributes.append(attribute) X = [] for attribute in x_attributes: X.append(list(table[attribute])) if len(X) == 0: for i in range(len(Y)): X.append(i + 1) answer = "" answer += "In " + y_label + ": " + "the max value is " + str( Y[0]) + ", the min value is " + str(Y[len(Y) - 1]) option = find_rank_render(X, Y, query, table_path, answer) else: answer = "" answer += "In " + y_label + ": " + "the max value is " + str( Y[0]) + ", the min value is " + str(Y[len(Y) - 1]) option = find_rank_render(X[0], Y, query, table_path, answer) return {"option": option, "attributes": [y_label], "task": "rank"}
def find_extremum_answer(query, table_path): #Answer questions about find_extremum option = {} table = pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') attributes = taskMap.get('find_extremum')[0].get('attributes') operator = taskMap.get('find_extremum')[0].get('operator') queryPhrase = taskMap.get('find_extremum')[0].get('queryPhrase') attributeMap = respon.get('attributeMap') Attributes = list(attributeMap.keys()) if len(attributes) == 2: # in query “Which year has the largest gdp?” # y=gdp # x=year if len(taskMap) == 1: y_label = attributes[0] x_label = attributes[1] y = list(table[attributes[0]]) x = list(table[attributes[1]]) min_index = y.index(min(y)) max_index = y.index(max(y)) answer = "" if operator == "MAX": answer += str( x[max_index] ) + " has the " + queryPhrase + " " + y_label.lower( ) + ": " + str(max(y)) else: answer += str( x[min_index] ) + " has the " + queryPhrase + " " + y_label.lower( ) + ": " + str(min(y)) option = find_extremum_render(x, y, x_label, y_label, query, operator, answer) else: x_label = "" F = "" Filter = taskMap.get('filter') for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]): table[filter_attributes] = table[ filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[attributes[0]]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) min_index = y.index(min(y)) max_index = y.index(max(y)) answer = "" if operator == "MAX": answer += str( x[max_index] ) + " has the " + queryPhrase + " " + attributes[0].lower( ) + " " + F + ": " + str(max(y)) else: answer += str( x[min_index] ) + " has the " + queryPhrase + " " + attributes[0].lower( ) + " " + F + ": " + str(min(y)) option = find_extremum_render(x, y, x_label, attributes[0], query, operator, answer) elif len(attributes) == 1: #have 2 explicit attributes if len(taskMap) >= 2: x_label = "" F = "" Filter = taskMap.get('filter') for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype( table[filter_attributes]): table[filter_attributes] = table[ filter_attributes].str.lower() table = table[table[filter_attributes].str.contains( values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype( table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype( int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype( table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype( int) table = table[(table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[ 1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[attributes[0]]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) answer = "" if operator == "MAX": answer += "The " + queryPhrase + " " + attributes[0].lower( ) + " " + F + " is " + str(max(y)) else: answer += "The " + queryPhrase + " " + attributes[0].lower( ) + " " + F + " is " + str(min(y)) option = find_extremum_render(x, y, x_label, attributes[0], query, operator, answer) else: Label = [] x_label = "" x = [] y = list(table[attributes[0]]) for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype( table[attribute]): Label.append(attribute) if len(Label) != 0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) answer = "" if operator == "MAX": answer += "The " + queryPhrase + " " + attributes[0].lower( ) + " is " + str(max(y)) else: answer += "The " + queryPhrase + " " + attributes[0].lower( ) + " is " + str(min(y)) option = find_extremum_render(x, y, x_label, attributes[0], query, operator, answer) return { "option": option, "attributes": [attributes[0]], "task": "find_extremum" }
def derived_value_answer(query,table_path): #Answer questions about derived_value option={} table=pd.read_csv(table_path) Parse_instance = Parse(data_url=table_path) respon = Parse_instance.analyze_query(query) taskMap = respon.get('taskMap') y_label = taskMap.get('derived_value')[0].get('attributes')[0] if len(taskMap) >= 2: x_label = "" Filter = taskMap.get('filter') derived_operator = taskMap.get('derived_value')[0].get('operator') F = "" for filter in Filter: filter_Phrase = filter.get("queryPhrase") filter_opertaor = filter.get("operator") values = filter.get("values") filter_attributes = filter.get("attributes")[0] if filter_opertaor == "EQ" and is_string_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].str.lower() table = table[table[filter_attributes].str.contains(values.lower())] F = F + filter_Phrase + " " + values + " " + filter_attributes elif filter_opertaor == "EQ" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] == int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "LT" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] <= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_opertaor == "GT" and is_numeric_dtype(table[filter_attributes]) and filter_Phrase != "over": table[filter_attributes] = table[filter_attributes].astype(int) table = table[table[filter_attributes] >= int(values)] F = F + filter_Phrase + " " + values + " " + filter_attributes x_label = filter_attributes elif filter_Phrase != "over": x_label = filter_attributes elif filter_attributes == "RANGE" and is_numeric_dtype(table[filter_attributes]): table[filter_attributes] = table[filter_attributes].astype(int) table = table[ (table[filter_attributes] <= int(values[0])) & (table[filter_attributes] >= int(values[1]))] F = F + filter_Phrase + " " + values[0] + " and " + values[1] + " " + filter_attributes x_label = filter_attributes x = [] y = list(table[y_label]) if len(x_label) == 0: Label = [] for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype(table[attribute]): Label.append(attribute) if len(Label)!=0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) else: x = list(table[x_label]) answer="" if derived_operator == 'AVG': flag = "mean" answer+="The mean of "+y_label+" is "+str(round(sum(y)/len(y),2))+" "+F option = find_aggregation_render(x, y, flag, query, table_path,answer,y_label) elif derived_operator == 'SUM': flag = "sum" answer+="The sum of "+y_label+" is "+str(round(sum(y),2))+" "+F+"\n"+"and in "+y_label+", "+"the Max is "+str(max(y))+', '+"the Min is "+str(min(y)) option = find_aggregation_render(x, y, flag, query, table_path,answer,y_label) elif len(list(taskMap.keys()))==1: derived_value_attribute = taskMap.get('derived_value')[0].get('attributes')[0] derived_operator = taskMap.get('derived_value')[0].get('operator') derived_queryPhrase = taskMap.get('derived_value')[0].get('queryPhrase') y = list(table[derived_value_attribute]) Label = [] x=[] x_label="" answer="" for attribute in table.columns: if attribute.lower() in utils.time: Label.append(attribute) for attribute in table.columns: if attribute.lower() not in utils.time and is_string_dtype(table[attribute]): Label.append(attribute) if len(Label)!=0: x_label = Label[0] x = list(table[x_label]) else: x_label = "Serial" for i in range(len(y)): x.append(i + 1) if derived_operator == 'AVG': flag = "mean" answer += "The mean of " + y_label + " is " + str(round(sum(y) / len(y), 2)) option=find_aggregation_render(x, y, flag, query, table_path,answer,y_label) elif derived_operator == 'SUM': flag = "sum" answer += "The mean of " + y_label + " is " + str(round(sum(y) / len(y), 2))+"\n"+"and in "+y_label+", "+"the Max is "+str(max(y))+', '+"the Min is "+str(min(y)) option=find_aggregation_render(x, y, flag, query, table_path,answer,y_label) return {"option":option,"attributes":[y_label],"task":"derived_value"}