def _build_query_result_lists(self): # build lists of query results across each column for file_obj_index, f_obj in enumerate(self.query_output_file_objects): # the first line of each query result is the source column path. # we need this to get the values for fetching the joined rows. self.source_file_paths.append(f_obj.readline().rstrip("\n")) for line in f_obj: # SHOULD RESOVLE TO INT line_id = resolve_type(line, VALID_TYPES) self.line_id_sets[file_obj_index].append(line_id) # we are done with this file object forever now, we have all useful # info from the column. f_obj.close()
def main(query_output_file_directory, new_column_output_file_directory): query_output_file_objects = [] for root, dirs, files in os.walk(query_output_file_directory): for f_name in files: # append the file handler to the file so we can iterate them later query_output_file_objects.append(io.open(os.path.join(root, f_name))) source_file_paths = [] # build sets for each files line ids line_id_sets = [list() for i in range(len(query_output_file_objects))] # build lists of query results across each column for file_obj_index, f_obj in enumerate(query_output_file_objects): # the first line of each query result is the source column path. # we need this to get the values for fetching the joined rows. source_file_paths.append(f_obj.readline().rstrip("\n")) for line in f_obj: # SHOULD RESOVLE TO INT line_id = resolve_type(line, VALID_TYPES) line_id_sets[file_obj_index].append(line_id) # we only want unique filenames for columns, it wont matter what file is # associated to what once we join the rows referenced_column_paths = set(source_file_paths) # clean up file handlers for f_obj in query_output_file_objects: f_obj.close() # find smallest result and base our join off of it # default to 0 since its still a valid index smallest_set_index = 0 for set_index, line_id_set in enumerate(line_id_sets): if len(line_id_sets[set_index]) < len(line_id_sets[smallest_set_index]): samllest_set_index = set_index # used later so we can build the table joined_lines = [] # go ahead and join the columns for p_line_id in line_id_sets[smallest_set_index]: for line_id_set_index, line_id_set in enumerate(line_id_sets): # skip the set we are currently iterating through if line_id_set_index == smallest_set_index: continue # find out if we can join any rows if p_line_id in line_id_set: joined_lines.append(p_line_id) # write out each joined line to the respective new columns. for column in referenced_column_paths: column_directory, column_name = os.path.split(column) # open the column file for iteration with io.open(column, "rb") as in_column_obj: # write out the new column with io.open(os.path.join(new_column_output_file_directory, column_name), "wb") as out_column_obj: # needed to maintain state within the iteration of the in_column line_counter = 0 # used to determine whether or not to contine reading the # in_column. If we hit as many ids as are in the joined_lines, # we can assume there will be no more to find. ids_hit = 0 for line in in_column_obj: # if we have hit the number of ids in our list, we can # expect not to find anything more. if ids_hit == len(joined_lines): break if line_counter in joined_lines: # push the number of ids we have found forward ids_hit += 1 # write out value to new column out_column_obj.write(line) # push the line counter forward to maintain state line_counter += 1