def get_headers_with_cols(cols_to_include): headers = gd.get_headers() final = [] for x in range(len(headers)): if x in cols_to_include: final.append(headers[x]) return final
def change_to_list_of_lists(list_of_dicts): headers = gd.get_headers() final = [] for x in range(len(list_of_dicts)): final.append([]) for entry in headers: col = gd.get_data_slice(entry, list_of_dicts) for y in range(len(list_of_dicts)): final[y].append(col[y]) return final
def normalize_data(): list_of_dicts = gd.get_data_list_of_dicts() headers = gd.get_headers() final = [] for y in range(len(headers)): final.append([]) for x in range(len(headers)): col = gd.get_data_slice(headers[x], list_of_dicts) colFloat = convert_colFloat(col) final[x].append(np.std(colFloat)) final[x].append(np.mean(colFloat)) return final
def cleaned_headers(): headers = gd.get_headers() new_headers = [] for entry in headers: final = "" for i in entry: if (i == " " or i == "(" or i == ")" or i == "/" or i == "^"): final = final else: final = final + i print "entry: " + final new_headers.append(final) return new_headers
def remove_dup_cols(): list_of_dicts = gd.get_data_list_of_dicts() headers = gd.get_headers() cols_2remove = [] for x in range(len(headers)-15): col1 = gd.get_data_slice(headers[x],list_of_dicts) col1 = replace_missing(col1) for y in range(len(headers)-15): col2 = gd.get_data_slice(headers[y],list_of_dicts) col2 = replace_missing(col2) r = find_r(col1,col2) pairIn = exist_same_pair(cols_2remove, x, y) if r>=.99 and r<=1.002 and not x ==y and not pairIn: cols_2remove.append([y,x, r[0], headers[x], headers[y]])
def clean_headers(headers): all_headers = gd.get_headers() new_headers = [] for entry in headers: final = "" for i in entry: if (i == " "): final = final else: final = final + i print "entry: " + final new_headers.append(final) print new_headers return new_headers
def create_rows_list_of_lists_without_cols(list_of_dicts, cols_to_exclude): headers = gd.get_headers() final = [] for x in range(len(list_of_dicts)): final.append([]) counter = 0 for x in range(len(headers)): if not x in cols_to_exclude: col = gd.get_data_slice(headers[x], list_of_dicts) avg = average_available_values(col) cleanedCol = gd.get_data_slice_replace(headers[x], list_of_dicts, avg) counter = 0 for entry in cleanedCol: final[counter].append(entry) counter += 1 return final
import get_data as gd headers_clean = gd.get_headers() list_of_dicts = gd.get_data_list_of_dicts() new_headers = [] for h in headers_clean: h = h.split(",") if not " Error" in h and not "" in h: new_headers.append(",".join(h)) final = [] for entry in list_of_dicts: temp = {} for header in new_headers: temp[header] = entry[header] final.append(temp) filename = "noError.csv" headers = new_headers gd.write_data_dicts(filename, headers, final)
import get_data as gd list_of_dicts = gd.get_data_list_of_dicts() final = [] for entry in list_of_dicts: if entry["Transcript"] != "": final.append(entry) filename = "justTranscripts.csv" headers = gd.get_headers() gd.write_data_dicts(filename, headers, final)
import get_data as gd import get_data2 as gd2 import get_data3 as gd3 list_of_dicts = gd.get_data_list_of_dicts() full = gd2.get_data_list_of_dicts() full_headers = gd2.get_headers() headers = gd.get_headers() headers_income = gd3.get_headers() codes = {} full_clean = [] final_headers = [] for h in headers: h = h.split(" - ") code = h[0] try: name = h[1] codes[code] = name except: print h for h2 in headers_income: h2 = h2.split(" - ") code = h2[0] try: if not "Error" in h2[1]: name = h2[1] codes[code] = name except:
import get_data as gd headers_clean = gd.get_headers() list_of_dicts = gd.get_data_list_of_dicts() new_headers = [] for h in headers_clean: h = h.split(",") if not " Error" in h and not "" in h: new_headers.append(",".join(h)) final = [] for entry in list_of_dicts: temp = {} for header in new_headers: temp[header] = entry[header] final.append(temp) filename = "noError.csv" headers = new_headers gd.write_data_dicts(filename,headers,final)