def download(url, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): reqReq = [x.upper() for x in reqFields] dName = outPath iYear = reqReq[0] iMonth = reqReq[1] if reqReq[2] != 'ALL': reqs = reqReq[2:] # open url socket = openurl.openurl(url, logfile, errfile) raw_data = {} for j in col: raw_data[j] = [] # operate this csv file logfile.write(str(now.now()) + ' csv file loading\n') print('csv file loading------') df = pd.read_csv(socket, dtype='unicode') cList = df.columns.tolist() if reqReq[2] == 'ALL': reqs = cList[8:] # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') list0 = df.loc[:, col[0]].tolist() list1 = df.loc[:, col[1]].tolist() list2 = df.loc[:, col[2]].tolist() list3 = df.loc[:, col[3]].tolist() list4 = df.loc[:, col[4]].tolist() for req in reqs: if req not in cList: errfile.write(str(now.now()) + " Requested data " + str(req) + " don't match the csv file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(req) + " don't match the excel file. Please check the file at: " + url) valueList = df.loc[:, req].tolist() raw_data[col[0]] = raw_data[col[0]] + list0 raw_data[col[1]] = raw_data[col[1]] + list1 raw_data[col[2]] = raw_data[col[2]] + list2 raw_data[col[3]] = raw_data[col[3]] + list3 raw_data[col[4]] = raw_data[col[4]] + list4 raw_data[col[5]] = raw_data[col[5]] + [req.split('_')[0]] * len(valueList) raw_data[col[6]] = raw_data[col[6]] + [req.split('_')[1]] * len(valueList) raw_data[col[7]] = raw_data[col[7]] + valueList raw_data[col[8]] = [iYear] * len(raw_data[col[0]]) raw_data[col[9]] = [iMonth] * len(raw_data[col[0]]) logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): yearReq = reqFields dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + ' excel file loading\n') print('excel file loading------') xd = pd.ExcelFile(socket) df = xd.parse(sheet) # indicator checking logfile.write(str(now.now()) + ' indicator checking\n') print('indicator checking------') for i in range(df.shape[0]): yearCol = [] for k in yearReq: kk = [] k_asked = "19 in " + k[2:] for j in range(df.shape[1]): if df.iloc[i, j] == k_asked: kk.append(j) restartIndex = i + 1 if len(kk) == 4: yearCol.append(kk[3]) if len(yearCol) == len(yearReq): break if len(yearCol) != len(yearReq): errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip( '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(yearReq).strip( '[]') + " don't match the excel file. Please check the file at: " + url) raw_data = {} for j in col: raw_data[j] = [] # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(restartIndex, df.shape[0]): if re.match(r'E\d{8}$', str(df.iloc[i, 0])): ii = 0 for j in range(len(yearCol)): raw_data[col[0]].append(df.iloc[i, 0]) raw_data[col[1]].append(df.iloc[i, 2]) raw_data[col[2]].append(yearReq[ii]) raw_data[col[3]].append(df.iloc[i, yearCol[ii]]) ii += 1 logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, outPath, keyCol, digitCheckCol, noDigitRemoveFields, logfile, errfile): dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # load this csv file logfile.write(str(now.now()) + ' csv file loading\n') print('csv file loading------') df = pd.read_csv(socket, dtype='unicode') col = df.columns.tolist() # save csv file dsave.save(df, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, reqInfo, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): reqReq = reqInfo dName = outPath iYear = reqReq[0] iMonth = reqReq[1] iSex = reqReq[2] # open url socket = openurl.openurl(url, logfile, errfile) raw_data = {} for j in col: raw_data[j] = [] # operate this csv file logfile.write(str(now.now()) + ' csv file loading\n') print('csv file loading------') df = pd.read_csv(socket, dtype='unicode') for k in range(0, df.shape[1]): if re.match(r'E\d{8}$', str(df.iloc[0][k])): break if k == df.shape[1]: errfile.write(str(now.now()) + " Cannot find ecode in row " + str(2) + ". Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Cannot find ecode in row " + str(2) + ". Please check the file at: " + url) logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(0, df.shape[0], 2): if str(df.iloc[i][0]): eList = df.iloc[i, k:].dropna().tolist() raw_data[col[5]] = raw_data[col[5]] + eList raw_data[col[6]] = raw_data[col[6]] + df.iloc[i+1, k:].dropna().tolist() raw_data[col[0]] = raw_data[col[0]] + [df.iloc[i][0]] * len(eList) raw_data[col[1]] = raw_data[col[1]] + [df.iloc[i][1]] * len(eList) raw_data[col[2]] = [iYear] * len(raw_data[col[0]]) raw_data[col[3]] = [iMonth] * len(raw_data[col[0]]) raw_data[col[4]] = [iSex] * len(raw_data[col[0]]) logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, keyCol, digitCheckCol, noDigitRemoveFields): col = reqFields dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + " excel file loading\n") print("excel file loading------") xd = pd.ExcelFile(socket) df = xd.parse(sheet) # data reading logfile.write(str(now.now()) + " data reading\n") print("data reading------") raw_data = df.loc[:, col] # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): schoolReq = reqFields if len(schoolReq) != 1: errfile.write( str(now.now()) + " Requested data " + str(schoolReq).strip("[]") + " don't match the excel file. This code is only for extracting data from filed 'State-funded primary, secondary and special schools (5)' with 'Percentage of persistent absentees (4)'. Please check the file at: " + str(url) + " . End progress\n" ) logfile.write(str(now.now()) + " error and end progress\n") sys.exit( "Requested data " + str(schoolReq).strip("[]") + " don't match the excel file. This code is only for extracting data from filed 'State-funded primary, secondary and special schools (5)' with 'Percentage of persistent absentees (4)'. Please check the file at: " + url ) dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + " excel file loading\n") print("excel file loading------") xd = pd.ExcelFile(socket) df = xd.parse(sheet) iYear = (df.iloc[2, 0].split(","))[0] # indicator checking logfile.write(str(now.now()) + " indicator checking\n") print("indicator checking------") for i in range(df.shape[0]): numCol = [] for k in schoolReq: k_asked = k for j in range(df.shape[1]): if str(k_asked) in str(df.iloc[i, j]): numCol.append(j) restartIndex = i + 1 if len(numCol) == len(schoolReq): break if len(numCol) != len(schoolReq): errfile.write( str(now.now()) + " Requested data " + str(schoolReq).strip("[]") + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n" ) logfile.write(str(now.now()) + " error and end progress\n") sys.exit( "Requested data " + str(schoolReq).strip("[]") + " don't match the excel file. Please check the file at: " + url ) numCol.append(df.shape[1]) for i in range(restartIndex, df.shape[0]): kk = [] k_asked = "Percentage of persistent absentees (4)" for k in range(len(numCol) - 1): for j in range(numCol[k], numCol[k + 1]): if df.iloc[i, j] == k_asked: kk.append(j) restartIndex = i + 1 break if len(kk) == len(schoolReq): break numCol.pop() if len(kk) != len(schoolReq): sys.exit( "Requested data " + str(schoolReq).strip("[]") + " in the field 'Percentage of persistent absentees (4)' don't match the excel file. Please check the file at: " + url ) raw_data = {} for j in col: raw_data[j] = [] # data reading logfile.write(str(now.now()) + " data reading\n") print("data reading------") for i in range(restartIndex, df.shape[0]): for k in kk: if re.match(r"E\d{8}$", str(df.iloc[i, 1])): raw_data[col[0]].append(df.iloc[i, 1]) raw_data[col[1]].append(df.iloc[i, 3]) raw_data[col[2]].append(iYear) raw_data[col[3]].append(df.iloc[i, k]) logfile.write(str(now.now()) + " data reading end\n") print("data reading end------") # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): dName = outPath listurl = url.split('/') pDate = listurl[len(listurl) - 1][:4] # operate this excel file logfile.write(str(now.now()) + ' excel file loading\n') print('excel file loading------') xd = pd.ExcelFile(url) sheets = xd.sheet_names raw_data = {} for j in col: raw_data[j] = [] for sheet in sheets: df = xd.parse(sheet) logfile.write(str(now.now()) + ' for sheet ' + str(sheet) + '------\n') logfile.write(str(now.now()) + ' indicator checking\n') print('for sheet ' + str(sheet) + ' ------') print('indicator checking------') # indicator checking fflag = 0 for i in range(df.shape[0]): for j in range(df.shape[1]): if ('Aged' in str(df.iloc[i][j]).split()) and (len(str(df.iloc[i][j]).split()) == 2): fflag = 1 break if fflag == 1: ageReq = df.iloc[i][j:-1].tolist() restartIndex = i + 1 break if fflag == 0: errfile.write(str(now.now()) + " The sheet " + str(sheet) + " has not required fields, such as 'Aged 10-14'. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("The sheet " + str(sheet) + " has not not required fields, such as 'Aged 10-14'. Please check the file at: " + url) # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(restartIndex, df.shape[0]): if str(df.iloc[i][0]): for k in ageReq: raw_data[col[5]].append(k.split()[1]) raw_data[col[0]] = raw_data[col[0]] + [(df.iloc[i][0])] * len(ageReq) raw_data[col[1]] = raw_data[col[1]] + [(df.iloc[i][1])] * len(ageReq) raw_data[col[2]] = raw_data[col[2]] + [(df.iloc[i][2])] * len(ageReq) raw_data[col[4]] = raw_data[col[4]] + [(df.iloc[i][3])] * len(ageReq) raw_data[col[6]] = raw_data[col[6]] + df.iloc[i][j:-1].tolist() raw_data[col[3]] = raw_data[col[3]] + [sheet] * len(ageReq) * (df.shape[0] - restartIndex) raw_data[col[7]] = [pDate] * len(raw_data[col[0]]) raw_data[col[8]] = ["HCC_SAPF_2015"] * len(raw_data[col[0]]) logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): yearReq = reqFields dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + ' excel file loading\n') print('excel file loading------') xd = pd.ExcelFile(socket) df = xd.parse(sheet) # indicator checking logfile.write(str(now.now()) + ' indicator checking\n') print('indicator checking------') for i in range(df.shape[0]): yearCol = [] for k in yearReq: k_asked = k for j in range(df.shape[1]): if str(k_asked) in str(df.iloc[i, j]): yearCol.append(j) restartIndex = i + 1 if len(yearCol) == len(yearReq): break if len(yearCol) != len(yearReq): errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip( '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(yearReq).strip( '[]') + " don't match the excel file. Please check the file at: " + url) yearCol.append(df.shape[1]) for i in range(restartIndex, df.shape[0]): kk = [] k_asked = "All Apprenticeships" for k in range(len(yearCol)-1): for j in range(yearCol[k], yearCol[k+1]): if df.iloc[i, j] == k_asked: kk.append(j) restartIndex = i + 1 break if len(kk) == len(yearReq): break yearCol.pop() if len(kk) != len(yearReq): errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip( '[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(yearReq).strip( '[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + url) raw_data = {} for j in col: raw_data[j] = [] # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(restartIndex, df.shape[0]): ii = 0 for k in kk: if (pd.notnull(df.iloc[i, 1])) and (pd.notnull(df.iloc[i, k])) and (df.iloc[i, 1] != "Total"): ij = 0 for jj in ["Under 19", "19-24"]: raw_data[col[0]].append(df.iloc[i, 1]) raw_data[col[1]].append(yearReq[ii]) raw_data[col[2]].append(jj) raw_data[col[3]].append(df.iloc[i, k+ij]) ij += 1 ii += 1 logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): yearReq = reqFields dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + ' excel file loading\n') print('excel file loading------') xd = pd.ExcelFile(socket) df = xd.parse(sheet) # indicator checking logfile.write(str(now.now()) + ' indicator checking\n') print('indicator checking------') for i in range(df.shape[0]): yearCol = [] for k in yearReq: k_asked = k for j in range(df.shape[1]): if str(k_asked) in str(df.iloc[i, j]): yearCol.append(j) restartIndex = i + 1 if len(yearCol) == len(yearReq): break if len(yearCol) != len(yearReq): errfile.write( str(now.now()) + " Requested data " + str(yearReq).strip('[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(yearReq).strip('[]') + " don't match the excel file. Please check the file at: " + url) yearCol.append(df.shape[1]) for i in range(restartIndex, df.shape[0]): kk = [] k_asked = "All Apprenticeships" for k in range(len(yearCol) - 1): for j in range(yearCol[k], yearCol[k + 1]): if df.iloc[i, j] == k_asked: kk.append(j) restartIndex = i + 1 break if len(kk) == len(yearReq): break yearCol.pop() if len(kk) != len(yearReq): errfile.write( str(now.now()) + " Requested data " + str(yearReq).strip('[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit( "Requested data " + str(yearReq).strip('[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + url) raw_data = {} for j in col: raw_data[j] = [] # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(restartIndex, df.shape[0]): ii = 0 for k in kk: if (pd.notnull(df.iloc[i, 1])) and (pd.notnull( df.iloc[i, k])) and (df.iloc[i, 1] != "Total"): ij = 0 for jj in ["Under 19", "19-24"]: raw_data[col[0]].append(df.iloc[i, 1]) raw_data[col[1]].append(yearReq[ii]) raw_data[col[2]].append(jj) raw_data[col[3]].append(df.iloc[i, k + ij]) ij += 1 ii += 1 logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields): homeReq = reqFields if len(homeReq) != 1: errfile.write(str(now.now()) + " Requested data " + str(homeReq).strip( '[]') + " don't match the excel file. This code is only for extracting data from filed 'e1b1a'. Please check the file at: " + str( url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(homeReq).strip( '[]') + " don't match the excel file. This code is only for extracting data from filed 'e1b1a'. Please check the file at: " + url) dName = outPath # open url socket = openurl.openurl(url, logfile, errfile) # operate this excel file logfile.write(str(now.now()) + ' excel file loading\n') print('excel file loading------') xd = pd.ExcelFile(socket) df = xd.parse(sheet) # find year and quarter listurl = (url.split('_')) iYQ = listurl[len(listurl) - 1] iYQ = (iYQ.split('.'))[0] iYear = iYQ[:4] iQuarter = str(int(int(iYQ[4:]) / 3)) # indicator checking logfile.write(str(now.now()) + ' indicator checking\n') print('indicator checking------') for i in range(df.shape[0]): numCol = [] for k in homeReq: for j in range(df.shape[1]): if df.iloc[i][j] == k: numCol.append(j) restartIndex = i + 1 if len(numCol) == len(homeReq): break if len(numCol) != len(homeReq): errfile.write(str(now.now()) + " Requested data " + str(homeReq).strip( '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n") logfile.write(str(now.now()) + ' error and end progress\n') sys.exit("Requested data " + str(homeReq).strip( '[]') + " don't match the excel file. Please check the file at: " + url) raw_data = {} for j in col: raw_data[j] = [] # data reading logfile.write(str(now.now()) + ' data reading\n') print('data reading------') for i in range(restartIndex, df.shape[0]): for k in numCol: if re.match(r'E\d{8}$', str(df.index[i][0])): raw_data[col[0]].append(df.index[i][0]) raw_data[col[1]].append(df.index[i][1]) raw_data[col[2]].append(iYear) raw_data[col[3]].append(iQuarter) raw_data[col[4]].append(df.iloc[i][k]) logfile.write(str(now.now()) + ' data reading end\n') print('data reading end------') # save csv file dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)