def download(url, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    reqReq = [x.upper() for x in reqFields]
    dName = outPath

    iYear = reqReq[0]
    iMonth = reqReq[1]

    if reqReq[2] != 'ALL':
        reqs = reqReq[2:]

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # operate this csv file
    logfile.write(str(now.now()) + ' csv file loading\n')
    print('csv file loading------')
    df = pd.read_csv(socket, dtype='unicode')
    cList = df.columns.tolist()

    if reqReq[2] == 'ALL':
        reqs = cList[8:]

    # data reading
    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')

    list0 = df.loc[:, col[0]].tolist()
    list1 = df.loc[:, col[1]].tolist()
    list2 = df.loc[:, col[2]].tolist()
    list3 = df.loc[:, col[3]].tolist()
    list4 = df.loc[:, col[4]].tolist()

    for req in reqs:
        if req not in cList:
            errfile.write(str(now.now()) + " Requested data " + str(req) + " don't match the csv file. Please check the file at: " + str(url) + " . End progress\n")
            logfile.write(str(now.now()) + ' error and end progress\n')
            sys.exit("Requested data " + str(req) + " don't match the excel file. Please check the file at: " + url)

        valueList = df.loc[:, req].tolist()

        raw_data[col[0]] = raw_data[col[0]] + list0
        raw_data[col[1]] = raw_data[col[1]] + list1
        raw_data[col[2]] = raw_data[col[2]] + list2
        raw_data[col[3]] = raw_data[col[3]] + list3
        raw_data[col[4]] = raw_data[col[4]] + list4
        raw_data[col[5]] = raw_data[col[5]] + [req.split('_')[0]] * len(valueList)
        raw_data[col[6]] = raw_data[col[6]] + [req.split('_')[1]] * len(valueList)
        raw_data[col[7]] = raw_data[col[7]] + valueList

    raw_data[col[8]] = [iYear] * len(raw_data[col[0]])
    raw_data[col[9]] = [iMonth] * len(raw_data[col[0]])
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    yearReq = reqFields
    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + ' excel file loading\n')
    print('excel file loading------')
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    # indicator checking
    logfile.write(str(now.now()) + ' indicator checking\n')
    print('indicator checking------')
    for i in range(df.shape[0]):
        yearCol = []
        for k in yearReq:
            kk = []
            k_asked = "19 in " + k[2:]
            for j in range(df.shape[1]):
                if df.iloc[i, j] == k_asked:
                    kk.append(j)
                    restartIndex = i + 1

            if len(kk) == 4:
                yearCol.append(kk[3])

        if len(yearCol) == len(yearReq):
            break

    if len(yearCol) != len(yearReq):
        errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(yearReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + url)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # data reading
    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')
    for i in range(restartIndex, df.shape[0]):
        if re.match(r'E\d{8}$', str(df.iloc[i, 0])):
            ii = 0
            for j in range(len(yearCol)):
                raw_data[col[0]].append(df.iloc[i, 0])
                raw_data[col[1]].append(df.iloc[i, 2])
                raw_data[col[2]].append(yearReq[ii])
                raw_data[col[3]].append(df.iloc[i, yearCol[ii]])
                ii += 1
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
Example #3
0
def download(url, outPath, keyCol, digitCheckCol, noDigitRemoveFields, logfile, errfile):
    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # load this csv file
    logfile.write(str(now.now()) + ' csv file loading\n')
    print('csv file loading------')
    df = pd.read_csv(socket, dtype='unicode')
    col = df.columns.tolist()

    # save csv file
    dsave.save(df, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, reqInfo, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    reqReq = reqInfo
    dName = outPath

    iYear = reqReq[0]
    iMonth = reqReq[1]
    iSex = reqReq[2]

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # operate this csv file
    logfile.write(str(now.now()) + ' csv file loading\n')
    print('csv file loading------')
    df = pd.read_csv(socket, dtype='unicode')

    for k in range(0, df.shape[1]):
        if re.match(r'E\d{8}$', str(df.iloc[0][k])):
            break

    if k == df.shape[1]:
        errfile.write(str(now.now()) + " Cannot find ecode in row " + str(2) + ". Please check the file at: " + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Cannot find ecode in row " + str(2) + ". Please check the file at: " + url)

    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')
    for i in range(0, df.shape[0], 2):
        if str(df.iloc[i][0]):
            eList = df.iloc[i, k:].dropna().tolist()
            raw_data[col[5]] = raw_data[col[5]] + eList
            raw_data[col[6]] = raw_data[col[6]] + df.iloc[i+1, k:].dropna().tolist()
            raw_data[col[0]] = raw_data[col[0]] + [df.iloc[i][0]] * len(eList)
            raw_data[col[1]] = raw_data[col[1]] + [df.iloc[i][1]] * len(eList)

    raw_data[col[2]] = [iYear] * len(raw_data[col[0]])
    raw_data[col[3]] = [iMonth] * len(raw_data[col[0]])
    raw_data[col[4]] = [iSex] * len(raw_data[col[0]])
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, keyCol, digitCheckCol, noDigitRemoveFields):
    col = reqFields
    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + " excel file loading\n")
    print("excel file loading------")
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    # data reading
    logfile.write(str(now.now()) + " data reading\n")
    print("data reading------")
    raw_data = df.loc[:, col]

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    schoolReq = reqFields

    if len(schoolReq) != 1:
        errfile.write(
            str(now.now())
            + " Requested data "
            + str(schoolReq).strip("[]")
            + " don't match the excel file. This code is only for extracting data from filed 'State-funded primary, secondary and special schools (5)' with 'Percentage of persistent absentees (4)'. Please check the file at: "
            + str(url)
            + " . End progress\n"
        )
        logfile.write(str(now.now()) + " error and end progress\n")
        sys.exit(
            "Requested data "
            + str(schoolReq).strip("[]")
            + " don't match the excel file. This code is only for extracting data from filed 'State-funded primary, secondary and special schools (5)' with 'Percentage of persistent absentees (4)'. Please check the file at: "
            + url
        )

    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + " excel file loading\n")
    print("excel file loading------")
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    iYear = (df.iloc[2, 0].split(","))[0]

    # indicator checking
    logfile.write(str(now.now()) + " indicator checking\n")
    print("indicator checking------")
    for i in range(df.shape[0]):
        numCol = []
        for k in schoolReq:
            k_asked = k
            for j in range(df.shape[1]):
                if str(k_asked) in str(df.iloc[i, j]):
                    numCol.append(j)
                    restartIndex = i + 1

        if len(numCol) == len(schoolReq):
            break

    if len(numCol) != len(schoolReq):
        errfile.write(
            str(now.now())
            + " Requested data "
            + str(schoolReq).strip("[]")
            + " don't match the excel file. Please check the file at: "
            + str(url)
            + " . End progress\n"
        )
        logfile.write(str(now.now()) + " error and end progress\n")
        sys.exit(
            "Requested data "
            + str(schoolReq).strip("[]")
            + " don't match the excel file. Please check the file at: "
            + url
        )

    numCol.append(df.shape[1])

    for i in range(restartIndex, df.shape[0]):
        kk = []
        k_asked = "Percentage of persistent absentees (4)"
        for k in range(len(numCol) - 1):
            for j in range(numCol[k], numCol[k + 1]):
                if df.iloc[i, j] == k_asked:
                    kk.append(j)
                    restartIndex = i + 1
                    break

        if len(kk) == len(schoolReq):
            break

    numCol.pop()

    if len(kk) != len(schoolReq):
        sys.exit(
            "Requested data "
            + str(schoolReq).strip("[]")
            + " in the field 'Percentage of persistent absentees (4)' don't match the excel file. Please check the file at: "
            + url
        )

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # data reading
    logfile.write(str(now.now()) + " data reading\n")
    print("data reading------")
    for i in range(restartIndex, df.shape[0]):
        for k in kk:
            if re.match(r"E\d{8}$", str(df.iloc[i, 1])):
                raw_data[col[0]].append(df.iloc[i, 1])
                raw_data[col[1]].append(df.iloc[i, 3])
                raw_data[col[2]].append(iYear)
                raw_data[col[3]].append(df.iloc[i, k])
    logfile.write(str(now.now()) + " data reading end\n")
    print("data reading end------")

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    dName = outPath

    listurl = url.split('/')
    pDate = listurl[len(listurl) - 1][:4]

    # operate this excel file
    logfile.write(str(now.now()) + ' excel file loading\n')
    print('excel file loading------')
    xd = pd.ExcelFile(url)
    sheets = xd.sheet_names

    raw_data = {}
    for j in col:
        raw_data[j] = []

    for sheet in sheets:
        df = xd.parse(sheet)

        logfile.write(str(now.now()) + ' for sheet ' + str(sheet) + '------\n')
        logfile.write(str(now.now()) + ' indicator checking\n')
        print('for sheet ' + str(sheet) + ' ------')
        print('indicator checking------')

        # indicator checking
        fflag = 0
        for i in range(df.shape[0]):
            for j in range(df.shape[1]):
                if ('Aged' in str(df.iloc[i][j]).split()) and (len(str(df.iloc[i][j]).split()) == 2):
                    fflag = 1
                    break

            if fflag == 1:
                ageReq = df.iloc[i][j:-1].tolist()
                restartIndex = i + 1
                break

        if fflag == 0:
            errfile.write(str(now.now()) + " The sheet " + str(sheet) + " has not required fields, such as 'Aged 10-14'. Please check the file at: " + str(url) + " . End progress\n")
            logfile.write(str(now.now()) + ' error and end progress\n')
            sys.exit("The sheet " + str(sheet) + " has not not required fields, such as 'Aged 10-14'. Please check the file at: " + url)

        # data reading
        logfile.write(str(now.now()) + ' data reading\n')
        print('data reading------')
        for i in range(restartIndex, df.shape[0]):
            if str(df.iloc[i][0]):
                for k in ageReq:
                    raw_data[col[5]].append(k.split()[1])

                raw_data[col[0]] = raw_data[col[0]] + [(df.iloc[i][0])] * len(ageReq)
                raw_data[col[1]] = raw_data[col[1]] + [(df.iloc[i][1])] * len(ageReq)
                raw_data[col[2]] = raw_data[col[2]] + [(df.iloc[i][2])] * len(ageReq)
                raw_data[col[4]] = raw_data[col[4]] + [(df.iloc[i][3])] * len(ageReq)
                raw_data[col[6]] = raw_data[col[6]] + df.iloc[i][j:-1].tolist()

        raw_data[col[3]] = raw_data[col[3]] + [sheet] * len(ageReq) * (df.shape[0] - restartIndex)


    raw_data[col[7]] = [pDate] * len(raw_data[col[0]])
    raw_data[col[8]] = ["HCC_SAPF_2015"] * len(raw_data[col[0]])
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    yearReq = reqFields
    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + ' excel file loading\n')
    print('excel file loading------')
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    # indicator checking
    logfile.write(str(now.now()) + ' indicator checking\n')
    print('indicator checking------')
    for i in range(df.shape[0]):
        yearCol = []
        for k in yearReq:
            k_asked = k
            for j in range(df.shape[1]):
                if str(k_asked) in str(df.iloc[i, j]):
                    yearCol.append(j)
                    restartIndex = i + 1

        if len(yearCol) == len(yearReq):
            break

    if len(yearCol) != len(yearReq):
        errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(yearReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + url)

    yearCol.append(df.shape[1])

    for i in range(restartIndex, df.shape[0]):
        kk = []
        k_asked = "All Apprenticeships"
        for k in range(len(yearCol)-1):
            for j in range(yearCol[k], yearCol[k+1]):
                if df.iloc[i, j] == k_asked:
                    kk.append(j)
                    restartIndex = i + 1
                    break

        if len(kk) == len(yearReq):
            break

    yearCol.pop()

    if len(kk) != len(yearReq):
        errfile.write(str(now.now()) + " Requested data " + str(yearReq).strip(
            '[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(yearReq).strip(
            '[]') + " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: " + url)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # data reading
    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')
    for i in range(restartIndex, df.shape[0]):
            ii = 0
            for k in kk:
                if (pd.notnull(df.iloc[i, 1])) and (pd.notnull(df.iloc[i, k])) and (df.iloc[i, 1] != "Total"):
                    ij = 0
                    for jj in ["Under 19", "19-24"]:
                        raw_data[col[0]].append(df.iloc[i, 1])
                        raw_data[col[1]].append(yearReq[ii])
                        raw_data[col[2]].append(jj)
                        raw_data[col[3]].append(df.iloc[i, k+ij])

                        ij += 1

                ii += 1
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol,
             noDigitRemoveFields):
    yearReq = reqFields
    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + ' excel file loading\n')
    print('excel file loading------')
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    # indicator checking
    logfile.write(str(now.now()) + ' indicator checking\n')
    print('indicator checking------')
    for i in range(df.shape[0]):
        yearCol = []
        for k in yearReq:
            k_asked = k
            for j in range(df.shape[1]):
                if str(k_asked) in str(df.iloc[i, j]):
                    yearCol.append(j)
                    restartIndex = i + 1

        if len(yearCol) == len(yearReq):
            break

    if len(yearCol) != len(yearReq):
        errfile.write(
            str(now.now()) + " Requested data " + str(yearReq).strip('[]') +
            " don't match the excel file. Please check the file at: " +
            str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(yearReq).strip('[]') +
                 " don't match the excel file. Please check the file at: " +
                 url)

    yearCol.append(df.shape[1])

    for i in range(restartIndex, df.shape[0]):
        kk = []
        k_asked = "All Apprenticeships"
        for k in range(len(yearCol) - 1):
            for j in range(yearCol[k], yearCol[k + 1]):
                if df.iloc[i, j] == k_asked:
                    kk.append(j)
                    restartIndex = i + 1
                    break

        if len(kk) == len(yearReq):
            break

    yearCol.pop()

    if len(kk) != len(yearReq):
        errfile.write(
            str(now.now()) + " Requested data " + str(yearReq).strip('[]') +
            " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: "
            + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit(
            "Requested data " + str(yearReq).strip('[]') +
            " in the field 'All Apprenticeships' don't match the excel file. Please check the file at: "
            + url)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # data reading
    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')
    for i in range(restartIndex, df.shape[0]):
        ii = 0
        for k in kk:
            if (pd.notnull(df.iloc[i, 1])) and (pd.notnull(
                    df.iloc[i, k])) and (df.iloc[i, 1] != "Total"):
                ij = 0
                for jj in ["Under 19", "19-24"]:
                    raw_data[col[0]].append(df.iloc[i, 1])
                    raw_data[col[1]].append(yearReq[ii])
                    raw_data[col[2]].append(jj)
                    raw_data[col[3]].append(df.iloc[i, k + ij])

                    ij += 1

            ii += 1
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields,
               dName, logfile)
def download(url, sheet, reqFields, outPath, col, keyCol, digitCheckCol, noDigitRemoveFields):
    homeReq = reqFields

    if len(homeReq) != 1:
        errfile.write(str(now.now()) + " Requested data " + str(homeReq).strip(
            '[]') + " don't match the excel file. This code is only for extracting data from filed 'e1b1a'. Please check the file at: " + str(
            url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(homeReq).strip(
            '[]') + " don't match the excel file. This code is only for extracting data from filed 'e1b1a'. Please check the file at: " + url)

    dName = outPath

    # open url
    socket = openurl.openurl(url, logfile, errfile)

    # operate this excel file
    logfile.write(str(now.now()) + ' excel file loading\n')
    print('excel file loading------')
    xd = pd.ExcelFile(socket)
    df = xd.parse(sheet)

    # find year and quarter
    listurl = (url.split('_'))
    iYQ = listurl[len(listurl) - 1]
    iYQ = (iYQ.split('.'))[0]
    iYear = iYQ[:4]
    iQuarter = str(int(int(iYQ[4:]) / 3))

    # indicator checking
    logfile.write(str(now.now()) + ' indicator checking\n')
    print('indicator checking------')
    for i in range(df.shape[0]):
        numCol = []
        for k in homeReq:
            for j in range(df.shape[1]):
                if df.iloc[i][j] == k:
                    numCol.append(j)
                    restartIndex = i + 1

        if len(numCol) == len(homeReq):
            break

    if len(numCol) != len(homeReq):
        errfile.write(str(now.now()) + " Requested data " + str(homeReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + str(url) + " . End progress\n")
        logfile.write(str(now.now()) + ' error and end progress\n')
        sys.exit("Requested data " + str(homeReq).strip(
            '[]') + " don't match the excel file. Please check the file at: " + url)

    raw_data = {}
    for j in col:
        raw_data[j] = []

    # data reading
    logfile.write(str(now.now()) + ' data reading\n')
    print('data reading------')
    for i in range(restartIndex, df.shape[0]):
        for k in numCol:
            if re.match(r'E\d{8}$', str(df.index[i][0])):
                raw_data[col[0]].append(df.index[i][0])
                raw_data[col[1]].append(df.index[i][1])
                raw_data[col[2]].append(iYear)
                raw_data[col[3]].append(iQuarter)
                raw_data[col[4]].append(df.iloc[i][k])
    logfile.write(str(now.now()) + ' data reading end\n')
    print('data reading end------')

    # save csv file
    dsave.save(raw_data, col, keyCol, digitCheckCol, noDigitRemoveFields, dName, logfile)