コード例 #1
0
def load_excel(filename, sheet_name, truncate_sheet):
    startrow = None
    writer = ExcelWriter(filename, engine='openpyxl')

    try:
        # try to open an existing workbook
        writer.book = load_workbook(filename)
        # get the last row in the existing Excel sheet
        # if it was not specified explicitly
        if startrow is None and sheet_name in writer.book.sheetnames:
            startrow = writer.book[sheet_name].max_row

        # truncate sheet
        if truncate_sheet and sheet_name in writer.book.sheetnames:
            # index of [sheet_name] sheet
            idx = writer.book.sheetnames.index(sheet_name)
            # remove [sheet_name]
            writer.book.remove(writer.book.worksheets[idx])
            # create an empty sheet [sheet_name] using old index
            writer.book.create_sheet(sheet_name, idx)

        # copy existing sheets
        writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
    except FileNotFoundError:
        # file does not exist yet, we will create it
        pass
    return (startrow, writer)
コード例 #2
0
    def to_excel(self, write_mode='a', highest_score=True):

        out_put_path = self.output_file.with_suffix('.xlsx')

        columns = self._init_columns()

        dict_data_list = self.get_list_dict_data(self.gcms,
                                                 highest_score=highest_score)

        df = DataFrame(dict_data_list, columns=columns)

        if write_mode == 'a' and out_put_path.exists():

            writer = ExcelWriter(out_put_path, engine='openpyxl')
            # try to open an existing workbook
            writer.book = load_workbook(out_put_path)
            # copy existing sheets
            writer.sheets = dict(
                (ws.title, ws) for ws in writer.book.worksheets)
            # read existing file
            reader = read_excel(out_put_path)
            # write out the new sheet
            df.to_excel(writer,
                        index=False,
                        header=False,
                        startrow=len(reader) + 1)

            writer.close()
        else:

            df.to_excel(self.output_file.with_suffix('.xlsx'),
                        index=False,
                        engine='openpyxl')

        self.write_settings(self.output_file, self.gcms)
コード例 #3
0
ファイル: 2DHist.py プロジェクト: C09/cjp
def writeExcelData(x,excelfile,sheetname,startrow,startcol):
    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook
    df=DataFrame(x)
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, sheet_name=sheetname,startrow=startrow-1, startcol=startcol-1, header=False, index=False)
    writer.save()
    writer.close()
コード例 #4
0
def appendExcel(data_frame, file_name,sc=0,use_index=False):
	#Define and load excel file
	writefilestr = file_name
	book = load_workbook(writefilestr)
	writer = ExcelWriter(writefilestr, engine='openpyxl')
	writer.book = book
	#Get list of sheets in excel file
	writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
	#Write and save data to excel file
	data_frame.to_excel(writer,'Sheet1',index=use_index,startrow=0, startcol=sc,header=None)
	writer.save()
	writer.close()
コード例 #5
0
def save_cut(cut_df, i, name):
    """Cutting full file into parts according to the lengh"""

    book = load_workbook(template_short_path)
    writer = ExcelWriter(output_path + u'Реестр' + name +
                         u'{} часть_'.format(i + 1) +
                         '{}'.format(cut_df.shape[0]) + u' записей.xlsx',
                         engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    cut_df.to_excel(writer, u'Прил 2', startrow=26, header=False, index=False)
    writer.save()
コード例 #6
0
ファイル: views.py プロジェクト: lukegre/vindta_reCAlk
def write_log_to_excel(log, xls_filename, sheet_name):
    from openpyxl import load_workbook
    from pandas import ExcelWriter, DataFrame

    df = DataFrame(["'" + l for l in log.strip().splitlines()])

    book = load_workbook(xls_filename)
    writer = ExcelWriter(xls_filename, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    df.to_excel(writer, sheet_name, index=False)
    writer.save()
コード例 #7
0
def pandas_dbexcel(viewname, con):

    book = load_workbook(dbtbltype + '.xlsx')
    writer = ExcelWriter(dbtbltype + '.xlsx', engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    df = pd.read_sql_query("SELECT * from " + viewname, con)

    #verify that result of SQL query is stored in the dataframe
    #print(df)

    df.to_excel(writer, sheet_name=viewname, encoding='utf8')

    writer.save()
コード例 #8
0
def pd_html_excel():

    book = load_workbook('tables.xlsx')
    writer = ExcelWriter('tables.xlsx', engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    for x in range(0, len(df)):

        df[x].to_excel(writer, sheet_name="table_" + str(x))

    writer.save()

    file_2.write(
        "Success !!! Please check tables.xlsx for extracted tables from the webpage."
        + '\n' + '\n')
コード例 #9
0
def to_excel_ExcelWriter( df,
                          file,
                          rowNames          = False,
                          masterFile        = '',
                          promptIfLocked    = False,
                          xlsxEngine        = 'openpyxl', #xlsxwriter
                        ):
    writer = ExcelWriter(file, engine=xlsxEngine)
    DataFrame().to_excel(writer, list(df.keys())[0])
   
    #ensure the file is not locked.
    try:
        #create the file even before we try to add any data to ensure it isn't locked
        writer.save()
    except:
        counter = 0
        file_updated = False
        #if the file is locked/RO, sleep to give user time to close it. After 10 failures, wait for user input.
        while( not file_updated ):
            try:
                writer.save()
                file_updated = True
            except:
                print('  >> Waiting on ' + os.path.split(file)[1] + ' for 15 seconds' )
                counter = counter + 1
                if( counter >= 10 ):
                    pdb.set_trace()
                sleep(15)
 
    if(masterFile != ''):
        # book = xlrd.open_workbook(masterFile)
       
        #create an archived copy of the master
        book = openpyxl.load_workbook(masterFile)
        writer.book = book
        writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
 
    for sheetName, this_df in df.items():
        if( not len( this_df ) ):
            print( '    ++ No data for sheet ' + sheetName + '. Moving on.')
            continue
 
        this_df.to_excel(writer, sheetName, index = rowNames)
    writer.save()
    return(1)
コード例 #10
0
def save_all(all_df):
    """Concating all into one file"""

    book = load_workbook(template_all_path)
    writer = ExcelWriter(
        output_path +
        u'Реестр_полный_ {} записей.xlsx'.format(all_df.shape[0]),
        engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    try:
        del all_df['uid_first']
        del all_df['uid']
        del all_df['true_index']
    except:
        pass
    all_df.to_excel(writer, u'Прил 2', startrow=26, header=False, index=False)
    writer.save()
コード例 #11
0
def append_df_to_excel(
        df: DataFrame, filename: str, sheet_name='Sheet1',
        **to_excel_kwargs
        ) -> None:
    file_path = path.join(environ['USERPROFILE'], 'Desktop', filename)
    if not path.exists(file_path):
        df.to_excel(
            file_path,
            sheet_name=sheet_name,
            **to_excel_kwargs)
        return None
    writer = ExcelWriter(file_path, engine='openpyxl', mode='a')
    writer.book = load_workbook(file_path)
    start_row = writer.book[sheet_name].max_row
    writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
    df.to_excel(writer, sheet_name, startrow=start_row,
                header=False, **to_excel_kwargs)
    writer.save()
    writer.close()
    return None
コード例 #12
0
ファイル: RunMe.py プロジェクト: sunifeb12/MicroArrayPipeline
def ControlTreatedUnique(item, expgroup):
    controlfile = Controls[item][0]
    pathc = 'DEG_Identification/' + item + '/' + controlfile
    for expfile in expgroup:
        print 'Finding Common Probes for File : ' + expfile
        pathe = 'DEG_Identification/' + item + '/' + expfile
        exprobes = pd.read_excel(open(pathe, 'rb'),
                                 sheetname='ABS_CALL',
                                 index=False)['Probename']
        controlprobes = pd.read_excel(open(pathc, 'rb'),
                                      sheetname='ABS_CALL',
                                      index=False)['Probename']
        commonprobes = list(set(exprobes) & set(controlprobes))
        expdata = pd.read_excel(open(pathe, 'rb'),
                                sheetname='ABS_CALL',
                                index=False)
        commondf = expdata[expdata['Probename'].isin(commonprobes)]
        writernw = ExcelWriter(pathe, engine='openpyxl')
        book = load_workbook(pathe)
        writernw.book = book
        writernw.sheets = dict((ws.title, ws) for ws in book.worksheets)
        tme = expfile.split('_')[1].split('.')[0]
        commondf.to_excel(writernw, 'COMMN_' + tme, index=False)
        writernw.save()
コード例 #13
0
def main():
    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Final Project/Final_Project_Data_and_Template_Original.xlsx"

    sheetname = "Training Data"
    # df = readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=17)
    # print df
    # print df[0]
    # print df[-1]

    X = np.array(readExcel(excelfile,
                           sheetname=sheetname,
                           startrow=2,
                           endrow=3169,
                           endcol=20),
                 dtype=np.float64)
    print X

    Xa = np.insert(X, 0, 1., axis=1)
    print Xa

    Tw = np.array(readExcel(excelfile,
                            sheetname=sheetname,
                            startrow=2,
                            endrow=3169,
                            startcol=21,
                            endcol=21),
                  dtype=np.string_)
    print Tw

    T = np.array([1 if Tw[i][0] == "male" else -1 for i in range(len(Tw))],
                 dtype=np.int32)
    print T

    Xapi = np.linalg.pinv(Xa)
    print Xapi

    W = np.dot(Xapi, T)
    print W

    T_validating = np.dot(Xa, W)
    print T_validating
    T_validating = np.array(
        [1 if T_validating[i] > 0 else -1 for i in range(len(T_validating))],
        dtype=np.int32)
    print T_validating

    bccm = np.zeros((2, 2), dtype=np.int32)
    for i in range(len(T)):
        row = 1 if T[i] == 1 else 0
        col = 1 if T_validating[i] == 1 else 0
        # print row, col
        bccm[row, col] += 1
    print bccm

    metrics = [
        np.divide(bccm[0, 0] + bccm[1, 1],
                  bccm[0, 0] + bccm[0, 1] + bccm[1, 0] + bccm[1, 1],
                  dtype=np.float64),
        np.divide(bccm[1, 1], bccm[1, 0] + bccm[1, 1], dtype=np.float64),
        np.divide(bccm[0, 0], bccm[0, 0] + bccm[0, 1], dtype=np.float64),
        np.divide(bccm[1, 1], bccm[0, 1] + bccm[1, 1], dtype=np.float64)
    ]
    print metrics

    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook

    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Final Project/Final_Project_Data_and_Template.xlsx"
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    sheetname = "Classifiers"

    # print W
    df = DataFrame(W)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=4,
                startcol=0,
                header=False,
                index=False)

    sheetname = "Performance"

    # print bccm
    df = DataFrame(bccm)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=9,
                startcol=2,
                header=False,
                index=False)

    # print metrics
    df = DataFrame(metrics)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=7,
                startcol=6,
                header=False,
                index=False)

    writer.save()
    writer.close()
コード例 #14
0
ファイル: ass5.py プロジェクト: cjworld/UCSC-Extension
def main():
    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 5/Assignment_5_Data_and_Template_Original.xlsx"

    sheetname = "Data"
    xs = np.array(readExcel(excelfile,
                            sheetname=sheetname,
                            startrow=2,
                            endrow=951,
                            endcol=2),
                  dtype=np.float64)
    # print data
    # print data[0]
    # print data[-1]

    dims = xs.shape
    n = dims[0]
    d = dims[1]
    # print d
    c = 3

    # initial random guesses for parameters
    np.random.seed(0)
    scales = np.array([np.divide(1, c, dtype=np.float64)] * c)
    # print scales
    maxs = np.amax(xs, axis=0)
    # print maxs
    mins = np.amin(xs, axis=0)
    # print mins
    mus = np.zeros((c, d))
    for i in range(c):
        for j in range(d):
            mus[i, j] = (maxs[j] - mins[j]) * np.random.random() + mins[j]
            # print j, maxs[j], mins[j], mus[i, j]
    # print mus
    vs = np.var(xs, axis=0)
    # print vars
    sigmas = np.zeros((c, d, d))
    for i in range(c):
        for j in range(d):
            sigmas[i, j, j] = vs[j]
    # print sigmas

    ps, scales, mus, sigmas = em_gmm_orig(xs,
                                          scales,
                                          mus,
                                          sigmas,
                                          iterations=10000)

    print scales
    amounts = np.round(scales * n)
    print amounts

    print mus
    idx_male = np.argmax(mus[:, 0])
    idx_children = np.argmin(mus[:, 0])
    idx_female = 3 - idx_male - idx_children
    print idx_male
    print idx_female
    print idx_children
    labelnames = ["F", "F", "F"]
    labelnames[idx_male] = "M"
    labelnames[idx_female] = "F"
    labelnames[idx_children] = "C"
    print labelnames

    print ps
    label_indices = np.argmax(ps, axis=1)
    print label_indices
    labels = np.array([labelnames[label_idx] for label_idx in label_indices])
    print labels
    confs = np.array(
        [ps[idx, label_idx] for idx, label_idx in enumerate(label_indices)])
    print confs
    totals = np.array(
        [amounts[idx_male], amounts[idx_female], amounts[idx_children]])
    print totals

    # In the below plots the white dots represent the observed heights.

    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook

    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 5/Assignment_5_Data_and_Template.xlsx"
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    sheetname = "Results"

    # print labels
    df = DataFrame(labels)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=1,
                startcol=0,
                header=False,
                index=False)

    # print confs
    df = DataFrame(confs)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=1,
                startcol=1,
                header=False,
                index=False)

    # print totals
    df = DataFrame(totals)
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=1,
                startcol=5,
                header=False,
                index=False)

    writer.save()
    writer.close()
コード例 #15
0
from tkinter import *
from pandas import DataFrame
from pandas import ExcelWriter
import datetime
import openpyxl
from openpyxl import load_workbook
root = Tk()
root.title("Travel Transport Employee Clock")
root.geometry("600x600")
writer = ExcelWriter(r'test.xlsx', engine='openpyxl')
book = load_workbook(r'test.xlsx')
writer.book = book
writer.sheets = {ws.title: ws for ws in book.worksheets}
check1 = False
check2 = False
check3 = False
inTime1 = 0
outTime1 = 0
inTime2 = 0
outTime2 = 0
inTime3 = 0
outTime3 = 0
#Functions
def enterval():
    global check1
    global check2
    global check3
    global inTime1
    global outTime1
    global inTime2
    global outTime2
コード例 #16
0
def export_excel(data, metadata, tables, config):

    """

    Export full data and results summary tables to Excel template

    Note on template file:
        Ideally, pivot tables would read full data (i.e. un-rounded) and band with a grouping of 0.5-99.5 at 1dB inc.
        Upon testing, it was found that openpyxl.load_workbook is unable to read grouped fields in pivot tables
        Solution will be to apply banding within Python function if possible
        If not possible, solution will be to include extra columns, rounded to 0 d.p.

    """

    file_out = config["output"][0] + ".xlsm"
    config_out = file_out.replace(".xlsm", "_config.txt")

    # Create summary
    flags = data.filter(regex='^Flag_').columns.to_list()
    s1 = Series(
        index=[
            'Number of Time Samples',
            'Number of Recorded Metrics',
            'Number of User-defined Time Periods',
        ],
        data=[
            len(data),
            len(data.columns) - len(flags),
            len(flags),
        ]
    )

    s2 = Series(
        index=['Time Period ' + str(f + 1) for f in range(len(flags))],
        data=flags
    ).str.replace('Flag_', '')

    s3 = Series(
        index=[
            '',
            'Monitor Metadata:'
        ],
        data=[
            None,
            None
        ]
    )

    summary = concat([s1, s2, s3, metadata])

    # Re-order columns
    cols = ['Address', 'Duration']
    cols += data.filter(regex='_Main$').columns.to_list()
    cols += data.filter(regex='Hz$').columns.to_list()
    cols += flags
    for c in data.columns:
        if c not in cols:
            cols += [c]
    data_out = data[cols].copy()

    # Replace flags' start times with booleans
    for f in flags:
        data_out_tmp = data_out[f].dropna()
        if data_out_tmp.empty:
            data_out[f] = False
        else:
            t_start = data_out_tmp[0]
            data_out[f] = data_out[f].replace({t_start: True, None: False})

    # Collate tables

    for i, t in enumerate(tables):
        if i not in list(range(12))[0::4]:
            t.index = len(t) * ['']

    table_main = concat([
        tables[0],
        DataFrame(columns=tables[0].columns, index=['']),
        tables[1],
        DataFrame(columns=tables[0].columns, index=['']),
        tables[2],
        DataFrame(columns=tables[0].columns, index=['']),
        tables[3]
    ])

    table_leq_spec = concat([
        tables[4],
        DataFrame(columns=tables[4].columns, index=['']),
        tables[5],
        DataFrame(columns=tables[4].columns, index=['']),
        tables[6],
        DataFrame(columns=tables[4].columns, index=['']),
        tables[7]
    ])

    table_lmax_spec = concat([
        tables[8],
        DataFrame(columns=tables[8].columns, index=['']),
        tables[9],
        DataFrame(columns=tables[8].columns, index=['']),
        tables[10],
        DataFrame(columns=tables[8].columns, index=['']),
        tables[11],
        DataFrame(columns=tables[8].columns, index=['']),
        tables[12]
    ])

    # Read template
    file_template = "OutputExcelTemplate_v09_20200922.xlsm"
    wb = load_workbook(file_template, keep_vba=True)

    # Write full data
    ExcelFormatter.header_style = None
    writer = ExcelWriter(
        file_out,
        engine='openpyxl',
        datetime_format='dd/mm/yyyy hh:mm:ss',
        date_format='dd/mm/yyyy'
    )
    writer.book = wb
    writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)

    print("Exporting to Excel...")

    # Write data
    summary.to_excel(writer, "Summary", header=False)
    data_out.to_excel(writer, "Full_Data")
    table_main.to_excel(writer, "Summary_Tables")
    table_leq_spec.to_excel(writer, "Leq_Spectral_Tables")
    table_lmax_spec.to_excel(writer, "Lmax_Spectral_Tables")

    return writer, config_out
コード例 #17
0
os.system('chmod -R 777 input')

#Converting the splitted PDF into list
pdflist = sorted(os.listdir('input'))

#Create Excel Workbook
workbook = xlsxwriter.Workbook('tables.xlsx')
worksheet = workbook.add_worksheet()
workbook.close()
os.system('chmod -R 777 tables.xlsx')

#Load excel Workbook using openpyxl
book = load_workbook('tables.xlsx')
writer = ExcelWriter('tables.xlsx', engine='openpyxl') 	
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

#Iterating through the PDF list
for i in range(0,len(pdflist)):

    pdfn = pdflist[i].replace("pg_","").replace(".pdf","")

    #extracting table from PDF
    tables = camelot.read_pdf(os.path.join('input',pdflist[i]))

    #check if pdf contains table to be extracted
    if "1" in str(tables):
        print ("table found in " + pdflist[i])
        #converting a pdf into a DataFrame
        tables[0].df.to_excel(writer,sheet_name="table_" + str(pdfn),index=False,header=False)
	
コード例 #18
0
def summarize_reg(gene_set, n_data_matrix):
    """
	The SUMMARIZE_REG operation summarizes all the data analysis results, by collecting them in convenient tables that exported locally in Excel files.

	:param gene_set: the set of genes of interest to summarize
	:param n_data_matrix: number identifying the data matrix to summarize (only 2,3 and 5 values are permitted)
	
	Example::
	
		import genereg as gr
		gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=2)
		gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=3)
		gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=5)		
	"""

    # Check input parameters
    if n_data_matrix not in [2, 3, 5]:
        raise ValueError('Data Matrix ERROR! Possible values: {2,3,5}')

    # Define the model to summarize
    model = str(n_data_matrix)

    # Define the previous model to check
    if model == '3':
        previous_model = str(int(model) - 1)
    elif model == '5':
        previous_model = str(int(model) - 2)

    # Import the dictionary of genes of interest with their candidate regulatory genes
    dict_RegulGenes = pickle.load(
        open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb'))

    # Import the list of genes of interest and extract in a list the Gene Symbols of all the genes belonging to the current gene set
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    SYMs_current_pathway = []
    for index, row in EntrezConversion_df.iterrows():
        sym = row['GENE_SYMBOL']
        path = row['GENE_SET']
        if path == gene_set:
            SYMs_current_pathway.append(sym)

    if (model == '3') or (model == '5'):
        # Create a list containing the Gene Symbols of the regulatory genes of the genes in the current gene set
        current_regulatory_genes = []
        for key, value in dict_RegulGenes.items():
            if key in SYMs_current_pathway:
                for gene in value:
                    if gene not in current_regulatory_genes:
                        current_regulatory_genes.append(gene)

    if (model == '5'):
        # Create a list containing the Gene Symbols of genes in the other gene sets
        SYMs_other_pathways = []
        for index, row in EntrezConversion_df.iterrows():
            sym = row['GENE_SYMBOL']
            path = row['GENE_SET']
            if not (path == gene_set):
                SYMs_other_pathways.append(sym)

        # Create a list containing the Gene Symbols of the regulatory genes of the genes in the other gene sets
        regulatory_genes_other = []
        for key, value in dict_RegulGenes.items():
            if key not in SYMs_current_pathway:
                for gene in value:
                    if gene not in regulatory_genes_other:
                        regulatory_genes_other.append(gene)

    # Create a dataframe to store final summary results of feature selection and linear regression for each gene of interest
    if model == '2':
        lr_summary_df = pd.DataFrame(index=SYMs_current_pathway,
                                     columns=[
                                         'Inital N° Features',
                                         'Discarded Features',
                                         'N° Features Selected', 'R2', 'Adj.R2'
                                     ])
    else:
        lr_summary_df = pd.DataFrame(
            index=SYMs_current_pathway,
            columns=[
                'Inital N° Features', 'N° New Features w.r.t. Previous Model',
                'Discarded Features', 'Features Available for Selection',
                'N° Features Selected', 'R2', 'Adj.R2'
            ])

    for current_gene in SYMs_current_pathway:

        # Import the current and, if present, the previous model of the current gene
        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          current_gene,
                                          'ENTREZ_GENE_ID'].iloc[0]
        model_gene_df = pd.read_excel(
            './4_Data_Matrix_Construction/Model' + model + '/Gene_' + gene_ID +
            '_[' + current_gene + ']' + '_(' + gene_set + ')-Model_v' + model +
            '.xlsx',
            sheetname='Sheet1',
            header=0)
        if not (model == '2'):
            previous_model_df = pd.read_excel(
                './4_Data_Matrix_Construction/Model' + previous_model +
                '/Gene_' + gene_ID + '_[' + current_gene + ']' + '_(' +
                gene_set + ')-Model_v' + previous_model + '.xlsx',
                sheetname='Sheet1',
                header=0)

        # Extract the list of new features, added to the current model, w.r.t. the previous one
        if not (model == '2'):
            current_model_col_names = set(list(model_gene_df.columns.values))
            previous_model_col_names = set(
                list(previous_model_df.columns.values))
            new_features = list(current_model_col_names -
                                previous_model_col_names)
            lr_summary_df.set_value(current_gene,
                                    'N° New Features w.r.t. Previous Model',
                                    len(new_features))

        # Import the feature selection and linear regression summary tables
        feature_sel_df = pd.read_excel('./5_Data_Analysis/' + gene_set +
                                       '/FeatureSelection/M' + model +
                                       '/Feature_Selection_SUMMARY.xlsx',
                                       sheetname='Sheet1',
                                       header=0)
        lin_reg_df = pd.read_excel('./5_Data_Analysis/' + gene_set +
                                   '/LinearRegression/M' + model +
                                   '/Linear_Regression_R2_SCORES.xlsx',
                                   sheetname='Sheet1',
                                   header=0)

        # Extract and store the results in the summary dataframe
        n_features = feature_sel_df.get_value(current_gene,
                                              'TOT Inital N° Features')
        n_feat_discarded = feature_sel_df.get_value(current_gene,
                                                    'Discarded Features')
        if not (model == '2'):
            n_features_available = feature_sel_df.get_value(
                current_gene, 'Features Available for Selection')
        n_feat_selected = feature_sel_df.get_value(current_gene,
                                                   'N° Features Selected')
        lin_reg_r2_adj = lin_reg_df.get_value(current_gene, 'Adj.R2')
        lin_reg_r2 = lin_reg_df.get_value(current_gene, 'R2')
        lr_summary_df.set_value(current_gene, 'Inital N° Features', n_features)
        lr_summary_df.set_value(current_gene, 'Discarded Features',
                                n_feat_discarded)
        if not (model == '2'):
            lr_summary_df.set_value(current_gene,
                                    'Features Available for Selection',
                                    n_features_available)
        lr_summary_df.set_value(current_gene, 'N° Features Selected',
                                n_feat_selected)
        lr_summary_df.set_value(current_gene, 'Adj.R2', lin_reg_r2_adj)
        lr_summary_df.set_value(current_gene, 'R2', lin_reg_r2)

    # Export the summary dataframe in an Excel file
    lr_summary_df = lr_summary_df.sort_values(by=['Adj.R2'], ascending=[False])
    filename = './5_Data_Analysis/' + gene_set + '/Feature_Selection_and_Linear_Regression.xlsx'
    writer = ExcelWriter(filename, engine='openpyxl')
    try:
        writer.book = load_workbook(filename)
        writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
    except IOError:
        # if the file does not exist yet, I will create it
        pass
    lr_summary_df.to_excel(writer, 'M' + model)
    writer.save()

    # Extract relevant features for each gene of the current gene set and store them in a summary table and define a dataframe to summarize the features selected for each model gene
    features_summary_df = pd.DataFrame(index=SYMs_current_pathway)

    for current_gene in SYMs_current_pathway:

        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          current_gene,
                                          'ENTREZ_GENE_ID'].iloc[0]

        # Import the regression coefficients
        coeff_df = pd.read_excel(
            './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model +
            '/Coefficients/Coefficients_(M' + model + ')-Gene_' + gene_ID +
            '_[' + current_gene + '].xlsx',
            sheetname='Sheet1',
            header=0)

        # Import the confidence intervals
        ci_df = pd.read_excel(
            './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model +
            '/ConfidenceIntervals/Confidence_Intervals_(M' + model +
            ')-Gene_' + gene_ID + '_[' + current_gene + '].xlsx',
            sheetname='Sheet1',
            header=0)

        # Import the correlation matrix
        corr_df = pd.read_excel(
            './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model +
            '/CorrelationMatrix/Correlation_Matrix_(M' + model + ')-Gene_' +
            gene_ID + '_[' + current_gene + '].xlsx',
            sheetname='Sheet1',
            header=0)

        # Select the relevant features on the basis of the confidence intervals (i.e. if the confidence interval does not contain 0, then the feature is significant for the model)
        relevant_features = []
        for index, row in ci_df.iterrows():
            s = row['Significant Feature?']
            if s == 'YES':
                relevant_features.append(index)

        # Create a dataframe to store the results and fill it with requested information
        relevant_features_df = pd.DataFrame(
            index=relevant_features,
            columns=[
                'Regression Coefficient', 'Feature Description',
                'Correlation with EXPRESSION (' + current_gene + ')'
            ])
        for index, row in coeff_df.iterrows():
            gene = row['feature']
            if gene in relevant_features:
                coeff = row['coefficient']
                relevant_features_df.set_value(gene, 'Regression Coefficient',
                                               coeff)
        for index, row in corr_df.iterrows():
            if index in relevant_features:
                corr_with_target = row['EXPRESSION (' + current_gene + ')']
                relevant_features_df.set_value(
                    index,
                    'Correlation with EXPRESSION (' + current_gene + ')',
                    corr_with_target)

        # Add the features descriptions
        if model == '2':
            for f in relevant_features:
                if f in SYMs_current_pathway:
                    descr = 'Gene of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif 'METHYLATION' in f:
                    descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif f in dict_RegulGenes[current_gene]:
                    descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)

        elif model == '3':
            for f in relevant_features:
                if f in SYMs_current_pathway:
                    descr = 'Gene of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif 'METHYLATION' in f:
                    descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif f in dict_RegulGenes[current_gene]:
                    descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif not (f in dict_RegulGenes[current_gene]) and (
                        f in current_regulatory_genes):
                    descr = 'Candidate regulatory gene of the genes in the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)

        elif model == '5':
            for f in relevant_features:
                if f in SYMs_current_pathway:
                    descr = 'Gene of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif 'METHYLATION' in f:
                    descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif f in dict_RegulGenes[current_gene]:
                    descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif not (f in dict_RegulGenes[current_gene]) and (
                        f in current_regulatory_genes):
                    descr = 'Candidate regulatory gene of the genes in the ' + gene_set + ' set'
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif f in SYMs_other_pathways:
                    df_temp = EntrezConversion_df.loc[
                        EntrezConversion_df['GENE_SYMBOL'] == f].copy()
                    f_pathways = (df_temp.GENE_SET.unique()).tolist()
                    descr = 'Gene of the gene sets: ' + (', '.join(f_pathways))
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)
                elif f in regulatory_genes_other:
                    regulated_genes_other = []
                    for key, value in dict_RegulGenes.items():
                        if key in SYMs_other_pathways:
                            if f in value:
                                regulated_genes_other.append(key)
                    df_temp = EntrezConversion_df.loc[EntrezConversion_df[
                        'GENE_SYMBOL'].isin(regulated_genes_other)].copy()
                    f_pathways = (df_temp.GENE_SET.unique()).tolist()
                    descr = 'Candidate regulatory gene of the gene sets: ' + (
                        ', '.join(f_pathways))
                    relevant_features_df.set_value(f, 'Feature Description',
                                                   descr)

        # Export the dataframe in an Excel file
        relevant_features_df = relevant_features_df.sort_values(
            by=['Regression Coefficient'], ascending=[False])
        filename = './5_Data_Analysis/' + gene_set + '/Relevant_Features-Gene_' + gene_ID + '_[' + current_gene + '].xlsx'
        writer = ExcelWriter(filename, engine='openpyxl')
        try:
            writer.book = load_workbook(filename)
            writer.sheets = dict(
                (ws.title, ws) for ws in writer.book.worksheets)
        except IOError:
            # if the file does not exist yet, I will create it
            pass
        relevant_features_df.to_excel(writer, 'M' + model)
        writer.save()

        relevance_order = 0
        for index, row in relevant_features_df.iterrows():
            relevance_order = relevance_order + 1
            str_order = str(relevance_order)
            features_summary_df.set_value(current_gene, index, str_order)

    # Export the summary dataframe in an Excel file
    filename = './5_Data_Analysis/' + gene_set + '/Order_of_Features_Selected.xlsx'
    writer = ExcelWriter(filename, engine='openpyxl')
    try:
        writer.book = load_workbook(filename)
        writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
    except IOError:
        # if the file does not exist yet, I will create it
        pass
    features_summary_df.to_excel(writer, 'M' + model)
    writer.save()
コード例 #19
0
def best_genes(gene_set):
    """
	The BEST_GENES operation collects the target genes with the best linear fit (Adjusted R2 >= 0.6) in the three regression models, storing them locally in a single Excel file.

	:param gene_set: the set of genes of interest to summarize
	
	Example::
	
		import genereg as gr
		gr.SummaryResults.best_genes(gene_set='DNA_REPAIR')		
	"""

    # Define the models to summarize
    models = ['2', '3', '5']

    # Import the list of genes of interest and extract in a list the Gene Symbols of all the genes belonging to the current gene set
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    SYMs_current_pathway = []
    for index, row in EntrezConversion_df.iterrows():
        sym = row['GENE_SYMBOL']
        path = row['GENE_SET']
        if path == gene_set:
            SYMs_current_pathway.append(sym)

    for model in models:

        # Import the summary table cointaining the value of the R2 for each model and for each gene of interest in the current gene set
        # and extract the list of "good" genes, the ones that have R2 >= 0.6 in the current model
        summary_r2_df = pd.read_excel('./5_Data_Analysis/' + gene_set +
                                      '/R2_and_Adj.R2_Scores.xlsx',
                                      sheetname='Sheet1',
                                      header=0)
        summary_r2_df = summary_r2_df.sort_values(
            by=['Adj.R2 (M' + model + ')'], ascending=[False])

        good_genes = []
        for index, row in summary_r2_df.iterrows():
            current_model_r2 = row['Adj.R2 (M' + model + ')']
            if current_model_r2 >= 0.6:
                good_genes.append(index)

        # Create a dataframe to the store the results, indexed by the "good" genes and a progressive number for each significant feature extracted during the regression process
        num_features = []
        for i in list(range(1, 20)):
            num_features.append(i)

        # Cartesian product to generate tuples for multi-indexing
        import itertools
        tuples = []
        for i in itertools.product(good_genes, num_features):
            tuples.append(i)

        # Set the multiple indexes to be used in the dataframe
        index = pd.MultiIndex.from_tuples(tuples, names=['GENE', '#'])

        # Create the dataframe and initialize the empty cells as empty strings
        final_summary_df = pd.DataFrame('',
                                        index=index,
                                        columns=[
                                            'Significant Feature', 'Adj.R2',
                                            'Regression Coefficient',
                                            'Feature Description'
                                        ])

        # Fill the dictionary

        # Adjusted R2
        for current_gene in good_genes:
            r2 = summary_r2_df.get_value(current_gene,
                                         'Adj.R2 (M' + model + ')')
            final_summary_df.loc[(current_gene, 1), 'Adj.R2'] = r2

        # Features
        for current_gene in good_genes:

            # Import the table containing the significant features extracted for the current gene
            gene_ID = EntrezConversion_df.loc[
                EntrezConversion_df['GENE_SYMBOL'] == current_gene,
                'ENTREZ_GENE_ID'].iloc[0]
            features_df = pd.read_excel('./5_Data_Analysis/' + gene_set +
                                        '/Relevant_Features-Gene_' + gene_ID +
                                        '_[' + current_gene + '].xlsx',
                                        sheetname='M' + model,
                                        header=0)

            feature_counter = 1
            for index, row in features_df.iterrows():
                coeff = row['Regression Coefficient']
                descr = row['Feature Description']
                final_summary_df.loc[(current_gene, feature_counter),
                                     'Significant Feature'] = index
                final_summary_df.loc[(current_gene, feature_counter),
                                     'Regression Coefficient'] = coeff
                final_summary_df.loc[(current_gene, feature_counter),
                                     'Feature Description'] = descr
                feature_counter = feature_counter + 1

        # Remove the empty rows in the dataframe
        for index, row in final_summary_df.iterrows():
            feat = row['Significant Feature']
            coeff = row['Regression Coefficient']
            descr = row['Feature Description']
            if (feat == '') & (coeff == '') & (descr == ''):
                final_summary_df.drop(index, inplace=True)

        # Export the summary dataframe in an Excel file
        filename = './5_Data_Analysis/' + gene_set + '/Best_Genes.xlsx'
        writer = ExcelWriter(filename, engine='openpyxl')
        try:
            writer.book = load_workbook(filename)
            writer.sheets = dict(
                (ws.title, ws) for ws in writer.book.worksheets)
        except IOError:
            # if the file does not exist yet, I will create it
            pass
        final_summary_df.to_excel(writer, 'M' + model)
        writer.save()
コード例 #20
0
    def append_df_to_excel(self,
                           df,
                           sheet_name='Sheet1',
                           startrow=None,
                           truncate_sheet=False,
                           **to_excel_kwargs):
        '''
        Append a DataFrame [df] to existing Excel file [filename]
        into [sheet_name] Sheet.
        If [filename] doesn't exist, then this function will create it.

        Parameters:
          filename : File path or existing ExcelWriter
                     (Example: '/path/to/file.xlsx')
          df : dataframe to save to workbook
          sheet_name : Name of sheet which will contain DataFrame.
                       (default: 'Sheet1')
          startrow : upper left cell row to dump data frame.
                     Per default (startrow=None) calculate the last row
                     in the existing DF and write to the next row...
          truncate_sheet : truncate (remove and recreate) [sheet_name]
                           before writing DataFrame to Excel file
          to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
                            [can be dictionary]

        Returns: None
        '''
        from openpyxl import load_workbook

        # ignore [engine] parameter if it was passed
        if 'engine' in to_excel_kwargs:
            to_excel_kwargs.pop('engine')

        #writer = pd.ExcelWriter(self.file_name, engine='openpyxl')
        writer = ExcelWriter(self.file_name, engine='openpyxl')

        try:
            # try to open an existing workbook
            writer.book = load_workbook(self.file_name)

            # get the last row in the existing Excel sheet
            # if it was not specified explicitly
            if startrow is None and sheet_name in writer.book.sheetnames:
                startrow = writer.book[sheet_name].max_row

            # truncate sheet
            if truncate_sheet and sheet_name in writer.book.sheetnames:
                # index of [sheet_name] sheet
                idx = writer.book.sheetnames.index(sheet_name)
                # remove [sheet_name]
                writer.book.remove(writer.book.worksheets[idx])
                # create an empty sheet [sheet_name] using old index
                writer.book.create_sheet(sheet_name, idx)

            # copy existing sheets
            writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
        except FileNotFoundError:
            # file does not exist yet, we will create it
            pass

        if startrow is None:
            startrow = 0

        # write out the new sheet
        df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)

        # save the workbook
        writer.save()
コード例 #21
0
def CreateFileExcel(pathin,pathout):
    #filename = "Config_Setting.csv"

    pathconf = PathSteel(dir_path =pathin,
                     FileName = "Config_Setting.csv")\
                    .refpath()

    #fullpath = os.path.join(pathin,filename)

    # get path store data to handling
    Right_Genneral_All_path = PathFromFileNameAndDirpath(dir_path =pathin,
                                                         filename ="Right_Genneral_All.csv"
                                                         )
                                    
    Left_Genneral_All_path = PathFromFileNameAndDirpath(dir_path =pathin,
                                                         filename ="Left_Genneral_All.csv"
                                                         )

    # key value 
    keyvalue = ["ValueGeneral", 
                "Columnmove",
                "LocationCellForMoveColumn",
                "GenneralColumnNotChange",
                "GeneralConcernRaffter",
                "Genneral_Select",
                "LocationOfRowLeft",
                "LocationOfRowRight",
                "ExcelCellForMoveColumnRight",
                "LocationOfPurlin",
                "startrow",
                ]
                
    #location in conf
    locvalue = [12,16,17,19,20,21,23,24,25,26,30]

    credict_c = credict(KeyValues = keyvalue, 
                        LocConf = locvalue,
                        Config_Setting_Path = pathconf)
    credict_list = credict_c.Dictfromkeyandvalueconf()

    # create arr from keyvalue  
    valgen = credict_list.get("ValueGeneral", "")
    colmv = credict_list.get("Columnmove", "")
    locmvcol = credict_list.get("LocationCellForMoveColumn", "")
    gencolnotchg = credict_list.get("GenneralColumnNotChange", "")
    genconraf = credict_list.get("GeneralConcernRaffter", "")
    gensel = credict_list.get("Genneral_Select", "")
    locrowleft = credict_list.get("LocationOfRowLeft", "")
    locrowright = credict_list.get("LocationOfRowRight", "")
    excemvcolright = credict_list.get("ExcelCellForMoveColumnRight", "")
    locpur = credict_list.get("LocationOfPurlin", "")
    strow = credict_list.get("startrow", "")

    # data template  
    if IsRunningInPyinstallerBundle():
        #NameFile = ExtractFileNameFromPath(DataExcel)
        DataExcel = resource_path_is_from_pyinstall_and_dev(FileName = 'DataALL - Template.xlsx',
                                                         Subfolder="Data",
                                                         Is_Directory_Path_To_SubFolder= True,
                                                         dir_path=sys._MEIPASS)

    else:
        # get file excel from template excel (full path)
        DataExcel = PathSteel(modulename =templatexc,
                             FileName ='DataALL - Template.xlsx')\
                            .getpathmodule()
            
        """
        DataExcel = os.path.join((os.path.dirname(templatexc.__file__)),
                                'DataALL - Template.xlsx') 
        """

    book = load_workbook(DataExcel)
    writer = ExcelWriter(DataExcel,
                        engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    for path in [Left_Genneral_All_path,
                Right_Genneral_All_path]:
            
            df1 = pd.read_csv(path, delimiter=',',
                                    index_col = 0)
            dfCount = df1.shape
            df1.to_csv(path)
            #write Left to Excel 
            dfValueGeneral = pd.read_csv(path, 
                                        delimiter=',',
                                        usecols  = valgen,
                                        nrows= 1)

            dfValueGeneral.to_excel(writer,'General Member',
                                    index=False,header=True ,
                                    startcol=0,startrow \
                                    = strow[0])

            worksheet = writer.sheets['General Member']

            # create frame to excel 
            excellframe = toexcel(worksheet = worksheet,
                                            path = path,
                                            path_conf =pathconf,
                                            lpath=Left_Genneral_All_path,
                                            rpath = Right_Genneral_All_path)

            if path == Left_Genneral_All_path:
                usecolsArr = locrowleft
                LocationMoveColumn = locmvcol
                #write path to excel 
                excellframe.wrivaltoexc()
            else:
                usecolsArr = locrowright
                LocationMoveColumn = excemvcolright
                #write path to excel 
                excellframe.wrivaltoexc()
            #Write genneral to excel
            DfChangegenneral = pd.read_csv(path,
                                         delimiter=',',
                                         usecols = gencolnotchg,
                                         nrows= 1 
                                         )

            DfChangegenneral.to_excel(writer,
                                    'General Member',
                                    index=False,
                                    header=True,
                                    startcol=0,
                                    startrow= int(usecolsArr[1])
                                    )

            #write Genneral Concern Raffter
            DfChangegenneral = pd.read_csv(path,
                                             delimiter=',',
                                             usecols = genconraf 
                                             )

            DfChangegenneral.to_excel(writer,
                                    'General Member',
                                    index=False,
                                    header=True ,
                                    startcol=0,
                                    startrow=int(usecolsArr[2])
                                    )

            #write genneral selected 
            DfChangegenneral = pd.read_csv(path,
                                     delimiter=',',
                                     usecols = gensel,
                                     nrows= 1)
            DfChangegenneral.to_excel(writer,
                                    'General Member',
                                    index=False,
                                    header=True ,
                                    startcol=0,
                                    startrow= int(usecolsArr[0])
                                    )

            #write purlin roof 
            DfChangegenneral = pd.read_csv(path,
                                         delimiter=',',
                                         usecols = locpur,
                                         nrows= 1)

            DfChangegenneral.to_excel(writer,
                                    'General Member',
                                    index=False,
                                    header=True ,
                                    startcol=0,
                                    startrow= int(usecolsArr[3]))
            worksheet = writer.sheets['General Member']

            #Wirte move Column to excel
            excellframe.writemovecol(LocationMoveColumn,
                                                colmv)
    # create full path from dirpath 
    path = PathSteel(dir_path =pathout,
                     FileName ='new_big_file.xlsx')\
                    .refpath()
    #path = os.path.join(pathout,'new_big_file.xlsx') 
    book.save(path) 
コード例 #22
0
ファイル: helper.py プロジェクト: SRchary/flask
def generate_ferc_intr_excel(mongo, start_date, end_date, dataframes):

    source_file = conf.FERC_TEMPLATE_FILE
    book = openpyxl.load_workbook(source_file)

    start_row_index = 23
    sheet_name = conf.FERC_INTR_FILE_NAME
    sheet_name = "{}_{}.xlsx".format(sheet_name, end_date.year)
    file_path = os.path.join(conf.GRC_FILE_DIRS[0], sheet_name)
    writer = ExcelWriter(file_path,
                         engine='openpyxl',
                         options={'strings_to_numbers': True})
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    oh_conductor_df = dataframes.get("oh_conductor_df")
    ug_conductor_df = dataframes.get("ug_conductor_df")

    oh_kw_df = dataframes.get("oh_kw_df")
    ug_kw_df = dataframes.get("ug_kw_df")

    conductor_df_columns = [
        "", "TLINE_NM", "DESIGNATION_B", "RATEDKV", "NOMINAL_VOLTAGE",
        "STRUC_TYPE", "CIRCUIT_MI", "", "NUMBER_CIR", "CONDUCTOR SEGMENT"
    ]

    oh_conductor_df[""] = ""
    oh_conductor_df = oh_conductor_df.apply(lambda x: set_designation(x),
                                            axis=1)
    print(oh_conductor_df)
    oh_conductor_df_new = oh_conductor_df[conductor_df_columns]
    oh_conductor_df_new.to_excel(writer,
                                 sheet_name="Page 422-423",
                                 startrow=start_row_index,
                                 startcol=0,
                                 header=False,
                                 index=False)

    start_row_index = start_row_index + oh_conductor_df.shape[0] + 1
    ug_conductor_df[""] = ""
    ug_conductor_df = ug_conductor_df.apply(lambda x: set_designation(x),
                                            axis=1)
    ug_conductor_df_new = ug_conductor_df[conductor_df_columns]
    ug_conductor_df_new.to_excel(writer,
                                 sheet_name="Page 422-423",
                                 startrow=start_row_index,
                                 startcol=0,
                                 header=False,
                                 index=False)
    start_row_index = start_row_index + ug_conductor_df.shape[0] + 5
    oh_kw_df[""] = ""
    oh_kw_df = oh_kw_df[["", "", "", "RATEDKV", "", "", "SUM_Mi"]]
    oh_kw_df["RATEDKV"] = oh_kw_df["RATEDKV"].apply(lambda x: str(x) + "KV"
                                                    if x else "")
    oh_kw_df["SUM_Mi"] = oh_kw_df["SUM_Mi"].apply(lambda x: "{:.2f}".format(x)
                                                  if x else x)
    oh_kw_df.to_excel(writer,
                      sheet_name="Page 422-423",
                      startrow=start_row_index,
                      startcol=0,
                      header=False,
                      index=False)
    old_start_row_index = start_row_index

    start_row_index = start_row_index + oh_kw_df.shape[0] + 2
    ug_kw_df[""] = ""
    ug_kw_df["RATEDKV"] = ug_kw_df["RATEDKV"].apply(lambda x: str(x) + "KV"
                                                    if x else "")
    ug_kw_df["SUM_Mi"] = ug_kw_df["SUM_Mi"].apply(lambda x: "{:.2f}".format(x)
                                                  if x else x)
    ug_kw_df = ug_kw_df[["", "", "", "RATEDKV", "", "", "SUM_Mi"]]
    ug_kw_df.to_excel(writer,
                      sheet_name="Page 422-423",
                      startrow=start_row_index,
                      startcol=0,
                      header=False,
                      index=False)

    max_rows = start_row_index + ug_kw_df.shape[0] + 2
    formatter.format_ferc_report(writer.sheets["Page 422-423"],
                                 old_start_row_index, start_row_index,
                                 max_rows, end_date.year)
    writer.save()
    for ffname in conf.GRC_FILE_DIRS[1:]:
        dest_path = os.path.join(ffname, sheet_name)
        shutil.copy(file_path, dest_path)
    return sheet_name
コード例 #23
0
def main():
    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 4/Assignment_4_Data_and_Template_Original.xlsx"

    sheetname = "Training Data"
    # df = readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=17)
    # print df
    # print df[0]
    # print df[-1]

    X = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=15), dtype=np.int32)
    # print X

    Xa = np.insert(X, 0, 1, axis=1)
    # print Xa

    T = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, startcol=16, endcol=16), dtype=np.int32)
    # print T

    T2n = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, startcol=17, endcol=17), dtype=np.int32)
    # print T2n

    mapping = [
        [1, -1, -1, -1, -1, -1],
        [-1, 1, -1, -1, -1, -1],
        [-1, -1, 1, -1, -1, -1],
        [-1, -1, -1, 1, -1, -1],
        [-1, -1, -1, -1, 1, -1],
        [-1, -1, -1, -1, -1, 1]
    ]
    T2 = np.array([mapping[row[0]] for row in T2n], dtype=np.int32)
    # print T2

    Xapi = np.linalg.pinv(Xa)
    # print Xapi

    W = np.dot(Xapi, T)
    # print W

    W2 = np.dot(Xapi, T2)
    # print W2

    sheetname = "To be classified"
    Xt = np.array(readExcel(excelfile, sheetname=sheetname, startrow=5, endrow=54, endcol=15), dtype=np.int32)
    # print Xt
    # print Xt[0]

    Xta = np.insert(Xt, 0, 1, axis=1)
    # print Xta

    T_testing = np.dot(Xta, W)
    # print T_testing
    T_testing = np.array([1 if T_testing[i][0] > 0 else -1 for i in range(len(T_testing))], dtype=np.int32)
    # print T_testing

    T2_testing = np.dot(Xta, W2)
    # print T2_testing
    T2_testing = np.argmax(T2_testing, axis=1)
    # print T2_testing

    T_validating = np.dot(Xa, W)
    # print T_validating
    T_validating = np.array([1 if T_validating[i][0] > 0 else -1 for i in range(len(T_validating))], dtype=np.int32)
    # print T_validating

    T2_validating = np.dot(Xa, W2)
    # print T2_validating
    T2_validating = np.argmax(T2_validating, axis=1)
    # print T2_validating

    bccm = np.zeros((2, 2), dtype=np.int32)
    for i in range(len(T)):
        row = 1 if T[i, 0] == 1 else 0
        col = 1 if T_validating[i] == 1 else 0
        # print row, col
        bccm[row, col] += 1
    print bccm

    metrics = [
        np.divide(bccm[0, 0] + bccm[1, 1], bccm[0, 0] + bccm[0, 1] + bccm[1, 0] + bccm[1, 1], dtype=np.float64),
        np.divide(bccm[1, 1], bccm[1, 0] + bccm[1, 1], dtype=np.float64),
        np.divide(bccm[0, 0], bccm[0, 0] + bccm[0, 1], dtype=np.float64),
        np.divide(bccm[1, 1], bccm[0, 1] + bccm[1, 1], dtype=np.float64)
    ]
    print metrics

    mcccm = np.zeros((6, 6), dtype=np.int32)
    for i in range(len(T2n)):
        row = T2n[i, 0]
        col = T2_validating[i]
        # print row, col
        mcccm[row, col] += 1
    print mcccm

    sumup = np.sum(mcccm, axis=0)
    # print sumup

    ppv = np.zeros(6)
    for i in range(6):
        ppv[i] = np.divide(mcccm[i, i], sumup[i], dtype=np.float64)
    print ppv

    ppvmetrics = [[max(ppv), np.argmax(ppv)], [min(ppv), np.argmin(ppv)]]
    print ppvmetrics


    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook

    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 4/Assignment_4_Data_and_Template_Updated.xlsx"
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    sheetname = "Classifiers"

    # print W
    df = DataFrame(W)
    df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=0, header=False, index=False)

    # print W2
    df = DataFrame(W2)
    df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=4, header=False, index=False)

    sheetname = "To be classified"

    # print Tt
    df = DataFrame(T_testing)
    df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=15, header=False, index=False)

    # print Tt2
    df = DataFrame(T2_testing)
    df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=16, header=False, index=False)

    sheetname = "Performance"

    # print bccm
    df = DataFrame(bccm)
    df.to_excel(writer, sheet_name=sheetname, startrow=9, startcol=2, header=False, index=False)

    # print metrics
    df = DataFrame(metrics)
    df.to_excel(writer, sheet_name=sheetname, startrow=7, startcol=6, header=False, index=False)

    # print mcccm
    df = DataFrame(mcccm)
    df.to_excel(writer, sheet_name=sheetname, startrow=18, startcol=2, header=False, index=False)

    # print ppvmetrics
    df = DataFrame(ppvmetrics)
    df.to_excel(writer, sheet_name=sheetname, startrow=19, startcol=11, header=False, index=False)

    writer.save()
    writer.close()
コード例 #24
0
def main():
    selected_digits = [6, 9]

    X, T = load_mnist("training", selected_digits)
    X = np.array(X, np.float64)
    T = np.array(T, np.float64)
    # print type(X)
    # print type(T)
    # print X.shape
    # print T.shape

    # Xn = np.array([[72., 101., 94.], [50., 96., 70.], [14., 79., 10.], [8., 70., 1.]], np.float64)
    U, V, P = XZCVPR(X, T, selected_digits)

    n = [0, 0]
    n[0] = len(T[T == selected_digits[0]])
    n[1] = len(T[T == selected_digits[1]])
    # B = int(np.ceil(np.log2(n[0] + n[1]))) + 1
    B = 32
    # print B

    Prec = P[:, 0:2]
    # print Prec[:, 0]
    # print Prec[:, 1]

    histograms, maxs, mins = build_2d_histograms(Prec, T, B, selected_digits)

    mus, Sigmas = build_2d_beysian(Prec, T, selected_digits)

    Xt, Tt = load_mnist("testing", selected_digits)
    Xt = np.array(Xt, np.float64)
    Tt = np.array(Tt, np.float64)
    # print Tt

    # Get principal component matrix of Xt
    Pt = PCA(Xt, U, V)
    # print Pt

    # Get redecued principal component matrix of Pt
    Ptrec = Pt[:, 0:2]
    # print Ptrec[0]
    # print Ptrec[1]

    Tth = np.zeros((len(Tt), 2))
    Ttb = np.zeros((len(Tt), 2))
    for i in range(len(Tt)):
        ccps_by_histograms = get_ccps_by_2d_histograms(B, histograms, maxs,
                                                       mins, n, Ptrec[i])
        # print ccps_by_histograms
        pps_by_histograms = posterior_probability(ccps_by_histograms, n)
        # print pps_by_histograms
        if pps_by_histograms[0] == pps_by_histograms[1]:
            Tth[i] = (-1., 0.)
        else:
            if pps_by_histograms[0] > pps_by_histograms[1]:
                Tth[i] = (str(selected_digits[0]), pps_by_histograms[0])
            else:
                Tth[i] = (str(selected_digits[1]), pps_by_histograms[1])

        ccps_by_beysian = get_ccps_by_2d_beysian(Ptrec[i], mus, Sigmas)
        # print ccps_by_beysian
        pps_by_beysian = posterior_probability(ccps_by_beysian, n)
        # print pps_by_beysian
        if pps_by_beysian[0] == pps_by_beysian[1]:
            Ttb[i] = ("Undecidable", 0.)
        else:
            if pps_by_beysian[0] > pps_by_beysian[1]:
                Ttb[i] = (str(selected_digits[0]), pps_by_beysian[0])
            else:
                Ttb[i] = (str(selected_digits[1]), pps_by_beysian[1])
    # print Tth
    # print Ttb
    # print len(Tt[Tt == Tth])
    # print len(Tt[Tt == Ttb])

    idxp = 2
    idxn = 1
    # vector_to_img(Xt[idxp], Xt[idxn])

    xp = Xt[idxp]
    xn = Xt[idxn]
    # print xp
    # print xn

    zp = xp - U
    zn = xn - U
    # print zp
    # print zn

    Pp = np.dot(zp, V.T)
    Pn = np.dot(zn, V.T)
    # print Pp
    # print Pn

    Precp = Pp[0:2]
    Precn = Pn[0:2]
    # print Precp
    # print Precn

    Rp = np.dot(Precp, V[0:2, :])
    Rn = np.dot(Precn, V[0:2, :])
    # print Rp
    # print Rn

    Xrecp = Rp + U
    Xrecn = Rn + U
    # print Xrecp
    # print Xrecn

    Thp = Tth[idxp]
    Tbp = Ttb[idxp]
    # print Thp
    # print Tbp

    Thn = Tth[idxn]
    Tbn = Ttb[idxn]
    # print Thn
    # print Tbn

    ch = 0
    cb = 0
    for i in range(len(Tt)):
        # print Tth[i][0], Tt[i][0], Tth[i][0] == Tt[i][0]
        if Tth[i][0] == Tt[i][0]:
            ch += 1
        # print Ttb[i][0], Tt[i][0], Ttb[i][0] == Tt[i][0]
        if Ttb[i][0] == Tt[i][0]:
            cb += 1
    # print len(Tt), ch, cb

    accuracyh = np.divide(ch, len(Tt), dtype=np.float64)
    accuracyb = np.divide(cb, len(Tt), dtype=np.float64)
    # print accuracyh
    # print accuracyb

    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook

    excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 3/Assignment_3_Submission_Template_Copy.xlsx"
    sheetname = "Results"

    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    # print [U]
    df = DataFrame([U])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=1,
                startcol=1,
                header=False,
                index=False)

    # print [V[0, :]]
    df = DataFrame([V[0, :]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=2,
                startcol=1,
                header=False,
                index=False)

    # print [V[1, :]]
    df = DataFrame([V[1, :]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=3,
                startcol=1,
                header=False,
                index=False)

    # print [n[0]]
    df = DataFrame([n[0]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=5,
                startcol=1,
                header=False,
                index=False)

    # print [n[1]]
    df = DataFrame([n[1]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=6,
                startcol=1,
                header=False,
                index=False)

    # print [mus[0]]
    df = DataFrame([mus[0]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=8,
                startcol=1,
                header=False,
                index=False)

    # print [mus[1]]
    df = DataFrame([mus[1]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=9,
                startcol=1,
                header=False,
                index=False)

    # print Sigmas[0]
    df = DataFrame(Sigmas[0])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=11,
                startcol=1,
                header=False,
                index=False)

    # print Sigmas[1]
    df = DataFrame(Sigmas[1])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=13,
                startcol=1,
                header=False,
                index=False)

    # print maxs[0], mins[0]
    df = DataFrame([[maxs[0], mins[0]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=16,
                startcol=1,
                header=False,
                index=False)

    # print maxs[1], mins[1]
    df = DataFrame([[maxs[1], mins[1]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=17,
                startcol=1,
                header=False,
                index=False)

    # print histograms[0]
    # print histograms[0][0, :]
    df = DataFrame(histograms[0])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=19,
                startcol=1,
                header=False,
                index=False)

    # print histograms[1]
    # print histograms[1][0, :]
    df = DataFrame(histograms[1])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=52,
                startcol=1,
                header=False,
                index=False)

    # print xp
    df = DataFrame([xp])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=87,
                startcol=1,
                header=False,
                index=False)
    # print zp
    df = DataFrame([zp])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=88,
                startcol=1,
                header=False,
                index=False)
    # print Precp
    df = DataFrame([Precp])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=89,
                startcol=1,
                header=False,
                index=False)
    # print Rp
    df = DataFrame([Rp])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=90,
                startcol=1,
                header=False,
                index=False)
    # print Xrecp
    df = DataFrame([Xrecp])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=91,
                startcol=1,
                header=False,
                index=False)

    # print xn
    df = DataFrame([xn])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=93,
                startcol=1,
                header=False,
                index=False)
    # print zn
    df = DataFrame([zn])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=94,
                startcol=1,
                header=False,
                index=False)
    # print Precn
    df = DataFrame([Precn])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=95,
                startcol=1,
                header=False,
                index=False)
    # print Rn
    df = DataFrame([Rn])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=96,
                startcol=1,
                header=False,
                index=False)
    # print Xrecn
    df = DataFrame([Xrecn])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=97,
                startcol=1,
                header=False,
                index=False)

    # print selected_digits[0]
    df = DataFrame([selected_digits[0]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=101,
                startcol=1,
                header=False,
                index=False)
    # print Thp[0], Thp[1]
    df = DataFrame([[Thp[0], Thp[1]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=102,
                startcol=1,
                header=False,
                index=False)
    # print Tbp[0], Tbp[1]
    df = DataFrame([[Tbp[0], Tbp[1]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=103,
                startcol=1,
                header=False,
                index=False)

    # print selected_digits[1]
    df = DataFrame([selected_digits[1]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=105,
                startcol=1,
                header=False,
                index=False)
    # print Thn[0], Thn[1]
    df = DataFrame([[Thn[0], Thn[1]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=106,
                startcol=1,
                header=False,
                index=False)
    # print Tbn[0], Tbn[1]
    df = DataFrame([[Tbn[0], Tbn[1]]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=107,
                startcol=1,
                header=False,
                index=False)

    # print accuracyh
    df = DataFrame([[accuracyh]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=110,
                startcol=1,
                header=False,
                index=False)
    # print accuracyb
    df = DataFrame([[accuracyb]])
    df.to_excel(writer,
                sheet_name=sheetname,
                startrow=111,
                startcol=1,
                header=False,
                index=False)

    writer.save()
    writer.close()