def DFtoExcel(df, FolderName, FileName):
    write_df = df.loc[:, ["FileName", "hyperlink", "Sheet Name"]]

    # Path Cell_Search_By_Key
    MainFolder = "C:\\Cell_Search_By_Key"
    FolderPath = os.path.join(MainFolder, FolderName)
    if not os.path.exists(FolderPath):
        os.makedirs(FolderPath)
    os.chdir(FolderPath)
    ExcelName = "%s.xlsx" % FileName
    writer = ExcelWriter(ExcelName)
    write_df.to_excel(writer, "Result", index=False)
    writer.save()
    # turn path into hyperlink
    Excel_Path = os.path.join(FolderPath, ExcelName)
    wb = Workbook(Excel_Path)
    # wb = Workbook.caller()
    checkArr = Range("B2").vertical.value
    i = 2
    for check in checkArr:

        RangeName = "B%d" % (i)
        displayRange = "A%d" % (i)
        address = Range(RangeName).value
        display_name = Range(displayRange).value
        i += 1
        try:
            Range(RangeName).add_hyperlink(address, text_to_display=address)
        except:
            pass
    wb.save()
    wb.close()
    return "FINISH"
Exemple #2
1
    def generate_report(title, description):
        """Generate Excel  1997 file from query.

        :param title: Query title.
        :param description: Query description.
        :return: Response with Excel 1997 attachment.
        """
        df = load_data_frame(request)

        # Limit the columns to the maximum allowed in Excel 97.
        max_length = 255
        index_len = len(df.index.names)

        lim_df = df.drop(df.columns[max_length - index_len - 1:len(df.columns) - 1], axis=1)

        extension = 'xls'
        engine = 'xlwt'
        encoding = 'utf-8'
        content_type = 'application/vnd.ms-excel'
        # Add content and return response
        f = NamedTemporaryFile(suffix=extension)
        ew = ExcelWriter(f.name, engine=engine, encoding=encoding)

        #print lim_df.to_string()
        #print f.name

        lim_df.to_excel(ew)
        ew.save()


        #shutil.copyfile(f.name, 'manuel.xls')

        show_legend = request.REQUEST.get('show_legend', '')
        table_description = request.REQUEST.get('table_description', '')

        add_header_and_footer(f.name, title, description, show_legend, table_description)

        title = title.strip().encode("UTF-8").replace(" ", '_')

        if len(title) > max_length_filename:
            title = title[:max_length_filename]

        filename = '%s.%s' % (title, extension)

        # Setup response
        data = f.read()

        response = HttpResponse(data)
        response["Content-Type"] = content_type
        response["Content-status_code"] = 200
        response['Content-Transfer-Encoding'] = 'binary'
        response['Content-Disposition'] = 'attachment; filename="%s"' % filename
        return response
Exemple #3
0
    def dataIO(self, args):
        """
        IO data for possible extension
        """
        writer = ExcelWriter("{}.xlsx".format(args.logFile), engine='xlsxwriter')
        reportDf = pd.DataFrame()
        reportDf.to_excel(writer, sheet_name="Reports")
        contentDf = pd.DataFrame()
        contentDf.to_excel(writer, sheet_name="Contents")
        contentSheet = writer.sheets["Contents"]
        contentSheet.write_string(xl_rowcol_to_cell(self.sheetLinkRow, 0),
                                  "link list for all choices and sub refines".format(args.logFile))
        self.sheetLinkRow += 1

        for dfname in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']:
            if dfname in self._rawdf.keys():
                print("--save raw data for {}".format(dfname))
                self._rawdf[dfname].to_excel(writer, "{}".format(dfname))
                link_format = writer.book.add_format({'color': 'blue', 'underline': 1})
                contentSheet.write_url(xl_rowcol_to_cell(self.sheetLinkRow, 0), "internal:{}!A1".format(dfname),
                                       link_format, dfname)
                self.sheetLinkRow += 1
            if dfname in self._rawdf.keys() and dfname in ['2', '3', '5', '6', '8', '9', '11', '12']:
                self.refine(args, writer, dfname)

        # Close the Pandas Excel writer and output the Excel file.
        writer.save()
Exemple #4
0
def data_total( DocName, HistoryPath, SavePath ):
    
    files = os.listdir(HistoryPath)
    
    TotalData = pd.DataFrame()
    
    for file in files:    
        historyfile = os.path.join(HistoryPath, file)
        try:
            HistoryBook = pd.ExcelFile(historyfile)
            HistorySheet = HistoryBook.parse('Sheet1', skiprows = 0, index = None)
            
            TotalData = TotalData.append(HistorySheet)
        
        except IOError:
            print "Cannot read " + str(historyfile)
    
    TotalData.dropna(subset = ['ProductID'], inplace = True)
    TotalData.drop_duplicates(inplace = True)    
    
    filename = DocName + '.xlsx'
    filename = os.path.join(SavePath, filename)    
    
    writer = ExcelWriter(filename)
    TotalData.to_excel(writer, 'Sheet1', index = False )   
    writer.save()
    
    TotalData.to_csv(os.path.join(SavePath, DocName + '.txt'),sep=';',index=False, encoding = 'utf-8')
    def save_table(self, directory = None, filename = None, table_format = None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index= False, header= True)
                descr = self.create_description()
                descr.to_excel(writer, "description", index = False, header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index= False, header = True)
        except Exception, e:
                raise Exception("Aggregates: Error saving file", str(e))
def build_aggregates():

    writer = None
    years = range(2006,2010)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        simu.set_survey()
        inflator = get_loyer_inflator(year)
        simu.inflate_survey({'loyer' : inflator})
        simu.compute()

        agg = Aggregates()
        agg.set_simulation(simu)
        agg.compute()

        if writer is None:
            writer = ExcelWriter(str(fname_all))
        agg.aggr_frame.to_excel(writer, yr, index= False, header= True, float_format="%.2f")
        print agg.aggr_frame.to_string()
        del simu
        del agg
        import gc
        gc.collect()


    writer.save()
Exemple #7
0
def main():
  parser = argparse.ArgumentParser(description = 'Fantasy Data Visualization')
  parser.add_argument('players', metavar='PLAYER', \
                      type=int, nargs='*', help='ids of players to display')
  parser.add_argument('-d', '--display', type=int, \
                      choices=[10,25,50], default=10, help='number of rows to display')
  parser.add_argument('-e', '--excel', dest='excel', \
                      action='store_true', default=False, help='to excel')
  args = parser.parse_args()

  show = int(args.display) # number of stats to show
  stats = pd.DataFrame.from_csv('.cache/res_avg.csv')
  
  # write all stats to excel file
  if (args.excel):
    writer = ExcelWriter('.cache/res_avg.xlsx')
    stats.to_excel(writer, 'Sheet1')
    writer.save()
  
  # display plot
  if len(args.players) > 0:
    plot(stats=stats, players=args.players)

  # print short summary
  print stats.sort_values(by=['avg_2015'], ascending=[False]).head(show)
def diag_aggregates():

    years = ['2006', '2007', '2008', '2009']

    df_final = None
    for yr in years:
        xls = ExcelFile(fname_all)
        df = xls.parse(yr, hindex_col= True)

        cols = [u"Mesure",
                u"Dépense \n(millions d'€)",
                u"Bénéficiaires \n(milliers)",
                u"Dépenses \nréelles \n(millions d'€)",
                u"Bénéficiaires \nréels \n(milliers)",
                u"Diff. relative \nDépenses",
                u"Diff. relative \nBénéficiaires"]
        selected_cols = [u"Mesure", u"Diff. relative \nDépenses", u"Diff. relative \nBénéficiaires"]
        df = df[selected_cols]
        df['year'] = yr
        df['num'] = range(len(df.index))
        df = df.set_index(['num', u'Mesure', 'year'])
        if df_final is None:
            df_final = df
        else:

            df_final = df_final.append(df, ignore_index=False)

#    DataFrame.groupby()
    df_final = df_final.sortlevel(0)
    print str(fname_all)[:-5]+'_diag.xlsx'
    writer = ExcelWriter(str(fname_all)[:-5]+'_diag.xlsx')
    df_final.to_excel(writer, sheet_name="diagnostics", float_format="%.2f")
    writer.save()
def create_output(regression_dist_dict, closest_curve_dict, reactor_name, name_add):
    '''Converts the dictionaries into dataframes to format for saving as
    an excel. The total resutls on the first sheet and closest curves on the second'''

    #creates a dataframe by looping through the dict and appending the df's together.
    count = 0
    print regression_dist_dict
    for key in regression_dist_dict:
        if count == 0:
            total_results = pd.DataFrame(regression_dist_dict[key], index=[key]*len(regression_dist_dict[key]), columns=['reactor', 'enrichment', 'distance'])
            closest_results = pd.DataFrame([closest_curve_dict[key]], index=[key], columns=['reactor', 'enrichment', 'distance'])
            count += 1
        else:
            total_results = total_results.append(pd.DataFrame(regression_dist_dict[key], index=[key]*len(regression_dist_dict[key]), columns=['reactor', 'enrichment', 'distance']))
            closest_results = closest_results.append(pd.DataFrame([closest_curve_dict[key]], index=[key], columns=['reactor', 'enrichment', 'distance']))

    print 'total_results', total_results
    print 'closest_results', closest_results

    file_name = 'data/%s_regression_results_%s.xlsx' % ('_'.join(map(str, reactor_name)), name_add)

    writer = ExcelWriter(file_name)

    total_results.to_excel(writer, sheet_name='Sheet1')
    closest_results.to_excel(writer, sheet_name='Sheet2')
    writer.save()
Exemple #10
0
class Excel(object):

    goal_time = 0.2
    params = ['openpyxl', 'xlsxwriter', 'xlwt']
    param_names = ['engine']

    def setup(self, engine):
        N = 2000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.bio_read = BytesIO()
        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
        self.writer_read.save()
        self.bio_read.seek(0)

        self.bio_write = BytesIO()
        self.bio_write.seek(0)
        self.writer_write = ExcelWriter(self.bio_write, engine=engine)

    def time_read_excel(self, engine):
        read_excel(self.bio_read)

    def time_write_excel(self, engine):
        self.df.to_excel(self.writer_write, sheet_name='Sheet1')
        self.writer_write.save()
def save_xls_name(list_dfs, xls_path, sheet_name):
    '''save function that takes a list as input to name sheets.'''

    #remove ascii characters from dataframes for saving
    for df in list_dfs:
        df.index = remove_non_ascii(df.index)
        for col in df.columns:
        df[col] = remove_non_ascii(df[col])

    #save the df's to an excel file
    writer = ExcelWriter(xls_path)
    for n, df in enumerate(list_dfs):
        df.to_excel(writer, sheet_name[n])
    writer.save()


def remove_non_ascii(col):
    '''remove ascii for saving to excel'''
    new_index = []
    for name in col:
        try:
            for letter in name:
                if ord(letter) > 128:
                    name = name.replace(letter, '')
        except:
            pass
        new_index.append(name)
    return new_index
def AddSeqComp(mypath):
    """ Loads TestLogAll.h5 from the specified path, then calls 
    MeasurementGroupTools.AddSeqComp to recalculate seq components using FFT  

    Input:  Directory of the measurment campaign, e.g.: "aLabView2"
    Output: Results1.h5, Results1.pdf in the data subdirs.
    """
    from pandas import HDFStore, ExcelWriter
    import MeasurementGroupTools as mgt

    h5logs = HDFStore(mypath + "\\" + 'TestLogsAll.h5')
    TestLog = h5logs['TestLogsAll']

    dirs = TestLog[u'DirName'].unique()
    for dname in dirs:
        mysubdirpath = mypath + "\\" + dname
        print "Processing: " + dname
        mgt.AddSeqComp(mysubdirpath, TestLog, dname)

    h5logs.put('TestLogsAll',TestLog)
    h5logs.close()

    writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx')
    TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name
    writer.save()

    return
    def build_and_send_email(self, data, options):
        date = timezone.now().date().strftime('%Y_%m_%d')

        if 'recipients' in options:
            print 'yes'
            recipients = options['recipients']
        else:
            print 'no'
            recipients = settings.DEFAULT_WEEKLY_RECIPIENTS

        print 'recipients:', recipients

        message = EmailMessage(subject='Kikar Hamedina, Weekly Report: %s' % date,
                               body='Kikar Hamedina, Weekly Report: %s.' % date,
                               to=recipients)
        w = ExcelWriter('Weekly_report_%s.xlsx' % date)

        for datum in data:
            # csvfile = StringIO.StringIO()
            pd.DataFrame.from_dict(datum['content']).to_excel(w, sheet_name=datum['name'])

        w.save()
        w.close()
        # f = open(w.path, 'r', encoding='utf-8')
        message.attach_file(w.path)
        message.send()
Exemple #14
0
	def saveDialog(self):
		'''Saves the project as an .xls file.'''
		title									= 'Save project as...'
		fileName,f								= QFileDialog.getSaveFileName(self,title,self.path)
		writer									= ExcelWriter(fileName+'.xls')
		for marker in self.markers:
			marker.table.to_excel(writer,marker.name)
		writer.save()
def writeToExcel(fileName=''):
	print "Writing to Excel File : "+fileName
	data = {'CVE ID Number': cveIDNumber, 'Summary Text': summaryText, 'Publish Date': publishDate, 'Software Type': softwareType, 'Vendor': vendor,'Product':product,'Version':version,'CVSS Score':cvssScore,'Confidentiality Impact':confidentialityImpact,'Integrity Impact':integrityImpact,'Availibility Impact':availibilityImpact,'Access Complexity':accessComplexity,'Authentication':authentication,'Gained Access':gainedAccess,'Vulnerability Type':vulnType}
	df = pd.DataFrame(data,columns=['CVE ID Number','Publish Date', 'Software Type','Vendor','Product','Version','CVSS Score','Confidentiality Impact','Integrity Impact','Availibility Impact','Access Complexity','Authentication','Gained Access','Vulnerability Type','Summary Text'])
	writer = ExcelWriter(fileName)
	df.to_excel(writer,'CVE Details',index=False)
	writer.save()
	print "Completed."
def save_xlsx(list_dfs, xlsx_path):
    writer = ExcelWriter(xlsx_path)
    for n, df in enumerate(list_dfs):
        df.to_excel(writer, '%s' %n)
        print('Saving %s' %n)
    writer.save()
    print('Finished writing to file')
    return None
def to_mem_excel(dataframe, sheet_name='WorkSheet'):
    iobuffer = BytesIO()
    writer = ExcelWriter(iobuffer, engine='xlwt')
    dataframe.to_excel(writer, sheet_name=sheet_name)
    writer.save()
    iobuffer.flush()
    iobuffer.seek(0)
    return iobuffer.getvalue()
Exemple #18
0
def corpus_to_excel(corpus_path, excel_path):
    '''NB! Make sure to use .xls file extension for Excel files.'''
    corpus = PyCorpus(corpus_path)
    writer = ExcelWriter(excel_path)
    for key in corpus:
        corpus[key].to_excel(writer, sheet_name=key)
    writer.save()
    corpus.close()
Exemple #19
0
def extract_SHT1x_data_day_by_day(SHT1x_dataframe, days_list):
	# the 'with' statement dont work
	today = date.today()
	writer = ExcelWriter('static/data/SHT1x.xlsx')
    	for day in days_list:
    		if day <= today:
    			day_SHT1x = SHT1x_dataframe[str(day)]
        		day_SHT1x.to_excel(writer, sheet_name=str(day))
    	writer.save()
Exemple #20
0
def save_peaks_excel(peakOnlyHdf5,xlsxFile):
    dsets = h5py.File(peakOnlyHdf5,'r')
    writer = ExcelWriter(xlsxFile)
    for _key in dsets.keys():
        dset = dsets[_key]
        _df = pd.DataFrame(list(dset))
        _df.to_excel(writer,_key,header=False, index=False)
        print(_key+'sheet is created')
    writer.save()
    writer.close()
Exemple #21
0
def slmode(sheet, size):
	writer = ExcelWriter("sw_mode_" + str(size) + "t_" + sheet + ".xlsx")
	columnas = dfs[str(sheet)].columns # store columns names
	length = len(dfs[str(sheet)].columns)
	new_df = pd.DataFrame(dfs[str(sheet)].iloc[:,0])
	for i in range(1,length-(size-1)):
		for j in range(0,(size)):
			new_df[str(columnas[j+i])] = dfs[str(sheet)].iloc[:,j+i]
		new_df.to_excel(writer,"set_" + str(i), index=False)
		new_df = pd.DataFrame(dfs[str(sheet)].iloc[:,0])
	writer.save()
Exemple #22
0
def export_to_xls(df, path, format_excel=None, engine='xlsxwriter', send=False):
    writer = ExcelWriter(path,
                         engine=engine,
                         datetime_format='hh:mm:ss mmm d yyyy',
                         date_format='mmmm dd yyyy')
    df.to_excel(writer)
    writer.save()
    if format_excel: format_excel(path)
    if send:
        send_file_by_email(path)
    else:
        return download_file(path)
Exemple #23
0
def extract_thermo_data_day_by_day(thermo_dataframe, days_list):
	# the 'with' statement dont work
	# replace dont work properly
	#thermo_dataframe_sustituted = thermo_dataframe.replace({'0': 'OFF', '1': 'ON'})
	#print thermo_dataframe_sustituted
	today = date.today()
	writer = ExcelWriter('static/data/thermo.xlsx')
    	for day in days_list:
    		if day <= today:
    			day_thermo = thermo_dataframe[str(day)]
        		day_thermo.to_excel(writer, sheet_name=str(day))
    	writer.save()
 def save_xls(self, dframe):  # 把数据写到已行业命名的excel文件的名字sheet
     xls_path = os.path.join(current_folder, '筛选后股票的财务报表', self.hangye)
     if os.path.exists(xls_path):  # excel 文件已经存在
         book = load_workbook(xls_path)
         writer = pd.ExcelWriter(xls_path, engine='openpyxl')
         writer.book = book
         writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
         dframe.to_excel(writer, self.name)
         writer.save()
     else:  # 文件还不存在
         writer = ExcelWriter(xls_path)
         dframe.to_excel(writer, self.name)
         writer.save()
def GetPrices():
    """ Goes to the URL, Reads the CSV download link, and creates the CSV DataFrame"""
    url = "http://fundresearch.fidelity.com/mutual-funds/fidelity-funds-daily-pricing-yields/download"
    CSV_Import = urllib.request.urlopen(url).read() 
    CSV = pd.read_csv(url, skiprows=3) 
    
    """ Creates CSV File to be opened in Excel. 
    This can be removed if you don't need Excel and you can just use CSV as the DataFrame """ 
    File = 'DailyPrices'
    writer = ExcelWriter(str(File) + '.xlsx')
    CSV.to_excel(writer, 'DailyReport', index = False)
    writer.close() 
    os.startfile(File + '.xlsx') 
def to_excel():
    DR = data_recording.DataRecorder(db_name="PRIVATE/result.sqlite")
    sql = "Select * from rep"
    DR.con.row_factory = sqlite3.Row
    cursor = DR.con.execute(sql)

    rows = cursor.fetchall()
    DF = pd.DataFrame(rows, columns=[item[0] for item in cursor.description])

    # nattention : il faut que le chemin existe.        
    writer = ExcelWriter(conf_file.EXPORT_REP+'/'+'fact_excel.xlsx')
    DF.to_excel(writer, sheet_name='data_fact')
    
    writer.save()
    print("Le fichier a été sauvé dans {}".format(conf_file.EXPORT_REP+'/'+'fact_excel.xlsx'))
Exemple #27
0
    def to_excel(self, filename='myfile.xlsx'):
        """Export informations to a excel file

        Kargs:
            filename: string
                Name of the excel file ex: filename='myfile.xlsx'
        """
        writer = ExcelWriter(filename)
        self.clfinfo.to_excel(writer,'Classifier')
        self.statinfo.to_excel(writer,'Statistics')
        try:
            self.featinfo.to_excel(writer,'Features')
        except:
            warn('Informations about features has been ignored. Run fit()')
        writer.save()
def save_data(Working_Directory, Result_Directory, name_file, Duration_ON, Duration_OFF, Num_pixels_ON, Num_pixels_OFF):
    ## Excel data
    #Save duration 
    Duration = list()
    Stimulus_Type = list()
    Matched_Pixels = list()
    Stimulus_Index = list()
    count=0
    for ii in xrange(size(Duration_ON,0)):
        Duration.append(mean(Duration_ON[ii,:]))
        Matched_Pixels.append(Num_pixels_ON[ii,:])
        Stimulus_Type.append(str(count+1)+'ON')
        Stimulus_Index.append(count)
        count=count+1
    for ii in xrange(size(Duration_OFF,0)):
        Duration.append(mean(Duration_OFF[ii,:]))
        Matched_Pixels.append(Num_pixels_OFF[ii,:])
        Stimulus_Type.append(str(count+1)+'OFF')   
        Stimulus_Index.append(count)
        count=count+1
    
    ## For fish 23, change OFF to ON and save
#    Stimulus_Type[2] = '3ON'
        
    #Save matched_pixels 
    Name_stimulus = get_list_of_stimulus_name(Working_Directory)
    Label_plane, Label_stimulus = label_stimulus(Name_stimulus,Stimulus_Type)
    Stim_type_all = repeat(Stimulus_Type, size(Matched_Pixels,1))
    Matched_Pixels_all = reshape(Matched_Pixels, (size(Matched_Pixels)))
    Name_stimulus_all = tile(Name_stimulus, size(Matched_Pixels,0))
    # Some data frames
    df1 = DataFrame({'Stimulus_Type':Stimulus_Type,'TDuration':Duration}) #Only duration
    df2 = DataFrame(index=Stimulus_Index, columns=Name_stimulus) # pixels to concatenate with duration
    df3 = DataFrame(index=Stimulus_Type, columns=Name_stimulus) #pixels tandalone
    df4 = DataFrame({'Stimulus_Type':Stim_type_all, 'Pixels':Matched_Pixels_all,\
    'Label_plane':Label_plane, 'Label_stimulus':Label_stimulus, 'Original_Stim':Name_stimulus_all}) #label pixels with stimulus and z plane
    df4["Stimulus"] = df4.Label_stimulus.map(Label_Odor_reverse)
    
    for ii in xrange(0,size(Stimulus_Index)):
        df2.ix[ii] = Matched_Pixels[ii]
        df3.ix[ii] = Matched_Pixels[ii]
    df = concat([df1,df2], join='inner', axis=1)
    #Save to excel
    writer = ExcelWriter(Result_Directory+ filesep+'Classified_Results'+filesep+name_file+ '.xlsx', engine='xlsxwriter')
    df.to_excel(writer, sheet_name='sheet1')
    writer.close()
    
    return df, df1, df3, df4
def networkset_2_spreadsheet(ntwkset, file_name=None, file_type= 'excel', 
    *args, **kwargs):
    '''
    Write a NetworkSet object to a spreadsheet, for your boss    
    
    Write  the s-parameters  of a each network in the networkset to a 
    spreadsheet. If the `excel` file_type is used, then each network, 
    is written to its own sheet, with the sheetname taken from the
    network `name` attribute.
    This functions makes use of the pandas module, which in turn makes
    use of the xlrd module. These are imported during this function
    
    Notes
    ------
    The frequency unit used in the spreadsheet is take from 
    `ntwk.frequency.unit`
    
    Parameters
    -----------
    ntwkset :  :class:`~skrf.networkSet.NetworkSet` object
        the network to write 
    file_name : str, None
        the file_name to write. if None,  ntwk.name is used. 
    file_type : ['csv','excel','html']
        the type of file to write. See pandas.DataFrame.to_??? functions.
    form : 'db','ma','ri'
        format to write data, 
        * db = db, deg
        * ma = mag, deg
        * ri = real, imag
    \*args, \*\*kwargs :
        passed to pandas.DataFrame.to_??? functions.
        
        
    See Also
    ---------
    networkset_2_spreadsheet : writes a spreadsheet for many networks
    '''
    from pandas import DataFrame, Series, ExcelWriter # delayed because its not a requirement
    if ntwkset.name is None and file_name is None:
        raise(ValueError('Either ntwkset must have name or give a file_name'))
    
    if file_type == 'excel':
        writer = ExcelWriter(file_name)
        [network_2_spreadsheet(k, writer, sheet_name =k.name, *args, **kwargs) for k in ntwkset]
        writer.save()
    else:
        [network_2_spreadsheet(k,*args, **kwargs) for k in ntwkset]
Exemple #30
0
    def setup(self, engine):
        N = 2000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.bio_read = BytesIO()
        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
        self.writer_read.save()
        self.bio_read.seek(0)

        self.bio_write = BytesIO()
        self.bio_write.seek(0)
        self.writer_write = ExcelWriter(self.bio_write, engine=engine)
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from netCDF4 import Dataset

ifile = 'merra2_omega_kenya_400_timeseries.nc'
f = Dataset(ifile, mode='r')
field = f.variables['wap'][:, :]
#field=field*86400
mname = 'omega'
print field
#print str(field).replace('[','').replace(']',' ')
#field=str(field).replace('[',' ').replace(']]',',')
field = (" ".join(str(i) for i in field))
field = str(field).replace('[', ' ').replace(']]]', ',')

df = pd.DataFrame({mname: [field]})
writer = ExcelWriter('kenya_omega_monthly_means.xlsx')
df.to_excel(writer, 'Sheet1', index=True)
writer.save()
Exemple #32
0
#CSV File
df = pd.DataFrame.from_dict(data, orient='index', dtype=None)
#df = pd.read_csv("Data.csv")
s = df.describe()
print s

#CSV File
filename = 'Data.csv'
df.to_csv(filename, index=True, encoding='utf-8')
#print df

#XLS File

filename1 = 'Data.xlsx'
writer = ExcelWriter(filename1)
df.to_excel(writer, 'Sheet1')
writer.save()

# JSON File
filename2 = 'Data.json'
df.to_json(filename2, orient="index")

s = df.describe()
print s

#
# files = [f for f in os.listdir('./pokemon_5378')]
# for f in files:
#         print f
Exemple #33
0
import pandas as objPandas
from pandas import ExcelWriter

archivo = objPandas.DataFrame({
    'matricula': [12345, 1235],
    'Nombre': ['Joseph', 'pedro'],
    'Apellido': ['Mendez', 'lopez']
})

archivo = archivo[['matricula', 'Nombre', 'Apellido']]

rut = ExcelWriter(r'C:\Users\Joseph\Desktop\excell\utm.xlsx')

archivo.to_excel(rut, 'Hoja de datos', index=False)

rut.save()
    }
    response = requests.get(url, headers=headers)
    html_doc = response.content
    #print(html_doc)
    soup = BeautifulSoup(html_doc, 'html.parser')
    # get all content at tag a
    pids = soup.findAll('a')
    result = []
    for pid in pids:
        p = pid.attrs
        if ('target' in p and p['target'] == '_blank' and 'data-click' in p):
            if ('百度快照' not in pid.contents[0]):
                slist = []
                for x in pid.contents:
                    slist.append(str(x))
                s = ''.join(slist)
                s = s.replace('\n',
                              '').replace(' ',
                                          '').replace('<em>',
                                                      '').replace('</em>', '')
                result.append(s)

    # save to excel
    d = {'titles': result[:100]}
    df = pd.DataFrame(data=d)
    from pandas import ExcelWriter

    writer = ExcelWriter('hot_news.xlsx')
    df.to_excel(writer, 'Sheet1')
    writer.save()
        # Condition 6: Current Price is at least 30% above 52 week low
        condition_6 = currentClose >= (1.3 * low_of_52week)

        # Condition 7: Current Price is within 25% of 52 week high
        condition_7 = currentClose >= (.75 * high_of_52week)

        # If all conditions above are true, add stock to exportList
        if (condition_1 and condition_2 and condition_3 and condition_4
                and condition_5 and condition_6 and condition_7):
            exportList = exportList.append(
                {
                    'Stock': stock,
                    "RS_Rating": RS_Rating,
                    "50 Day MA": moving_average_50,
                    "150 Day Ma": moving_average_150,
                    "200 Day MA": moving_average_200,
                    "52 Week Low": low_of_52week,
                    "52 week High": high_of_52week
                },
                ignore_index=True)
            print(stock + " made the Minervini requirements")
    except Exception as e:
        print(e)
        print(f"Could not gather data on {stock}")

exportList = exportList.sort_values(by='RS_Rating', ascending=False)
print('\n', exportList)
writer = ExcelWriter("ScreenOutput.xlsx")
exportList.to_excel(writer, "Sheet1")
writer.save()
Exemple #36
0
def save_xls(df_list, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(df_list):
            df.to_excel(writer, 'sheet%s' % n)
        writer.save()
        print("Saved")
Exemple #37
0
def extract_expression(tumor, platform, gencode_version):
    """
	The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'.

	:param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...)
	:param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform)
	:param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used)
	:return: two Pandas dataframes

	Example::
	
		import genereg as gr
		expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22)
	"""

    # Check input parameters
    tcga_tumors = [
        "Acute Myeloid Leukemia", "Adrenocortical Carcinoma",
        "Bladder Urothelial Carcinoma", "Brain Lower Grade Glioma",
        "Breast Invasive Carcinoma",
        "Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma",
        "Cholangiocarcinoma", "Colon Adenocarcinoma", "Esophageal Carcinoma",
        "Glioblastoma Multiforme", "Head and Neck Squamous Cell Carcinoma",
        "Kidney Chromophobe", "Kidney Renal Clear Cell Carcinoma",
        "Kidney Renal Papillary Cell Carcinoma",
        "Liver Hepatocellular Carcinoma", "Lung Adenocarcinoma",
        "Lung Squamous Cell Carcinoma",
        "Lymphoid Neoplasm Diffuse Large B-cell Lymphoma", "Mesothelioma",
        "Ovarian Serous Cystadenocarcinoma", "Pancreatic Adenocarcinoma",
        "Pheochromocytoma and Paraganglioma", "Prostate Adenocarcinoma",
        "Rectum Adenocarcinoma", "Sarcoma", "Skin Cutaneous Melanoma",
        "Stomach Adenocarcinoma", "Testicular Germ Cell Tumors", "Thymoma",
        "Thyroid Carcinoma", "Uterine Carcinosarcoma",
        "Uterine Corpus Endometrial Carcinoma", "Uveal Melanoma"
    ]
    if tumor not in tcga_tumors:
        raise ValueError(
            'PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '
            + (', '.join(tcga_tumors)))

    if platform not in [27, 450]:
        raise ValueError(
            'PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450'
        )

    if gencode_version not in [22, 24, 27]:
        raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27')

    # Load the list of genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Create a list containing the Gene Symbols of the genes of interest
    genesSYM_of_interest = []
    for i, r in EntrezConversion_df.iterrows():
        sym = r['GENE_SYMBOL']
        if sym not in genesSYM_of_interest:
            genesSYM_of_interest.append(sym)

    # Import the dictionary of genes of interest with their candidate regulatory genes
    dict_RegulGenes = pickle.load(
        open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb'))

    # Import the gene-TFs mapping dataframe
    Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes_Mapping.xlsx',
                               sheetname='Sheet1',
                               header=0,
                               converters={
                                   'ENTREZ_GENE_ID': str,
                                   'HGNC_ID': str
                               })

    # Create a list containing the Gene Symbols of the regulatory genes of genes of interest
    regulatory_genesSYM = []
    for key, value in dict_RegulGenes.items():
        for gene in value:
            if gene not in regulatory_genesSYM:
                regulatory_genesSYM.append(gene)

    # Extract the list of distinct Gene Symbols mapped in the mapping table
    mapped_gene_SYMs = []
    for index, row in Mapping_df.iterrows():
        sym = row['GENE_SYMBOL']
        if sym not in mapped_gene_SYMs:
            mapped_gene_SYMs.append(sym)

    # Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library
    gl.set_remote_address('http://gmql.eu/gmql-rest/')
    gl.login()
    gl.set_mode('remote')

    # Load the TCGA datasets to be used in the query
    methylation_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_methylation', owner='public')
    expression_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_gene_expression', owner='public')

    # Identify the sequencing platform to be used
    if platform == 27:
        seq_platform = 'Illumina Human Methylation 27'
    elif platform == 450:
        seq_platform = 'Illumina Human Methylation 450'

    # Extract all the samples for the current tumor and platform
    all_methyl = methylation_dataset.meta_select(
        (methylation_dataset['manually_curated__cases__disease_type'] == tumor)
        & (methylation_dataset['manually_curated__platform'] == seq_platform)
        & ((methylation_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (methylation_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (methylation_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))
    all_expr = expression_dataset.meta_select(
        (expression_dataset['manually_curated__cases__disease_type'] == tumor)
        & ((expression_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (expression_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (expression_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))

    # Gene Expression:
    expr_0 = all_expr.reg_project(field_list=[
        'ensembl_gene_id', 'entrez_gene_id', 'gene_symbol', 'fpkm'
    ])
    expr = expr_0.meta_select(
        semiJoinDataset=all_methyl,
        semiJoinMeta=['biospecimen__bio__bcr_sample_barcode'])

    # Materialize the results into a GDataframe
    expr_Gdf = expr.materialize('./(MaterializeResults)')

    # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata.
    # Get the two pandas dataframes:
    expr_df_regs = expr_Gdf.regs
    expr_df_meta = expr_Gdf.meta
    n_regs = len(expr_df_regs)
    n_samples = len(expr_df_meta)

    # Rename 'chr', 'start', and 'stop' columns header
    expr_df_regs.rename(columns={
        'chr': 'chrom',
        'start': 'left',
        'stop': 'right'
    },
                        inplace=True)
    # Change index into progressive integer numbers and store the name of the sample in another column
    expr_df_regs['sample_id'] = expr_df_regs.index
    expr_df_regs.index = range(n_regs)

    # Convert unknown values (NaN) to empty strings
    expr_df_regs = expr_df_regs.fillna('')

    # Convert all the metadata values into strings, since they're encode as lists in Python
    col_names = []
    for name, values in expr_df_meta.iteritems():
        col_names.append(name)
    for index, row in expr_df_meta.iterrows():
        for c in col_names:
            list_val = row[c]  # it's encoded as a list
            str_val = ''.join(
                list_val)  # convert the value stored as a list in a string
            expr_df_meta.set_value(index, c, str_val)

    # Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers
    expr_sample_barcodes_all = []
    for index, row in expr_df_meta.iterrows():
        barcode = row['biospecimen__bio__bcr_sample_barcode']
        if barcode not in expr_sample_barcodes_all:  # get distinct values
            expr_sample_barcodes_all.append(barcode)

    # Check which are repeated aliquots, if present
    all_aliqouts = []
    for index, row in expr_df_meta.iterrows():
        barcode = row['biospecimen__bio__bcr_sample_barcode']
        all_aliqouts.append(barcode)
    multiple_aliquots = [
        item for item, count in collections.Counter(all_aliqouts).items()
        if count > 1
    ]

    samples_to_remove = []
    expr_sample_barcodes = []
    if len(multiple_aliquots) != 0:
        # Among the repeated aliquots, keep only the most recent ones (of 2013)
        for index, row in expr_df_meta.iterrows():
            year = row['biospecimen__bio__year_of_shipment']
            barcode = row['biospecimen__bio__bcr_sample_barcode']
            if (barcode in multiple_aliquots) and year == '2011':
                expr_df_meta.drop(index, inplace=True)
                samples_to_remove.append(index)

        # Import the list of aliquots in the methylation dataset
        text_file = open('./3_TCGA_Data/Common_Aliquots.txt', 'r')
        aliquots = text_file.read().split('\n')
        aliquots.remove('')
        text_file.close()

        # Extract the new list of distinct TCGA Aliquots to extract
        for index, row in expr_df_meta.iterrows():
            barcode = row['biospecimen__bio__bcr_sample_barcode']
            if barcode in aliquots:
                if barcode not in expr_sample_barcodes:
                    expr_sample_barcodes.append(barcode)
            else:
                expr_df_meta.drop(index, inplace=True)
                samples_to_remove.append(index)

        # Remove regions that corresponded to eliminated repeated aliquots
        expr_df_regs = expr_df_regs.loc[~(
            expr_df_regs['sample_id'].isin(samples_to_remove))].copy()

    else:
        expr_sample_barcodes = expr_sample_barcodes_all

    # Export the metadata dataframe setting the TCGA aliquots as indexes.
    Metadata_df = expr_df_meta.copy()
    Metadata_df['id_sample'] = Metadata_df.index
    Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True)
    writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR_(Metadata).xlsx')
    Metadata_df.to_excel(writer, 'Sheet1')
    writer.save()

    # Extract from the expression dataset all the regions that belong to genes of interest
    expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(
        genesSYM_of_interest)].copy()
    # Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest
    expr_df_regs_regulatory = expr_df_regs.loc[
        expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy()

    # Gene expression values for each gene of interest:

    # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
    from collections import defaultdict
    dict_expr_interest = defaultdict(dict)

    for key, value in dict_expr_interest.items():
        value = defaultdict(list)

    # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
    # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

    # Set the Gene Symbol as keys of the main dictionary
    for name in genesSYM_of_interest:
        dict_expr_interest[name] = {}

    # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
    for sample in expr_sample_barcodes:
        for k, v in dict_expr_interest.items():
            v[sample] = []

    # Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe
    for index, row in expr_df_regs_interest.iterrows(
    ):  # iterating along the whole dataframe
        sym = row['gene_symbol']  # get the Gene Symbol of the gene
        fpkm = row['fpkm']  # get the gene expression value
        sample = row['sample_id']  # get the name of the sample
        # get the aliquot corresponding to current sample
        aliq = expr_df_meta.get_value(sample,
                                      'biospecimen__bio__bcr_sample_barcode')
        # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers,
        dict_expr_interest[sym][aliq].append(round(float(fpkm), 6))

    # Convert the nested dictionary also into a dataframe

    # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest
    expr_interest_df1 = pd.DataFrame(index=expr_sample_barcodes,
                                     columns=[genesSYM_of_interest])

    # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
    expr_interest_df2 = pd.DataFrame(
        index=expr_sample_barcodes,
        columns=['Sample_ID', 'Tumor', 'Patient_ID'])

    # Create the final dataframe
    expr_interest_df = expr_interest_df1.join(expr_interest_df2)

    # Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot
    for gene_sym, dict_value in dict_expr_interest.items():
        for tcga_aliq, exp_list in dict_value.items():
            if (len(exp_list) != 0):
                fpkm = exp_list[0]
                # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
                expr_interest_df.set_value(tcga_aliq, gene_sym, round(fpkm, 6))

    # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
    for index, row in expr_df_meta.iterrows():
        aliquot = row['biospecimen__bio__bcr_sample_barcode']
        tumor_tag = row['clinical__admin__disease_code']
        patient_id = row['clinical__shared__patient_id']
        expr_interest_df.set_value(aliquot, 'Sample_ID', index)
        expr_interest_df.set_value(aliquot, 'Tumor', tumor_tag)
        expr_interest_df.set_value(aliquot, 'Patient_ID', patient_id)

    # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest
    additional_index = ['ENTREZ_GENE_ID']
    expr_interest_df0_1 = pd.DataFrame(index=additional_index,
                                       columns=[genesSYM_of_interest])
    expr_interest_df0_2 = pd.DataFrame(
        index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID'])
    expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2)

    frames = [expr_interest_df0, expr_interest_df]
    expr_interest_df = pd.concat(frames)

    # Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe
    for i, r in EntrezConversion_df.iterrows():
        entrez_id = r['ENTREZ_GENE_ID']
        gene_name = r['GENE_SYMBOL']
        expr_interest_df.set_value('ENTREZ_GENE_ID', gene_name, entrez_id)

    # Set empty strings for NaN values in the 'GENE_SYMBOL' row
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "")
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Tumor', "")
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "")

    # Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot
    writer = ExcelWriter(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-InterestGenes.xlsx')
    expr_interest_df.to_excel(writer, 'Sheet1')
    writer.save()

    # Gene expression values for each candidate regulatory gene of the genes of interest:

    # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
    from collections import defaultdict
    dict_expr_regulatory = defaultdict(dict)

    for key, value in dict_expr_regulatory.items():
        value = defaultdict(list)

    # The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
    # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

    # Set the Gene Symbols as keys of the main dictionary
    for name in regulatory_genesSYM:
        dict_expr_regulatory[name] = {}

    # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
    for sample in expr_sample_barcodes:
        for k, v in dict_expr_regulatory.items():
            v[sample] = []

    # Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe
    for index, row in expr_df_regs_regulatory.iterrows(
    ):  # iterating along the whole dataframe
        sym = row['gene_symbol']  # get the Gene Symbol of the gene
        ens_id = row['ensembl_gene_id']  # get the Ensembl Gene ID
        fpkm = row['fpkm']  # get the gene expression value
        sample = row['sample_id']  # get the name of the sample
        # get the aliquot corresponding to current sample
        aliq = expr_df_meta.get_value(sample,
                                      'biospecimen__bio__bcr_sample_barcode')
        # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers
        if (gencode_version == 22):
            if (ens_id not in [
                    'ENSG00000277726.3', 'ENSG00000275895.3',
                    'ENSGR0000214717.8'
            ]):
                dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6))
        else:
            dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6))

    # Convert the nested dictionary also into a dataframe

    # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes
    expr_regulatory_df1 = pd.DataFrame(index=expr_sample_barcodes,
                                       columns=[regulatory_genesSYM])

    # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
    expr_regulatory_df2 = pd.DataFrame(
        index=expr_sample_barcodes,
        columns=['Sample_ID', 'Tumor', 'Patient_ID'])

    # Create the final dataframe
    expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2)

    # Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot
    for gene_sym, dict_value in dict_expr_regulatory.items():
        for tcga_aliq, exp_list in dict_value.items():
            if (len(exp_list) != 0):
                fpkm = exp_list[0]
                # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
                expr_regulatory_df.set_value(tcga_aliq, gene_sym,
                                             round(fpkm, 6))

    # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
    for index, row in expr_df_meta.iterrows():
        aliquot = row['biospecimen__bio__bcr_sample_barcode']
        tumor_tag = row['clinical__admin__disease_code']
        patient_id = row['clinical__shared__patient_id']
        expr_regulatory_df.set_value(aliquot, 'Sample_ID', index)
        expr_regulatory_df.set_value(aliquot, 'Tumor', tumor_tag)
        expr_regulatory_df.set_value(aliquot, 'Patient_ID', patient_id)

    # Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest
    additional_index = ['ENTREZ_GENE_ID']
    expr_regulatory_df0_1 = pd.DataFrame(index=additional_index,
                                         columns=[regulatory_genesSYM])
    expr_regulatory_df0_2 = pd.DataFrame(
        index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID'])
    expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2)

    frames = [expr_regulatory_df0, expr_regulatory_df]
    expr_regulatory_df = pd.concat(frames)

    # Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe
    for i in regulatory_genesSYM:
        if i == 'PTRF':
            entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1',
                                       'ENTREZ_GENE_ID'].iloc[0]
        else:
            entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i,
                                       'ENTREZ_GENE_ID'].iloc[0]
        expr_regulatory_df.set_value('ENTREZ_GENE_ID', i, entrez_id)

    # Set empty strings for NaN values in the 'GENE_SYMBOL' row
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "")
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Tumor', "")
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "")

    # Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot
    writer = ExcelWriter(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-RegulatoryGenes.xlsx')
    expr_regulatory_df.to_excel(writer, 'Sheet1')
    writer.save()

    return expr_interest_df, expr_regulatory_df
Exemple #38
0
 def single_acct_analysis(self, accid, label=None, save=False, savepath=''):
     '''
     Analysize single account.
     parameters:
         accid,pass account id number, which is transferred into regular expression by this method, although regular expression is supported.
         label,default set to self.accid_col if None is passed to this parameter.
     '''
     accid_item = self.trans_accid_regex(accid)
     acct_data = self.getAcct(accid_item,
                              accid_label=label,
                              over_write=False,
                              pure=False,
                              side='all')
     # acct_data=acct_data.set_index('glid',inplace=True)
     acct_sum = acct_data[self.drcrdesc].sum(axis=0)
     print('---start analysize %s---' % str(accid))
     print(get_time_str())
     print('---Account Data---')
     print('account data shape:', acct_data.shape)
     print('account sum:', acct_sum)
     print('theAcct %s:\n' % str(accid), acct_data)
     if acct_sum[0] != 0:
         dr_acct_data = self.getAcct(accid_item,
                                     accid_label=label,
                                     over_write=False,
                                     pure=False,
                                     side='dr')
         # dr_acct_data=dr_acct_data.set_index('glid',inplace=True)
         print('---Debit Data---')
         print('debit data shape:', dr_acct_data.shape)
         print('debit_side %s:\n' % str(accid), dr_acct_data)
         # print(dr_acct_data)
     else:
         dr_acct_data = None
         pass
     if acct_sum[1] != 0:
         cr_acct_data = self.getAcct(accid_item,
                                     accid_label=label,
                                     over_write=False,
                                     pure=False,
                                     side='cr')
         # cr_acct_data=cr_acct_data.set_index('glid',inplace=True)
         print('---Credit Data---')
         print('credit data shape:', cr_acct_data.shape)
         print('credit_side %s:\n' % str(accid), cr_acct_data)
         # print(cr_acct_data)
     else:
         cr_acct_data = None
         pass
     if save == True:
         import os
         # from autk import get_time_str
         from openpyxl import Workbook, load_workbook
         from pandas import ExcelWriter
         if savepath == '':
             savename = ''.join(
                 ['theAcct',
                  str(accid), '-',
                  get_time_str(), '.xlsx'])
             savepath = os.path.join(os.path.abspath(os.curdir), savename)
             wb = Workbook()
             wb.save(savepath)
             print('new workbook created at current directory.')
         elif os.path.isdir(savepath):
             savename = ''.join(
                 ['theAcct',
                  str(accid), '-',
                  get_time_str(), '.xlsx'])
             savepath = os.path.join(os.path.abspath(savepath), savename)
             wb = Workbook()
             wb.save(savepath)
             print('new workbook created at %s' % savepath)
         elif os.path.isfile(savepath):
             wb = load_workbook(savepath)
             print('workbook loaded at %s' % savepath)
         else:
             print('woc???,file not exist?')
             wb = Workbook()
             wb.save(savepath)
         wter = ExcelWriter(savepath, engine='openpyxl')
         wter.book = wb
         acct_data.to_excel(wter, sheet_name=''.join(['acct_', str(accid)]))
         if dr_acct_data is not None:
             dr_acct_data.to_excel(wter,
                                   sheet_name=''.join(['dr_',
                                                       str(accid)]))
             wter.save()
         else:
             pass
         if cr_acct_data is not None:
             cr_acct_data.to_excel(wter,
                                   sheet_name=''.join(['cr_',
                                                       str(accid)]))
             wter.save()
         else:
             pass
         wter.save()
         wb.close()
         print('%s data saved.' % str(accid))
     else:
         print('analysis result not saved.')
     print('---end %s analysis---' % str(accid))
     print(get_time_str())
     return [acct_data, dr_acct_data, cr_acct_data]
Exemple #39
0
def write_xlsx(df, name_file):
    writer = ExcelWriter(f'{name_file}.xlsx')
    df.to_excel(writer, 'Sheet1')
    writer.save()
    return 'ФАЙЛ СОХРАНЕН'
Exemple #40
0
        if (line.strip().find(word) != -1):
            col08.append(line[len(word) + 1:len(word) + 12])

        word = 'peakpower='
        if (line.strip().find(word) != -1):
            col09.append(line[len(word) + 1:len(word) + 12])

panda = pd.DataFrame(list(
    zip(col01, col02, col03, col04, col05, col06, col07, col08, col09)),
                     columns=[
                         'tpdf', 'tpdr', 'tpd', 'tcdf', 'tcd', 'tr', 'tf',
                         'avgpower', 'peakpower'
                     ])
panda_sliced = panda.drop(list([0]))

writer = ExcelWriter('../nor2_Pre-Sim.xlsx')
panda_sliced.to_excel(writer)  #can add sheets here if needed
writer.save()

os.chdir('/content/drive/My Drive/vlsi/Hspice/nor2/PLS/output')

col01 = []
col02 = []
col03 = []
col04 = []
col05 = []
col06 = []
col07 = []
col08 = []
col09 = []
Exemple #41
0
i = 0
truetrue = 0
for i in range(2):
    truetrue += cmp[i][i]

i = 0
j = 0
falsefalse = 0
for i in range(2):
    for j in range(2):
        if i != j:
            falsefalse += cmp[i][j]

print('true=', np.round(truetrue, 2))
print('false=', np.round(falsefalse, 2))

sns.heatmap(cm, annot=True, cmap='Reds')
cmp = cmp / 100

fig, (ax1) = plt.subplots(1, sharey=True, figsize=(10, 10))
fig.suptitle('confusion matrix percentage')
sns.heatmap(cmp, annot=False, fmt='.2%', cmap='Blues')

# Export results to excel
df = pd.DataFrame(y_pred)
dfa = pd.DataFrame(y_pred_all)
writer = ExcelWriter('test2.xlsx')
df.to_excel(writer, 'Sheet1', index=False)
dfa.to_excel(writer, 'Sheet2', index=False)
writer.save()
Exemple #42
0
    if  len(corArray)%30==0:
        threshholdUpdate()
        thres_FB_UP=threshUpdate[-1]

        thres_FB = thres_FB_UP%0.1






    if currentTime - tWinHead > durWin and iTail > iHead:
        correlationAverage()
        print("thresh",thres_FB)
        if avgCorrelationArray[0] > thres_FB:
#            play()
            start_play()
            print(avgCorrelationArray)



        tWinHead = tWinHead + 1
        iHead = iTail + 1
        iTail = iHead
    df = pd.DataFrame({'threshold':threshUpdate})
    writer =ExcelWriter('xxx.xlsx')


    df.to_excel(writer,'sheet1',index=False)
    writer.save()
def train(epoch):
    clf.train()  # set model in training mode (need this because of dropout)
    correct = 0
    train_loss = 0
    #weights = []
    #class_weights = torch.cuda.FloatTensor(weights)
    # dataset API gives us pythonic batching
    for batch_id, (data, label) in enumerate(train_loader):
        #print('len(train_data)',len(data))
        data = Variable(data).to('cuda')
        target = Variable(label).to('cuda')

        # forward pass, calculate loss and backprop!
        opt.zero_grad()
        output = clf(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        #loss_history.append(loss.data[0])
        opt.step()
        train_loss += loss.item()

        pred = output.data.max(1)[
            1]  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    #train_loss = np.mean(loss_history)
    train_loss /= len(train_loader)
    train_accuracy = float(correct) / float(len(train_loader.dataset))
    print('\n{:d}, {:.4f}, {:.4f}, {}/{}'.format(epoch, train_loss,
                                                 train_accuracy, correct,
                                                 len(train_loader.dataset)))

    a.append(epoch)
    b.append(train_loss)
    c.append(train_accuracy)

    # output to excel

    d = {'epoch': a, 'loss': b, 'accuracy': c}
    df = pd.DataFrame(d)

    writer = ExcelWriter('result_threeclass_320_4x_train_sn_3_.xlsx')
    df.to_excel(writer, 'Sheet1', index=False)

    # create chart

    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    chart = workbook.add_chart({'type': 'line'})

    chart.add_series({
        'categories': ['Sheet1', 1, 0, epoch + 1, 0],
        'values': ['Sheet1', 1, 2, epoch + 1, 2],
    })

    chart.set_x_axis({'name': 'epoch', 'position_axis': 'on_tick'})
    chart.set_y_axis({
        'name': 'accuracy',
        'major_gridlines': {
            'visible': False
        }
    })

    worksheet.insert_chart('D1', chart)

    writer.save()
Exemple #44
0
    # " a ":' ',' is ':' ',
}
df.replace(to_replace=replace_dict, inplace=True, regex=True)  #replace word
df.replace(to_replace=replace_dict, inplace=True, regex=True)  #replace word

#Filter by question words
what = df_ct = df[df['Questions'].str.contains('what')]  #
where = df_ct = df[df['Questions'].str.contains('where')]  #
who = df_ct = df[df['Questions'].str.contains('who')]  #
how = df_ct = df[df['Questions'].str.contains('how')]  #
which = df[df['Questions'].str.contains('which')]  #
when = df[df['Questions'].str.contains('when')]  #
Other = df[~df['Questions'].str.contains('what|where|who|how|which|when')]  #

#save in excel file
writer = ExcelWriter('outputFiles/questionWords TrainingSet.xlsx')
# writer = ExcelWriter('questionWords ValidationSet.xlsx')
# writer = ExcelWriter('Analysis2 TrainingSet.xlsx')
# writer = ExcelWriter('Analysis2 ValidationSet.xlsx')
df.to_excel(writer, 'all', index=False)
what.to_excel(writer, 'what', index=False)
where.to_excel(writer, 'where', index=False)
who.to_excel(writer, 'who', index=False)
which.to_excel(writer, 'which', index=False)
when.to_excel(writer, 'when', index=False)
Other.to_excel(writer, 'Other', index=False)

writer.save()

#filter
# df_mri = df[df['Questions'].str.contains('mri')]#Only questions about mri
Exemple #45
0
wb = pd.ExcelFile('Client Detail_2021_2H_concat.xlsx', engine='openpyxl')
worksheets = wb.sheet_names  # gets sheet names - works but also creates sheet named ' (200)  Storage' that needs to be deleted
new_sheets = []

del worksheets[0]

# could probably use a list comprehension here
for sheet in worksheets:
    sheet_frame = pd.read_excel(wb, sheet, header=0)
    if 'concat' in sheet_frame.columns:
        new_sheets.append(sheet)

# gets clients from weekly pallet counts - works
wpc_df = pd.read_excel(wb, 'Weekly Pallet Counts')
clients = wpc_df['concat'].unique()

for client in clients:
    writer = ExcelWriter(f'{client}_Details_{dt_string}.xlsx')
    for sheet in new_sheets:
        sheet_frame = pd.read_excel(wb, sheet, header=0)
        if 'concat' in sheet_frame.columns:
            client_sheet = sheet_frame[sheet_frame['concat'] == client]
            if client_sheet.shape[0] == 0:
                continue
            # with pd.ExcelWriter(f'{client}_Details_{dt_string}.xlsx') as writer:  # pylint: disable=abstract-class-instantiated
            client_sheet.to_excel(writer, index=False, sheet_name=sheet)
        else:
            pass
    writer.save()
    'Metallica', 'Elvis Presley', 'Luke Bryan', 'Mitchell Tenpenny',
    'Zac Brown Band', 'Josh Groban', 'Dierks Bentley', 'Blake Shelton',
    'Eric Church', 'John Legend', 'Zedd', 'Normani', 'Flipp Dinero', 'Migos',
    'The Weeknd', 'Offset', 'Keith Urban', 'J Balvin', 'Kelly Clarkson',
    'Barbra Streisand', 'Gucci Mane', 'Rihanna', 'Daddy Yankee', 'Old Dominion'
]
iter = 0
rows_list = []
while iter < len(artists):
    artist = artists[iter]
    query = "SELECT * FROM Tweets WHERE artistName ='" + artist + "'"
    cursor.execute(query)
    total_score = 0
    num_tweets = 0
    for (artistName, timestamp, content) in cursor:
        score = Score(content)
        total_score += score
        num_tweets += 1
    avg_score = total_score / num_tweets
    print('Score for ' + artist_names[iter] + ': ' + str(avg_score) + '\n')
    entry_dict = {}
    entry_dict['Artist'] = artist_names[iter]
    entry_dict['Sentiment Score'] = avg_score
    rows_list.append(entry_dict)
    iter += 1

sentiment = pd.DataFrame(rows_list)

writer = ExcelWriter('sentment.xlsx')
sentiment.to_excel(writer, 'Sheet1')
writer.save()
Exemple #47
0
def sortCSVfile():
    workbook = pd.ExcelFile('Compiled.xls')
    SummaryTable = pd.read_excel(workbook, 'CSV_summary')

    MeanCompiled = 'MeanCompiled.xls'

    meanHits = SummaryTable.groupby('Animal ID')['Hits'].mean()
    meanMisses = SummaryTable.groupby('Animal ID')['Misses'].mean()
    meanFA = SummaryTable.groupby('Animal ID')['False alarms'].mean()
    meanCR = SummaryTable.groupby('Animal ID')['Correct Rejections'].mean()
    meanISI = SummaryTable.groupby('Animal ID')['ISI touches'].mean()
    meanHR = SummaryTable.groupby('Animal ID')['Hit rate'].mean()
    meanFAR = SummaryTable.groupby('Animal ID')['False alarm rate'].mean()
    meanD = SummaryTable.groupby('Animal ID')['D-prime'].mean()
    meanC = SummaryTable.groupby('Animal ID')['Criterion'].mean()
    meanResponseLat = SummaryTable.groupby(
        'Animal ID')['Mean Response Latency'].mean()
    meanHitLat = SummaryTable.groupby('Animal ID')['Mean Hit Latency'].mean()
    meanFAlat = SummaryTable.groupby(
        'Animal ID')['Mean False Alarm Latency'].mean()
    meanRETlat = SummaryTable.groupby(
        'Animal ID')['Mean Retrieval Latency'].mean()
    meanMagEntries = SummaryTable.groupby(
        'Animal ID')['Magazine Entries'].mean()
    meanBIRBeam = SummaryTable.groupby('Animal ID')['Back Beam Breaks'].mean()
    meanFIRBeam = SummaryTable.groupby('Animal ID')['Front Beam Breaks'].mean()

    strategyTable = pd.read_excel(workbook, 'CSV_strategy')
    meanLatBetweenHits = strategyTable.groupby(
        'Animal ID')['Latency Between Hits (MEAN)'].mean()
    stdevLatBetweenHits = strategyTable.groupby(
        'Animal ID')['Latency Between Hits (STDEV)'].mean()
    maxLatBetweenHits = strategyTable.groupby(
        'Animal ID')['Latency Between Hits (MAX)'].mean()
    meanLatBetweenResponses = strategyTable.groupby(
        'Animal ID')['Latency Between Stimuli Responses (MEAN)'].mean()
    stdevLatBetweenResponses = strategyTable.groupby(
        'Animal ID')['Latency Between Stimuli Responses (STDEV)'].mean()
    maxLatBetweenResponses = strategyTable.groupby(
        'Animal ID')['Latency Between Stimuli Responses (MAX)'].mean()
    meanTrialsBetweenResponses = strategyTable.groupby(
        'Animal ID')['Trials Between Stimuli Responses (MEAN)'].mean()
    stdevTrialsBetweenResponses = strategyTable.groupby(
        'Animal ID')['Trials Between Stimuli Responses (STDEV)'].mean()
    maxTrialsBetweenResponses = strategyTable.groupby(
        'Animal ID')['Trials Between Stimuli Responses (MAX)'].mean()

    meanFAboutLength = strategyTable.groupby(
        'Animal ID')['False Alarm Bout Length (MEAN)'].mean()
    stdevFAboutLength = strategyTable.groupby(
        'Animal ID')['False Alarm Bout Length (STDEV)'].mean()
    maxFAboutLength = strategyTable.groupby(
        'Animal ID')['False Alarm Bout Length (MAX)'].mean()

    meanHitboutLength = strategyTable.groupby(
        'Animal ID')['Hit Bout Length (MEAN)'].mean()
    stdevHitboutLength = strategyTable.groupby(
        'Animal ID')['Hit Bout Length (STDEV)'].mean()
    maxHitboutLength = strategyTable.groupby(
        'Animal ID')['Hit Bout Length (MAX)'].mean()

    mean_retrievalFrontBeam = strategyTable.groupby(
        'Animal ID')['Latency Retrieval --> Front Beam Break (MEAN)'].mean()
    stdev_retrievalFrontBeam = strategyTable.groupby(
        'Animal ID')['Latency Retrieval --> Front Beam Break (STDEV)'].mean()
    max_retrievalFrontBeam = strategyTable.groupby(
        'Animal ID')['Latency Retrieval --> Front Beam Break (MAX)'].mean()

    hitBinTable = pd.read_excel(workbook, 'CSV_binsHR')
    hitsBin1 = hitBinTable.groupby('Animal ID')['Bin1'].mean()
    hitsBin2 = hitBinTable.groupby('Animal ID')['Bin2'].mean()
    hitsBin3 = hitBinTable.groupby('Animal ID')['Bin3'].mean()
    hitsBin4 = hitBinTable.groupby('Animal ID')['Bin4'].mean()
    hitsBin5 = hitBinTable.groupby('Animal ID')['Bin5'].mean()
    hitsBin6 = hitBinTable.groupby('Animal ID')['Bin6'].mean()
    hitsBin7 = hitBinTable.groupby('Animal ID')['Bin7'].mean()
    hitsBin8 = hitBinTable.groupby('Animal ID')['Bin8'].mean()
    hitsBin9 = hitBinTable.groupby('Animal ID')['Bin9'].mean()
    hitsBin10 = hitBinTable.groupby('Animal ID')['Bin10'].mean()
    hitsBin11 = hitBinTable.groupby('Animal ID')['Bin11'].mean()
    hitsBin12 = hitBinTable.groupby('Animal ID')['Bin12'].mean()

    FABinTable = pd.read_excel(workbook, 'CSV_binsFAR')
    FARBin1 = FABinTable.groupby('Animal ID')['Bin1'].mean()
    FARBin2 = FABinTable.groupby('Animal ID')['Bin2'].mean()
    FARBin3 = FABinTable.groupby('Animal ID')['Bin3'].mean()
    FARBin4 = FABinTable.groupby('Animal ID')['Bin4'].mean()
    FARBin5 = FABinTable.groupby('Animal ID')['Bin5'].mean()
    FARBin6 = FABinTable.groupby('Animal ID')['Bin6'].mean()
    FARBin7 = FABinTable.groupby('Animal ID')['Bin7'].mean()
    FARBin8 = FABinTable.groupby('Animal ID')['Bin8'].mean()
    FARBin9 = FABinTable.groupby('Animal ID')['Bin9'].mean()
    FARBin10 = FABinTable.groupby('Animal ID')['Bin10'].mean()
    FARBin11 = FABinTable.groupby('Animal ID')['Bin11'].mean()
    FARBin12 = FABinTable.groupby('Animal ID')['Bin12'].mean()

    DBinTable = pd.read_excel(workbook, 'CSV_binsD')
    DBin1 = DBinTable.groupby('Animal ID')['Bin1'].mean()
    DBin2 = DBinTable.groupby('Animal ID')['Bin2'].mean()
    DBin3 = DBinTable.groupby('Animal ID')['Bin3'].mean()
    DBin4 = DBinTable.groupby('Animal ID')['Bin4'].mean()
    DBin5 = DBinTable.groupby('Animal ID')['Bin5'].mean()
    DBin6 = DBinTable.groupby('Animal ID')['Bin6'].mean()
    DBin7 = DBinTable.groupby('Animal ID')['Bin7'].mean()
    DBin8 = DBinTable.groupby('Animal ID')['Bin8'].mean()
    DBin9 = DBinTable.groupby('Animal ID')['Bin9'].mean()
    DBin10 = DBinTable.groupby('Animal ID')['Bin10'].mean()
    DBin11 = DBinTable.groupby('Animal ID')['Bin11'].mean()
    DBin12 = DBinTable.groupby('Animal ID')['Bin12'].mean()

    CBinTable = pd.read_excel(workbook, 'CSV_binsC')
    CBin1 = CBinTable.groupby('Animal ID')['Bin1'].mean()
    CBin2 = CBinTable.groupby('Animal ID')['Bin2'].mean()
    CBin3 = CBinTable.groupby('Animal ID')['Bin3'].mean()
    CBin4 = CBinTable.groupby('Animal ID')['Bin4'].mean()
    CBin5 = CBinTable.groupby('Animal ID')['Bin5'].mean()
    CBin6 = CBinTable.groupby('Animal ID')['Bin6'].mean()
    CBin7 = CBinTable.groupby('Animal ID')['Bin7'].mean()
    CBin8 = CBinTable.groupby('Animal ID')['Bin8'].mean()
    CBin9 = CBinTable.groupby('Animal ID')['Bin9'].mean()
    CBin10 = CBinTable.groupby('Animal ID')['Bin10'].mean()
    CBin11 = CBinTable.groupby('Animal ID')['Bin11'].mean()
    CBin12 = CBinTable.groupby('Animal ID')['Bin12'].mean()

    ISIBinTable = pd.read_excel(workbook, 'CSV_binsISI')
    ISIBin1 = ISIBinTable.groupby('Animal ID')['Bin1'].mean()
    ISIBin2 = ISIBinTable.groupby('Animal ID')['Bin2'].mean()
    ISIBin3 = ISIBinTable.groupby('Animal ID')['Bin3'].mean()
    ISIBin4 = ISIBinTable.groupby('Animal ID')['Bin4'].mean()
    ISIBin5 = ISIBinTable.groupby('Animal ID')['Bin5'].mean()
    ISIBin6 = ISIBinTable.groupby('Animal ID')['Bin6'].mean()
    ISIBin7 = ISIBinTable.groupby('Animal ID')['Bin7'].mean()
    ISIBin8 = ISIBinTable.groupby('Animal ID')['Bin8'].mean()
    ISIBin9 = ISIBinTable.groupby('Animal ID')['Bin9'].mean()
    ISIBin10 = ISIBinTable.groupby('Animal ID')['Bin10'].mean()
    ISIBin11 = ISIBinTable.groupby('Animal ID')['Bin11'].mean()
    ISIBin12 = ISIBinTable.groupby('Animal ID')['Bin12'].mean()

    FARbyStimTable = pd.read_excel(workbook, 'CSV_FARbyStim')
    FARbyStim1 = FARbyStimTable.groupby('Animal ID')['FAR stimulus 1/2'].mean()
    FARbyStim2 = FARbyStimTable.groupby('Animal ID')['FAR stimulus 3'].mean()
    FARbyStim3 = FARbyStimTable.groupby('Animal ID')['FAR stimulus 4'].mean()
    FARbyStim4 = FARbyStimTable.groupby('Animal ID')['FAR stimulus 5'].mean()

    allSummary = (meanFA, meanCR, meanISI, meanHR, meanFAR, meanD, meanC,
                  meanResponseLat, meanHitLat, meanFAlat, meanRETlat,
                  meanMagEntries, meanBIRBeam, meanFIRBeam)
    allStrategy = (maxLatBetweenHits, meanLatBetweenResponses,
                   stdevLatBetweenResponses, maxLatBetweenResponses,
                   meanTrialsBetweenResponses, stdevTrialsBetweenResponses,
                   maxTrialsBetweenResponses, meanFAboutLength,
                   stdevFAboutLength, maxFAboutLength, meanHitboutLength,
                   stdevHitboutLength, maxHitboutLength,
                   mean_retrievalFrontBeam, stdev_retrievalFrontBeam,
                   max_retrievalFrontBeam)
    allHR = (hitsBin3, hitsBin4, hitsBin5, hitsBin6, hitsBin7, hitsBin8,
             hitsBin9, hitsBin10, hitsBin11, hitsBin12)
    allFAR = (FARBin3, FARBin4, FARBin5, FARBin6, FARBin7, FARBin8, FARBin9,
              FARBin10, FARBin11, FARBin12)
    allD = (DBin3, DBin4, DBin5, DBin6, DBin7, DBin8, DBin9, DBin10, DBin11,
            DBin12)
    allC = (CBin3, CBin4, CBin5, CBin6, CBin7, CBin8, CBin9, CBin10, CBin11,
            CBin12)
    allISI = (ISIBin3, ISIBin4, ISIBin5, ISIBin6, ISIBin7, ISIBin8, ISIBin9,
              ISIBin10, ISIBin11, ISIBin12)
    allFARbyStim = (FARbyStim3, FARbyStim4)

    #SUMMARY TABLE
    meanHits = meanHits.to_frame().reset_index()
    summaryTable = meanHits.merge(meanMisses.to_frame(),
                                  left_on='Animal ID',
                                  right_index=True)
    for i in allSummary:
        summaryTable = summaryTable.merge(i.to_frame(),
                                          left_on='Animal ID',
                                          right_index=True)
    summaryTable.set_index('Animal ID', inplace=True)

    #STRATEGY TABLE
    meanLatBetweenHits = meanLatBetweenHits.to_frame().reset_index()
    stratTable = meanLatBetweenHits.merge(stdevLatBetweenHits.to_frame(),
                                          left_on='Animal ID',
                                          right_index=True)
    for i in allStrategy:
        stratTable = stratTable.merge(i.to_frame(),
                                      left_on='Animal ID',
                                      right_index=True)
    stratTable.set_index('Animal ID', inplace=True)

    #HIT TABLE
    hitsBin1 = hitsBin1.to_frame().reset_index()
    hitTable = hitsBin1.merge(hitsBin2.to_frame(),
                              left_on='Animal ID',
                              right_index=True)
    for i in allHR:
        hitTable = hitTable.merge(i.to_frame(),
                                  left_on='Animal ID',
                                  right_index=True)
    hitTable.set_index('Animal ID', inplace=True)

    #FAR TABLE
    FARBin1 = FARBin1.to_frame().reset_index()
    FARTable = FARBin1.merge(FARBin2.to_frame(),
                             left_on='Animal ID',
                             right_index=True)
    for i in allFAR:
        FARTable = FARTable.merge(i.to_frame(),
                                  left_on='Animal ID',
                                  right_index=True)
    FARTable.set_index('Animal ID', inplace=True)

    #D TABLE
    DBin1 = DBin1.to_frame().reset_index()
    DTable = DBin1.merge(DBin2.to_frame(),
                         left_on='Animal ID',
                         right_index=True)
    for i in allD:
        DTable = DTable.merge(i.to_frame(),
                              left_on='Animal ID',
                              right_index=True)
    DTable.set_index('Animal ID', inplace=True)

    #C TABLE
    CBin1 = CBin1.to_frame().reset_index()
    CTable = CBin1.merge(CBin2.to_frame(),
                         left_on='Animal ID',
                         right_index=True)
    for i in allC:
        CTable = CTable.merge(i.to_frame(),
                              left_on='Animal ID',
                              right_index=True)
    CTable.set_index('Animal ID', inplace=True)

    #ISI TABLE
    ISIBin1 = ISIBin1.to_frame().reset_index()
    ISITable = ISIBin1.merge(ISIBin2.to_frame(),
                             left_on='Animal ID',
                             right_index=True)
    for i in allISI:
        ISITable = ISITable.merge(i.to_frame(),
                                  left_on='Animal ID',
                                  right_index=True)
    ISITable.set_index('Animal ID', inplace=True)

    # FARbySTim TABLE
    FARbyStim1 = FARbyStim1.to_frame().reset_index()
    FStimTable = FARbyStim1.merge(FARbyStim2.to_frame(),
                                  left_on='Animal ID',
                                  right_index=True)
    for i in allFARbyStim:
        FStimTable = FStimTable.merge(i.to_frame(),
                                      left_on='Animal ID',
                                      right_index=True)
    FStimTable.set_index('Animal ID', inplace=True)

    list_dfs = (summaryTable, stratTable, hitTable, FARTable, DTable, CTable,
                ISITable, FStimTable)

    writer = ExcelWriter(MeanCompiled)
    nameList = (str('Summary'), str('Strategies'), str('Hit Bins'),
                str('False Alarm Bins'), str('D prime bins'),
                str('Criterion bins'), str('ISI bins'),
                str('FAR by non-target'))
    loop = 0
    for n, df in enumerate(list_dfs):
        sheetName = (nameList[loop])
        df.to_excel(writer, sheetName)
        loop += 1
    writer.save()
Exemple #48
0
def prepare_excel_file(mydict):
    with ExcelWriter("validation_excel.xlsx") as writer:
        for k, v in mydict.items():
            v.to_excel(writer, sheet_name=k)
    move_files('validation_excel.xlsx')
                error_pre = math.sqrt(
                    (pi_1_3 * (1 - pi_1_3)) * (1 / (n_1 + n_3) + 1 / n_2))
                zscore = (p_1 - p_2) / Decimal(error)
                zscore_pre = (pi_1_3 - p_2) / Decimal(error_pre)
                pval = st.norm.sf(abs(float(zscore))) * 2
                pval_pre = st.norm.sf(abs(float(zscore_pre))) * 2

                significant = 0
                significant_pre = 0
                if pval < alpha:
                    significant = 1
                if pval_pre < alpha:
                    significant_pre = 1

                #------- Confidence interval for 95%, 98% or 99% -------
                ci_lower = (p_1 - p_2) + Decimal(z_value * error)
                ci_upper = (p_1 - p_2) - Decimal(z_value * error)
                ci_lower_pre = (pi_1_3 - p_2) + Decimal(z_value * error_pre)
                ci_upper_pre = (pi_1_3 - p_2) - Decimal(z_value * error_pre)

                #------- Write data to data frame -------
                df.loc[len(df)] = [
                    Decimal(x), pi_1, pi_2, pi_3, n_1, n_2, n_3, error,
                    error_pre, zscore, zscore_pre, pval, pval_pre, effectsize,
                    effectsize_pre, significant, significant_pre, ci_lower,
                    ci_upper, ci_lower_pre, ci_upper_pre
                ]

writer = ExcelWriter('df.xlsx')
df.to_excel(writer, 'Sheet1')
writer.save()
Exemple #50
0
def main(func=lambda x: x, mode=""):
    for feature_set in FEATURES_FILES:
        print("CURR feature_set:", feature_set)
        full_features_amnt = feature_set[1] - IGNORED_COLS_NUM - 2
        n_features_list = [full_features_amnt]
        if full_features_amnt < 100:
            n_features_list += list(range(30, full_features_amnt, 15))
        else:
            n_features_list += [50]
        n_features_list += list(range(100, min(750, full_features_amnt), 150)) + \
            list(range(1000, min(6000, full_features_amnt), 1500))
        for n_features in n_features_list:
            print("CURR n_features:", n_features)
            videos_features, videos_labels = get_videos_features_labels(
                feature_set, n_features, transform=func)

            models = []
            # models += [(RandomForestClassifier(n_estimators=70, random_state=1), "RF_70_estimators")]  # Random Forest
            models += [(RandomForestClassifier(n_estimators=i, max_depth=j),
                        RF_MODEL_NAME + "_%d_trees_%d_depth" % (i, j))
                       for i in range(50, 250, 50)
                       for j in range(6, 16, 3)]  # Random Forest
            if mode not in ["binary", "change"]:
                if feature_set == FEATURES_FILES[0]:
                    models += [(make_pipeline(PolynomialFeatures(i),
                                              linear_model.Ridge()),
                                POL_MODEL_NAME + "_%d_degree" % i)
                               for i in range(1, 4)]
                else:
                    models += [(make_pipeline(PolynomialFeatures(i),
                                              linear_model.Ridge()),
                                POL_MODEL_NAME + "_%d_degree" % i)
                               for i in range(1, 2)]

            models += [(svm.LinearSVC(max_iter=2000), SVM_MODEL_NAME)]  # SVM
            perform_general_learning(
                videos_features, videos_labels, models,
                "%s_%d" % (feature_set[0][:-4], n_features), mode)
            perform_within_sub_learning(
                videos_features, videos_labels, models,
                "%s_%d" % (feature_set[0][:-4], n_features), mode)

    # saving all the data frames
    if mode != "":
        mode += "_"
    within_test_size_output = concat(WITHIN_TEST_SIZE, sort=False).transpose()
    within_test_size_output.to_csv(
        path.join(OUTPUT_PATH, "%swithin_subject_test_sizes.csv" % mode))
    within_test_vids_output = concat(WITHIN_TEST_VIDS, sort=False)
    within_test_vids_output.to_csv(
        path.join(OUTPUT_PATH, "%swithin_subject_test_videos.csv" % mode))
    for df, name in [
        (DataFrame(GENERAL_CHANCE),
         "%sgeneral_learning_chance_level.csv" % mode),
        (DataFrame(WITHIN_CHANCE), "%swithin_subject_chance_level.csv" % mode),
        (within_test_size_output, "%swithin_subject_test_sizes.csv" % mode)
    ]:
        df.to_csv(path.join(OUTPUT_PATH, name))
    for data, paradigm in [
        (EA_GENERAL_RESULTS, "%sgeneral_learning_ea" % mode),
        (EA_WITHIN_RESULTS, "%swithin_subject_ea" % mode),
        (FULL_GENERAL_RESULTS, "%sgeneral_learning_full" % mode),
        (FULL_WITHIN_RESULTS, "%swithin_subject_full" % mode)
    ]:
        with ExcelWriter(path.join(OUTPUT_PATH,
                                   paradigm + "_results.xlsx")) as writer:
            save_results(writer, data, "results")
    with ExcelWriter(path.join(OUTPUT_PATH,
                               '%sall_models_results.xlsx' % mode)) as writer:
        general_results = save_all_models(writer, ALL_MODELS_GENERAL_RESULTS,
                                          "general")
        within_results = save_all_models(writer, ALL_MODELS_WITHIN_RESULTS,
                                         "within")
    with ExcelWriter(path.join(OUTPUT_PATH,
                               '%sall_models_ea.xlsx' % mode)) as writer:
        general_ea = save_all_models(writer, ALL_MODELS_GENERAL_EA, "general")
        within_ea = save_all_models(writer, ALL_MODELS_WITHIN_EA, "within")
    with ExcelWriter(
            path.join(OUTPUT_PATH,
                      '%sall_models_train_results.xlsx' % mode)) as writer:
        general_train = save_all_models(writer, ALL_MODELS_GENERAL_TRAIN,
                                        "general")

    # plots train-test trade-off
    plot_train_test(general_results, general_train, mode)

    # calculate the accuracy and EA correlation
    accuracy_ea_correlation(mode, general_results, general_ea, within_results,
                            within_ea)

    general_test = concat(ALL_MODELS_GENERAL_TEST, sort=False).transpose()
    general_test.to_csv(
        path.join(OUTPUT_PATH, "%sall_models_test_results.csv" % mode))
    general_test = general_test.values.flatten()
    general_ea_test = concat(ALL_MODELS_GENERAL_EA_TEST,
                             sort=False).transpose().values.flatten()
    best_acc = np.nanargmax(general_results.values)
    best_ea = np.nanargmax(general_ea.values)
    best_acc_model_name = general_results.index[best_acc //
                                                len(general_results.columns)]
    best_ea_model_name = general_ea.index[best_ea // len(general_ea.columns)]
    df = DataFrame({
        "best_acc_model (%s)" % best_acc_model_name: {
            "accuracy": general_test[best_acc],
            "ea": general_ea_test[best_acc]
        },
        "best_ea_model (%s)" % best_ea_model_name: {
            "accuracy": general_test[best_ea],
            "ea": general_ea_test[best_ea]
        }
    })
    df.to_csv(path.join(OUTPUT_PATH, "%stest_results.csv" % mode))
    print("--------------TEST-------------")
    print(df)
    print("--------------TEST-------------")
def output_report(
        filename, subject_enrollments_by_date, search_enrollments_by_date, subject_clicks_by_date,
        search_clicks_by_date, total_course_card_clicks_by_date, total_program_course_cards_by_date,
        total_homepage_views=None, total_course_card_clicks=None, total_program_card_clicks=None, featured_cards=None,
        homepage_subjects=None):
    writer = ExcelWriter(filename,engine='xlsxwriter')

    # Get access to the workbook
    workbook = writer.book

    # Set the formats needed for the report
    money_fmt = workbook.add_format({'num_format': '$#,##0', 'bold': True})
    percent_fmt = workbook.add_format({'num_format': '0.0%', 'bold': False})
    comma_fmt = workbook.add_format({'num_format': '#,##0', 'bold': False})
    date_fmt = workbook.add_format({'num_format': 'dd/mm/yy'})
    cell_format = workbook.add_format({'bold': True, 'italic': False})
    merge_format = workbook.add_format(
        {
            'bold': 1,
            'align': 'center',
            'valign': 'vcenter',
        }
    )


    # Create the homepage courses featured_cards_worksheet
    if featured_cards is not None:
        total_search_clicks = int(search_clicks_by_date['uniqueClicks'].sum())
        total_subject_clicks = int(subject_clicks_by_date['uniqueClicks'].sum())
        total_search_enrollments = int(search_enrollments_by_date['uniqueEnrollments'].sum())
        total_subject_enrollments = int(subject_enrollments_by_date['uniqueEnrollments'].sum())
        search_enrollment_conversion_rate = float(total_search_enrollments) / total_homepage_views
        subject_enrollment_conversion_rate = float(total_subject_enrollments) / total_homepage_views

        # The total course card clicks and total program card clicks values are off so use these instead
        all_course_cards_clicks = int(total_course_card_clicks_by_date['uniqueClicks'].sum())
        all_program_course_cards_clicks = int(total_program_course_cards_by_date['uniqueClicks'].sum())

        total_clicks = all_course_cards_clicks + all_program_course_cards_clicks + total_search_clicks + total_subject_clicks
        total_enrolls = int(featured_cards['uniqueEnrolls'].sum()) + total_search_enrollments + total_subject_enrollments

        featured_cards.to_excel(writer, index=False, sheet_name='Featured Card Report', startrow=18)
        featured_cards_worksheet = writer.sheets['Featured Card Report']

        # Set column width and formatting
        featured_cards_worksheet.set_column('A:A', 60)
        featured_cards_worksheet.set_column('D:D', 15, comma_fmt)
        featured_cards_worksheet.set_column('E:E', 15, percent_fmt)
        featured_cards_worksheet.set_column('F:H', 15, comma_fmt)
        featured_cards_worksheet.set_column('I:O', 15, percent_fmt)

        # Write headings
        featured_cards_worksheet.write(
            'A1',
            'Homepage Course Enrollments, Data from {start} to {end}'.format(start=start_date, end=end_date),
            cell_format
        )
        featured_cards_worksheet.write('A3', 'Overview', cell_format)
        featured_cards_worksheet.write('A4', 'Total Homepage Views:', cell_format)
        featured_cards_worksheet.write('A6', 'Total Clicks on Home Page:', cell_format)
        featured_cards_worksheet.write('A7', '     feat. course clicks', cell_format)
        featured_cards_worksheet.write('A8', '     feat. program clicks', cell_format)
        featured_cards_worksheet.write('A9', '     feat. search clicks', cell_format)
        featured_cards_worksheet.write('A10', '     feat. subject clicks', cell_format)
        featured_cards_worksheet.write('A11', 'Total CTR', cell_format)
        featured_cards_worksheet.write('C12', 'card conversion', cell_format)
        featured_cards_worksheet.write('A13', 'Total Enrollments from card clicks:', cell_format)
        featured_cards_worksheet.write('A14', '      enrollment on course about (top+bottom)', cell_format)
        featured_cards_worksheet.write('A15', '      enrolllment on program about', cell_format)
        featured_cards_worksheet.write('A16', '      enrollment on search', cell_format)
        featured_cards_worksheet.write('A17', '      enrollment on subject card clicks', cell_format)
        featured_cards_worksheet.write('A18', 'Top Performing Cards + Conversion', cell_format)

        featured_cards_worksheet.merge_range('F18:H18', 'enrollment events from card click', merge_format)
        featured_cards_worksheet.merge_range('I18:K18', 'conversion from card click', merge_format)
        featured_cards_worksheet.merge_range('L18:M18', 'clickshare vs. other cards', merge_format)
        featured_cards_worksheet.merge_range('N18:O18', 'enrollments per impression', merge_format)

        # Write Overview Data
        featured_cards_worksheet.write('B4', total_homepage_views, comma_fmt)
        featured_cards_worksheet.write('B6', int(total_clicks), comma_fmt)
        featured_cards_worksheet.write('B7', all_course_cards_clicks, comma_fmt)
        featured_cards_worksheet.write('B8', all_program_course_cards_clicks, comma_fmt)
        featured_cards_worksheet.write('B9', total_search_clicks, comma_fmt)
        featured_cards_worksheet.write('B10', total_subject_clicks, comma_fmt)
        featured_cards_worksheet.write('B11', float(total_clicks)/total_homepage_views, percent_fmt)
        featured_cards_worksheet.write('B13', int(total_enrolls), comma_fmt)
        featured_cards_worksheet.write('B14', int(featured_cards['uniqueCourseEnrolls'].sum()), comma_fmt)
        featured_cards_worksheet.write('B15', int(featured_cards['uniqueProgramEnrolls'].sum()), comma_fmt)
        featured_cards_worksheet.write('B16', total_search_enrollments, comma_fmt)
        featured_cards_worksheet.write('B17', total_subject_enrollments, comma_fmt)
        featured_cards_worksheet.write('C13', float(total_enrolls)/total_homepage_views, percent_fmt)
        featured_cards_worksheet.write('C14', float(featured_cards['uniqueCourseEnrolls'].sum()) / total_homepage_views, percent_fmt)
        featured_cards_worksheet.write('C15', float(featured_cards['uniqueProgramEnrolls'].sum()) / total_homepage_views, percent_fmt)
        featured_cards_worksheet.write('C16', search_enrollment_conversion_rate, percent_fmt)
        featured_cards_worksheet.write('C17', subject_enrollment_conversion_rate, percent_fmt)

    if homepage_subjects is not None:
        homepage_subjects.to_excel(writer, index=False, sheet_name='HomepageSubjects', startrow=2)

        # Get the homepage subject worksheet
        homepage_subject_worksheet = writer.sheets['HomepageSubjects']

        # Set conditional format
        homepage_subject_worksheet.conditional_format('C1:C1000', {'type': '3_color_scale'})

        # Set column width and formatting
        homepage_subject_worksheet.set_column('A:A', 27)
        homepage_subject_worksheet.set_column('B:B', 15, comma_fmt)
        homepage_subject_worksheet.set_column('C:S', 15, percent_fmt)

        # Write heading
        homepage_subject_worksheet.write('A1', 'Top Subject Pages from the Homepage, Data from '+str(start_date)+' to '+str(end_date) , cell_format)

    # Write out the .xlsx file
    writer.save()
                    dt.date.today().strftime("%Y%m%d"))),
            action='store')

        args = parser.parse_args()
        if args.duplication_type == "RPA" and args.rpa_file is None:
            argparse.ArgumentTypeError(
                "no RPA file specified for duplication type of {0}".format(
                    args.duplication_type))

        if args.duplication_type == 'RPA':
            print("evaluating RPA inventory to ETA submissions worklist")
        elif args.duplication_type == "COLLAB":
            print("evaluating ETA submissions worklist against itself")

        evaluator = de.DupEvaluator(args)
        dup_df = evaluator.dup_eval()

        print("{0:d} duplicates found".format(dup_df.shape[0]))
        print("saving duplicates to {0}".format(args.output_file))
        with ExcelWriter(os.fspath(args.output_file)) as writer:
            dup_df.to_excel(writer, index=False)

    except Exception as e:
        print(e, file=sys.stdout)
        rv += 1
    finally:
        print("end {0} (elapsed time: {1})".format(parser.prog,
                                                   dt.datetime.now() -
                                                   startdt))
        sys.exit(rv)
import pandas as pd
from pandas import ExcelWriter
import os
#import itertools
import numpy as np

os.chdir(
    'C:\Users\\nauga\Google Drive\BuildingPrognostics\ForecastingTrainingData')
df = pd.ExcelFile('JunJulyCleanedData2.xlsx').parse('Sheet1')
df.replace('', np.nan, inplace=True)
#newpd = pd.DataFrame(df.iloc[0::6, 0::])###
#print newpd

newpd = df[np.isfinite(df['Hourly Totalization.Hourly Totals Trend ()'])]
writer = ExcelWriter('JunJulyFinalData.xlsx')
newpd.to_excel(writer, 'Sheet1')
writer.save()

#newpd = df[np.isfinite(df['Hourly Totalization.Hourly Totals Trend ()'])]

#df = pd.ExcelFile('reqData.xlsx').parse('Sheet3')
#df.replace('', np.nan, inplace=True)
#print df
dcr['sterile'] = dcr.doctor_synopsis.apply(lambda x: exist(x, '不孕不育'))
dcr['digestion'] = dcr.doctor_synopsis.apply(lambda x: exist(x, '肠胃'))
dcr['digestion_2'] = dcr.doctor_synopsis.apply(lambda x: exist(x, '消化'))
dcr['urinary'] = dcr.doctor_synopsis.apply(lambda x: exist(x, '泌尿'))

synopsis_extract_list = [hco, dept, dcr]
save_xls(synopsis_extract_list, 'synopsis_extract.xlsx')

# 眼科
d0 = dcr.loc[dcr.dept.str.contains('眼科', na=False)]

# 眼底病
d1 = d0[d0['doctor_skill'].str.contains('眼底病', na=False)]
d2 = d0[d0['doctor_synopsis'].str.contains('眼底病', na=False)]
d3 = d1.append(d2).drop_duplicates()
writer = ExcelWriter('ophthalmology.xlsx')
d3.to_excel(writer, 'Sheet1')
writer.save()

# dcr with an expertise in 不孕不育
d4 = dcr[dcr.doctor_skill.str.contains('不孕不育', na=False)]
d5 = dcr[dcr.doctor_synopsis.str.contains('不孕不育', na=False)]
d5 = d4.append(d5).drop_duplicates()
writer = ExcelWriter('sterile.xlsx')
d5.to_excel(writer, 'Sheet1')
writer.save()

len(d5)
len(d5.hospital.unique())
len(d5.groupby(['hospital', 'dept']).count())
Exemple #55
0
import os, pickle
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np

in_path = os.path.join(os.pardir, "aspects", "new_aspects.pkl")
relations_file = open(in_path, 'rb')
relations = pickle.load(relations_file)
relations_file.close()

keywords = set()
for topics in relations.values():
    for sets in topics.values():
        for _, relation, _, _, _ in sets:
            for word in relation.split():
                keywords.add(word)
print(len(keywords))
# out_path = os.path.join(os.pardir, "aspects", "keywords.txt")
# with open(out_path, 'a') as f:
#     for w in keywords:
#         f.write(w+"\n")
df = pd.DataFrame({'keywords': list(keywords)})
out_path = os.path.join(os.pardir, "aspects", "keywords.xlsx")
writer = ExcelWriter(out_path)
df.to_excel(writer, 'Sheet1', index=False)
writer.save()
Exemple #56
0
 def time_write_excel(self, engine):
     bio = BytesIO()
     bio.seek(0)
     writer = ExcelWriter(bio, engine=engine)
     self.df.to_excel(writer, sheet_name="Sheet1")
     writer.save()
Exemple #57
0
def ExcelWrite(yourdf, file, sheet):

    writer = ExcelWriter(file)
    yourdf.to_excel(writer, sheet)
    writer.save()
Exemple #58
0
                c.append('Gujrat')
            if value2.endswith('(UT)') == True:
                a.append(key2)
                c.append('Uttarakhand')
            if value2.endswith('(RJ)') == True:
                a.append(key2)
                c.append('Rajasthan')
            if value2.endswith('(SK)') == True:
                a.append(key2)
                c.append('Sikkhim')
            if value2.endswith('(WB)') == True:
                a.append(key2)
                c.append('West Bengal')
            if value2.endswith('(CG)') == True:
                a.append(key2)
                c.append('Chhattisgarh')
            if value2.endswith('(TG)') == True:
                a.append(key2)
                c.append('Telangana')

        s=dict(zip(a,c))
        f={'Cadre':s}
b.update(f)
print b
df = pd.DataFrame(b)
writer = ExcelWriter('(final)appendixc2.xlsx', engine='xlsxwriter')
df.to_excel(writer, 'Sheet1', index=False)
writer.save()


Exemple #59
0
        # 검색 결과 확인
        elem = driver.find_element_by_class_name('results')
        div_list = elem.find_elements_by_tag_name('div')
        # 검색 결과 모두 긁어서 리스트로 저장
        results = []
        for div in div_list:
            results.append(div.text)
            a_tags = div.find_elements_by_tag_name('a')
            if a_tags:
                for a_tag in a_tags:
                    link = a_tag.get_attribute('href')
                    results.append(link)
        # 검색결과 모음 리스트를 12개씩 분할하여 새로운 리스트로 저장
        result = [
            results[i * 12:(i + 1) * 12]
            for i in range((len(results) + 12 - 1) // 12)
        ]
        df = DataFrame(data=result)
        filePath = f'Exam-{idx}.xlsx'
        with ExcelWriter(filePath) as writer:
            df.to_excel(writer, index=False)
        writer.close()
        # 결과 출력
        print(result)
except Exception as e:
    # 위 코드에서 에러가 발생한 경우 출력
    print(e)
finally:
    # 에러와 관계없이 실행되고, 크롬 드라이버를 종료
    pass
    # driver.quit()
class ImpactTableGenerator:
    def __init__(self,
                 config_file,
                 input_filename=None,
                 output_filename_base=None,
                 output_directory=None):

        # create parameters
        self.__dict__.update(utils.get_config(config_file))

        # Load list of locations
        self.locations_df = self._load_locations()

        # load spacy model
        logger.info("Loading model {}".format(self.model))
        self.nlp = spacy.load(self.model)

        # get df of articles
        self.articles_df = self._load_articles(input_filename)

        # get keywords
        self.keywords = ImpactTableGenerator._get_keywords(config_file)

        # prepare output
        if output_filename_base is None:
            output_filename_base = 'impact_data_{keyword}_{country}'.format(
                keyword=self.keyword, country=self.country)
        self.output_filename_base = output_filename_base
        if output_directory is None:
            self.output_directory = OUTPUT_DIRECTORY
        else:
            self.output_directory = output_directory

        if not os.path.exists(self.output_directory):
            os.makedirs(self.output_directory)
        self.writer = ExcelWriter(
            os.path.join(self.output_directory,
                         self.output_filename_base + '.xlsx'))
        self.df_impact = ImpactTableGenerator._make_df_impact()

    def loop_over_articles(self):
        n_articles = len(self.articles_df)
        for id_row in range(n_articles):
            logger.info("Analyzing article {}/{}...".format(
                id_row + 1, n_articles))
            article = Article.Article(self.articles_df.iloc[id_row],
                                      self.language, self.keywords, self.nlp,
                                      self.locations_df)
            article.analyze(self.language, self.keywords, self.df_impact)

            logger.info("...finished article {}/{}, updating file\n".format(
                id_row + 1, n_articles))

            if not self.df_impact.empty:
                self.df_impact.to_csv(os.path.join(
                    self.output_directory, self.output_filename_base + '.csv'),
                                      mode='w',
                                      encoding='utf-8',
                                      sep='|')
                self.df_impact.to_excel(self.writer, sheet_name='Sheet1')
                self.writer.save()

        logger.info('found {} entries'.format(len(self.df_impact)))
        self.df_impact.dropna(how='all', inplace=True)
        logger.info('{}'.format(self.df_impact.describe()))
        logger.info('{}'.format(self.df_impact.head()))
        self.df_impact.to_csv(os.path.join(self.output_directory,
                                           self.output_filename_base + '.csv'),
                              mode='w',
                              encoding='utf-8',
                              sep='|')
        self.df_impact.to_excel(self.writer, sheet_name='Sheet1')
        self.writer.save()

    @staticmethod
    def _make_df_impact():
        levels = [[], [], []]
        codes = [[], [], []]
        names = ['location', 'date', 'article_num']
        columns = [
            'damage_livelihood', 'damage_general', 'people_affected',
            'people_dead', 'houses_affected', 'livelihood_affected',
            'infrastructures_affected', 'infrastructures_mentioned',
            'sentence(s)', 'article_title'
        ]
        return pd.DataFrame(index=pd.MultiIndex(levels=levels,
                                                codes=codes,
                                                names=names),
                            columns=columns)

    def _load_articles(self, input_filename):
        # load DataFrame with articles
        input_directory = utils.INPSECTED_ARTICLES_OUTPUT_DIR
        if input_filename is None:
            input_filename = utils.get_inspected_articles_output_filename({
                'keyword':
                self.keyword,
                'country':
                self.country
            })
        df = pd.read_csv(os.path.join(input_directory, input_filename),
                         sep='|').drop_duplicates(['title', 'text'],
                                                  keep=False)
        df['publish_date'] = df['publish_date'].apply(pd.to_datetime)
        logger.info('got {} articles:'.format(len(df)))
        logger.info('{} -- {}'.format(
            df['publish_date'].min().strftime('%Y-%m-%d'),
            df['publish_date'].min().strftime('%Y-%m-%d')))
        return df

    @staticmethod
    def _get_keywords(config_file):
        keywords_config = utils.get_keywords(config_file)

        keyword_list = [
            'donation', 'type_livelihood', 'type_people_multiple',
            'type_people_death', 'list_verb_death', 'type_house',
            'local_currency_names_short', 'currency_short',
            'local_currency_code', 'local_currency_names_long',
            'currency_long', 'titles'
        ]

        keywords = {
            keyword: ast.literal_eval(keywords_config[keyword])
            for keyword in keyword_list
        }

        keywords['type_people'] = utils.read_keyword_csv(
            keywords_config['filename_type_people'])
        keywords['type_infrastructure'] = utils.read_keyword_csv(
            keywords_config['filename_type_infrastructures'])
        keywords['currency_short'] = keywords[
            'local_currency_names_short'] + keywords['currency_short']
        keywords['currency_long'] = keywords[
            'local_currency_names_long'] + keywords['currency_long']
        return keywords

    def _load_locations(self):
        """
        build a dictionary of locations {name: coordinates}
        from a gazetteer in tab-separated csv format (http://geonames.nga.mil/gns/html/namefiles.html)
        """
        input_file = os.path.join(LOCATIONS_FOLDER, self.country,
                                  self.country_short + '_administrative_a.txt')
        columns = ['FULL_NAME_RO', 'FULL_NAME_ND_RO', 'LAT', 'LONG', 'ADM1']
        locations_df = pd.read_csv(input_file,
                                   sep='\t',
                                   encoding='utf-8',
                                   usecols=columns)
        input_file = os.path.join(LOCATIONS_FOLDER, self.country,
                                  self.country_short + '_localities_l.txt')
        locations_df = locations_df.append(pd.read_csv(input_file,
                                                       sep='\t',
                                                       encoding='utf-8',
                                                       usecols=columns),
                                           ignore_index=True)
        input_file = os.path.join(
            LOCATIONS_FOLDER, self.country,
            self.country_short + '_populatedplaces_p.txt')
        locations_df = locations_df.append(pd.read_csv(input_file,
                                                       sep='\t',
                                                       encoding='utf-8',
                                                       usecols=columns),
                                           ignore_index=True)
        locations_df = locations_df[~locations_df['FULL_NAME_ND_RO'].str.
                                    contains(self.country)]
        locations_df["ADM1"] = pd.to_numeric(locations_df["ADM1"],
                                             errors='coerce')
        return locations_df