def load_master_initial_merge(): """ GitHub Issue #3 should be fixed: Initial merge is Master SCORP .csv + GEO .xlsx file. Logic should pull street type and address fields from Geo and overwrite master. """ print("Loading SCORP Master...") scorp_master_file = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\MasterSCORP_Base.xlsx" print("Loading SCORP GEO...") geo_master_file = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\SCORP_FILTER_GEO.xlsx" # State index_col so that data matches correctly. sm = pd.read_excel(scorp_master_file, index_col="OBJECTID") gm = pd.read_excel(geo_master_file, index_col="OBJECTID") # Take Street type, street, town from geo, where available. sm["Street_Type"] = gm["Street_Type"] sm["Street"] = gm["Street"] sm['Town'] = gm['Town'] print("Updated Street_Type, Street, Town from GEO to Master.") # Export MasterSCORP_Updated.csv to be new master, # then return the dataframe to whomeever called it. export_filename = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\MasterSCORP_Updated.csv" print("Exporting to {}".format(export_filename)) sm.to_csv(export_filename) return sm
def magicitems(dict, roll, row): if dict[roll]['MI Numb'] != '0': if dict[roll]['MI Numb 2'] != '0': times2 = diceroller(dict[roll]['MI Numb 2']) y = 0 items2dict = {} try: mdf2 = pd.read_excel('Items.xlsx', sheet_name = dict[roll]['Item 2'], index_col = 0, usecols = 'E:F') mitems2 = dictcreator(mdf2) while y < times2: rolls = random.randint(1,100) items2dict['var_' + str(y)] = tk.Label(root, text = mitems2[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2) y += 1 row += 1 except SyntaxError: pass times = diceroller(dict[roll]['MI Numb']) x = 0 itemsdict = {} try: mdf = pd.read_excel('Items.xlsx', sheet_name = dict[roll]['Item'], index_col = 0, usecols = 'E:F') mitems = dictcreator(mdf) while x < times: rolls = random.randint(1,100) itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2) itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2) itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2) x += 1 row += 1 except SyntaxError: pass art(dict, roll, row)
def main(): # Several datafiles, each with a long list of subjects # Directory path variable assignment, assumes script is in working directory!!! DATA = "data" MEASURE = "measure" EXCEL = "excel_files" # Mainly for testing purposes if len(sys.argv) > 1: DATA = os.path.join(sys.argv[1], DATA) MEASURE = os.path.join(sys.argv[1], MEASURE) FINAL = os.path.join(sys.argv[1], FINAL) # Create a dictionary with Subtest #s as the keys and a list of the data # file as values. Uses a Dictionary Comprehension SubTestIndex = [os.path.split(_file)[1].split('_')[0].split('Test')[1] for _file in glob(os.path.join(DATA,"*.txt"))] for sID in SubTestIndex: # sID => subtest ID, eg. Sub[03A] pXLXS = os.path.join(EXCEL, "Sub{0}_person_measure.xlsx".format(sID)) pTXT = os.path.join(MEASURE, "Sub{0}_person_measure.txt".format(sID)) if os.path.exists(pXLXS): person_measure = pd.read_excel(pXLXS, header=None, names=['Scores', 'NaN', 'SubID', '_SubID', '_NaN']) person_output = person_measure[['SubID', 'Scores']] person_output.to_csv(pTXT, sep='\t', index=False, header=False) iXLXS = os.path.join(EXCEL, "Sub{0}_item_measure.xlsx".format(sID)) iTXT = os.path.join(MEASURE, "Sub{0}_item_measure.txt".format(sID)) pd.read_excel(iXLXS, header=None).to_csv(iTXT, sep='\t', index=False, header=False)
def grouping_parcels(group): # Parcel to census lookup df = pd.read_csv(r'R:\Brice\gis\parcels_urbansim_census.txt') # add low income geography tag low_inc = pd.read_excel(r'R:\Brice\gis\special-needs\ACS_15_5YR_Low-income.xlsx', sheetname='Map-income') minority = pd.read_excel(r'R:\Brice\gis\special-needs\ACS_15_5YR_Minority.xlsx', sheetname='Mapping') if group == 'low_income': # Threshold for % of households as total for determining low income or not income_threshold = 0.5 # Define low_inc tracts as those with more HH below 200% median income than those above it low_inc['% low inc'] = low_inc['Below200']/low_inc['Total'] # Create flag for whether low income or not low_inc.ix[low_inc['% low inc'] >= income_threshold,'low_inc_tract'] = 1 low_inc.ix[low_inc['% low inc'] < income_threshold,'low_inc_tract'] = 0 # Merge with parcel file newdf = pd.merge(df, low_inc[['GEOID10','low_inc_tract']], on='GEOID10', how='left') parcels_list = newdf[newdf['low_inc_tract'] == 1].parcelid.values elif group == 'minority': minority['% minority'] = minority['Minority']/minority['Total'] minority_threshold = 0.5 minority.ix[minority['% minority'] >= minority_threshold, 'minority_tract'] = 1 minority.ix[minority['% minority'] < minority_threshold, 'minority_tract'] = 0 # Merge with parcel file newdf = pd.merge(df, minority[['GEOID10','minority_tract']], on='GEOID10', how='left') parcels_list = newdf[newdf['minority_tract'] == 1].parcelid.values return parcels_list
def preprocess_greyc_nislab(in_file, out_file): """ Preprocess the raw GREYC NISLAB dataset """ df = pd.concat([pd.read_excel(in_file, sheetname=0), pd.read_excel(in_file, sheetname=1), pd.read_excel(in_file, sheetname=2), pd.read_excel(in_file, sheetname=3), pd.read_excel(in_file, sheetname=4)]) df = df[df['Class'] == 2] df['age'] = (df['Age'] < 30).map({True: '<30', False: '>=30'}) df['gender'] = df['Gender'].map({'F': 'female', 'M': 'male'}) df['handedness'] = df['Handedness'].map({'L': 'left', 'R': 'right'}) df['session'] = np.arange(len(df)) df['password'] = df['Password'].map({ 'leonardo dicaprio': 1, 'the rolling stones': 2, 'michael schumacher': 3, 'red hot chilli peppers': 4, 'united states of america': 5, }) def preprocess_row(idx_row): idx, row = idx_row keyname = list(map(lambda x: 'space' if x == ' ' else x, list(row['Password']))) v = np.array(row['Keystroke Template Vector'].strip().split()).astype(int) // 10000 s = len(keyname) - 1 pp, rr, pr, rp = [v[s * i:s * (i + 1)] for i in range(4)] timepress = np.r_[0, pp].cumsum() # Offset the first release time by the duration of the first key timerelease = np.r_[rp[0] - rr[0], rr].cumsum() # There are ~180 rows where timerelease == timepress. # Fix these by assuming at least the minimum standard clock resolution timerelease[timerelease == timepress] += 16 sample = pd.DataFrame.from_items([ ('user', row['User_ID']), ('session', row['session']), ('password', row['password']), ('age', row['age']), ('gender', row['gender']), ('handedness', row['handedness']), ('timepress', timepress), ('timerelease', timerelease), ('keyname', keyname) ]) return sample df = pd.concat(map(preprocess_row, df.iterrows())) df = df.set_index(['user', 'session'])[COLS] df = remove_repeated_keys(df) df.to_csv(out_file) return
def labelspiezo(self) : """ Uses two sets of labels from EEG scorers to keep only those segments corresponding to agreedupon scores """ # First get file names lbls1name = self.filename.strip('.mat') + '.xls' lbls2name = self.filename.strip('.mat') + '_2.xls' # Import scores as dataframes lbls1 = pd.read_excel(self.filepath+lbls1name, header = None) lbls2 = pd.read_excel(self.filepath+lbls2name, header = None) # Concatenate into same dataframe and keep segments where equal concatted = pd.concat([lbls1[0],lbls2[0]],1) concatted.columns = ['scorer1','scorer2'] scoredf = concatted[concatted['scorer1']==concatted['scorer2']] # scoredf is a dataframe with indices corresponding to the piezo # segments where there is agreement, and the identical labels in # each column # first reshape the piezo npr = np.reshape(self.piezo,[len(self.piezo)/(self.fs*4),self.fs*4]) # this single function slices the reshaped piezo matrix such that # it retains only segments where doublescored self.piezomat = npr[scoredf.index] # as_matrix ensures indices are not saved since we need only labels self.labels = scoredf['scorer1'].as_matrix()
def vadir_get_cnames_replace(df_list, df_to_use): """ This function determines the column differecnes between each of the excel files passed in. INPUT: list of excel files to import and the file with the right column names to use to compare against OUTPUT: dictionary of excel files as keys and list of unmatched columns as values of the dictionary """ columns_to_use = [] other_columns = {} unmatched_c = {} for df in df_list: if df == df_to_use: df_import = pd.read_excel(df) c_row = vadir_column_data_row(df_import) columns_to_use = vadir_clean_cnames(df_import, c_row) unmatched_c[df] = columns_to_use else: df_import = pd.read_excel(df) c_row = vadir_column_data_row(df_import) other_columns[df] = vadir_clean_cnames(df_import, c_row) for df, columns in other_columns.items(): unmatched_c[df] = [c for c in columns if c not in columns_to_use] return unmatched_c
def download_iter(self, file, save_path="."): file_path = os.sep.join([save_path, file]) if not os.path.exists(save_path): os.mkdir(save_path) with open(file_path, "wb") as cache: try: self.retrbinary("RETR %s" % file, cache.write) except: yield "", "", pandas.DataFrame(),False return if not zipfile.is_zipfile(file_path): ef = pandas.ExcelFile(file_path) yield file, ef.sheet_names[0], pandas.read_excel(ef),False return with zipfile.ZipFile(file_path, "r") as zip: xlss = [] sheet_name = "" for name in zip.namelist(): ef = pandas.ExcelFile(zip.open(name)) sheet_name = ef.sheet_names[0] xls = pandas.read_excel(ef) yield name, sheet_name, xls,True xlss.append(xls)
def qmflt(name="qfl.xlsx", Width=1, Color='k'): if("csv"in name):QmFLtRaw = pd.read_csv(name) elif("xlsx"in name):QmFLtRaw = pd.read_excel(name) QmFLtRaw = pd.read_excel(name) qmfltline(Width, Color) Points = len(QmFLtRaw) for i in range(Points): q = QmFLtRaw.at[i, 'Qm'] f = QmFLtRaw.at[i, 'F'] l = QmFLtRaw.at[i, 'Lt'] Q = 100 * q / (q + f + l) F = 100 * f / (q + f + l) L = 100 * l / (q + f + l) x = Q / 2 + (100 - Q) * L / (L + F) y = Q / 2 * math.sqrt(3) plotpoint(x, y, QmFLtRaw.at[i, 'Size'], QmFLtRaw.at[i, 'Color'], QmFLtRaw.at[i, 'Alpha'], QmFLtRaw.at[i, 'Marker']) plt.savefig("QmFLt-Plot.png", dpi=600) plt.savefig("QmFLt-Plot.svg", dpi=600) plt.show()
def main(args): #Load the answer key #Answer key must have the headings ['Problem', 'Your Answer', 'Answer Format'] answer_df = pd.read_excel(args.answer_key, sheet=0) #Score scores = {} #Go through the individual sheets for student_answer in glob.glob(os.path.join( args.assign_dir, '*xlsx') ): print(student_answer) student_df = pd.read_excel(student_answer, sheet=0) #Check to make sure that the column headings are equal if (student_df.columns != answer_df.columns).all(): print('ERROR with: %s' % student_answer) else: #Proceed with grading equal = (answer_df['Your Answer'].str.lower() == student_df['Your Answer'].str.lower()) #Count all the false values eqval = equal.value_counts() #Pull the students name path, fname = os.path.split(student_answer) student_name = fname.split('_')[0] #True grade set scores[student_name] = eqval[eqval.index == True].values[0] #Sort and print a csv with open('exam_scores.csv', 'w') as wfile: print('Student,Score', file=wfile) for sname in sorted( list(scores.keys()) ): print( '%s,%d' % (sname, scores[sname]), file=wfile)
def get_from_excel(data_path, extra_sheet=None): ''' This opens a file dialog allowing ot select an excel file containing the tracked data, and returns a :class:`CellCluster` object. Paramteters ----------- data_path: the path to the excelTM file Returns ------- cellcluster : a :class:`CellCluster` instance the container class for the tracking Notes ----- The excel file should follow the structure of `excel_trajs_example.xlsx` in the project's `data` directory ''' ### Read the data trajs = pd.read_excel(data_path, 0) trajs.t_stamp = trajs.t_stamp.astype(np.int) trajs.label = trajs.label.astype(np.int) trajs.set_index(['t_stamp', 'label'], inplace=True) ### The Trajectories class is a subclass of ### pandas DataFrame ### Parsing excel files tends to add NaNs to the data trajs = Trajectories(trajs.dropna().sortlevel()) metadata = pd.read_excel(data_path, 1) metadata = {name: value for name, value in zip(metadata['Name'], metadata['Value'])} metadata['FileName'] = data_path store_path = metadata['FileName'] if '.' in store_path[-6:]: store_path = ''.join(store_path.split('.')[:-1]+['.h5']) else: store_path = store_path+'.h5' store_path = os.path.join( os.path.dirname(data_path), store_path) ### The ObjectsIO class objectsio = ObjectsIO(metadata=metadata, store_path=store_path) cellcluster = CellCluster(objectsio=objectsio) cellcluster.trajs = trajs cellcluster.oio['trajs'] = trajs if extra_sheet is not None: try: extra = pd.read_excel(data_path, extra_sheet) cellcluster.extra = extra cellcluster.oio['extra'] = extra except: print('Extra data from sheet {} not found in the file {}'.format(extra_sheet, data_path)) return cellcluster
def extract_bloomberg_excel(str_bbDataFile, str_bbIndexFile,is_excel): ''' 블룸버그에서 받아온 엑셀파일을 dataframe 형식으로 변경하여 저장 :param str_bbDataFile: 실제 데이터 파일 :param str_bbIndexFile: 메타파일 ''' global df_bbData, df_bbDataCol if(is_excel): #데이터 df_bbData = pd.read_excel(str_bbDataFile,'Sheet1') df_bbData = df_bbData.ix[5:,:] #제목행 및 날짜 없는행 제거 df_bbData = df_bbData.replace('#N/A N/A','') #엑셀에서 데이터 없는 셀에 들어간 문자열 제거 df_bbData = df_bbData.convert_objects(convert_numeric=True) #모든 컬럼은 숫자형식으로 변환 #리스트 df_bbIndex = pd.read_excel(str_bbIndexFile, 'index') df_bbIndex.columns = ['no','idx','cat','rgn','rgn2','rmk','undf'] df_bbDataCol = df_bbIndex[df_bbIndex['no'].isin(df_bbData.columns)][['no','idx','rgn2']] #csv로 저장 df_bbData.to_csv('../data/DailyEconomicData.csv',sep='\t',encoding='utf-8') df_bbDataCol.to_csv('../data/index.csv',sep='\t',encoding='utf-8') else: df_bbData = pd.read_csv('../data/DailyEconomicData.csv',sep='\t',encoding='utf-8') df_bbDataCol = pd.read_csv('../data/index.csv',sep='\t',encoding='utf-8')
def xl_to_df(directory, file_dict): # Get excel file file_path = '' for file in file_dict: if not file['type'] == 'questions': file_path = str(directory) + '\\' + file['name'] else: file_path2 = str(directory) + '\\' + file['name'] main1 = pd.read_excel(file_path, pd.ExcelFile(file_path).sheet_names[0], encoding='utf-8') main2 = pd.read_excel(file_path2, pd.ExcelFile(file_path2).sheet_names[0], encoding='utf-8') # xls_file = pd.ExcelFile(file_path) # main1 = xls_file.parse( xls_file.sheet_names[0] ) # xls_file2 = pd.ExcelFile(file_path2) # main2 = xls_file2.parse( xls_file2.sheet_names[0] ) full_df = main1.append(main2) return full_df
def get_context_data(self, **kwargs): context = super(DocumentDetail, self).get_context_data(**kwargs) filename = settings.MEDIA_ROOT+"/"+self.object.document.name if self.object.doc_type == 1: sniffer = csv.Sniffer() dialect = sniffer.sniff(open(filename, 'r').read(), delimiters='\t,;') # defining the separator of the csv file df = read_csv(filename, delimiter=dialect.delimiter) context['input'] = df[:50].to_html() else: try: df = read_excel(filename, sheetname="PMS") context['PMS'] = df.to_html() except: pass try: df = read_excel(filename, sheetname="PMS1") context['PMS1'] = df.to_html() except: pass try: df = read_excel(filename, sheetname="DS1") context['DS1'] = df.to_html() except: pass try: df = read_excel(filename, sheetname="DS2") context['DS2'] = df.to_html() except: pass return context
def load_references(xls_filename, errors, validation_errors): # Output columns can be different. Update according to the rename_columns dict: try: dfs = pandas.read_excel(xls_filename, [#'core-24-depts', '(reference) senior-staff-grades', '(reference) professions', '(reference) units', ]) except XLRDError, e: if str(e) == "No sheet named <'(reference) units'>": validation_errors.append(str(e)) return {} elif str(e) in ("No sheet named <'(reference) senior-staff-grades'>", "No sheet named <'(reference) professions'>"): # this doesn't matter - we will use the standard_references # anyway. Read it again, just for the units. try: dfs = pandas.read_excel(xls_filename, ['(reference) units']) except XLRDError, e: if str(e) == "No sheet named <'(reference) units'>": validation_errors.append(str(e)) else: errors.append(str(e)) return {}
def excel(FilePath, FileName, SheetNameOrNone, *args, **kwargs): IndexColumn = kwargs.get('IndexColumn',None) from pandas import read_excel if FilePath.endswith('\\'): lastslash='' else: lastslash='\\' if FileName.endswith('.xlsx'): fext='' else: fext='.xlsx' while True: try: fullpath=FilePath+lastslash+FileName+fext EmptyVar=read_excel(fullpath, SheetNameOrNone, index_col=IndexColumn, na_values=['NA']) break except: fext='.xls' fullpath=FilePath+lastslash+FileName+fext EmptyVar=read_excel(fullpath, SheetNameOrNone, index_col=IndexColumn, na_values=['NA']) break return EmptyVar;
def get_hs300s(): """ 获取沪深300当前成份股及所占权重 Return -------- DataFrame code :股票代码 name :股票名称 date :日期 weight:权重 """ try: df = pd.read_excel( ct.HS300_CLASSIFY_URL % (ct.P_TYPE["http"], ct.DOMAINS["idx"], ct.INDEX_C_COMM, ct.PAGES["hs300b"]), parse_cols=[0, 1], ) df.columns = ct.FOR_CLASSIFY_B_COLS df["code"] = df["code"].map(lambda x: str(x).zfill(6)) wt = pd.read_excel( ct.HS300_CLASSIFY_URL % (ct.P_TYPE["http"], ct.DOMAINS["idx"], ct.INDEX_C_COMM, ct.PAGES["hs300w"]), parse_cols=[0, 3, 6], ) wt.columns = ct.FOR_CLASSIFY_W_COLS wt["code"] = wt["code"].map(lambda x: str(x).zfill(6)) return pd.merge(df, wt) except Exception as er: print(str(er))
def get_transfert_data_frames(year=None): assert year is not None default_config_files_directory = os.path.join( pkg_resources.get_distribution("openfisca_france_indirect_taxation").location ) matrice_passage_file_path = os.path.join( default_config_files_directory, "openfisca_france_indirect_taxation", "assets", "Matrice passage {}-COICOP.xls".format(year), ) parametres_fiscalite_file_path = os.path.join( default_config_files_directory, "openfisca_france_indirect_taxation", "assets", "Parametres fiscalite indirecte.xls", ) matrice_passage_data_frame = pandas.read_excel(matrice_passage_file_path) if year == 2011: matrice_passage_data_frame["poste2011"] = matrice_passage_data_frame["poste2011"].apply( lambda x: int(x.replace("c", "").lstrip("0")) ) parametres_fiscalite_data_frame = pandas.read_excel(parametres_fiscalite_file_path, sheetname="categoriefiscale") selected_parametres_fiscalite_data_frame = parametres_fiscalite_data_frame[ parametres_fiscalite_data_frame.annee == year ] return matrice_passage_data_frame, selected_parametres_fiscalite_data_frame
def __init__(self, file, year=None, level="Départements"): """ loads the data downloaded from `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_. @param file xls file @param year year (optional) @param level ``Départements`` or ``Cantons`` """ self.year = year self.level = level.lower().replace("s", "") if isinstance(file, list): self.tours = file else: self.tours = [pandas.read_excel(file, sheetname="%s T1" % level), pandas.read_excel(file, sheetname="%s T2" % level)] for i, t in enumerate(self.tours): if len(t) == 0: raise Exception("no data for tour %d" % (i + 1)) self.tours = [self.process_tour(_) for _ in self.tours] for i, t in enumerate(self.tours): if len(t) == 0: raise Exception("no data for tour %d" % i) try: self.tours = [ _.sort_values("Libellé du %s" % self.level, inplace=False) for _ in self.tours] except Exception as e: message = "unable to sort, shape={1} columns={0}".format( ",".join(self.tours[0].columns), self.tours[0].shape) raise Exception(message) from e
def get_barres_seq_data(force=False): global BARRES_SPECIES_DATA if force or not os.path.exists(BARRES_SEQ_PATH): LOGGER.info("Downloading Barres RNA Seq Data") response = requests.get(BARRES_SEQ_URL, stream=True) response.raise_for_status() with open(BARRES_SEQ_PATH, mode="wb") as f: for block in response.iter_content(1024): f.write(block) LOGGER.info("Reading Barres RNA Seq Data") BARRES_SPECIES_DATA = { "H**o sapiens": pd.read_excel( BARRES_SEQ_PATH, sheet_name="Human data only", skiprows=[0], ).iloc[1:], "Mus musculus": pd.read_excel( BARRES_SEQ_PATH, sheet_name="Mouse data only", skiprows=[0], ), }
def load_All_BAVs(BAVfile,sheet_names): x = pandas.read_excel(BAVfile, sheet_names[0], index_col=0, na_values=['NA']).index data= dict(); for sheet in sheet_names: df= pandas.read_excel(BAVfile, sheet, index_col=0, na_values=['NA']) x = intersect(x, df.index) for sheet in sheet_names: df =pandas.read_excel(BAVfile, sheet, index_col=0, na_values=['NA']) good_cols = [col for col in df.columns if len(col.split("_"))==2] df= df[good_cols] df.columns = map(lambda x: x.split("_")[0],df.columns) try: del df[u"Tough"] except: print "oh well" try: del df[u"Visionary"] except: print "oh well" df=df[pruned_words] df = df.ix[x] data[sheet]=df return (x,data)
def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels): # see gh-4679 with ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: pytest.skip("Column index name cannot be " "serialized unless it's a MultiIndex") # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 df = mkdf(5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels) df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names)
def test_excel_passes_na(self, read_ext): excel = ExcelFile('test4' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) # 13967 excel = ExcelFile('test5' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) expected = pd.read_excel('test1' + read_ext) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected)
def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): # GH 16645 expected = pd.read_excel('test1' + read_ext) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected)
def test_reader_dtype(self, read_ext): # GH 8212 basename = 'testdtype' actual = pd.read_excel(basename + read_ext) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [2.5, 3.5, 4.5, 5.5], 'c': [1, 2, 3, 4], 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( columns=['a', 'b', 'c', 'd']) tm.assert_frame_equal(actual, expected) actual = pd.read_excel(basename + read_ext, dtype={'a': 'float64', 'b': 'float32', 'c': str}) expected['a'] = expected['a'].astype('float64') expected['b'] = expected['b'].astype('float32') expected['c'] = ['001', '002', '003', '004'] tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): pd.read_excel(basename + read_ext, dtype={'d': 'int64'})
def test_excel_read_buffer(self, read_ext): pth = 'test1' + read_ext expected = pd.read_excel(pth, 'Sheet1', index_col=0) with open(pth, 'rb') as f: actual = pd.read_excel(f, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual)
def __init__(self): ## PDZ Domains temp_df = pd.read_excel(DATA+'\\theta_data.xlsx') self.aminoacids = [acid.encode('utf-8') for acid in list(temp_df.columns[:20])] self.df = temp_df.T self.domains = [Domain(domain.encode('utf-8')) for domain in list(self.df.columns)] self.domain_names = [domain.name for domain in self.domains] ### Peptide sequences self.pep_seqs = [] self.pep_names = [] self.acid_names = ['Glycine', 'Alanine', 'Valine', 'Leucine', 'Isoleucine', 'Methionine', 'Proline', 'Phenylalanine', 'Tryptophan', 'Serine', \ 'Threonine', 'Asparagine', 'Glutamine', 'Tyrosine', 'Cysteine', 'Lysine', 'Arginine', 'Histidine', 'Aspartate', 'Glutamate'] self.acid_dict = {self.aminoacids[i]:self.acid_names[i] for i in range(len(self.aminoacids))} with open(DATA+'\\peptides.free') as f: for line in f: x = line.split() self.pep_seqs.append(x[1]) self.pep_names.append(x[0]) self.peptides = [Peptide(name) for name in self.pep_names] ## Interaction: Which peptides bind to which domains self.fp_interaction_matrix = pd.read_excel(DATA+"\\fp_interaction_matrix.xlsx") for column in self.fp_interaction_matrix.columns: self.fp_interaction_matrix.loc[self.fp_interaction_matrix[column] == 0.0, column] = -1.0 self.fp_interaction_matrix = self.fp_interaction_matrix.rename(columns = lambda x: str(x).replace(" ", "")) ## Classification matrix self.class_matrix = np.zeros((2,2)) self.class_matrix[0,0] = 0.85 self.class_matrix[0,1] = 0.04 self.class_matrix[1,0] = 0.15 self.class_matrix[1,1] = 0.96
def __init__(self, db_filename = "fbo_solicitations.xlsx", report_prefix = "report", sol_sheet_name = "solicitations", filtered_sheet_name = "filtered_solicitations", index_column = "sponsor_number", report_only_new = True): ''' Constructor ''' if(not os.path.isfile(db_filename)): #generate a blank writable excel sheet from scratch field_names = [field_name for field_name in Opportunity.fields] field_names.remove("filtered") writer = ExcelWriter(db_filename) sol_df = pd.DataFrame(columns = field_names) filtered_df = pd.DataFrame(columns = field_names) sol_df.to_excel(writer,sol_sheet_name) filtered_df.to_excel(writer,filtered_sheet_name) writer.save() writer.close() self.report_filename = (report_prefix + "_" + str(datetime.today())[:19] .replace(":","_").replace(" ","[") + "].xlsx") #kept for posterity, in case only the date component is needed and we don't care about overwrites #self.report_filename = report_prefix + "_" + str(date.today()) self.db_filename = db_filename self.sol_sheet_name = sol_sheet_name self.filtered_sheet_name = filtered_sheet_name self.sol_df = pd.read_excel(db_filename,sol_sheet_name, index_col = index_column) self.filtered_df = pd.read_excel(db_filename,filtered_sheet_name, index_col = index_column) self.usaved_sol_counter = 0 self.sol_counter = 0 self.added_items = set()
def main(left_file="", right_file="", out_file="", on=[], how="left"): """given two xlsx (excel) workbooks, each containing one worksheet, join the two worksheets and output a new workbook. Parameters: * `-l, --left-file`: The workbook which contains the worksheet to consider the "left" table * `-r, --right-file`: The workbook which contains the worksheet to consider the "right" table * `-o, --out-file`: The file to output the "joined" tables to * `-O, --on`: A (space-seperated) list of column names to join on * `-H, --how`: how to join the two tables, must be one of "left", "right", "outer" or "inner" For more information on joining tables please see the [pandas dataframe merge documentation](http://pandas.pydata.org/pandas-docs/version/0.17.1/generated/pandas.DataFrame.merge.html) """ left = pd.read_excel(left_file) right = pd.read_excel(right_file) new = pd.merge(left, right, on=on, how=how) print "SAVING {}".format(out_file) new.to_excel(out_file, index=False)
df["grade"].cat.categories = ["very good", "good", "very bad"] df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium",\ "good", "very good"]) print df["grade"] print df.sort_values(by="grade") print df.groupby("grade").size() ''' Plotting ''' ts = pd.Series(np.random.randn(1000),\ index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot() df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,\ columns=['A', 'B', 'C', 'D']) df = df.cumsum() plt.figure() df.plot() plt.legend(loc='best') ''' Getting Data In/Out ''' # CSV df.to_csv('foo.csv') print pd.read_csv('foo.csv') # HDF5 df.to_hdf('foo.h5', 'df') print pd.read_hdf('foo.h5', 'df') # Excel df.to_excel('foo.xlsx', sheet_name='Sheet1') print pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) ''' Gotchas '''
os.chdir("/jukebox/wang/zahra/python/BrainPipe") from tools.analysis.network_analysis import make_structure_objects #set appropriate pth src = "/jukebox/wang/zahra/kelly_cell_detection_analysis" erode_pth = os.path.join(src, "annotation_allen_2017_25um_sagittal_erode_80um.tif") dilate_pth = os.path.join(src, "dilated_atlases") fig_dst = "/home/wanglab/Desktop" df_pth = "/jukebox/LightSheetTransfer/atlas/allen_atlas/allen_id_table_w_voxel_counts_16bit.xlsx" ann_pth = "/jukebox/LightSheetTransfer/atlas/allen_atlas/annotation_2017_25um_sagittal_forDVscans_16bit.tif" #%% #read vols ann = sitk.GetArrayFromImage(sitk.ReadImage(ann_pth)) df = pd.read_excel(df_pth) er_ann = tifffile.imread(erode_pth) dl_anns = [os.path.join(dilate_pth, xx) for xx in os.listdir(dilate_pth)] org_iids = np.unique(ann)[1:] #excluding 0 er_iids = np.unique(er_ann)[1:] missing = [iid for iid in org_iids if iid not in er_iids] missing_struct_names = [nm for nm in df.name.values if df.loc[df.name == nm, "id"].values[0] in missing] #excluding root missing_struct_voxels = [df.loc[df.name == nm, "voxels_in_structure"].values[0] for nm in missing_struct_names] #replace id column that matches to names missing_struct_ids = [df.loc[df.name == nm, "id"].values[0] for nm in missing_struct_names] #get parent names missing_struct_parents = [df.loc[df["id"] == iid, "parent_name"].values[0]
# Import Built-Ins import logging # Import Third-Party # Import Homebrew import matplotlib.pyplot as plt plt.style.use('bmh') # Init Logging Facilities log = logging.getLogger(__name__) ################################################################# # 1- Load data indicators_value = [] ticker_name = [] glob.glob("D:\Stock Study Excel Files\Input Excel Files\Stock USA\*.xlsx") for f in glob.glob('D:\Stock Study Excel Files\Input Excel Files\Stock USA\*.xlsx'): df = pd.read_excel(f) # df.columns = map(str.capitalize, df.columns) #df.rename(columns={'Volume': 'Volume_BTC'}, inplace=True) tike = f.split('\\')[-1].split('.')[0] print(tike) df.insert(1, 'TICKER', tike) # to bring excel file name # Clean nan values df = ta.utils.dropna(df) #################################################################### # 2-Add all ta features filling nans values (from Ta-Lib Except SuperTrend Because not in Ta-Lib) df = ta.add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume_BTC", fillna=True) ##################################################################### # 3- Calculate df['Signal'] = 0 sell = []
import matplotlib.pyplot as plt import pandas as pd url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv' df = pd.read_csv(url,sep=";") print(df.head()) pd.DataFrame.hist(df.ix[:, 0:1]) plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)') plt.ylabel('count') plt.show() #Importing non-flat files from the web import pandas as pd url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls' xls = pd.read_excel(url,sheet_name=None) print(xls.keys()) print(xls['1700'].head()) #Performing HTTP requests in Python using urllib from urllib.request import urlopen,Request url = "http://www.datacamp.com/teach/documentation" request = Request(url) response = urlopen(request) print(type(response)) response.close() #Printing HTTP request results in Python using urllib from urllib.request import urlopen, Request
from pprint import pprint import os import json import csv from flask import ( Flask, render_template, jsonify, request, redirect, url_for,) dataset1 = pd.read_excel("BronxPropertySalesDatasets/sales_bronx_03.xls") app = Flask (__name__) SITE_ROOT = os.path.realpath(os.path.dirname(__file__)) json_url = os.path.join(SITE_ROOT, "data", "data.json") data = json.load(open(json_url)) AIRBNB_SITE_ROOT = os.path.realpath(os.path.dirname(__file__)) airbnb_json_url = os.path.join(AIRBNB_SITE_ROOT, "data", "airbnb_data.json") data_airbnb = json.load(open(airbnb_json_url)) PROPERTYSALES_SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
def main(): import cdsapi import numpy as np import os import pandas as pd import math def quarter_up(x): return math.ceil(x * 4) / 4 def quarter_down(x): return math.floor(x * 4) / 4 c = cdsapi.Client() file = '/Volumes/Neely/BioDAR/ERA5/sites of light and suction traps.xlsx' suction_traps = pd.read_excel(file, header=0, sheet_name='Suction traps') number_of_traps = len(suction_traps['Lat']) areas = [] trap_name = [] for a in range(0, number_of_traps): lats = [ quarter_up(suction_traps['Lat'][a]), quarter_down(suction_traps['Lat'][a]) ] longs = [ quarter_up(suction_traps['Long'][a]), quarter_down(suction_traps['Long'][a]) ] areas.append([ max(lats), min(longs), min(lats), max(longs), ]) trap_name.append(suction_traps['Trap name'][a].replace(" ", "_")) start_year = 1979 stop_year = 2020 years = np.arange(start_year, stop_year + 1) months = [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12' ] days = [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31' ] for year in years: for month in months: for day in days: for idx, area in enumerate(areas): try: outdir = '/Volumes/Neely/BioDAR/ERA5/Myrna_TrapLocations_0_25_Box/suction_traps/pres_levels/' \ + str(trap_name[idx]) + '/' if not os.path.exists(outdir): os.makedirs(outdir) file_name = outdir + 'era5_pres_level_' + str(trap_name[idx]) + '_' + \ str(year) + str(month) + str(day) + '.nc' print(str(trap_name[idx]), area) print(file_name) if os.path.isfile(file_name) == True: print('exists') continue else: c.retrieve( 'reanalysis-era5-pressure-levels', { 'product_type': 'reanalysis', 'format': 'netcdf', 'variable': [ 'divergence', 'fraction_of_cloud_cover', 'geopotential', 'ozone_mass_mixing_ratio', 'potential_vorticity', 'relative_humidity', 'specific_cloud_ice_water_content', 'specific_cloud_liquid_water_content', 'specific_humidity', 'specific_rain_water_content', 'specific_snow_water_content', 'temperature', 'u_component_of_wind', 'v_component_of_wind', 'vertical_velocity', 'vorticity', ], 'pressure_level': [ '1', '2', '3', '5', '7', '10', '20', '30', '50', '70', '100', '125', '150', '175', '200', '225', '250', '300', '350', '400', '450', '500', '550', '600', '650', '700', '750', '775', '800', '825', '850', '875', '900', '925', '950', '975', '1000', ], 'year': [str(year)], 'month': [month], 'day': [day], 'time': [ '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00', '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00', ], 'area': area, }, file_name) except: continue
def convert_xlsx(filename, sheetname, csv_name): data_xls = pandas.read_excel(filename, sheetname, convert_float=False, index_col=None) data_xls.to_csv(csv_name, encoding='utf-8')
def xl2csv(path): df = pandas.read_excel(path) csvfileloc = '/home/py01/Desktop/ratings5.csv' df.to_csv(csvfileloc, sep='\t', encoding='utf-8', index=False) os.remove(path)
items = [int(item) for item in items] df[col] = pd.Series(items, dtype=int) else: df[col] = pd.Series(items, dtype=float) ############################################################################## ################################### MAIN ##################################### ############################################################################## if __name__ == '__main__': # make output directory if not exists cwd = os.getcwd() if not cwd.endswith('/'): cwd += '/' if not OUTDIR.endswith('/'): OUTDIR += '/' try: os.mkdir(OUTDIR) print('Output folder created: %s' % (cwd + OUTDIR)) except: pass # iterate through each sheet and format / spit out CSV for D3 for i in range(len(SHEET_NAMES)): print('\nWorking on %s sheet' % SHEET_NAMES[i]) df = pd.read_excel(INFILE, sheet_name=i+1, dtype=str, na_filter=False) remove_trailing_whitespace(df) typify_dataframe(df) df.to_csv(OUTDIR + SHEET_NAMES[i] + '.csv', index=None) print('Saved to %s' % (cwd + OUTDIR + SHEET_NAMES[i] + '.csv'))
import warnings warnings.filterwarnings("ignore") # Get dataset and features #==============================# aalist = list('ACDEFGHIKLMNPQRSTVWY') def getAAC(seq): aac = np.array([seq.count(x) for x in aalist])/len(seq) return aac data = pd.read_excel('sequence_ogt_topt.xlsx', index_col=0) aac = np.array([getAAC(seq) for seq in data['sequence']]) ogt = data['ogt'].values.reshape((data.shape[0],1)) X = np.append(aac, ogt, axis=1) sc = StandardScaler() X = sc.fit_transform(X) y = data['topt'].values # Strategies and hyperparameters #======================================# # Hyperparameter range cl_vals = [25.0, 30.0, None] ch_vals = [72.2, 60.0]
import pandas as pd from sklearn.neural_network import MLPClassifier import numpy as np import json import scipy as sc import numpy as np train_file=pd.read_json(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\train.json', orient='records') test_file=pd.read_csv(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\sample_solution.csv', header=0) train_inp=pd.read_excel(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\Train_Input.xlsx',index_col=None,header=None) test_inp=pd.read_excel(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\Test_Input.xlsx',index_col=None,header=None) train_l=train_file['cuisine'] test_l=test_file['cuisine'] train_label=train_l[0:4000] test_label=test_l[0:1000] best=0 accuracy=0 hl=[1,3] act=['logistic', 'tanh', 'relu'] sol=['lbfgs','sgd','adam'] al=[0.0001,0.0005] bs=[64,128] lr=['constant','invscaling','adaptive'] best_params = [0,0,0,0,0,0] params = [0,0,0,0,0,0] for h in hl: for a in act:
import pandas as pd import sys input_file = sys.argv[1] output_file = sys.argv[2] data_frame = pd.read_excel(input_file, sheet_name='january_2013') writer = pd.ExcelWriter(output_file) data_frame.to_excel(writer, sheet_name='jan_13_output', index=False) writer.save()
category_compare, padding) from BusinessPulseSurvey import business_pulse, qa_for_loc, qa_by_loc, compare_questions_locations, stacked_by_loc, qa_diff_by_loc dir_path = os.path.dirname(os.path.abspath(__file__)) if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.log_dir + config.log_file): with open(config.log_dir + config.log_file, 'w+'): pass logging.basicConfig(filename=config.log_dir + config.log_file, level=logging.INFO) logging.info('%s Economic Dashboard Started', datetime.datetime.now()) PUA_url = 'https://oui.doleta.gov/unemploy/docs/weekly_pandemic_claims.xlsx' pua_data = pd.read_excel(PUA_url) fileloc = config.fileloc y2k = '2000-01-01' cy = '2020-01-01' rs = '2020-02-01' #Recession start date #%% Overall Trends def overall_trends(): logging.info('%s Overall Trends Started', datetime.datetime.now()) series = ['RSAFS', 'IPMAN', 'PAYEMS', 'DGORDER'] national_trends = fred_chart(series, '2019-01-01', transformation='index', transform_date=rs,
def second(batch): index_col_2_yr="B.TECH. II Yr.(III SEMESTER TIMETABLE) ODD SEMESTER 2018(Combined) JIIT128(Effective from 17/07/2018)" data=pd.read_excel("timetable2.xlsx", index_col=index_col_2_yr) #sperating cols data.columns=[1,2,3,4,5,6,7,8,9] data.columns.name=" " #seprating days mon=data.loc["MON":"TUE"].iloc[:-1] tue=data.loc["TUE":"WED"].iloc[:-1] wed=data.loc["WED":"THURS"].iloc[:-1] thu=data.loc["THURS":"FRI"].iloc[:-1] fri=data.loc["FRI":"SAT"].iloc[:-1] sat=data.loc["SAT":].iloc[:-1] #list of df data2=[mon,tue,wed,thu,fri,sat] final=data.dropna() #data2 #final=data.dropna() #final #edit data frame here . make first row the column labels rows=[] #realgame #move all to class and fns for i in range (0,6): newlist=[] for j in range(1,10): new=data2[i][j].dropna() new2=new.str.contains(batch) new3=new.str.contains('ALL') new2=new2|new3 if not ((new[new2]).empty): temp=new[new2].tolist()[0].replace("\n","") #comment to show subject code # temp1=temp.find('(') # temp2=temp.find(')') # temp=temp[:temp1]+temp[temp2+1:] #comment to show subject code temp3=temp.find('/') temp=temp[:temp3] newlist.append(temp) #method 2 #newlist.append(new[new2].tolist().replace('\n','')) else: newlist.append(" ") rows.append(newlist) #final.append(new[new2]) #final final=pd.DataFrame(rows,index=["Mon","Tue","Wed","Thu","Fri","Sat"],columns=data.iloc[0]) final.columns.name="Days/Time" return(final.transpose().to_dict('list'))
# -*- coding: utf-8 -*- # tf_idf1 # # test script for setup of tf-idf # import os import pandas as pd from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer #add a project name for output project_name = 'Daily Mail Property Articles' # read sf export sf_export = pd.read_excel(r'C:\Users\JLee35\Automation\TF-IDF\input\body_copy.xlsx') corpus =[] for i in sf_export.index: doc = sf_export['Body Copy Full'][i] corpus.append(doc) # counts the length of the list doc_count = len(corpus) print(f'Total number of documents = {doc_count}') # use TfidfVectorizer from Scikit-Learn to transform the corpus stop = stopwords.words('english') vectorizer = TfidfVectorizer(max_df=.65, min_df=1, ngram_range=(1,1), stop_words=stop, use_idf=True, norm=None) transformed_documents = vectorizer.fit_transform(corpus) transformed_documents_as_array = transformed_documents.toarray()
import numpy as np import pandas as pd import streamlit as st import altair as alt df=pd.read_excel('Base_sondage_maraichage.xlsx', index_col="Identifiant", na_values=['NA']) df = df.fillna({"Mode_irrigation": "Pluvial"}) cleanup_nums = { "Mode_Production": {"Principale": 1, "En succession": 2, "En association": 3, "Sous étage": 4}, "Mode_irrigation": {"Localisée": 1, "Gravitaire": 2, "Aspersion": 3, "Pivot": 4, "Gravitaire,Localisée": 5, "Localisée,Pivot": 6, "Pluvial":7}, "Culture": {"Courgette": 1, "Pomme de terre": 2, "Tomate": 3, "Coriandre et persil": 4, "Haricot vert": 5, "Concombre": 6, "Menthe": 7, "Fève vert": 8, "Aubergine": 9, "Carotte": 10, "Chou fleur": 11, "Oignon":12, "Choux vert":13, "Celeri": 14, "Laitue": 15, "Tomate kiwat": 16, "Fraise": 17, "Piment fort": 18, "Artichaut": 19, "Absinthe": 20, "Haricot Helda": 21, "Topinambour": 22, "Myrtille": 23, "Endive": 24, "Navet": 25, "Pastèque":26, "Poivron": 27},
""" 参考: http://pbpython.com/market-basket-analysis.html """ import pandas as pd from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx') print(df.head()) df['Description'] = df['Description'].str.strip() df.dropna(axis=0, subset=['InvoiceNo'], inplace=True) df['InvoiceNo'] = df['InvoiceNo'].astype('str') df = df[~df['InvoiceNo'].str.contains('C')] basket = (df[df['Country'] =="France"] .groupby(['InvoiceNo', 'Description'])['Quantity'] .sum().unstack().reset_index().fillna(0) .set_index('InvoiceNo')) basket_sets = basket.applymap(encode_units) basket_sets.drop('POSTAGE', inplace=True, axis=1)
#Importación de librerías import plotly.offline as pyo import plotly.graph_objs as go import pandas as pd #Carga de datos df_temp = pd.read_excel( r'C:\Users\ivan_pinar\Dropbox\Creación de MOCs\MOC Dash Python\Datasets\3.8\Temperaturas.xlsx' ) #Definición de objeto de tipo lista "data", x --> Categorízación, y --> Valores a verificar distribución data = [ go.Box(x=df_temp["Ciudad"], y=df_temp["T_Promedio"]) ] #pointpos=0 para ubicación de los puntos en el centro / boxpoints ="all" si se quieren visualizar todos los puntos #Definición de objeto "layout": diseño del gráfico como título, nombres de ejes,... layout = go.Layout(title="Box & whiskers Temperatura") #Creación de objeto "Figure" de Plotly a partir de los objetos data y layout creados previamente fig = go.Figure(data=data, layout=layout) #Generación del plot a partir de la figura definida y nombre del fichero de salida HTML pyo.plot(fig, filename="3.8 Temp_Box Plot.html")
print('************************************************** Analyzing {:s}'. format(state.upper())) print() #excel = r'K:\DEEP_SOLAR_BIG2\TN\TNDeep_Solar.xlsx' if region == TVA_l: #excel = r'K:\TVA_SVI\TVA_DS_SVI_merged.xlsx' excel = r'K:\TVA_SVI\TVA_DS_SVI_merged2.xlsx' elif region == TVA_f: #excel = r'K:\TVA_SVI\TVA_DS_SVI_merged.xlsx' excel = TVA_path else: excel = r'K:\TVA_SVI\TN_DS_SVI_merged.xlsx' #TNDS = pd.read_excel(excel, index='fips').fillna(0) #TNDS = pd.read_excel(excel, index='fips').dropna(axis=0) TNDS = pd.read_excel(excel, index='fips') #TNDS = pd.read_excel(excel, index='fips') #TNDS = TNDS.fillna(TNDS.mean(axis=1)) if region != TVA_l and region != TVA_f: print('Getting region {:s}'.format(region)) TNDS = TNDS.loc[TNDS['ST_ABBR'] == region.upper()] print('splitting data') adopters, high, mod, non = split_data_res_adopt_non(TNDS, hthr=10, midrange=[1, 10], lthr=1, verbose=False) dd.display_percentages(TNDS.shape[0], adopters.shape[0],
def disp1(): import io from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage text = '' def extract_text_from_pdf(pdf_path): with open(pdf_path, 'rb') as fh: # iterate over all pages of PDF document for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): # creating a resoure manager resource_manager = PDFResourceManager() # create a file handle fake_file_handle = io.StringIO() # creating a text converter object converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams()) # creating a page interpreter page_interpreter = PDFPageInterpreter(resource_manager, converter) # process current page page_interpreter.process_page(page) # extract text text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() # calling above function and extracting text #print(fname) file_path = "D:/resume_analysis/static/resumes/" + fname fp = file_path.split('/') f = fp[len(fp) - 1] for page in extract_text_from_pdf(file_path): text += ' ' + page #print(text) import spacy from spacy.matcher import Matcher # load pre-trained model nlp = spacy.load('en_core_web_sm') # initialize matcher with a vocab matcher = Matcher(nlp.vocab) def extract_name(resume_text): nlp_text = nlp(resume_text) # First name and Last name are always Proper Nouns pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] matcher.add('NAME', None, pattern) matches = matcher(nlp_text) for match_id, start, end in matches: span = nlp_text[start:end] return span.text name = extract_name(text) #print(name) import re def extract_mobile_number(text): phone = re.findall( re.compile( r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?' ), text) if phone: number = ''.join(phone[0]) if len(number) > 10: return '+' + number else: return number num = extract_mobile_number(text) #print(num) import re def extract_email(email): email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email) if email: try: return email[0].split()[0].strip(';') except IndexError: return None email = extract_email(text) #print(email) import pandas as pd import spacy #from spacy.en import English # load pre-trained model nlp = spacy.load('en_core_web_sm') #noun_chunk = nlp.noun_chunks #nlp=English() doc = nlp(text) def extract_skills(resume_text): nlp_text = nlp(resume_text) # removing stop words and implementing word tokenization tokens = [token.text for token in nlp_text if not token.is_stop] #print(tokens) # reading the csv file data = pd.read_csv("D:/resume_analysis/techskill.csv") # extract values skills = list(data.columns.values) skillset = [] # check for one-grams (example: python) for token in tokens: if token.lower() in skills: skillset.append(token) # check for bi-grams and tri-grams (example: machine learning) for token in doc.noun_chunks: token = token.text.lower().strip() if token in skills: skillset.append(token) return [i.capitalize() for i in set([i.lower() for i in skillset])] text = text.lower() skill = extract_skills(text) print(skill) skill_len = len(skill) excel_file = 'D:/resume_analysis/jd.xls' jd = pd.read_excel(excel_file) skill1 = jd['Skills'] row = jd.shape[0] res = [] for i in range(row): count = 0 sk = skill1[i].split(',') for j in skill: if (skill1[i].find(j) != -1): count = count + 1 res.append(100 * count / len(skill)) ind = res.index(max(res)) print(jd['JobTitle'][ind]) res1 = [] for i in res: res1.append(i) res1 = sorted(res1) p1 = max(res) print(res1) second = res1[len(res1) - 1] third = res1[len(res1) - 2] print(max(res), second, third) res[ind] = -res[ind] ind1 = res.index(second) res[ind1] = -res[ind1] ind2 = res.index(third) print(ind1) s1 = jd['Skills'][ind] s2 = jd['Skills'][ind1] s3 = jd['Skills'][ind2] rs1, rs2, rs3 = [], [], [] for j in skill: if (j in s1): rs1.append(j) for j in skill: if (j in s2): rs2.append(j) for j in skill: if (j in s3): rs3.append(j) return render_template('car.html', job1=jd['JobTitle'][ind], skills1=rs1, job2=jd['JobTitle'][ind1], skills2=rs2, job3=jd['JobTitle'][ind2], skills3=rs3, p1=round(p1, 3), fnam=f, p2=round(second, 3), p3=round(third, 3))
def read_file(path,name): df = pd.read_excel('%s/%s'%(path,name)) return df
driver.find_element_by_name('password').send_keys('t4') time.sleep(.2) driver.find_element_by_name('password').send_keys('78') time.sleep(.2) driver.find_element_by_name('password').send_keys('@g') time.sleep(.2) driver.find_element_by_name('password').send_keys('ma') time.sleep(.2) driver.find_element_by_name('password').send_keys('il') time.sleep(.2) driver.find_element_by_name('password').send_keys('.c') time.sleep(.2) driver.find_element_by_name('password').send_keys('om') driver.find_element_by_name('password').send_keys(Keys.RETURN) time.sleep(60) abc = pd.read_excel('C:\\Users\\acer\\Downloads\\cds\\mat.xls', header=None, index_col=False) f = open('data.csv', 'a') var=0 continueCheck = False for item in abc.index: print(abc[0][item]) website = str(abc[0][item]) if website == '6annonce.com': continueCheck = True continue if continueCheck: driver.get('https://pro.similarweb.com/#/website/worldwide-overview/'+website+'/*/999/3m?webSource=Total') # try: if driver.title != 'Pardon Our Interruption': wait = WebDriverWait(driver, 40)
def createRUID_List(rowIdxList, headerStr): """ Loops over a series containing row indices and returns a list of RUID strings. Inputs: rowIdxList - collection of row index values headerStr - DataFrame header string value for column containing RUIDs Outputs: new list containing RUID strings """ RUID_List = [] for aRowIdx in rowIdxList: workingRUID=df[headerStr].iloc[aRowIdx] RUID_List.append(workingRUID) return RUID_List df = pd.read_excel("abcd_rucdr_master_forPython.xlsx") print ('Finished reading in input file.') #blackList=['NDAR_INV'] #for pattern in blackList: # df['pGUID_Rutgers'] = df['pGUID_Rutgers'].replace(pattern, '') #datasets Unique_DAIC_Invs = df['InvCodeDAIC_OnlyTxt'].dropna() Unique_Rutgers_Invs = df['InvCodeRUCDR_OnlyTxt'].dropna() AllRutgersInvs = df['InvCodeMinusDOTxt'].dropna() AllDAIC_Invs = df['InvCodeMinusROTxt'].dropna() print ('About to start first match2collections.')
def main(): df = pd.read_excel(INPUT_FILE) wb = op.Workbook() ws = wb.active ws.append(['Месяц'] + ['Параметр'] + [i for i in range(1, 32)]) wells = list(dict.fromkeys(df.well)) # Iterate over well for well in wells: print(well) ws.append([well]) last_row = ws.max_row ws.merge_cells(start_row=last_row, start_column=1, end_row=last_row, end_column=33) well_df = df[df.well == well] years = list(dict.fromkeys(well_df.date.dt.year)) # Iterate over year for year in years: year_df = well_df[well_df.date.dt.year == year] months = list(dict.fromkeys(year_df.date.dt.month)) for month in months: month_df = year_df[year_df.date.dt.month == month] # Blank rows q = [None for i in range(31)] dynamic = [None for i in range(31)] static = [None for i in range(31)] # Iterate over month data for _, row in month_df.iterrows(): day = row.date.day q[day - 1] = round(row.rate, 1) dynamic[day - 1] = round(row.dynamic, 1) static[day - 1] = round(row.static, 1) # Write rows to sheet ws.append([f'{months_names[month - 1]} {year}'] + ['Q, м3/сут'] + q) ws.append([None] + ['Нд, м'] + dynamic) ws.append([None] + ['Нст, м'] + static) last_row = ws.max_row ws.merge_cells(start_row=last_row - 2, start_column=1, end_row=last_row, end_column=1) # Apply styles for row in ws.iter_rows(): for cell in row: cell.style = style_basic if row[0].value is not None and row[1].value is None: row[0].style = style_bold for i in range(2, 33): ws.column_dimensions[get_column_letter(i + 1)].width = 6 wb.save(OUTPUT_FILE)
def on_button_clicked(b): clear_output() display(button) ## UPLOADED INITIAL DATA # if mv.value !='': # try: data = pd.read_excel('story_'+ story.value+'/story'+ story.value+'.xlsx', sheet_name='sample') # data=data.drop(['FC_D','FC_E','FC_F'],axis=1) data['Departure_Date']=pd.to_datetime(data.Departure_Date) datadone=data[data.Departure_Date< pd.Timestamp(nowaday.value)] data=data[data.Departure_Date>= pd.Timestamp(nowaday.value)] ## 1. Sort by Arival Date and Priority data=data.sort_values(by=['Arrival_Date','Price'], ascending=[True,False]) ## 2. Set Parameter and Constraint #Total Floating Crane a=1 b=1 c=1 totfc = int(fcnumber.value) fclist=['FC_A','FC_B','FC_C'] #### Create feature demanddays for 1 floating crane data['demanddays']= np.round(data.Demand_Qty/data.Loading_Rate) data['demandfc']=np.ceil(data['demanddays']/data.Laytime_Duration) data['demanddays_new']=np.ceil(data.Demand_Qty/(data.Loading_Rate*data['demandfc'])) ## 3. Assign Floating Crane - Initial Plan # to get initial plan ### create initial first row import itertools a=[] for L in range(1, len(fclist)+1): for subset in itertools.combinations(fclist, L): # print(subset) x=list(subset) a.append(x) a=[[1,0,0], [0,1,0], [0,0,1], [1,1,0], [1,0,1], [0,1,1], ] a=pd.DataFrame(a,columns=['FC_A', 'FC_B', 'FC_C']) if (data.loc[0,'demandfc']==1) == True: data.loc[0, 'FC_A'] = 1 data.loc[0, 'FC_B'] = 0 data.loc[0, 'FC_C'] = 0 elif (data.loc[0,'demandfc']==2) == True : data.loc[0, 'FC_A'] = 1 data.loc[0, 'FC_B'] = 1 data.loc[0, 'FC_C'] = 0 else: data.loc[0, 'FC_A'] = 1 data.loc[0, 'FC_B'] = 1 data.loc[0, 'FC_C'] = 1 ### complete initial plan for i in range(1,data.shape[0]): if (data.loc[i,'demandfc'] == 1): data.loc[i, 'FC_A'] = 1 data.loc[i, 'FC_B'] = 0 data.loc[i, 'FC_C'] = 0 elif (data.loc[i,'demandfc'] == 2) : for fc in range(a.shape[0]): if ((data.loc[i-1,'FC_A'])== (a.loc[fc,'FC_A'])) & ((data.loc[i-1,'FC_B'])== (a.loc[fc,'FC_B'])) & ((data.loc[i-1,'FC_C'])== (a.loc[fc,'FC_C'])): data.loc[i, 'FC_A'] = np.abs((a.loc[fc,'FC_A'])-1) data.loc[i, 'FC_B'] = np.abs((a.loc[fc,'FC_B'])-1) data.loc[i, 'FC_C'] = np.abs((a.loc[fc,'FC_C'])-1) if ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_A']==0): data.loc[i, 'FC_A']=1 elif ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_B']==0): data.loc[i, 'FC_B']=1 elif ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_C']==0): data.loc[i, 'FC_C']=1 else:continue else:continue else: data.loc[i, 'FC_A'] = 1 data.loc[i, 'FC_B'] = 1 data.loc[i, 'FC_C'] = 1 ## 4. Recalculate Departure Date # based on real demanddays_new data['Arrival_Date_change']=pd.to_datetime(np.nan) data['Departure_Date_change']=pd.to_datetime(np.nan) data.loc['FC_gap_Date_change'] = pd.to_datetime(np.nan) data=data[['MV', 'ETA','Arrival_Date', 'Laytime_Duration', 'Departure_Date', 'Demand_Qty', 'Loading_Rate', 'Price', 'Demurrage_Rate', 'demanddays', 'demandfc', 'demanddays_new', 'FC_A', 'FC_B', 'FC_C', 'FC_D','FC_E','FC_F', 'Arrival_Date_change', 'Departure_Date_change']] datachange=pd.DataFrame([[mv.value,arvl.value]],columns=['MV','Arrival_Date_change_source']) datachange['Arrival_Date_change_source']=pd.to_datetime(datachange.Arrival_Date_change_source) data=pd.merge(data,datachange,how='left',on=['MV']) data['Arrival_Date']=pd.to_datetime(data.Arrival_Date) data['Departure_Date']=pd.to_datetime(data.Departure_Date) # data['Arrival_Date_change']=pd.to_datetime(data.Arrival_Date_change) data['Arrival_Date_change']=data['Arrival_Date_change_source'] data['Est_Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(data['demanddays_new'], unit='D') data['Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(10, unit='D') # data['Departure_Date_change']=pd.to_datetime(data.Departure_Date_change) data.loc[data.Arrival_Date_change.isnull() ,'Arrival_Date_change']=data.loc[data.Arrival_Date_change.isnull() ,'Arrival_Date'] data['Est_Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(data['demanddays_new'], unit='D')# data.loc[0, 'Arrival_Date_change']=pd.to_datetime(data.loc[0,'Arrival_Date_change_source']) # data.loc[0,'Departure_Date_change']=data.loc[0,'Arrival_Date_change']+pd.to_timedelta(data.loc[0,'demanddays_new'], unit='D') data['Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(10, unit='D') data.drop('Arrival_Date_change_source',axis=1,inplace=True) x=datachange['MV'][0] ### 6. Check the next sequence Schedule # If the departure date change is clash, so FC_start_date_change must be adjusted and check the potential demorage cost ## Sort by Arival Date Change and Priority (Price) data=data.sort_values(by=['Arrival_Date_change','Price'], ascending=[True,False]) data=data.reset_index() data.drop('index',axis=1,inplace=True) data['FC_Start_Date_change']=data['Arrival_Date_change'] data['FC_End_Date_change']=data['Est_Departure_Date_change'] # data.loc[data.MV== x ,'FC_Start_Date_change']=data.loc[data.MV== x ,'Arrival_Date_change'] # data.loc[data.MV== x ,'FC_End_Date_change']=data.loc[data.MV== x ,'Est_Departure_Date_change'] # data.loc[(data.MV!= x) & (data.FC_Start_Date_change.isnull()),'FC_Start_Date_change']=data.loc[data.MV!= x ,'Arrival_Date_change'] # data.loc[(data.MV!= x) & (data.FC_End_Date_change.isnull()),'FC_End_Date_change']=data.loc[data.MV!= x ,'Est_Departure_Date_change'] # Calculate Demurage cost data.loc[0,'Demmurage_Day']=0 data.loc[0,'Demmurage_Cost']=0 ### Create Demmuragecost Simulation Function def sim_demuragecost(totfc,data): totfc=totfc for i in range(1,data.shape[0]): #if previous iteration row value is greater than current iteration row value then if (data.loc[i-1,'Est_Departure_Date_change'] >= data.loc[i,'Arrival_Date_change']) : totfc=totfc-data.loc[i-1,'demanddays_new'] #if available fc >= demand fc i if (totfc >= data.loc[i,'demanddays_new'] ): data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change'] data.loc[i,'FC_End_Date_change'] = data.loc[i,'Est_Departure_Date_change'] # Calculate Demurage cost data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D')) data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day'] # data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') # data.loc[i,'Demmurage_Day']=0 #will be no risk to get demurage day n cost # data.loc[i,'Demmurage_Cost']=0 #if available fc < demand fc i and fc is at least available for one then elif (totfc < data.loc[i,'demanddays_new']) & (totfc >0): #state the available FC to start operate data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change'] data.loc[i,'FC_Start_Date_change_2'] = data.loc[i-1,'FC_End_Date_change'] + pd.to_timedelta(1, unit='D') #startdate next fc #cal the number of days that available FC can start data.loc[i,'dayrun_progress']=np.ceil((data.loc[i,'FC_Start_Date_change_2'] - data.loc[i,'FC_Start_Date_change'])/np.timedelta64(1,'D')) #cal the remaining quantity that is already loaded by available FC data.loc[i,'Demand_Qty_remain']= data.loc[i,'Demand_Qty'] - (data.loc[i,'Loading_Rate']*totfc*data.loc[i,'dayrun_progress']) #cal the remaining number of FC to fulfill the demand data.loc[i,'demandfc_remain'] = data.loc[i,'demanddays_new'] - totfc #re-cal the total demandays based on this condition data.loc[i,'demanddays_new']= np.ceil(data.loc[i,'Demand_Qty_remain'] / (data.loc[i,'Loading_Rate']*data.loc[i,'demanddays_new'])) +data.loc[i,'dayrun_progress'] #cal the end date fc operate data.loc[i,'FC_End_Date_change'] = data.loc[i,'FC_Start_Date_change'] + pd.to_timedelta(data.loc[i,'demanddays_new'], unit='D') # Calculate Demurage cost data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D')) data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day'] # data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') #if available fc < demand fc i and fc none available then else: #the fc must start till the previous mv finisih to load data.loc[i,'FC_Start_Date_change'] = data.loc[i-1,'FC_End_Date_change'] + pd.to_timedelta(1, unit='D') data.loc[i,'FC_End_Date_change'] = data.loc[i,'FC_Start_Date_change'] + pd.to_timedelta(data.loc[i,'demanddays_new'], unit='D') # Calculate Demurage cost data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D')) data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day'] # data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') totfc=3 #reset to initial total fc else: totfc = 3 data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change'] data.loc[i,'FC_End_Date_change'] = data.loc[i,'Est_Departure_Date_change'] data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D')) data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day'] # data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') data.loc[data.Demmurage_Day<=0 ,'Demmurage_Day']=0 data.loc[data.Demmurage_Cost<=0 ,'Demmurage_Cost']=0 # data.loc[data.Demmurage_Cost<=0 ,'FC_gap_Date_change']=data.loc[data.Demmurage_Cost<=0 ,'FC_End_Date_change'] return data ### Call function data=sim_demuragecost(totfc,data) data def gantt_fig3(data): data3 = [] for row in data.itertuples(): data3.append(dict(Task=str(row.MV), Start=str(row.Arrival_Date_change), Finish=str(row.Departure_Date_change), Resource='Plan')) data3.append(dict(Task=str(row.MV), Start=str(row.FC_Start_Date_change), Finish=str(row.FC_End_Date_change), Resource='Actual')) fig = ff.create_gantt(data3, index_col='Resource', title='Gantt Chart', show_colorbar = True, group_tasks = True , height=500, width=1300 ) fig['layout'].update(legend=dict(traceorder='reversed')) return fig iplot(gantt_fig3(data)) data=pd.concat(datadone,data) newtable=data posttable=data newtable.columns newtable['Arrival_Date']=newtable.Arrival_Date_change newtable['Departure_Date']=newtable.Departure_Date_change tab=newtable[['MV', 'ETA', 'Arrival_Date', 'Laytime_Duration', 'Departure_Date', 'Demand_Qty', 'Loading_Rate', 'Price', 'FC_A', 'FC_B', 'FC_C', 'FC_D','FC_E', 'FC_F', 'Demmurage_Day', 'Demurrage_Rate', 'Demmurage_Cost']] tab.to_excel('story_'+ story.value+'/story'+ story.value+'.xlsx',sheet_name='sample',engine='xlsxwriter',index=False) data.drop(['demanddays'],axis=1,inplace=True) data.rename(columns={'demanddays_new':'demanddays'},inplace=True) print( 'Total demurage cost: USD ' +str(data.Demmurage_Cost.sum())) button data.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True) data=data.drop(['FC_A', 'FC_B', 'FC_C', 'FC_D','FC_E', 'FC_F'],axis=1) data return button, display(data),data;
for data_file in data_files: if ".zip" in data_file: data_file_name = data_file[:-4] if extract_zip_files: with ZipFile("%s/%s" % (DATA_DIR, data_file), 'r') as zipObj: listOfFileNames = zipObj.namelist() fileName = listOfFileNames[0] zipObj.extractall("/tmp") os.replace("/tmp/%s" % fileName, "/tmp/%s.xls" % data_file_name) xl = pd.ExcelFile("/tmp/%s.xls" % data_file_name) sheet_name = xl.sheet_names[0] df = pd.read_excel(xl, sheet_name, usecols=[NAME, DATE, INTEREST, LONG, SHORT]) name_list += list(df[NAME]) date_list += list(df[DATE]) interest_list += list(df[INTEREST]) long_list += list(df[LONG]) short_list += list(df[SHORT]) num_of_entries = len(name_list) z_scores_one_year = [] z_scores_three_year = [] cwd = os.getcwd()
import requests import csv import json import matplotlib.pyplot as plt import seaborn as sns ''' Åldersgrupp : Agegroup Antal vaccinerade : Number of vaccinated Andel vaccinerade : Proportion of vaccinated Dosnummer : Dosenumber ''' xls = pd.ExcelFile("https://fohm.maps.arcgis.com/sharing/rest/content/items/fc749115877443d29c2a49ea9eca77e9/data") xls1 = pd.read_excel(xls, 'Vaccinerade ålder') #print(xls1.columns) # Let's drop unnamed column from the dataframe df = xls1.drop("Unnamed: 5", axis=1) #print(df.columns) #Now we will seperate dose1 and dose2 of entire sweden df_sweden_dose1 = df.loc[0:8] print(df_sweden_dose1) df_sweden_dose2 = df.loc[9:17] #print(df_sweden_dose2)
import pandas df = pandas.read_excel('712693030RPKUP4RX.xlsx') header = df.iloc[2] #取得標題 df1 = df[3:].copy() #去除前三列 df1 = df1.rename(columns=header) #重置標題 df2 = df1.drop(columns=['縣市代碼', '村里代碼', '村里名稱', '村里代碼'], axis=1) #去除四行資料 df3 = df2.drop_duplicates() #移除重複資料 df3.to_csv('district.csv', encoding='big5', index=False)
from ics_data_clean import clean_data, clean_text_round1, process_data from ics_train_classifier import train_ from plotly.graph_objs import Bar, Scatter from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import cosine_similarity # pickle load important files category_id_df = pd.read_pickle('../pickle/ics/factorize/category_id_df.pkl') id_to_category = dict(category_id_df[['category_id', 'category']].values) with open('../pickle/ics/stop_words.pickle', 'rb') as f: stop_words = pickle.load(f) # training_database_table = pd.read_pickle('../pickle/ics/training_database_index.pkl') # process list process_list = pd.read_excel('../data/ics/processes.xlsx') # load training data df = pd.read_pickle('../pickle/ics/data_final.pkl') # define vectorizers tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words=stop_words) tfidf.fit(df.content) cv = CountVectorizer(stop_words=stop_words) tfidf_s = TfidfVectorizer(analyzer='word',
import pandas as pd import numpy as np import sys #读取excel里的数据 df=pd.read_excel('E:/PythonStudy_Git/调用资料/file/菜品报表 (1).xlsx',sheet_name = 0) #增加时间 df['时间']='2020-10-02' df2=pd.read_excel('E:/PythonStudy_Git/调用资料/file/菜品报表 (2).xlsx',sheet_name = 0) print(df2) '''合并两表,how:连接方式: 有inner(根据条件,如同等id号的合并成一条记录 left左合并 right右合并 outer下合并 默认为inner;''' df3=pd.merge(df,df2,how="outer") print(df3)