def checkDelimiter(filename, directory): """ Checks the delimiter of a file or takes the input from the user """ with open(os.path.join(directory, filename)) as f: for line in f: if re.match(r"^\d+.*$", line): delim = detect(line) break if delim is None: delim = input("""Type of delimiter\n[1] ('\\t')\n[2] (' ')\n[3] (';') [4] (',')\n [5] Custom delimiter\nChosen option: """) delim = int(delim) if delim == 1: delim = '\\t' elif delim == 2: delim = ' ' elif delim == 3: delim = ';' elif delim == 4: delim = ',' elif delim == 5: delim = input("Enter custom delimiter: ") cls() return delim
def get_file_delimiter(arg_file): with open(arg_file, 'rb') as file: first_line = str(file.readline().strip()) delimiter = str(detect_delimiter.detect(first_line)) log.info('Auto-detection of delimiter for text-type file : [{0}]'.format( delimiter)) return delimiter
def wrapper(*args, **kwargs): try: fun_res = func(*args, **kwargs) except AssertionError as e: fun_res = func(*args, **kwargs, sep=detect(e.args[0])) except Exception as e: raise e return fun_res
def get_delimiter(path): with open(path) as csv_file: first_line = csv_file.readline() if first_line: delimiter = detect(first_line) if delimiter: return delimiter raise ValueError('Can\'t detect the delimiter')
def detectdelimiter(file): """ this will detect the famous delimiters [',', ';', ':', '|', '\t'] The input is _io.StringIO object. """ first_line = file.getvalue().split('\n')[ 0] # get the first line of the StringIO object return detect(first_line) # apply the detect method
def detectdelimiter(file): """ this will detect the famous delimiters [',', ';', ':', '|', '\t'] """ import csv from detect_delimiter import detect with open(file, newline='') as f: filecsv = csv.reader(f) first_line = list(filecsv)[0][0] return detect(first_line)
def text_to_dict(text): mydict = {} delim = detect_delimiter.detect(text[0], whitelist=[':', ' ']) if delim == None: raise ValueError('Cannot detect delimiter.') for line in text: if line.strip() != '': line_split = line.split(sep=delim, maxsplit=1) key = line_split[0].strip() value = line_split[1].strip() field_mode = 'field_mode__size' field_test_results = test_field_mode(field_mode, key) if field_test_results[0] == field_mode: key = field_test_results[1] value = value.replace(' ','') \ .replace(',','') \ .replace('bytes','') field_mode = 'field_mode__crc32' field_test_results = test_field_mode(field_mode, key) if field_test_results[0] == field_mode: key = field_test_results[1] value = value.upper().replace(' ', '') field_mode = 'field_mode__md5' field_test_results = test_field_mode(field_mode, key) if field_test_results[0] == field_mode: key = field_test_results[1] value = value.upper().replace(' ', '') field_mode = 'field_mode__sha1' field_test_results = test_field_mode(field_mode, key) if field_test_results[0] == field_mode: key = field_test_results[1] value = value.upper().replace(' ', '') field_mode = 'field_mode__sha256' field_test_results = test_field_mode(field_mode, key) if field_test_results[0] == field_mode: key = field_test_results[1] value = value.upper().replace(' ', '') mydict[key] = value return mydict
def bulk_host_popup(self): file= npyscreen.selectFile() try: with open(file, 'r') as f: content = f.read() content = content.replace('\r\n', '\n') from detect_delimiter import detect delimiter = detect(content, whitelist=['\n', ',']) ip_addresses = content.split(delimiter) current_value = self.add_hosts_ip_addresses.get_value() self.add_hosts_ip_addresses.set_value(f"{current_value}{',' if current_value[-1] != ',' else ''}{','.join(filter(None, ip_addresses))}") except FileNotFoundError: npyscreen.notify_confirm("File was not found!", title="Information Missing/Incorrect", wide=True, editw=1) except PermissionError: npyscreen.notify_confirm("You do not have permissions to open this file!", title="Information Missing/Incorrect", wide=True, editw=1) except UnicodeDecodeError: npyscreen.notify_confirm("We are unable to open the file. Make sure the file is human readable", title="Information Missing/Incorrect", wide=True, editw=1) except Exception as e: npyscreen.notify_confirm(f"Something went wrong, please retry, or select a different file.\nStack Trace:\n:{str(e)}", title="Equivalent of a 500 error", wide=True, editw=1) self.DISPLAY()
def genRawPreview(file_path,file_name,user_id): try: with open(os.path.join(file_path, file_name), 'r', encoding='utf-8') as file: filehead = file.readline() # skip the first line hdata = [] data_list = [] target_directory = UPLOAD_FOLDER + user_id + '\\' str_filehead = '' for h_content in filehead: str_filehead += h_content sfn = os.path.splitext(file_name) for index, value in enumerate(sfn): if index == 0: file_name_only = value tgt_filename = file_name_only + '.json' tgt_filePath = target_directory else: file_type = value deli = detect(str_filehead,whitelist = [',', ';', ':', '|', '\t']) hdata.append(str_filehead.split(deli)) for h in hdata: list_h = list(h) # print(list_h) allowed_header = ['email','phone','name', 'currency', 'merchant_id','ID','EMAIL'] h_flag = common_member(list_h,allowed_header) rows = [[str(x) for x in line.split(deli)] for line in file] cols = [list(col) for col in zip(*rows)] if not os.path.exists(target_directory): os.makedirs(target_directory) if h_flag == True: tuples = [tuple(x) for x in rows] # print(tuples) for row in tuples: save_data = OrderedDict() for i, r in enumerate(row): save_data[list_h[i]] = row[i] data_list.append(save_data) else: with open(os.path.join(file_path, file_name), 'r',encoding='utf-8') as file: rows = [[str(x) for x in line.split(deli)] for line in file] cols = [list(col) for col in zip(*rows)] tuples = [tuple(x) for x in rows] for row in tuples: save_data = OrderedDict() for i, r in enumerate(row): j = 'col'+str(i) save_data[j] = row[i] data_list.append(save_data) with open(os.path.join(tgt_filePath,tgt_filename), 'w') as json_file: json.dump(data_list, json_file) return {'result': 'File successfully uploaded'} except Exception as e: return {'error': str(e)}
def main(): """ Main is responsible for the visualisation of everything connected with streamlit. It is the web application itself. """ # # Radiobuttons in one row # st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True) # Sets sidebar's header and logo sidebar.sidebar_head() # # # Spectrometer type `- BWTek / Renishaw / Witec / Wasatch / Teledyne # spectra_types = [ 'EMPTY', 'BWTEK', 'RENI', 'WITEC', 'WASATCH', 'TELEDYNE', 'JOBIN' ] spectrometer = st.sidebar.selectbox("Choose spectra type", spectra_types, format_func=LABELS.get, index=0) # sidebar separating line sidebar.print_widgets_separator() # User data loader # sidebar.print_widget_labels('Upload your data or try with ours', 10, 0) files = st.sidebar.file_uploader(label='Upload your data or try with ours', accept_multiple_files=True, type=['txt', 'csv']) # Allow example data loading when no custom data are loaded if not files: if st.sidebar.checkbox("Load example data"): if spectrometer == "EMPTY": st.sidebar.error('First Choose Spectra type') else: files = utils.load_example_files(spectrometer) # Check if data loaded, if yes, perform actions delim = None if files: st.spinner('Uploading data in progress') # sidebar separating line sidebar.print_widgets_separator() from detect_delimiter import detect new_files = [] for file in files: file.seek(0) lines = file.readlines() try: lines = [line.decode('utf-8') for line in lines] except AttributeError: pass # lines = str.splitlines(str(text)) # .split('\n') first_lines = '\n'.join(lines[:20]) delim = detect(first_lines) colnum = lines[-2].count(delim) lines = [i for i in lines if i.count(delim) == colnum] text = '\n'.join(lines) buffer = io.StringIO(text) buffer.name = file.name new_files.append(buffer) try: df = save_read.read_files(spectrometer, new_files, delim) except (TypeError, ValueError): st.error('Try choosing another type of spectra') st.stop() main_expander = st.beta_expander("Customize your chart") # Choose plot colors and templates with main_expander: plots_color, template = vis_utils.get_chart_vis_properties() # Select chart type chart_type = vis_opt.vis_options() # sidebar separating line sidebar.print_widgets_separator() # Select data conversion type spectra_conversion_type = vis_opt.convertion_opt() # TODO need improvements # getting rid of duplicated columns df = df.loc[:, ~df.columns.duplicated()] # # # data manipulation - raw / optimization / normalization # # TODO delete if not needed # Normalization # if spectra_conversion_type == LABELS["NORM"]: # df = (df - df.min()) / (df.max() - df.min()) # Mean Spectra if chart_type == 'MS': df = df.mean(axis=1).rename('Average').to_frame() # columns in main view. Chart, expanders # TODO rozwiązać to jakoś sprytniej normalized = False col_left, col_right = st.beta_columns([5, 2]) if spectra_conversion_type != "RAW": col_right = col_right.beta_expander("Customize spectra", expanded=False) with col_right: vals = data_customisation.get_deg_win(chart_type, spectra_conversion_type, df.columns) if st.checkbox("Data Normalization"): normalized = True df = (df - df.min()) / (df.max() - df.min()) else: normalized = False # For grouped spectra sometimes we want to shift the spectra from each other, here it is: with main_expander: # TODO the code below needed? # trick to better fit sliders in expander # _, main_expander_column, _ = st.beta_columns([1, 38, 1]) # with main_expander_column: shift_col, _, trim_col = st.beta_columns([5, 1, 5]) with shift_col: if chart_type == 'GS': shift = data_customisation.separate_spectra(normalized) elif chart_type == 'SINGLE': col = st.selectbox('spectrum to plot', df.columns) df = df[[col]] else: shift = None with trim_col: df = vis_utils.trim_spectra(df) # data conversion end if spectra_conversion_type in {'OPT'}: baselines = pd.DataFrame(index=df.index) baselined = pd.DataFrame(index=df.index) flattened = pd.DataFrame(index=df.index) for col in df.columns: baselines[col] = peakutils.baseline(df[col], vals[col][0]) baselined[col] = df[col] - baselines[col] flattened[col] = baselined[col].rolling(window=vals[col][1], min_periods=1, center=True).mean() # # # Plotting # # Groupped spectra if chart_type == 'GS': shifters = [(i + 1) * shift for i in range(len(df.columns))] plot_df = df if spectra_conversion_type == 'RAW' else flattened plot_df = plot_df + shifters figs = [ px.line(plot_df, x=plot_df.index, y=plot_df.columns, color_discrete_sequence=plots_color) ] # Mean spectra elif chart_type == 'MS': if spectra_conversion_type == 'RAW': plot_df = df figs = [ px.line(plot_df, x=plot_df.index, y=plot_df.columns, color_discrete_sequence=plots_color) ] elif spectra_conversion_type in {'OPT'}: columns = [ 'Average', 'Baseline', 'BL-Corrected', 'Flattened + BL-Corrected' ] plot_df = pd.concat([df, baselines, baselined, flattened], axis=1) plot_df.columns = columns fig1 = px.line(plot_df, x=plot_df.index, y=columns[-1], color_discrete_sequence=plots_color[3:]) fig2 = px.line(plot_df, x=plot_df.index, y=plot_df.columns, color_discrete_sequence=plots_color) figs = [(fig1, fig2)] else: raise ValueError( 'Unknown conversion type for Mean spectrum chart') # 3D spectra elif chart_type == 'P3D': plot_df = flattened if spectra_conversion_type in {"OPT"} else df plot_df = plot_df.reset_index().melt('Raman Shift', plot_df.columns) fig = px.line_3d(plot_df, x='variable', y='Raman Shift', z='value', color='variable') camera = dict(eye=dict(x=1.9, y=0.15, z=0.2)) fig.update_layout( scene_camera=camera, width=1200, height=1200, margin=dict(l=1, r=1, t=30, b=1), ) figs = [fig] # Single spectra elif chart_type == 'SINGLE': if spectra_conversion_type == 'RAW': plot_df = df figs = [ px.line(plot_df[col], color_discrete_sequence=plots_color) for col in plot_df.columns ] else: columns = [ 'Average', 'Baseline', 'BL-Corrected', 'Flattened + BL-Corrected' ] figs = [] plot_df = pd.concat([df, baselines, baselined, flattened], axis=1) plot_df.columns = columns fig1 = px.line(plot_df, x=plot_df.index, y=columns[-1], color_discrete_sequence=plots_color[3:] ) # trick for color consistency fig2 = px.line(plot_df, x=plot_df.index, y=plot_df.columns, color_discrete_sequence=plots_color) fig_tup = (fig1, fig2) figs.append(fig_tup) else: raise ValueError("Something unbelievable has been chosen") with col_left: charts.show_charts(figs, plots_color, template) with col_left: st.markdown('') link = utils.download_button(plot_df.reset_index(), f'spectrum.csv', button_text='Download CSV') st.markdown(link, unsafe_allow_html=True) else: manual.show_manual() authors.show_developers()
assert force, "{} does already exists. Use -f option to overwrite it".format(output_folder) shutil.rmtree(output_folder) os.mkdir(output_folder) # Fix delimiter if not provided or file type has been given if delimiter == "csv": delimiter = "," elif delimiter == "tsv": delimiter = "\t" if delimiter is None: print("Inferring delimiter from input data") with open(input_file) as f: data = [f.readline() for i in range(50)] data = "\n".join([x for x in data if x]) delimiter = detect(data) print(f"Delimiter is '{delimiter}'") # Import data print("Loading data from disk") input_data = None with open(input_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=delimiter) next(csv_reader, None) input_data = list(csv_reader) print("Parsing and checking consistency") data = [] for i, line in enumerate(input_data): assert len(line) == 3, f"Line {i} has a problem: {line}" assert all([bool(x) for x in line]), f"Line {i} has a problem: {line}"
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = [ n for n in self.unpack_files(self.main_file, compression="unzip") if ("pdf" not in n["name"].lower()) ] # there should be only one voter file voter_file = [n for n in new_files if "vrdb" in n["name"].lower()][0] hist_files = [n for n in new_files if "history" in n["name"].lower()] if not self.ignore_checks: # We're already automatically limiting voter_file to one entry self.file_check(len([voter_file]), hist_files=len(hist_files)) # There are two possible separators. Detect it first. line = voter_file["obj"].readline().decode() delimiter = detect(line) # Return to the beginning of the buffer to read the data now that we # know what the separator is. voter_file["obj"].seek(0) df_voter = pd.read_csv(voter_file["obj"], sep=delimiter, encoding="latin-1", dtype=str, error_bad_lines=False) df_hist = pd.DataFrame() for hist_file in hist_files: line = hist_file["obj"].readline().decode() delimiter = detect(line) hist_file["obj"].seek(0) temp = pd.read_csv(hist_file["obj"], sep=delimiter, encoding="latin-1", dtype=str) df_hist = df_hist.append(temp, ignore_index=True) # --- handling the voter history file --- # # Need to fix/combine the differently named VoterHistoryID # and VotingHistoryID columns if {"VotingHistoryID", "VoterHistoryID"}.issubset(df_hist.columns): df_hist["VotingHistoryID"] = (df_hist.pop("VoterHistoryID").fillna( df_hist.pop("VotingHistoryID"))) # can't find voter history documentation in any yaml, hardcoding column name election_dates = pd.to_datetime(df_hist.loc[:, "ElectionDate"], errors="coerce").dt elections, counts = np.unique(election_dates.date, return_counts=True) def convert_date(k): try: return k.strftime("%m/%d/%Y") except ValueError: return "unknown" sorted_elections_dict = { str(k): { "index": i, "count": int(counts[i]), "date": convert_date(k), } for i, k in enumerate(elections) } sorted_elections = list(sorted_elections_dict.keys()) df_hist.loc[:, "all_history"] = election_dates.date.apply(str) df_hist.loc[:, "sparse_history"] = df_hist.loc[:, "all_history"].map( lambda x: int(sorted_elections_dict[x]["index"])) df_hist.loc[:, "county_history"] = df_hist.loc[:, self.config[ "primary_locale_identifier"]] voter_groups = df_hist.groupby(self.config["voter_id"]) all_history = voter_groups["all_history"].apply(list) sparse_history = voter_groups["sparse_history"].apply(list) county_history = voter_groups["county_history"].apply(list) df_hist = pd.concat([all_history, sparse_history, county_history], axis=1) # --- handling the voter file --- # # some columns have become obsolete df_voter = df_voter.loc[:, df_voter.columns.isin(self. config["column_names"])] df_voter = df_voter.set_index(self.config["voter_id"]) # pandas loads any numeric column with NaN values as floats # causing formatting trouble during execute() with a few columns # saw this solution in other states (arizona & texas) to_numeric = [ df_voter.loc[:, col].str.isnumeric().all() for col in df_voter.columns ] df_voter.loc[:, to_numeric] = ( df_voter.loc[:, to_numeric].fillna(-1).astype(int)) df_voter = self.config.coerce_numeric(df_voter) df_voter = self.config.coerce_strings( df_voter, exclude=[ self.config["primary_locale_identifier"], self.config["voter_id"], ], ) df_voter = self.config.coerce_dates(df_voter) # add voter history df_voter = df_voter.join(df_hist) # Add party_idenitfier dummy values, # since WA doesn't have party info df_voter.loc[:, self.config["party_identifier"]] = NO_PARTY_PLACEHOLDER # Need to remap status codes because the original data are messy df_voter["StatusCodeOrig"] = df_voter["StatusCode"] df_voter["StatusCode"] = df_voter["StatusCodeOrig"].map( self.config["status_codes_remap"]) if df_voter["StatusCode"].isnull().any(): missing = df_voter[ df_voter["StatusCode"].isnull()]["StatusCodeOrig"].to_list() logging.warning("Status codes missing from status_codes_remap") logging.warning(missing) # Check for missing columns; catch error because we're fixing them # below try: self.column_check(list(df_voter.columns)) except MissingColumnsError: pass # Make sure all columns are present expected_cols = (self.config["ordered_columns"] + self.config["ordered_generated_columns"]) # Remove the index column to avoid duplication expected_cols.remove(self.config["voter_id"]) df_voter = self.reconcile_columns(df_voter, expected_cols) df_voter = df_voter[expected_cols] self.meta = { "message": f"washington_{datetime.now().isoformat()}", "array_encoding": json.dumps(sorted_elections_dict), "array_decoding": json.dumps(sorted_elections), } self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(df_voter.to_csv(encoding="utf-8")), s3_bucket=self.s3_bucket, )
def txt2dict(path): print('getting info from', path) with open(path) as f: lines = f.readlines() for line in lines: if line[0] == '#': lines.remove(line) for ind, line in enumerate(lines): if '#' in line: lines[ind] = line[0:line.index('#')] elif '\n' in line: lines[ind] = line.replace('\n', '') for line in lines: if line == '': lines.remove(line) delimiter = detect(lines[0]) print(len(lines), 'lines found in txt_file with', delimiter, 'as the delimiter') try: for i in [0]: lines = [item.strip().rsplit(delimiter, 2) for item in lines] input_txt = {item[0].strip(): item[1].strip() for item in lines} except: print('failed to read txt_file') for key, val in input_txt.items(): if ',' in val: try: input_txt[key] = tuple(map(int, val.split(','))) except: try: input_txt[key] = [item.strip() for item in val.split(',')] except: pass else: try: input_txt[key] = float(val) except: input_txt[key] = str2bool(val) ### adding some default parameters if missing in info_txt if 'sigma' not in input_txt.keys(): input_txt['sigma'] = 0 if 'steps' not in input_txt.keys(): input_txt['steps'] = ['all'] if 'reg_subset' not in input_txt.keys(): input_txt['reg_subset'] = [0, 0] if 'metric' not in input_txt.keys(): input_txt['metric'] = 'mattes' if 'check_ch' not in input_txt.keys(): input_txt['check_ch'] = input_txt['ch_names'][0] if 'double_register' not in input_txt.keys(): input_txt['double_register'] = False #### reasign un-recognized parameters if type(input_txt['ch_names']) != list: input_txt['ch_names'] = [input_txt['ch_names']] if type(input_txt['drift_corr']) != list: input_txt['drift_corr'] = [input_txt['drift_corr']] if type(input_txt['steps']) == str: input_txt['steps'] = [input_txt['steps'].lower()] elif type(input_txt['steps']) == tuple: input_txt['steps'] = [s.lower() for s in input_txt['steps']] if 'all' in input_txt['steps']: input_txt['steps'] = [ 'preshift', 'postshift', 'ants', 'mask', 'n2v', 'clahe' ] if 'check_ch' not in input_txt['ch_names']: print( 'channel defined for similarity_check not recognized, so ch_0 used' ) input_txt['check_ch'] = input_txt['ch_names'][0] print(input_txt) return input_txt
def get_delimiter(sample_string: str): return detect(sample_string).replace("\\", "\\\\")
def determineDelimiter(self,lineData): return detect_delimiter.detect(lineData)