def clean_parsed_files(import_directory, export_directory): """ Cleans all .csv files in a given directory using methods in the XbrlCSVCleaner class and saves the processed files in a given directory Arguments: import_directory: Directory containing .csv files to be cleaned export_directory: Directory where cleaned files should be saved Returns: None Raises: None """ #path = '/shares/xbrl_parsed_data/' # Generate a list of files to be cleaned os.chdir(import_directory) xbrl_files = os.listdir(import_directory) xbrl_files = [csv for csv in xbrl_files if csv.endswith('.csv')] # Clean the parsed files from the relevant list for file in xbrl_files: print('Exporting {}......'.format(file)) XbrlCSVCleaner.parsed_csv_clean(import_directory + file, export_directory + file) print('Successfully exported {}!'.format(file)) return None
def train(dir, workdir): labels = list() merged_csv_path = os.path.join(workdir, 'merged.csv') merged = open(merged_csv_path, "w+") for filename in sorted(os.listdir(dir)): new_dir = os.path.join(dir, filename) for csv in sorted(os.listdir(new_dir)): if csv.endswith('.csv'): labels.append(filename) fullpath = os.path.join(dir, filename, csv) fl = open(fullpath) for line in fl: merged.write(line) fl.close() merged.close() le = LabelEncoder().fit(labels) labels_num = le.transform(labels) embeddings = pd.read_csv(merged_csv_path, header=None).as_matrix() clf = SVC(C=1, kernel='linear', probability=True) clf.fit(embeddings, labels_num) fname = "{}/classifier.pkl".format(workdir) print("Saving classifier to '{}'".format(fname)) with open(fname, 'w') as f: pickle.dump((le, clf), f) with open(classifier, 'rb') as f: (le, clf) = pickle.load(f)
def main(): # Read the parameters parser = ArgumentParser() parser.add_argument('-f', '--pathologiesFolderPath', required=True, help='The folder with the pathologies data.') parser.add_argument( '-p', '--pathologies', required=False, help='Specific pathologies to parse. (Example: "dementia,tbi"') args = parser.parse_args() pathologiesFolderPath = os.path.abspath(args.pathologiesFolderPath) # Get all pathologies pathologiesList = next(os.walk(pathologiesFolderPath))[1] if args.pathologies != None: pathologiesToConvert = args.pathologies.split(",") pathologiesList = list( set(pathologiesList) & set(pathologiesToConvert)) print("Converting csvs for pathologies: " + ",".join(pathologiesList)) # Create the datasets db for each pathology for pathologyName in pathologiesList: # Initializing metadata and output absolute path CDEsMetadataPath = os.path.join(pathologiesFolderPath, pathologyName, "CDEsMetadata.json") outputDBAbsPath = os.path.join(pathologiesFolderPath, pathologyName, "datasets.db") # Connect to the database con = sqlite3.connect(outputDBAbsPath) cur = con.cursor() # Add the metadata table + rows addMetadataInTheDatabase(CDEsMetadataPath, cur) # Transform the metadata json into a column name -> column type list metadataDictionary = createMetadataDictionary(CDEsMetadataPath) # Create the data table with the header createDataTable(metadataDictionary, cur) # Add all the csvs in the database for csv in os.listdir( os.path.join(pathologiesFolderPath, pathologyName)): if csv.endswith('.csv'): csvFilePath = os.path.join(pathologiesFolderPath, pathologyName, csv) addCSVInTheDataTable(csvFilePath, metadataDictionary, cur) con.commit() con.close()
def main(): # Read the parameters parser = ArgumentParser() parser.add_argument('-f', '--pathologiesFolderPath', required=True, help='The folder with the pathologies data.') parser.add_argument('-t', '--nodeType', required=True, help='Is this a master or a worker node?') args = parser.parse_args() pathologiesFolderPath = os.path.abspath(args.pathologiesFolderPath) # Get all pathologies pathologiesList = next(os.walk(pathologiesFolderPath))[1] # Create the datasets db for each pathology for pathologyName in pathologiesList: # Initializing metadata and output absolute path CDEsMetadataPath = os.path.join(pathologiesFolderPath, pathologyName, "CDEsMetadata.json") outputDBAbsPath = os.path.join(pathologiesFolderPath, pathologyName, "datasets.db") # Connect to the database con = sqlite3.connect(outputDBAbsPath) cur = con.cursor() # Add the metadata table + rows addMetadataInTheDatabase(CDEsMetadataPath, cur) # Transform the metadata json into a column name -> column type list metadataDictionary = createMetadataDictionary(CDEsMetadataPath) # Create the data table with the header createDataTable(metadataDictionary, cur) # Add all the csvs in the database for csv in os.listdir( os.path.join(pathologiesFolderPath, pathologyName)): if csv.endswith('.csv'): csvFilePath = os.path.join(pathologiesFolderPath, pathologyName, csv) addCSVInTheDataTable(csvFilePath, metadataDictionary, cur) con.commit() con.close()
def makefilelist(parent_dir): subject_dirs = [ os.path.join(parent_dir, dir) for dir in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, dir)) ] filelist = [] for dir in subject_dirs: csv_files = [ os.path.join(dir, csv) for csv in os.listdir(dir) if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv') ] for file in csv_files: filelist.append(file) return filelist
def build_list_of_wkt_filenames(): # Get list of directories wkt_subject_dirs = [ os.path.join(wkt_parent_dir, dir) for dir in os.listdir(wkt_parent_dir) if os.path.isdir(os.path.join(wkt_parent_dir, dir)) ] # Get list of csv files for dir in wkt_subject_dirs: wkt_csv_files = [ os.path.join(dir, csv) for csv in os.listdir(dir) if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv') ] for wkt_file in wkt_csv_files: building_name = (wkt_file.split("/")[10]) wkt_buildings.append(building_name)
def update_all_csvs(database_folder): database_folder = "Database files" # or use crawlerfolder paths = ([(os.path.join(database_folder, csv)) for csv in os.listdir(database_folder) if csv.endswith('.csv')]) count, total = 0, 0 for path in paths: c, t = update_database(log_folder, path) count += c total += t print( "{}/{} filings have been downloaded: {:.2%} complete. Continuing from last download. \n" .format(str(count), str(total), (count / total))) return count, total
def __init__(self, csv): if isinstance(csv, list): self.csv = csv elif isinstance(csv, basestring): if os.path.exists(csv): with open(csv) as f: buf = f.read() self.csv = buf.split('\n') elif csv.startswith('http') and csv.endswith('csv'): response = requests.get(csv) if response.status_code == 200: self.csv = response.content.split('\n') else: raise IOError('Failed to download %s' % csv) else: raise IOError('Unable to parse CSV')
def csv_to_tsv(): csvs = [csv for csv in os.listdir('.') if csv.endswith('csv')] for csv in csvs: with open(csv, 'r') as infile, open(f'{csv.split(".")[0]}.tsv', 'w')as outfile: for line in infile: line = line.strip().replace(',', '\t') outfile.write(f'{line}\n') os.remove(csv) tsvs = [tsv for tsv in os.listdir('.') if tsv.endswith('tsv')] for tsv in tsvs: with open(tsv, 'r') as infile, open('tmp', 'w') as outfile: for line in infile: line = line.strip().replace(',', '\t') outfile.write(f'{line}\n') shutil.move('tmp', tsv)
def start_download(database_folder, n_threads=9): log_folder = "Download logs" database_folder = "Database files" csvs = sorted([(os.path.join(database_folder, csv)) for csv in os.listdir(database_folder) if csv.endswith('.csv')])[::-1] for csv in csvs: pool = ThreadPool(n_threads) # instantiate multiple threads df = pd.read_csv(csv, chunksize=10000) # df is just an io iterator pool.map(download_data_chunk, df) # run the threads pool.close() pool.join() # wait for all to finish update_database(log_folder, csv) print("all done.") return
# Build a list we can compare against later build_list_of_wkt_filenames() # Loop through every csv file in the data input directory. subject_dirs = [ os.path.join(building_data_split_by_campus_dir, dir) for dir in os.listdir(building_data_split_by_campus_dir) if os.path.isdir(os.path.join(building_data_split_by_campus_dir, dir)) ] counter = 0 year_building = '' for dir in subject_dirs: csv_files = [ os.path.join(dir, csv) for csv in os.listdir(dir) if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv') ] for file in csv_files: # print('Reading in ' + file + ' ...\n') # print('Importing csv...\n') df = pd.read_csv(file, index_col=False) # Drop the useless first row df.drop(df.index[:1], inplace=True) # Rename the data column for easier access new_columns = df.columns.values new_columns[0] = 'original' df.columns = new_columns
from flask import Flask from flask import jsonify from flask_cors import CORS import os import csv import data.arff_to_csv app = Flask(__name__) CORS(app) # next line is NOT needed when running from command line or terminal # os.chdir('D:\ETF\Master\II semestar - MoE\Biomedicinski signali i sistemi\Seminarski by Bega & Creda\source code\\backend_flask') files = [csv for csv in os.listdir('.\data') if csv.endswith('.csv')] if len(files) == 0: data.arff_to_csv.main() csv_file = 'data\chronic_kidney_disease.csv' contents = [] # list of dictionaries with open(csv_file, 'r') as file: reader = csv.DictReader(file) for row in reader: contents.append(row) @app.route('/percentage-by-ages') def percentage_by_ages(): return jsonify(_percetage_by_ages()) def _percetage_by_ages(): # index of this list represents age interval (index 0 represents 0-9, index 1 represents 10-19 ...)
# write to csv with no index, tab deliminated, and quoting all non numeric # data print('Exporting cleaned dataframe to {}'.format(export_path)) df.to_csv(export_path, index=False, sep="\t", line_terminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) return None #parsed_csv_clean('/shares/xbrl_parsed_data/2020-April_xbrl_data.csv', # '/shares/test_parsed_data/2020-April_xbrl_data.csv') # Obtain a list of files to clean path = '/shares/xbrl_parsed_data/' extension = '.csv' os.chdir(path) xbrl_files = os.listdir(path) xbrl_files = [csv for csv in xbrl_files if csv.endswith(extension)] # Clean the parsed files from the relevant list for file in xbrl_files: print('Exporting {}......'.format(file)) parsed_csv_clean('/shares/xbrl_parsed_data/'+file, '/shares/test_parsed_data/'+file) print('Successfully export {}!'.format(file)) #print(df.head()) #import_path = '/shares/test_xbrl_data/' #df =pd.read_csv(import_path, lineterminator="\n")
def run(collection, csv, dry, src, type, verbose): """ Curvature: mm (describing the number of mm the carrot has been pulled down to center) Biomass: mm^2 Max_width: mm Length: mm L/W ratio: mm / mm Shoulder top / bottom: mm^2 Tip angle top / bottom: absolute value of angle up/down from center line above tipangle top: mm2 above tipangle bottom: mm2 """ mask_type = type if mask_type is None and not csv: click.echo("No mask type specified...") click.echo("run 'python phenotype.py --help to see options'") return if src is None and csv is None: click.echo("No source specified...") click.echo("run 'python phenotype.py --help to see options'") return if src is not None and csv is not None: click.echo( "Both source and csv file specified. I can only do one thing at a time" ) click.echo("run 'python phenotype.py --help to see options'") return if csv is not None and not csv.endswith(".csv"): click.echo("That doesn't look like a csv file.") return tic = timeit.default_timer() if dry: collection = None else: collection = get_collection(collection) inserted = 0 updated = 0 if src is not None: type_map = { "binary": BINARY_MASKS_DIR, "straight": STRAIGHTENED_MASKS_DIR, "detipped": DETIPPED_MASKS_DIR, } subdirs = get_masks_to_process(src, type_map[mask_type]) for dir in subdirs: for file in dir["files"]: try: instance = assemble_instance(file) except Exception as e: print(e) instance = None click.secho(file, fg="red") if instance is not None: if dry: print(instance) else: try: action = insert_or_update_instance( collection, instance, verbose) except Exception as e: click.secho(f"failed to insert {file}", fg="red") if action == "inserted": inserted += 1 elif action == "updated": updated += 1 if csv is not None: instances = assemble_instance_from_csv(csv) for instance in instances: if dry: print(instance) else: action = insert_or_update_instance(collection, instance) if action == "inserted": inserted += 1 elif action == "updated": updated += 1 toc = timeit.default_timer() duration = toc - tic msg = "Inserted %s and updated %s in %.2f seconds." % (inserted, updated, duration) click.secho(msg, fg="green")