Example #1
0
    def clean_parsed_files(import_directory, export_directory):
        """
        Cleans all .csv files in a given directory using methods in the
        XbrlCSVCleaner class and saves the processed files in a given directory

        Arguments:
            import_directory: Directory containing .csv files to be cleaned
            export_directory: Directory where cleaned files should be saved
        Returns:
            None
        Raises:
            None
        """
        #path = '/shares/xbrl_parsed_data/'
        # Generate a list of files to be cleaned
        os.chdir(import_directory)
        xbrl_files = os.listdir(import_directory)
        xbrl_files = [csv for csv in xbrl_files if csv.endswith('.csv')]

        # Clean the parsed files from the relevant list
        for file in xbrl_files:
            print('Exporting {}......'.format(file))
            XbrlCSVCleaner.parsed_csv_clean(import_directory + file,
                                            export_directory + file)
            print('Successfully exported {}!'.format(file))
        return None
Example #2
0
def train(dir, workdir):
    labels = list()
    merged_csv_path = os.path.join(workdir, 'merged.csv')
    merged = open(merged_csv_path, "w+")

    for filename in sorted(os.listdir(dir)):
        new_dir = os.path.join(dir, filename)
        for csv in sorted(os.listdir(new_dir)):
            if csv.endswith('.csv'):
                labels.append(filename)
                fullpath = os.path.join(dir, filename, csv)
                fl = open(fullpath)
                for line in fl:
                    merged.write(line)
                fl.close()
    merged.close()

    le = LabelEncoder().fit(labels)
    labels_num = le.transform(labels)
    embeddings = pd.read_csv(merged_csv_path, header=None).as_matrix()

    clf = SVC(C=1, kernel='linear', probability=True)
    clf.fit(embeddings, labels_num)

    fname = "{}/classifier.pkl".format(workdir)
    print("Saving classifier to '{}'".format(fname))
    with open(fname, 'w') as f:
        pickle.dump((le, clf), f)

    with open(classifier, 'rb') as f:
        (le, clf) = pickle.load(f)
def main():
    # Read the parameters
    parser = ArgumentParser()
    parser.add_argument('-f',
                        '--pathologiesFolderPath',
                        required=True,
                        help='The folder with the pathologies data.')
    parser.add_argument(
        '-p',
        '--pathologies',
        required=False,
        help='Specific pathologies to parse. (Example: "dementia,tbi"')
    args = parser.parse_args()
    pathologiesFolderPath = os.path.abspath(args.pathologiesFolderPath)

    # Get all pathologies
    pathologiesList = next(os.walk(pathologiesFolderPath))[1]

    if args.pathologies != None:
        pathologiesToConvert = args.pathologies.split(",")
        pathologiesList = list(
            set(pathologiesList) & set(pathologiesToConvert))
    print("Converting csvs for pathologies: " + ",".join(pathologiesList))

    # Create the datasets db for each pathology
    for pathologyName in pathologiesList:

        # Initializing metadata and output absolute path
        CDEsMetadataPath = os.path.join(pathologiesFolderPath, pathologyName,
                                        "CDEsMetadata.json")
        outputDBAbsPath = os.path.join(pathologiesFolderPath, pathologyName,
                                       "datasets.db")

        # Connect to the database
        con = sqlite3.connect(outputDBAbsPath)
        cur = con.cursor()

        # Add the metadata table + rows
        addMetadataInTheDatabase(CDEsMetadataPath, cur)

        # Transform the metadata json into a column name -> column type list
        metadataDictionary = createMetadataDictionary(CDEsMetadataPath)

        # Create the data table with the header
        createDataTable(metadataDictionary, cur)

        # Add all the csvs in the database
        for csv in os.listdir(
                os.path.join(pathologiesFolderPath, pathologyName)):
            if csv.endswith('.csv'):
                csvFilePath = os.path.join(pathologiesFolderPath,
                                           pathologyName, csv)
                addCSVInTheDataTable(csvFilePath, metadataDictionary, cur)

        con.commit()
        con.close()
Example #4
0
def main():

    # Read the parameters
    parser = ArgumentParser()
    parser.add_argument('-f',
                        '--pathologiesFolderPath',
                        required=True,
                        help='The folder with the pathologies data.')
    parser.add_argument('-t',
                        '--nodeType',
                        required=True,
                        help='Is this a master or a worker node?')
    args = parser.parse_args()

    pathologiesFolderPath = os.path.abspath(args.pathologiesFolderPath)

    # Get all pathologies
    pathologiesList = next(os.walk(pathologiesFolderPath))[1]

    # Create the datasets db for each pathology
    for pathologyName in pathologiesList:

        # Initializing metadata and output absolute path
        CDEsMetadataPath = os.path.join(pathologiesFolderPath, pathologyName,
                                        "CDEsMetadata.json")
        outputDBAbsPath = os.path.join(pathologiesFolderPath, pathologyName,
                                       "datasets.db")

        # Connect to the database
        con = sqlite3.connect(outputDBAbsPath)
        cur = con.cursor()

        # Add the metadata table + rows
        addMetadataInTheDatabase(CDEsMetadataPath, cur)

        # Transform the metadata json into a column name -> column type list
        metadataDictionary = createMetadataDictionary(CDEsMetadataPath)

        # Create the data table with the header
        createDataTable(metadataDictionary, cur)

        # Add all the csvs in the database
        for csv in os.listdir(
                os.path.join(pathologiesFolderPath, pathologyName)):
            if csv.endswith('.csv'):
                csvFilePath = os.path.join(pathologiesFolderPath,
                                           pathologyName, csv)
                addCSVInTheDataTable(csvFilePath, metadataDictionary, cur)

        con.commit()
        con.close()
def makefilelist(parent_dir):
    subject_dirs = [
        os.path.join(parent_dir, dir) for dir in os.listdir(parent_dir)
        if os.path.isdir(os.path.join(parent_dir, dir))
    ]
    filelist = []
    for dir in subject_dirs:
        csv_files = [
            os.path.join(dir, csv) for csv in os.listdir(dir)
            if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv')
        ]
        for file in csv_files:
            filelist.append(file)
    return filelist
Example #6
0
def build_list_of_wkt_filenames():
    # Get list of directories
    wkt_subject_dirs = [
        os.path.join(wkt_parent_dir, dir) for dir in os.listdir(wkt_parent_dir)
        if os.path.isdir(os.path.join(wkt_parent_dir, dir))
    ]

    # Get list of csv files
    for dir in wkt_subject_dirs:
        wkt_csv_files = [
            os.path.join(dir, csv) for csv in os.listdir(dir)
            if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv')
        ]
        for wkt_file in wkt_csv_files:
            building_name = (wkt_file.split("/")[10])
            wkt_buildings.append(building_name)
Example #7
0
def update_all_csvs(database_folder):
    database_folder = "Database files"  # or use crawlerfolder
    paths = ([(os.path.join(database_folder, csv))
              for csv in os.listdir(database_folder) if csv.endswith('.csv')])

    count, total = 0, 0
    for path in paths:
        c, t = update_database(log_folder, path)
        count += c
        total += t

    print(
        "{}/{} filings have been downloaded: {:.2%} complete. Continuing from last download. \n"
        .format(str(count), str(total), (count / total)))

    return count, total
Example #8
0
 def __init__(self, csv):
     if isinstance(csv, list):
         self.csv = csv
     elif isinstance(csv, basestring):
         if os.path.exists(csv):
             with open(csv) as f:
                 buf = f.read()
             self.csv = buf.split('\n')
         elif csv.startswith('http') and csv.endswith('csv'):
             response = requests.get(csv)
             if response.status_code == 200:
                 self.csv = response.content.split('\n')
             else:
                 raise IOError('Failed to download %s' % csv)
     else:
         raise IOError('Unable to parse CSV')
def csv_to_tsv():
    csvs = [csv for csv in os.listdir('.') if csv.endswith('csv')]
    for csv in csvs:
        with open(csv, 'r') as infile, open(f'{csv.split(".")[0]}.tsv', 'w')as outfile:
            for line in infile:
                line = line.strip().replace(',', '\t')
                outfile.write(f'{line}\n')
        os.remove(csv)

    tsvs = [tsv for tsv in os.listdir('.') if tsv.endswith('tsv')]
    for tsv in tsvs:
        with open(tsv, 'r') as infile, open('tmp', 'w') as outfile:
            for line in infile:
                line = line.strip().replace(',', '\t')
                outfile.write(f'{line}\n')
        shutil.move('tmp', tsv)
Example #10
0
def start_download(database_folder, n_threads=9):

    log_folder = "Download logs"

    database_folder = "Database files"
    csvs = sorted([(os.path.join(database_folder, csv))
                   for csv in os.listdir(database_folder)
                   if csv.endswith('.csv')])[::-1]
    for csv in csvs:

        pool = ThreadPool(n_threads)  # instantiate multiple threads

        df = pd.read_csv(csv, chunksize=10000)  # df is just an io iterator
        pool.map(download_data_chunk, df)  # run the threads
        pool.close()
        pool.join()  # wait for all to finish

        update_database(log_folder, csv)

    print("all done.")
    return
Example #11
0
# Build a list we can compare against later
build_list_of_wkt_filenames()

# Loop through every csv file in the data input directory.
subject_dirs = [
    os.path.join(building_data_split_by_campus_dir, dir)
    for dir in os.listdir(building_data_split_by_campus_dir)
    if os.path.isdir(os.path.join(building_data_split_by_campus_dir, dir))
]
counter = 0
year_building = ''

for dir in subject_dirs:
    csv_files = [
        os.path.join(dir, csv) for csv in os.listdir(dir)
        if os.path.isfile(os.path.join(dir, csv)) and csv.endswith('.csv')
    ]
    for file in csv_files:
        # print('Reading in ' + file + ' ...\n')

        # print('Importing csv...\n')
        df = pd.read_csv(file, index_col=False)

        # Drop the useless first row
        df.drop(df.index[:1], inplace=True)

        # Rename the data column for easier access
        new_columns = df.columns.values
        new_columns[0] = 'original'
        df.columns = new_columns
from flask import Flask
from flask import jsonify
from flask_cors import CORS
import os
import csv
import data.arff_to_csv

app = Flask(__name__)
CORS(app)

# next line is NOT needed when running from command line or terminal
# os.chdir('D:\ETF\Master\II semestar - MoE\Biomedicinski signali i sistemi\Seminarski by Bega & Creda\source code\\backend_flask')
files = [csv for csv in os.listdir('.\data') if csv.endswith('.csv')]
if len(files) == 0:
    data.arff_to_csv.main()

csv_file = 'data\chronic_kidney_disease.csv'
contents = []  # list of dictionaries
with open(csv_file, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        contents.append(row)


@app.route('/percentage-by-ages')
def percentage_by_ages():
    return jsonify(_percetage_by_ages())


def _percetage_by_ages():
    # index of this list represents age interval (index 0 represents 0-9, index 1 represents 10-19 ...)
    # write to csv with no index, tab deliminated, and quoting all non numeric
    # data
    print('Exporting cleaned dataframe to {}'.format(export_path))
    df.to_csv(export_path, index=False, sep="\t", line_terminator='\n',
              quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

    return None
#parsed_csv_clean('/shares/xbrl_parsed_data/2020-April_xbrl_data.csv',
# '/shares/test_parsed_data/2020-April_xbrl_data.csv')

# Obtain a list of files to clean
path = '/shares/xbrl_parsed_data/'
extension = '.csv'
os.chdir(path)
xbrl_files = os.listdir(path)
xbrl_files = [csv for csv in xbrl_files if csv.endswith(extension)]

# Clean the parsed files from the relevant list
for file in xbrl_files:
    print('Exporting {}......'.format(file))
    parsed_csv_clean('/shares/xbrl_parsed_data/'+file,
                     '/shares/test_parsed_data/'+file)
    print('Successfully export {}!'.format(file))


#print(df.head())
#import_path = '/shares/test_xbrl_data/'
#df =pd.read_csv(import_path, lineterminator="\n")


def run(collection, csv, dry, src, type, verbose):
    """

    Curvature: mm (describing the number of mm the carrot has been pulled down to center)  

    Biomass: mm^2

    Max_width: mm

    Length: mm

    L/W ratio: mm / mm

    Shoulder top / bottom: mm^2

    Tip angle top / bottom: absolute value of angle up/down from center line

    above tipangle top: mm2
    above tipangle bottom: mm2

    """
    mask_type = type

    if mask_type is None and not csv:
        click.echo("No mask type specified...")
        click.echo("run 'python phenotype.py --help to see options'")
        return

    if src is None and csv is None:
        click.echo("No source specified...")
        click.echo("run 'python phenotype.py --help to see options'")
        return

    if src is not None and csv is not None:
        click.echo(
            "Both source and csv file specified. I can only do one thing at a time"
        )
        click.echo("run 'python phenotype.py --help to see options'")
        return

    if csv is not None and not csv.endswith(".csv"):
        click.echo("That doesn't look like a csv file.")
        return

    tic = timeit.default_timer()
    if dry:
        collection = None
    else:
        collection = get_collection(collection)

    inserted = 0
    updated = 0

    if src is not None:
        type_map = {
            "binary": BINARY_MASKS_DIR,
            "straight": STRAIGHTENED_MASKS_DIR,
            "detipped": DETIPPED_MASKS_DIR,
        }
        subdirs = get_masks_to_process(src, type_map[mask_type])
        for dir in subdirs:
            for file in dir["files"]:
                try:
                    instance = assemble_instance(file)
                except Exception as e:
                    print(e)
                    instance = None
                    click.secho(file, fg="red")
                if instance is not None:
                    if dry:
                        print(instance)
                    else:
                        try:
                            action = insert_or_update_instance(
                                collection, instance, verbose)
                        except Exception as e:
                            click.secho(f"failed to insert {file}", fg="red")
                        if action == "inserted":
                            inserted += 1
                        elif action == "updated":
                            updated += 1

    if csv is not None:
        instances = assemble_instance_from_csv(csv)
        for instance in instances:
            if dry:
                print(instance)
            else:
                action = insert_or_update_instance(collection, instance)
                if action == "inserted":
                    inserted += 1
                elif action == "updated":
                    updated += 1

    toc = timeit.default_timer()
    duration = toc - tic
    msg = "Inserted %s and updated %s in %.2f seconds." % (inserted, updated,
                                                           duration)
    click.secho(msg, fg="green")