def excluded_top_categories():
    """Return excluded top categories, given a known collection of excluded segments."""
    ex_segs = excluded_segments(file_utils.read_csv("excluded_segments.csv"))
    tcs = file_utils.top_category_names()
    ex_tcs = []
    for tc in tcs:
        rows = file_utils.read_csv("top_category_files/" + tc +".csv")
        #There can never be more than one segment in a top category -> check first row
        if rows[0]["Segment Name"] in ex_segs:
            ex_tcs.append(tc)
    return ex_tcs
def generate_brand_counts_csv():
    try:
        stock_master = file_utils.read_csv("combined_stock_master_withbrands.csv")
        brands = count_field(stock_master, "Brand")
        file_utils.save_csv("brand_counts.csv", brands, fieldnames=["Brand", "Count"])
    except FileNotFoundError:
        print("Warning: files brand_counts.csv and/or combined_stock_master_withbrands.csv were not found. Brand data will not be used.")
def match_commodities(stock_with_top_categories, jaccard_threshold, topn, parallel=True):
    """Match commodities to stocks.
    Requires csv:s generated by generate_top_category_files to be in top_category_files/.

    Arguments:
    stock_with_top_categories -- A list of dictionaries, each with keys "Description", "id", "Top Categories", and "Brands".
    jaccard_threshold (float) -- if jaccard_index is lower than threshold, re-run with all top categories.
    parallel (boolean) -- Whether to use concurrency or not. Defaults to True.

    Returns:
    List of dictionaries with the same keys as stock_with_top_categories, and the keys "Commodity", "Commodity Code", and "Jaccard".
    """
    brands = get_brands()
    abbrevs = file_utils.read_csv("desc_abbrevs.csv")
    #Fetches all the allowed top categories.
    tcs = top_category_matcher.non_excluded_top_categories()
    commodities = {tc: get_commodities_for_top_category(tc, abbrevs) for tc in tcs}
    if parallel:
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = []
            for row in stock_with_top_categories:
                futures.append(executor.submit(match_commodities_for_row, row, jaccard_threshold, commodities, brands, topn, abbrevs))
            updated_rows = [future.result() for future in futures]
    else:
        updated_rows = [match_commodities_for_row(row, jaccard_threshold, commodities, brands, topn, abbrevs) for row in stock_with_top_categories]
    return updated_rows
def get_brands():
    brands = []
    try:
        rows = file_utils.read_csv('brand_counts.csv')
        for row in rows:
            if row["Brand"] != "":
                brands.append(row["Brand"].lower())
    except FileNotFoundError:
        pass
    return brands
def generate_top_category_files(column_name):
    file_utils.mkdir("top_category_files")
    rows = file_utils.read_csv('unspsc_codes_v3.csv')
    tcs = {}
    for row in rows:
        if row[column_name] not in tcs:
            tcs[row[column_name]] = []
        tcs[row[column_name]].append(row)
    for tc in tcs:
        filename = "top_category_files/" + tc + ".csv"
        print("Saving " + filename)
        file_utils.save_csv(filename, tcs[tc])
 def test_regression(self):
     """Rerun code on regression_test_stock_master.csv and make sure the results match."""
     #print(sys.path)
     stock_master = read_csv("regression_test_stock_master.csv")
     rows, fieldnames = commodity_matcher.add_commodities_to_stocks(stock_master[:100])
     all_matched = True
     for i, row in enumerate(rows):
         if not row['Commodity'] == stock_master[i]['Commodity']:
             print("Did not match, row "+str(i)+", "+row['text'])
             print("New: '"+row['Commodity']+"' vs. Original: '"+stock_master[i]['Commodity']+"'")
             all_matched = False
     assert(len(rows) == 100)
     assert(sorted(rows[0].keys()) == sorted(fieldnames))
     assert(sorted(fieldnames) == sorted(stock_master[0].keys()))
     if not all_matched:
         print("WARNING: all of the rows did not match. If the above results look fine, you may want to update regression_test_stock_master.csv.")
def add_commodities_to_stocks(stock_master, level="Family Name", tc_to_check_count=25, jaccard_threshold=0.3, topn=1, parallel=True, skip_preprocessing=False):
    """stock_master is a list of dicts that must contain keys id, text and Brand. Brand may be an empty string."""
    generate_constant_csvs(level)
    preprocessed = generate_preprocessed_stocks(stock_master)
    if skip_preprocessing:
        preprocessed = stock_master
    brand_counts = count_field(stock_master, "Brand")
    top_category_strings = file_utils.read_csv("top_category_strings.csv")
    stock_with_top_categories = top_category_matcher.match_preprocessed_to_top_categories(preprocessed, top_category_strings, brand_counts, tc_to_check_count = tc_to_check_count)
    print("Matching commodities")
    stock_with_commodities = match_commodities(stock_with_top_categories, jaccard_threshold=jaccard_threshold, topn=topn, parallel=parallel)
    rows = map_preprocessed_to_original(stock_master, stock_with_commodities)
    if skip_preprocessing:
        rows = stock_with_commodities
    rows = unpivot_stocks(rows)
    return rows
def get_commodities_for_top_categories(top_categories, abbrevs=[]):
    """Given a list of top categories:
        (1) go through the matching files and
        (2) compose a list of all commodities contained in those files.

        Arguments:
        top_categories -- list of names of top categories to fetch

        Returns:
        List of commodities in the given top categories."""
    commodities = {}
    for top_cat in top_categories:
        rows = file_utils.read_csv("top_category_files/" + top_cat + ".csv")
        for row in rows:
            if row["Commodity Name"] in commodities:
                print("Duplicate commodity: " + row["Commodity Name"])
            commodities[row["Commodity Name"]] = {"Commodity Code": row["Commodity"], "Preprocessed": to_base_word_set(row["Commodity Name"], abbrevs)}
    return commodities
Exemple #9
0
def match_by_description(site_rows, old_site_rows):
    """Given a list of site_rows, process them into a dictionary of the form
    {"item_id1": {"site1": {"Matches": [...], "Scores": [...], "Stock & Site": [...]}, ...}, ...}.

    Arguments:
    site_rows -- a list of dictionaries representing rows
    old_site_rows -- a list of dictionaries representing rows from previous output

    Returns:
    A dict of dicts of dicts mapping item_ids to sites to matches.
    """
    abbrevs = file_utils.read_csv("desc_abbrevs.csv")
    site_to_descs_preprocessed = preprocess_all(site_rows, abbrevs=abbrevs)
    old_site_to_descs_preprocessed = preprocess_all(old_site_rows,
                                                    abbrevs=abbrevs)
    all_site_to_descs_preprocessed = {}
    for site in (set(site_to_descs_preprocessed.keys())
                 | set(old_site_to_descs_preprocessed.keys())):
        all_site_to_descs_preprocessed[site] = {}
        if site in site_to_descs_preprocessed:
            all_site_to_descs_preprocessed[site].update(
                site_to_descs_preprocessed[site])
        if site in old_site_to_descs_preprocessed:
            all_site_to_descs_preprocessed[site].update(
                old_site_to_descs_preprocessed[site])
    desc_matches = {}
    jobs_new_to_new = generate_jobs(site_rows,
                                    site_to_descs_preprocessed,
                                    abbrevs=abbrevs)
    jobs_new_to_old = generate_jobs(site_rows,
                                    old_site_to_descs_preprocessed,
                                    abbrevs=abbrevs)
    jobs_old_to_new = generate_jobs(old_site_rows,
                                    site_to_descs_preprocessed,
                                    abbrevs=abbrevs)
    nn_desc_matches = jobs_to_desc_matches(jobs_new_to_new,
                                           all_site_to_descs_preprocessed)
    no_desc_matches = jobs_to_desc_matches(jobs_new_to_old,
                                           all_site_to_descs_preprocessed)
    on_desc_matches = jobs_to_desc_matches(jobs_old_to_new,
                                           all_site_to_descs_preprocessed)
    desc_matches = combine_desc_matches(nn_desc_matches, no_desc_matches, 10)
    desc_matches = combine_desc_matches(desc_matches, on_desc_matches, 10)
    return desc_matches
def top_category_to_string(top_category_name):
    segment_names = []
    family_names = []
    class_names = []
    commodity_names = []
    rows = file_utils.read_csv("top_category_files/" + top_category_name + ".csv")
    for row in rows:
        seg = row["Segment Name"]
        fam = row["Family Name"]
        cl = row["Class Name"]
        com = row["Commodity Name"]
        if not seg in segment_names:
            segment_names.append(seg)
        if not fam in family_names:
            family_names.append(fam)
        if not cl in class_names:
            class_names.append(cl)
        if not com in commodity_names:
            commodity_names.append(com)
    tc_str = str(segment_names) + " " + str(family_names) + " " + str(class_names) + " " + str(commodity_names)
    tc_str = tc_str.replace("[", "").replace("]", "").replace("'", "").replace(",", "").lower()
    return tc_str
Exemple #11
0
from constants import DEBUG, TABLE_COLUMNS, CSV_DIRECTORY, LIVE_SHEET_FILENAME
from postgresql_utils import sql_query, list_columns, list_tables
from file_utils import write_csv, read_csv
from time_utils import get_hours_between_datetimes
from identity_utils import generate_patient_site_uid
from mappers import map_time, map_string_lower, map_episode_unit_type

row_count = 0
patient_data_rows = []
patient_mrns = []
patient_covid_statuses = {}
pcr_sample_times = {}

# Get MRNs of patients in cohort
patient_data_rows = read_csv(os.path.join( \
  CSV_DIRECTORY, 'patient_data.csv'))

patient_mrns = []

live_sheet_rows = read_csv(LIVE_SHEET_FILENAME, remove_duplicates=True)

pcr_sample_times = {}

for row in live_sheet_rows:
  patient_mrn = str(row[0])
  patient_mrns.append(patient_mrn)
  pcr_time = str(row[-7])
  if patient_mrn not in pcr_sample_times:
    pcr_sample_times[patient_mrn] = []
  pcr_sample_times[patient_mrn].append(pcr_time)
  CSV_DIRECTORY, LAB_CANCELLED_FLAGS, LAB_SKIP_VALUES

from postgresql_utils import sql_query
from file_utils import read_csv, write_csv
from time_utils import get_hours_between_datetimes
from identity_utils import generate_patient_uid, generate_patient_site_uid

from mappers import map_float_value, map_time, map_lab_name, \
  map_lab_sample_site, map_lab_result_value, map_observation_name

row_count = 0
patient_data_rows = []
patient_mrns = []
pcr_sample_times = {}

reader = read_csv(os.path.join(CSV_DIRECTORY, 'patient_data.csv'))

for row in reader:
    if not row_count == 0:
        patient_data_rows.append(row)
        patient_mrn = row[0]
        patient_mrns.append(patient_mrn)
        pcr_sample_times[str(patient_mrn)] = row[2]
    row_count += 1

df = sql_query(
    "SELECT * FROM dw_v01.oacis_lb WHERE " +
    "lbres_ck IS NOT NULL AND resultunit IS NOT NULL AND resultdtm IS NOT NULL AND "
    + "specimencollectiondtm > '2020-01-01' AND dossier in (" +
    ", ".join(patient_mrns) + ")")
Exemple #13
0
def test_read_csv():
    # test with a file with header
    # delimiter "," , header=True, clean_chars_header=[]
    # expected all lines in test_data.csv should be in a dictionary list
    assert futl.read_csv("test_data.csv") == [{
        '"Name"': "User1",
        '"LastName"': 'User1 Lastname',
        '"Year"': '2010',
        '"Score"': '5'
    }, {
        '"Name"': "User1",
        '"LastName"': 'User1 Lastname',
        '"Year"': '2012',
        '"Score"': '6'
    }, {
        '"Name"': "User1",
        '"LastName"': 'User1 Lastname',
        '"Year"': '2012',
        '"Score"': '4'
    }, {
        '"Name"': "User2",
        '"LastName"': 'User2 Lastname',
        '"Year"': '2011',
        '"Score"': '1'
    }]
    # test with clean_chars_header arguments
    # delimiter "," , header=True
    # expected all '"' char in column names in the header should be removed
    assert futl.read_csv("test_data.csv", clean_chars_header=['"']) == [{
        'Name':
        "User1",
        'LastName':
        'User1 Lastname',
        'Year':
        '2010',
        'Score':
        '5'
    }, {
        'Name':
        "User1",
        'LastName':
        'User1 Lastname',
        'Year':
        '2012',
        'Score':
        '6'
    }, {
        'Name':
        "User1",
        'LastName':
        'User1 Lastname',
        'Year':
        '2012',
        'Score':
        '4'
    }, {
        'Name':
        "User2",
        'LastName':
        'User2 Lastname',
        'Year':
        '2011',
        'Score':
        '1'
    }]
    # test with no header file
    # delimiter ","
    # expected the data set should be created with unique column names
    assert futl.read_csv("test_data_no_header.csv",
                         header=False,
                         clean_chars_header=['"']) == [{
                             'col_1': "User1",
                             'col_2': 'User1 Lastname',
                             'col_3': '2010',
                             'col_4': '5'
                         }, {
                             'col_1': "User1",
                             'col_2': 'User1 Lastname',
                             'col_3': '2012',
                             'col_4': '6'
                         }, {
                             'col_1': "User1",
                             'col_2': 'User1 Lastname',
                             'col_3': '2012',
                             'col_4': '4'
                         }, {
                             'col_1': "User2",
                             'col_2': 'User2 Lastname',
                             'col_3': '2011',
                             'col_4': '1'
                         }]
    # expected throw FileNotFoundError exception when the file is not found
    with raises(FileNotFoundError):
        assert futl.read_csv("no_data.csv")
Exemple #14
0
        "-d",
        "--match_data",
        help=
        "Filename of json with old matches. If file already exists, read it. If file does not exist, create one based on the results of this run. Generating the description matches in match_data is by far the slowest part, so it is recommended to save it when expecting re-use."
    )
    parser.add_argument(
        "-m",
        "--matches",
        help=
        "Maximum amount of matches to return for each row. Default value is 5.",
        type=int,
        default=5)

    args = parser.parse_args()

    sites_rows = file_utils.read_csv(args.filename)
    output_file = args.output
    matches_json = args.match_data
    if not matches_json:
        matches_json = ""
    top_n = args.matches

    stime = time.time()

    if file_utils.file_exists(output_file):
        old_rows = file_utils.read_csv(output_file)
    else:
        old_rows = []

    ndf = pandas.DataFrame(sites_rows)
    odf = pandas.DataFrame(old_rows)
Exemple #15
0
df = sql_query('SELECT * from dw_v01.icd10_cat')
for i, row in df.iterrows():
  icd10_codes[row.categories] = row.description

icd10_codes['U07.1'] = 'COVID-19, virus identified'
icd10_codes['U07.2'] = 'COVID-19, virus not identified'
icd10_codes['I64'] = 'Stroke, not specified as hemorrhage or infarction'
icd10_codes['T13.0'] = 'Superficial injury of lower limb, level unspecified'
icd10_codes['T13.1'] = 'Open wound of lower limb, level unspecified'
icd10_codes['T35.7'] = 'Unspecified frostbite of unspecified site'

for code in icd10_codes:
  icd10_codes[code] = icd10_codes[code].lower()

patient_data_rows = read_csv(os.path.join(CSV_DIRECTORY, 'patient_data.csv'))
patient_mrns = [str(row[0]) for row in patient_data_rows]
padded_patient_mrns = [pad_mrn(x) for x in patient_mrns]

episode_data_rows = read_csv(os.path.join(CSV_DIRECTORY, 'episode_data.csv'))

diagnosis_data_rows = []

#### Add diagnoses from ER visits
df = sql_query("SELECT dossier, noadm, dhreadm FROM " + \
  "dw_test.orcl_cichum_sejurg_live WHERE " + \
  "dossier in ('" + "', '".join(patient_mrns) + "') " + \
  "AND dhreadm > '2020-01-01'")

urg_episode_ids = []
episodes_by_id = {}
from scrape import scraper
from file_utils import read_csv, write_csv, read_file, write_file
from generate import _render_template, preprocess
from image import pass_gen
from mail import sendmail
import json

# Scraping the webpage and storing the data in a csv
data = scraper('http://scrape.kjscecodecell.com/')
write_csv(data)

# Reading the scraped data from the csv and preprocessing the data
participants = read_csv()
participants = preprocess(participants)

# Getting the list of mails to whom mails have already been sent
sent_mails = read_file()

# Looping over all participants
for participant in participants:
    # Checking if the participant was sent a mail previously
    if participant['email'] not in sent_mails:
        name = participant['name']
        email = participant['email']
        phone = participant['phone']
        payment_status = participant['payment']

        # Generating a message from the template
        message = _render_template(name, payment_status)

        # Generating a custom image
Exemple #17
0
import subprocess, os
import pandas as pd
import numpy as np

from postgresql_utils import sql_query
from file_utils import read_csv, write_csv
from time_utils import get_hours_between_datetimes
from constants import CSV_DIRECTORY, TABLE_COLUMNS, CENSOR_COLUMNS
from cli_utils import tabulate_columns

MILA_CSV_DIRECTORY = '/data8/projets/Mila_covid19/output/covidb_mila/csv'

patient_data_rows = read_csv(
  os.path.join(CSV_DIRECTORY, 'patient_data.csv'))

patient_mrns = []
pcr_sample_times = {}

for row in patient_data_rows:
  patient_mrn = row[0]
  patient_mrns.append(patient_mrn)
  pcr_sample_times[patient_mrn] = row[2]

df = sql_query("SELECT * FROM dw_v01.dw_rad_examen "+
  "WHERE dossier IN ('S" + "', 'S".join(patient_mrns) + "') " +
  "AND date_heure_exam > '2020-01-01'")

imaged_patient_mrns = []

for index, row in df.iterrows():
  lower_desc = row.description.lower()
import os
import glob
import tkinter as tk
from tkinter import Label, Entry, Button, Toplevel
import subprocess
from Test import getMail
from file_utils import read_csv, write_csv, read_file, write_file
from generate import _render_template, preprocess

# getMail()
global root, root1

root = tk.Tk()
root.geometry('300x300')

data = read_csv()
participants = data
participants = preprocess(participants)


def destroyRoot():
    root.destroy()


def getData():
    emailid = emailentry.get()
    with open('credentials/email.txt', 'w') as emailFile:
        emailFile.write(emailid)

    password_file = passwordentry.get()
    with open('credentials/password.txt', 'w') as passFile:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Script to allocate items to a UNSPSC product")
    parser.add_argument("filename", help="Filename of the csv file to process.")
    parser.add_argument("-l", "--level", help="Defines the level of top categories to check for matches. Only n categories with the highest probability of containing matches will get checked. Accepts Segment, Family or Class. Default is Family.", choices=["Segment", "Family", "Class"], default="Family")
    parser.add_argument("-n", "--num_to_check", help="Number of top categories to check for each row. Higher values mean slower but more accurate matching. Default value 25.", type=int, default=25)
    parser.add_argument("-o", "--output", help="Save output to file with the given filename. If argument is not present, the output is instead printed to console in an abbreviated form.")
    parser.add_argument("-j", "--jaccard", help="Sets the Jaccard threshold. If the Jaccard score of the best match is below the threshold, reruns the search for all top categories to find the best possible match. Default value is 0.3.", type=float, default=0.3)
    parser.add_argument("-m", "--matches", help="How many matches to return for each row. Default is 1.", type=int, default=1)
    parser.add_argument("-np", "--no_parallel", help="Flag that determines whether to use parallel processing to speed up search.", action="store_true")
    parser.add_argument("-a", "--add_ids", help="Flag that determines whether to add an id column to the data read from the input csv.", action="store_true")
    parser.add_argument("-s", "--skip_preprocessing", help="If set, skip preprocessing steps. This will slow down the processing.", action="store_true")

    args = parser.parse_args()

    stock_master = file_utils.read_csv(args.filename, add_ids=args.add_ids)
    level = args.level
    top_categories_to_check_count = args.num_to_check
    output = args.output
    jac = args.jaccard
    topn = args.matches
    parallel = not args.no_parallel
    skip_preprocessing = args.skip_preprocessing

    stime = time.time()

    if not output:
        stock_master = pandas.DataFrame(stock_master)
        df = add_commodities_to_dataframe(stock_master)
        print(df)
    else:
Exemple #20
0
import numpy as np
import pandas as pd

from constants import LOCAL_SITE_CODE, CSV_DIRECTORY, \
  TABLE_COLUMNS, LIVE_SHEET_FILENAME
from time_utils import get_datetime_seconds, \
  get_hours_between_datetimes
from identity_utils import generate_patient_uid, \
  generate_patient_site_uid, generate_accession_uid
from file_utils import write_csv, read_csv
from postgresql_utils import sql_query

from mappers import map_time, map_patient_ramq, \
  map_patient_covid_status, map_patient_age, map_patient_sex

live_sheet_rows = read_csv(LIVE_SHEET_FILENAME, remove_duplicates=True)

all_mrns = list(set([str(row[0]) for row in live_sheet_rows]))

all_tests_obj = [
    [str(row[0]), str(row[-7]), row[-3] == 'External'] \
   for row in live_sheet_rows]

all_tests = [','.join([str(x) for x in y]) \
  for y in all_tests_obj]

total_tests = len(all_tests)

non_er_episodes_df = sql_query(
    "SELECT dossier, dhreadm, dhredep, unitesoinscode FROM " +
    "dw_test.cichum_sejhosp_live WHERE " + "dossier in (" +
Exemple #21
0
    # display data set
    print(f"data_x : {data_x}")
    print(f"data_x : {data_y}")

    # build and show plot scatter
    build_plot_scatter(data_x=data_x, data_y=data_y, title="Andrea Henkel Correlation", showTable=True, saveFig=True);


# read the file and get data set
data_folder = Path("data/")

file_to_open = data_folder / "athlete_events.csv"

data_set = read_csv(
    file_to_open,
    delimiter=";",
    clean_chars_header=['"']
)

# display a sample row
print()
print(f"Sample Row : {sample_rows(data_set, 1)}")

# display a columns of data_set which was created by read_csv
print()
columns = get_columns(data_set)
print(f"Columns : {columns}")

# test box and violin plot
box_violin_plot(data_set)