def create_location(code, args): """Creates a location object from user-input :param code: unique name for location :type code: str :return: location :rtype: dict """ location = {} location["country"] = input(f"Location country (required): ") or None location["state"] = input(f"Location state: ") or "" location["city"] = input(f"Location city: ") or "" location["code"] = input( f"Location code (required, must be unique): ") or None while location["country"] is None: location["country"] = input( f"Country must be identified. Please enter a country for the location: " ) or None while location["code"] is None: location["code"] = input( f"A location code must be provided. Please enter a location code for the location: " ) or None # The code is finally defined, so we need to validate if the code can be # used. conn = connect(args) # Was it already claimed? If so, does it match... If it matches, great! Don't create it. validate.location_exists(conn, location) location_id = find.find_location(conn, args, location["code"]) # Location by code not in database, create a new one if location_id is None: pass return
def process(args): """Imports hardcoded values for Setaria database. Many items are placeholder values.""" # ======================================= # ========= Database Connection ========= # ======================================= try: conn = connect() except: raise # ======================================= # ========== Experiment Design ========== # ======================================= # What the database needs in order to create an 'experiment' is the follow # Species: maize (Zea mays) # Population: Maize282 # Chromosome: 10 (just the number and a way to generate its unique name) # Line: 282set_B73 (B73) -- taken from file if possible # Genotype Version: B73 RefGen_v4_AGPv4_Maize282 (the reference genome) # Growout, Type, and Location: # Location: code, city, state, country # "PU", "West Lafayette", "Indiana", "United States" # Type: "field", "phenotyper", etc. # Growout: name, population ID, location ID, year, type # "PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID # Traits (planned phenotypes/traits to measure) # Expected User Input # Species species_shortname = 'setaria' # setaria species_binomial = 'Setaria italica' # Setaria italica OR Setaria viridis ??? species_subspecies = None species_variety = None # Population population_name = 'SetariaPopulationName' # Chromosome chromosome_count = 9 # As defined by `awk -F'\t' '!a[$1]++{print NR":"$0}' 2.from12.setaria.maf0.1.maxMissing0.1.allLines.012.pos` # Line lines_filename = Template( '${cwd}/${chr}_${shortname}.012.indv' ) # NOTE(tparker): Can use any chromosome, as they are the same for each. In the future, this the extraneous copies of the lines may be removed and there will be one specific line file, much like the phenotype files # Genotype Version # NOTE(tparker): This is possibly just the info about the reference genome # It is likely included with the VCF genotype file (.012). genotype_version_assembly_name = 'SetariaGenotypeVersionAssemblyName' genotype_version_annotation_name = 'SetariaAnotationVersionName' # NOTE(tparker): Not sure where to find this info or who names it reference_genome_line_name = 'REF_REF_REF_REF' # Placeholder # Growout, Type, and Location # NOTE(tparker): Unknown at this time ## Location ## Type ## Growout # # Traits phenotype_filename = Template('${cwd}/${growout}.ph.csv') # Model Construction & Insertion if not args.debug: # Species s = species(species_shortname, species_binomial, species_subspecies, species_variety) species_id = insert.insert_species(conn, s) species_id = find.find_species(conn, species_shortname) # For idempotence # Population p = population(population_name, species_id) population_id = insert.insert_population(conn, p) population_id = find.find_population( conn, population_name) # For idempotence if args.verbose: print(f'[Insert]\tPopulation ID\t{population_id}') # Chromosome chromosome_ids = insert.insert_all_chromosomes_for_species( conn, chromosome_count, species_id) # Line working_filepath = lines_filename.substitute( dict(chr="chr1", cwd=f"{args.working_directory}", shortname=species_shortname)) try: if not os.path.isfile(working_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), working_filepath) except: raise line_ids = insert.insert_lines_from_file( conn, working_filepath, population_id ) # hard-coded substitue until just one file is used for lines # Genotype Version reference_genome_id = find.find_line(conn, reference_genome_line_name, population_id) gv = genotype_version(genotype_version_assembly_name, genotype_version_annotation_name, reference_genome=reference_genome_id, genotype_version_population=population_id) genotype_version_id = insert.insert_genotype_version(conn, gv) genotype_version_id = find.find_genotype_version( conn, genotype_version_assembly_name) # Growout, Type, and Location # NOTE(tparker): Unknown at this time ## Location ## Type ## Growout # Traits # Go through all the phenotype files available for the dataset and insert # the recorded traits for each. try: if not os.path.isfile( phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}"))): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}"))) except: raise traits = list( pd.read_csv(phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}")), index_col=0)) trait_ids = insert.insert_traits_from_traitlist(conn, traits) # DEBUG else: print('Experiment Design\n=======================================') # Species s = species(species_shortname, species_binomial, species_subspecies, species_variety) print('\n------------------------\nSpecies\n------------------------') print(s) species_id = randint(1, 1000) print(f'Species ID set to {species_id}') # Population p = population(population_name, species_id) print( '\n------------------------\nPopulation\n------------------------') print(p) population_id = randint(1, 1000) print(f'Population ID set to {population_id}') # Chromosome print( '\n------------------------\nChromosome (from file)\n------------------------' ) print( f'insert_all_chromosomes_for_species(conn, {chromosome_count}, {species_id})' ) # Line print( '\n------------------------\nLines (from file)\n------------------------' ) print( f'insert_lines_from_file(conn, {lines_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}", shortname=species_shortname))}, {population_id})' ) # Genotype Version reference_genome_id = None line_id = randint(1, 1000) print( '\n------------------------\nLine ID (Reference Genome)\n------------------------' ) print(f'Line ID set to {line_id}') gv = genotype_version(genotype_version_assembly_name, genotype_version_annotation_name, reference_genome=line_id, genotype_version_population=population_id) print( '\n------------------------\nGenotype Version\n------------------------' ) print(gv) genotype_version_id = randint(1, 1000) print(f'Genotype Version ID set to {genotype_version_id}') # Growout, Type, and Location # NOTE(tparker): Unknown at this time ## Location ## Type ## Growout # Traits # Go through all the phenotype files available for the dataset and insert # the recorded traits for each. print('\n------------------------\nTraits\n------------------------') print('Trait (from file)') print( f'list(pd.read_csv({phenotype_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}"))}, index_col=0))' ) traits = ['weight', 'height', 'root_angle'] print(f'insert.insert_traits_from_traitlist(conn, {traits})') trait_ids = [randint(1, 1000) for t in traits] print(f'Trait IDs set to {trait_ids}') # # ===================================== # # ========== Pipeline Design ========== # # ===================================== # # GWAS Algorithm: "MLMM", "EMMAx", "GAPIT", "FarmCPU" # # Imputation Method: "impute to major allele" # # Kinship Algorithm: "loiselle" # # Population Structure Algorithm: "Eigenstrat" # Expected User Input # GWAS Algorithm gwas_algorithm_name = 'MLMM' # According to Greg's README # Imputation Method imputation_method_name = 'SetariaImputationMethodName' # Unknown, apparently it was done by someone named Sujan # Kinship Algorithm kinship_algorithm_name = 'AstleBalding synbreed (placeholder)' # Placeholder, I don't know the exact string that should be used # Population Structure Algorithm population_structure_algorithm_name = 'Eigenstrat' # This is a guess based on filename if not args.debug: # Model Construction & Insertion # GWAS Algorithm ga = gwas_algorithm(gwas_algorithm_name) gwas_algorithm_id = insert.insert_gwas_algorithm(conn, ga) # Imputation Method im = imputation_method(imputation_method_name) imputation_method_id = insert.insert_imputation_method(conn, im) # Kinship Algorithm ka = kinship_algorithm(kinship_algorithm_name) kinship_algorithm_id = insert.insert_kinship_algorithm(conn, ka) # Population Structure Algorithm psa = population_structure_algorithm( population_structure_algorithm_name) population_structure_algorithm_id = insert.insert_population_structure_algorithm( conn, psa) else: print('\n\nPipeline Design\n=======================================') # Model Construction & Insertion # GWAS Algorithm print( '\n------------------------\nGWAS Algorithm\n------------------------' ) print(f'insert_gwas_algorithm(conn, {gwas_algorithm_name})') gwas_algorithm_id = randint(1, 1000) print(f'GWAS Algorithm ID set to {gwas_algorithm_id}') # Imputation Method print( '\n------------------------\nImputation Method\n------------------------' ) print( f'insert.insert_imputation_method(conn, {imputation_method_name})') imputation_method_id = randint(1, 1000) print(f'Imputation method ID set to {imputation_method_id}') # Kinship Algorithm print( '\n------------------------\nKinship Algorithm\n------------------------' ) print( f'insert.insert_kinship_algorithm(conn, {kinship_algorithm_name})') kinship_algorithm_id = randint(1, 1000) print(f'Kinship algorithm ID set to {kinship_algorithm_id}') # Population Structure Algorithm print( '\n------------------------\nPopulation Structure Algorithm\n------------------------' ) print( f'insert.insert_population_structure_algorithm(conn, {population_structure_algorithm_name})' ) population_structure_algorithm_id = randint(1, 1000) print( f'Population structure algorithm ID set to {population_structure_algorithm_id}' ) # =========================================== # ========== Experiment Collection ========== # =========================================== # Phenotype (external source?) # This needs to be standardized to a .pheno filetype. # For now, it is the longFormat for the Maize282 datset # 5.mergedWeightNorm.LM.rankAvg.longFormat.csv, but for Setaria will be # Genotype (VCF output) # Variant (VCF output) # Expected User Input # Phenotype # NOTE(tparker): Define in earlier stage # Genotype genotype_filename = Template('${cwd}/${chr}_${shortname}.012') # Variants variants_filename = Template('${cwd}/${chr}_${shortname}.012.pos') if not args.debug: # Model Construction & Insertion # Phenotype try: if not os.path.isfile( phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}"))): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}"))) except: raise phenotype_ids = insert.insert_phenotypes_from_file( conn, phenotype_filename.substitute( dict(growout="phenotyper", cwd=f"{args.working_directory}")), population_id) # Genotype for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id) geno_filename = genotype_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) line_filename = lines_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) try: if not os.path.isfile(geno_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), geno_filename) if not os.path.isfile(line_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), line_filename) except: raise # genotype_ids = insert.insert_genotypes_from_file(conn, geno_filename, line_filename, chromosome_id, population_id, genotype_version_id) # Variants for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id) variant_filename = variants_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) try: if not os.path.isfile(variant_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), variant_filename) except: raise variant_ids = insert.insert_variants_from_file( conn, variant_filename, species_id, chromosome_id) else: print( '\n\nExperiment Collection\n=======================================' ) # Model Construction & Insertion # Phenotype print( '\n------------------------\nPhenotypes\n------------------------') print( f'insert.insert_phenotypes_from_file(conn, {phenotype_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}"))}, {population_id})' ) # Genotype for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) # chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id) chromosome_id = randint(1, 1000) print(f'Chromosome ID set to {chromosome_id}') geno_filename = genotype_filename.substitute( dict(chromosome_shortname=chromosome_shortname, cwd=f"{args.working_directory}")) line_filename = lines_filename.substitute( dict(chromosome_shortname=chromosome_shortname, cwd=f"{args.working_directory}")) print( f'insert.insert_genotypes_from_file(conn, {geno_filename}, {line_filename}, {chromosome_id}, {population_id}, {line_id})' ) genotype_ids = [randint(1, 1000) for g in range(1, 25)] print(f'Genotype IDs set to {genotype_ids}') # Variants for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) # chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id) chromosome_id = randint(1, 1000) print(f'Chromosome ID set to {chromosome_id}') variant_filename = variants_filename.substitute( dict(chromosome_shortname=chromosome_shortname)) print( f'insert.insert_variants_from_file(conn, {variant_filename}, {species_id}, {chromosome_id})' ) # ========================================= # ========== Pipeline Collection ========== # ========================================= # Kinship # Setaria Kinship is stored in: ## /shares/ibaxter_share/gziegler/SetariaGWASPipeline/data/genotype/6.AstleBalding.synbreed.kinship.rda ## Exported the file to CSV using R ### load('6.AstleBalding.synbreed.kinship.rda') ### write.csv(kinship, '6.AstleBalding.synbreed.kinship.csv') # Population Structure # Expected User Input # Kinship # NOTE(tparker): Currently the database just stores the filename. # There is no service to upload the file to database's # host, so there's no single location to find the file # I would like to find out why this is the case and if # it would just be better to store it in the database and # allow the user to export the table themselves as a CSV. kinship_filepath = f'{args.working_directory}/6.AstleBalding.synbreed.kinship.csv' # Population Structure # NOTE(tparker): Same reasoning as the kinship file. There should be a way # for the data to be stored in the database, not a population_structure_filepath = f'{args.working_directory}/6.Eigenstrat.population.structure.50PCs.csv' if not args.debug: # Model Construction & Insertion # Kinship try: if not os.path.isfile(kinship_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), kinship_filepath) if not os.path.isfile(population_structure_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), population_structure_filepath) except: raise k = kinship(kinship_algorithm_id, kinship_filepath) kinship_id = insert.insert_kinship(conn, k) # Population Structure ps = population_structure(population_structure_algorithm_id, population_structure_filepath) population_structure_id = insert.insert_population_structure(conn, ps) else: print( '\n\nPipeline Collection\n=======================================') # Model Construction & Insertion # Kinship k = kinship(kinship_algorithm_id, kinship_filepath) print('\n------------------------\nKinship\n------------------------') print(f'insert.insert_kinship(conn, {k})') kinship_id = randint(1, 1000) # Population Structure print( '\n------------------------\nPopulation Structure Algorithm\n------------------------' ) try: if not os.path.isfile(population_structure_filepath): raise FileNotFoundError except: raise else: ps = population_structure(population_structure_algorithm_id, population_structure_filepath) print(f'insert.insert_population_structure(conn, {ps})') population_structure_id = randint(1, 1000) print(f'Population structure ID set to {population_structure_id}') # ============================================= # ================== Results ================== # ============================================= # GWAS Run # GWAS Results # Expected User Input # GWAS Run & results gwas_filename = f'{args.working_directory}/placeholder_gwas_results.csv' # The following values (0.2, 0.2, and 0.1) were all taken from the Maize282 import # NOTE(tparker): Make sure to double check with Greg on what the true values should be # Also, double check the source of the pipeline to see if there is any # indication what the values shoudl be. missing_snp_cutoff_value = 0.2 missing_line_cutoff_value = 0.2 minor_allele_frequency_cutoff_value = 0.1 if not args.debug: # Model Construction & Insertion # GWAS Run # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was # set by someone named Sujan because imputation was done beforehand try: if not os.path.isfile(gwas_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), gwas_filename) except: raise imputation_method_id = find.find_imputation_method( conn, imputation_method_name) gwas_run_ids = insert.insert_gwas_runs_from_gwas_results_file( conn, gwas_filename, gwas_algorithm_id, reference_genome_id, missing_snp_cutoff_value, missing_line_cutoff_value, minor_allele_frequency_cutoff_value, imputation_method_id, kinship_id, population_structure_id) # GWAS Results gwas_result_ids = insert.insert_gwas_results_from_file( conn, species_id, gwas_filename, gwas_algorithm_id, missing_snp_cutoff_value, missing_line_cutoff_value, imputation_method_id, reference_genome_id, kinship_id, population_structure_id, minor_allele_frequency_cutoff_value) else: print('\n\nResults\n=======================================') # Model Construction & Insertion # GWAS Run # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was # set by someone named Sujan because imputation was done beforehand print( '\n------------------------\nImputation Method\n------------------------' ) # Imputation Method ID was already set in a previous set. If this was done at a # time, then it will have to be searched for in the database. print(f'Imputation method ID set to {imputation_method_id}') print('\n------------------------\nGWAS Run\n------------------------') try: if not os.path.isfile(gwas_filename): raise FileNotFoundError except: raise else: print( f'insert.insert_gwas_runs_from_gwas_results_file(conn, {gwas_filename}, {gwas_algorithm_id}, {reference_genome_id}, {missing_snp_cutoff_value}, {missing_line_cutoff_value}, {minor_allele_frequency_cutoff_value}, {imputation_method_id}, {kinship_id}, {population_structure_id})' ) gwas_run_ids = [randint(1, 1000) for g in range(1, 15)] print(f'GWAS run IDs set to {gwas_run_ids}') # GWAS Results print( '\n------------------------\nGWAS Result\n------------------------' ) print( f'insert.insert_gwas_results_from_file(conn,{species_id},{gwas_filename},{gwas_algorithm_id},{missing_snp_cutoff_value},{missing_line_cutoff_value},{imputation_method_id},{reference_genome_id},{kinship_id},{population_structure_id},{minor_allele_frequency_cutoff_value})' ) gwas_result_ids = [randint(1, 1000) for g in range(1, 15)] print(f'GWAS result IDs set to {gwas_result_ids}')
def validate(args): """Validate specified input files for import into GWAS database This function validates that the contents of a file to contain correct data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments """ logging.info("Collecting and identifying input files.") try: conn = connect(args) except: raise # Input file preprocessing and validation try: if not os.path.isfile(args.filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), args.filename) else: with open(args.filename) as f: dp = json.load(f) # data parameters # Verify that all necessary values were provided, assuming a complete dataset expected_fields = [ "species_shortname", "species_binomial_name", "species_subspecies", "species_variety", "population_name", "number_of_chromosomes", "genotype_version_assembly_name", "genotype_version_annotation_name", "reference_genome_line_name", "gwas_algorithm_name", "imputation_method_name", "kinship_algortihm_name", "population_structure_algorithm_name", "kinship_filename", "population_structure_filename", "gwas_run_filename", "gwas_results_filename", "missing_SNP_cutoff_value", "missing_line_cutoff_value", "minor_allele_frequency_cutoff_value", "phenotype_filename" ] missing_keys = [] for k in expected_fields: if k not in dp: missing_keys.append(k) if missing_keys: raise KeyError( f'The following keys are required. Please include them in your JSON configuration: {missing_keys}' ) # Check for all required fields required_fields = [ "species_shortname", "species_binomial_name", "population_name", "number_of_chromosomes", "genotype_version_assembly_name", "genotype_version_annotation_name", "reference_genome_line_name", "gwas_algorithm_name", "imputation_method_name", "kinship_algortihm_name", "population_structure_algorithm_name", "kinship_filename", "population_structure_filename", "gwas_run_filename", "gwas_results_filename", "missing_SNP_cutoff_value", "missing_line_cutoff_value", "minor_allele_frequency_cutoff_value", "phenotype_filename" ] empty_fields = [] for rf in required_fields: if not dp[rf]: empty_fields.append(rf) if empty_fields: raise KeyError( f'The following keys must be defined. Empty strings are not permitted. Please modify your JSON configuration: {empty_fields}' ) logging.info( 'Configuration file is valid. Verifying that all files exist.') # Track all the files to check for existance locations = [] filepath_template = Template('${cwd}/${filename}') # Verify that all files exist # Lines lines_filename = Template('${chr}_${shortname}.012.indv') # Genotype genotype_filename = Template('${chr}_${shortname}.012') # Variants variants_filename = Template('${chr}_${shortname}.012.pos') for c in range(1, dp['number_of_chromosomes'] + 1): chr_shortname = 'chr' + str(c) lines_filepath = lines_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) genotype_filepath = genotype_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) variants_filepath = variants_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) locations.append( dict(cwd=args.working_directory, filetype='line', filename=lines_filepath)) locations.append( dict(cwd=args.working_directory, filetype='genotype', filename=genotype_filepath)) locations.append( dict(cwd=args.working_directory, filetype='variant', filename=variants_filepath)) # Go through all the single files that are not named based off of a chromsome # Construct the file descriptor dictionaries, and then loop through and test each file's existance # phenotype_filename = Template('${cwd}/${growout}.ph.csv') # Welp, this is another instance of pheno file issue locations.append( dict(cwd=args.working_directory, filetype='kinship', filename=dp['kinship_filename'])) locations.append( dict(cwd=args.working_directory, filetype='population_structure', filename=dp['population_structure_filename'])) # Since there can be more than one file for the phenotypes, results, and run # For each array in the configuration file, add it to the list of paths to # verify as existing for configuration_entry in dp: if isinstance(dp[configuration_entry], list): for filename in dp[configuration_entry]: locations.append( dict(cwd=args.working_directory, filetype=configuration_entry, filename=filename)) else: # For any of the entries that CAN be a list, add their single values to # the file list if configuration_entry in [ 'phenotype_filename', 'gwas_run_filename', 'gwas_results_filename' ]: locations.append( dict(cwd=args.working_directory, filetype=configuration_entry, filename=dp[configuration_entry])) logging.debug("FILE LOCATIONS: %s", locations) for file_descriptor in locations: file_path = filepath_template.substitute(file_descriptor) if not os.path.isfile(file_path): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) logging.info(f'Found all files. Validating file contents.') # Validate the contents of each file for file_descriptor in locations: ft = file_descriptor['filetype'] fp = filepath_template.substitute(file_descriptor) if ft == 'line': validate_line(conn, args, fp) elif ft == 'variant': validate_variant(conn, args, fp) elif ft == 'genotype': validate_genotype(conn, args, fp) elif ft == 'kinship': validate_kinship(conn, args, fp) elif ft == 'population_structure': validate_population_structure(conn, args, fp) elif ft == 'phenotype_filename': validate_phenotype(conn, args, fp) elif ft == 'gwas_run_filename': validate_runs(conn, args, fp) elif ft == 'gwas_results_filename': validate_results(conn, args, fp) else: logging.debug(f"Calling validation on unknown file: {fp}") logging.info(f"VALIDATED '{fp}'") except: raise logging.info(f'All input files appear to be valid.')
def design(args): """Validates JSON configuration file and checks if experiement can be created by checking against database.""" cwd = os.getcwd() try: with open(args.config, 'r', encoding='utf-8') as configfp: config = json.load(configfp) except: raise conn = connect(args) if args.debug: pprint(conn) # Species & Poputation # Check if the species is already in the database species = find.find_species(conn, config["species"]["shortname"]) if args.debug: print(f"Species: {config['species']['shortname']} -> {species}") # Growouts for go in config["growout"]: go["id"] = find.find_growout(conn, args, go["name"]) location = {} if "code" in go["location"]: location_id = find.find_location(conn, args, go["location"]["code"]) if location_id is not None: # Location exists, no further action required # Go to the next growout continue else: location = iu.create_location(go["location"]["code"], args) # Unknown location: location not in database OR location code not provided # If the code is provided but not in database, create a new entry for the # database if location_id is None: location = iu.create_location(go["location"]["code"], args) # Else, code is not provided, so we need to continually request that it be # defined and then entered into the database while True: code = input(f"Define unique location code: ").strip() if code is None or code == "": print( "Location code cannot be blank. Please enter an alphanumeric string." ) continue elif find.find_location(conn, code) is not None: print( f"Location '{code}' is already in use. Please enter a different code." ) continue else: break go["location"]["id"] = find.find_location(conn, go["location"]["code"]) pprint(go)
Estimated total time: """ import pandas as pd import numpy as np import psycopg2 import csv import insert from importation.util import find, insert from importation.util.dbconnect import config, connect from importation.util.models import species, population, line, chromosome, variant, genotype, trait, phenotype, growout_type, growout, location, gwas_algorithm, genotype_version, imputation_method, kinship_algorithm, kinship, population_structure_algorithm, population_structure, gwas_run, gwas_result if __name__ == '__main__': conn = connect() # ADD HARD-CODED VALUES FOR INDEPENDENT TABLES/OBJECTS # ADD LOCATIONS locations = [] locations.append( location("United States", "Indiana", "West Lafayette", "PU")) locations.append(location("United States", "New York", None, "NY")) locations.append(location("United States", "Florida", None, "FL")) locations.append(location("United States", "Puerto Rico", None, "PR")) locations.append(location("United States", "North Carolina", None, "NC")) locations.append(location("South Africa", None, None, "SA")) locations.append(location("United States", "Missouri", None, "MO")) for place in locations: insert.insert_location(conn, place)
def process(args): """Imports hardcoded values for Setaria database. Many items are placeholder values.""" # ======================================= # ========= Database Connection ========= # ======================================= try: conn = connect(args) except: raise # Input file preprocessing and validation try: if not os.path.isfile(args.filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), args.filename) else: with open(args.filename) as f: dp = json.load(f) # data parameters # Verify that all necessary values were provided, assuming a complete dataset expected_fields = [ "species_shortname", "species_binomial_name", "species_subspecies", "species_variety", "population_name", "number_of_chromosomes", "genotype_version_assembly_name", "genotype_version_annotation_name", "reference_genome_line_name", "gwas_algorithm_name", "imputation_method_name", "kinship_algortihm_name", "population_structure_algorithm_name", "kinship_filename", "population_structure_filename", "gwas_run_filename", "gwas_results_filename", "missing_SNP_cutoff_value", "missing_line_cutoff_value", "minor_allele_frequency_cutoff_value", "phenotype_filename" ] missing_keys = [] for k in expected_fields: if k not in dp: missing_keys.append(k) if missing_keys: raise KeyError( f'The following keys are required. Please include them in your JSON configuration: {missing_keys}' ) # Check for all required fields required_fields = [ "species_shortname", "species_binomial_name", "population_name", "number_of_chromosomes", "genotype_version_assembly_name", "genotype_version_annotation_name", "reference_genome_line_name", "gwas_algorithm_name", "imputation_method_name", "kinship_algortihm_name", "population_structure_algorithm_name", "kinship_filename", "population_structure_filename", "gwas_run_filename", "gwas_results_filename", "missing_SNP_cutoff_value", "missing_line_cutoff_value", "minor_allele_frequency_cutoff_value", "phenotype_filename" ] empty_fields = [] for rf in required_fields: if not dp[rf]: empty_fields.append(rf) if empty_fields: raise KeyError( f'The following keys must be defined. Empty strings are not permitted. Please modify your JSON configuration: {empty_fields}' ) logging.info( 'Configuration file is valid. Verifying that all files exist.') # Track all the files to check for existance locations = [] filepath_template = Template('${cwd}/${filename}') # Verify that all files exist # Lines lines_filename = Template('${chr}_${shortname}.012.indv') # Genotype genotype_filename = Template('${chr}_${shortname}.012') # Variants variants_filename = Template('${chr}_${shortname}.012.pos') for c in range(1, dp['number_of_chromosomes'] + 1): chr_shortname = 'chr' + str(c) lines_filepath = lines_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) genotype_filepath = genotype_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) variants_filepath = variants_filename.substitute( dict(cwd=args.working_directory, shortname=dp['species_shortname'], chr=chr_shortname)) locations.append( dict(cwd=args.working_directory, filetype='line', filename=lines_filepath)) locations.append( dict(cwd=args.working_directory, filetype='genotype', filename=genotype_filepath)) locations.append( dict(cwd=args.working_directory, filetype='variant', filename=variants_filepath)) # Go through all the single files that are not named based off of a chromsome # Construct the file descriptor dictionaries, and then loop through and test each file's existance # phenotype_filename = Template('${cwd}/${growout}.ph.csv') # Welp, this is another instance of pheno file issue locations.append( dict(cwd=args.working_directory, filetype='kinship', filename=dp['kinship_filename'])) locations.append( dict(cwd=args.working_directory, filetype='population_structure', filename=dp['population_structure_filename'])) # Since there can be more than one file for the phenotypes, results, and run # For each array in the configuration file, add it to the list of paths to # verify as existing for configuration_entry in dp: if isinstance(dp[configuration_entry], list): for filename in dp[configuration_entry]: locations.append( dict(cwd=args.working_directory, filetype=configuration_entry, filename=filename)) else: # For any of the entries that CAN be a list, add their single values to # the file list if configuration_entry in [ 'phenotype_filename', 'gwas_run_filename', 'gwas_results_filename' ]: locations.append( dict(cwd=args.working_directory, filetype=configuration_entry, filename=dp[configuration_entry])) logging.debug("File locations\n======================") logging.debug(pformat(locations)) for file_descriptor in locations: file_path = filepath_template.substitute(file_descriptor) if not os.path.isfile(file_path): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) logging.info(f'Found all files. Validating file contents.') # Validate the contents of each file for file_descriptor in locations: ft = file_descriptor['filetype'] fp = filepath_template.substitute(file_descriptor) if ft == 'line': validate_line(conn, args, fp) elif ft == 'variant': validate_variant(conn, args, fp) elif ft == 'genotype': validate_genotype(conn, args, fp) elif ft == 'kinship': validate_kinship(conn, args, fp) elif ft == 'population_structure': validate_population_structure(conn, args, fp) elif ft == 'phenotype_filename': validate_phenotype(conn, args, fp) elif ft == 'gwas_run_filename': validate_runs(conn, args, fp) elif ft == 'gwas_results_filename': validate_results(conn, args, fp) else: logging.debug(f"Calling validation on unknown file: {fp}") except: raise logging.info(f'Input files appear to be valid. Proceeding with import.') # ======================================= # ========== Experiment Design ========== # ======================================= # What the database needs in order to create an 'experiment' is the follow # Species: maize (Zea mays) # Population: Maize282 # Chromosome: 10 (just the number and a way to generate its unique name) # Line: 282set_B73 (B73) -- taken from file if possible # Genotype Version: B73 RefGen_v4_AGPv4_Maize282 (the reference genome) # Growout, Type, and Location: # Location: code, city, state, country # "PU", "West Lafayette", "Indiana", "United States" # Type: "field", "phenotyper", etc. # Growout: name, population ID, location ID, year, type # "PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID # Traits (planned phenotypes/traits to measure) # Expected User Input # Species species_shortname = dp['species_shortname'] species_binomial = dp['species_binomial_name'] species_subspecies = dp['species_subspecies'] species_variety = dp['species_variety'] # Population population_name = dp['population_name'] # Chromosome chromosome_count = dp['number_of_chromosomes'] # Line # NOTE(tparker): Can use any chromosome, as they are the same for each. # In the future, this the extraneous copies of the lines may be removed # and there will be one specific line file, much like the phenotype files lines_filename = Template('${cwd}/${chr}_${shortname}.012.indv') # Genotype Version # NOTE(tparker): This is possibly just the info about the reference genome # It is likely included with the VCF genotype file (.012). genotype_version_assembly_name = dp['genotype_version_assembly_name'] genotype_version_annotation_name = dp['genotype_version_annotation_name'] reference_genome_line_name = dp['reference_genome_line_name'] # Growout, Type, and Location # NOTE(tparker): Unknown at this time ## Location ## Type ## Growout # # Traits # Allow for more than on phenotype files if isinstance(dp["phenotype_filename"], list): phenotype_filenames = [ f'{args.working_directory}/{filename}' for filename in dp['phenotype_filename'] ] else: phenotype_filenames = [ f'{args.working_directory}/{dp["phenotype_filename"]}' ] # Model Construction & Insertion # Species s = species(species_shortname, species_binomial, species_subspecies, species_variety) species_id = insert.insert_species(conn, args, s) logging.debug(f'[Insert]\tSpecies ID\t{species_id}, {s}') # Population p = population(population_name, species_id) population_id = insert.insert_population(conn, args, p) logging.debug(f'[Insert]\tPopulation ID\t{population_id}: {p}') # Chromosome chromosome_ids = insert.insert_all_chromosomes_for_species( conn, args, chromosome_count, species_id) logging.debug(f'[Insert]\tChromosome IDs\t{chromosome_ids}') # Line working_filepath = lines_filename.substitute( dict(chr="chr1", cwd=f"{args.working_directory}", shortname=species_shortname)) try: if not os.path.isfile(working_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), working_filepath) except: raise line_ids = insert.insert_lines_from_file( conn, args, working_filepath, population_id ) # hard-coded substitue until just one file is used for lines logging.debug(f'[Insert]\tLine IDs\t{line_ids}') # Genotype Version reference_genome_id = find.find_line(conn, args, reference_genome_line_name, population_id) logging.debug( f'[Insert]\tReference Genome ID\t{reference_genome_id}, ({reference_genome_line_name}, {population_id})' ) gv = genotype_version(genotype_version_name=genotype_version_assembly_name, genotype_version=genotype_version_annotation_name, reference_genome=reference_genome_id, genotype_version_population=population_id) genotype_version_id = insert.insert_genotype_version(conn, args, gv) logging.debug(f'[Insert]\tGenome Version ID\t{genotype_version_id}') if genotype_version_id is None: raise Exception( f'Genotype version is None for parameters: {pformat(gv)}') # Growout, Type, and Location # NOTE(tparker): Unknown at this time ## Location ## Type ## Growout # Traits # Go through all the phenotype files available for the dataset and insert # the recorded traits for each. for phenotype_filepath in phenotype_filenames: try: if not os.path.isfile(phenotype_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), phenotype_filepath) except: raise traits = list(pd.read_csv(phenotype_filepath, index_col=0)) trait_ids = insert.insert_traits_from_traitlist( conn, args, traits, phenotype_filepath) logging.debug( f'[Insert]\tTrait IDs for {phenotype_filepath}\t{trait_ids}') # # ===================================== # # ========== Pipeline Design ========== # # ===================================== # # GWAS Algorithm: "MLMM", "EMMAx", "GAPIT", "FarmCPU" # # Imputation Method: "impute to major allele" # # Kinship Algorithm: "loiselle" # # Population Structure Algorithm: "Eigenstrat" # Expected User Input # GWAS Algorithm gwas_algorithm_name = dp[ 'gwas_algorithm_name'] # According to Greg's README # Imputation Method imputation_method_name = dp[ 'imputation_method_name'] # Unknown, apparently it was done by someone named Sujan # Kinship Algorithm kinship_algorithm_name = dp[ 'kinship_algortihm_name'] # Placeholder, I don't know the exact string that should be used # Population Structure Algorithm population_structure_algorithm_name = dp[ 'population_structure_algorithm_name'] # This is a guess based on filename # Model Construction & Insertion # GWAS Algorithm ga = gwas_algorithm(gwas_algorithm_name) gwas_algorithm_id = insert.insert_gwas_algorithm(conn, args, ga) # Imputation Method im = imputation_method(imputation_method_name) imputation_method_id = insert.insert_imputation_method(conn, args, im) # Kinship Algorithm ka = kinship_algorithm(kinship_algorithm_name) kinship_algorithm_id = insert.insert_kinship_algorithm(conn, args, ka) # Population Structure Algorithm psa = population_structure_algorithm(population_structure_algorithm_name) population_structure_algorithm_id = insert.insert_population_structure_algorithm( conn, args, psa) # =========================================== # ========== Experiment Collection ========== # =========================================== # Phenotype (external source?) # This needs to be standardized to a .pheno filetype. # For now, it is the longFormat for the Maize282 datset # 5.mergedWeightNorm.LM.rankAvg.longFormat.csv, but for Setaria will be # Genotype (VCF output) # Variant (VCF output) # Expected User Input # Phenotype # NOTE(tparker): Define in earlier stage # Genotype genotype_filename = Template('${cwd}/${chr}_${shortname}.012') # Variants variants_filename = Template('${cwd}/${chr}_${shortname}.012.pos') # Model Construction & Insertion # Phenotype for phenotype_filepath in phenotype_filenames: try: if not os.path.isfile(phenotype_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), phenotype_filepath) except: raise phenotype_ids = insert.insert_phenotypes_from_file( conn, args, phenotype_filepath, population_id, phenotype_filepath) logging.debug( f'[Insert]\tPhenotype IDs for {phenotype_filepath}\t{phenotype_ids}' ) # Genotype for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) chromosome_id = find.find_chromosome(conn, args, chromosome_shortname, species_id) geno_filename = genotype_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) line_filename = lines_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) try: if not os.path.isfile(geno_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), geno_filename) if not os.path.isfile(line_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), line_filename) except: raise genotype_ids = insert.insert_genotypes_from_file( conn=conn, args=args, genotypeFile=geno_filename, lineFile=line_filename, chromosomeID=chromosome_id, populationID=population_id, genotype_versionID=genotype_version_id) # Variants for c in range(1, chromosome_count + 1): chromosome_shortname = 'chr' + str(c) chromosome_id = find.find_chromosome(conn, args, chromosome_shortname, species_id) variant_filename = variants_filename.substitute( dict(chr=chromosome_shortname, cwd=f'{args.working_directory}', shortname=species_shortname)) try: if not os.path.isfile(variant_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), variant_filename) except: raise # insert.insert_variants_from_file(conn, # args, # variant_filename, # species_id, # chromosome_id) # NOTE(tparker): Changed variant insertion to the async version insert.insert_variants_from_file_async(conn, args, variant_filename, species_id, chromosome_id) # ========================================= # ========== Pipeline Collection ========== # ========================================= # Kinship # Setaria Kinship is stored in: ## /shares/ibaxter_share/gziegler/SetariaGWASPipeline/data/genotype/6.AstleBalding.synbreed.kinship.rda ## Exported the file to CSV using R ### load('6.AstleBalding.synbreed.kinship.rda') ### write.csv(kinship, '6.AstleBalding.synbreed.kinship.csv') # Population Structure # Expected User Input # Kinship # NOTE(tparker): Currently the database just stores the filename. # There is no service to upload the file to database's # host, so there's no single location to find the file # I would like to find out why this is the case and if # it would just be better to store it in the database and # allow the user to export the table themselves as a CSV. kinship_filepath = f'{args.working_directory}/{dp["kinship_filename"]}' # Population Structure # NOTE(tparker): Same reasoning as the kinship file. There should be a way # for the data to be stored in the database, not a population_structure_filepath = f'{args.working_directory}/{dp["population_structure_filename"]}' # Model Construction & Insertion # Kinship try: if not os.path.isfile(kinship_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), kinship_filepath) if not os.path.isfile(population_structure_filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), population_structure_filepath) except: raise k = kinship(kinship_algorithm_id, kinship_filepath) kinship_id = insert.insert_kinship(conn, args, k) # Population Structure ps = population_structure(population_structure_algorithm_id, population_structure_filepath) population_structure_id = insert.insert_population_structure( conn, args, ps) # ============================================= # ================== Results ================== # ============================================= # GWAS Run # GWAS Results # Expected User Input # GWAS Run & results if isinstance(dp['gwas_results_filename'], list): gwas_filenames = [ f'{args.working_directory}/{filename}' for filename in dp['gwas_results_filename'] ] # allows for more than one gwas results/run file else: gwas_filenames = [ f'{args.working_directory}/{dp["gwas_results_filename"]}' ] # The following values (0.2, 0.2, and 0.1) were all taken from the Maize282 import # NOTE(tparker): Make sure to double check with Greg on what the true values should be # Also, double check the source of the pipeline to see if there is any # indication what the values shoudl be. missing_snp_cutoff_value = dp['missing_SNP_cutoff_value'] missing_line_cutoff_value = dp['missing_line_cutoff_value'] minor_allele_frequency_cutoff_value = dp[ 'minor_allele_frequency_cutoff_value'] # Model Construction & Insertion # GWAS Run # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was # set by someone named Sujan because imputation was done beforehand for gwas_filename in gwas_filenames: try: if not os.path.isfile(gwas_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), gwas_filename) except: raise imputation_method_id = find.find_imputation_method( conn, args, imputation_method_name) gwas_run_ids = insert.insert_gwas_runs_from_gwas_results_file( conn, args, gwas_filename, gwas_algorithm_id, genotype_version_id, missing_snp_cutoff_value, missing_line_cutoff_value, minor_allele_frequency_cutoff_value, imputation_method_id, kinship_id, population_structure_id) # GWAS Results gwas_result_ids = insert.insert_gwas_results_from_file( conn=conn, args=args, speciesID=species_id, gwas_results_file=gwas_filename, gwas_algorithm_ID=gwas_algorithm_id, missing_snp_cutoff_value=missing_snp_cutoff_value, missing_line_cutoff_value=missing_line_cutoff_value, imputationMethodID=imputation_method_id, genotypeVersionID=genotype_version_id, kinshipID=kinship_id, populationStructureID=population_structure_id, minor_allele_frequency_cutoff_value= minor_allele_frequency_cutoff_value)
def process(args): try: conn = connect() except: raise # Species species_shortname = 'maize' # setaria species_binomial = 'Zea mays' # Setaria italica OR Setaria viridis ??? species_subspecies = None species_variety = None # Population population_name = 'Maize282' # Chromosome chromosome_count = 10 # Line lines_filename = Template('${cwd}/${chr}_${shortname}.012.indv') # Genotype Version genotype_version_assembly_name = 'B73 RefGen_v4' genotype_version_annotation_name = 'AGPv4' # NOTE(tparker): Not sure where to find this info or who names it reference_genome_line_name = 'REF_REF_REF_REF' # Placeholder s = species(species_shortname, species_binomial, species_subspecies, species_variety) insertedSpeciesID = insert.insert_species(conn, mySpecies) print("[ INSERT ]\t(%s)\t%s" % (insertedSpeciesID, str(mySpecies))) maizeSpeciesID = find.find_species(conn, 'maize') print("[ FIND ]\t(%s)\t%s" % (maizeSpeciesID, '< species: maize >')) # ADD A HARD-CODED POPULATION TO DB USING insert_population() myPopulation = population('Maize282', maizeSpeciesID) insertedPopulationID = insert.insert_population(conn, myPopulation) print("[ INSERT ]\t(%s)\t%s" % (insertedPopulationID, str(myPopulation))) print("[ FIND ]\t(%s)\t%s" % (maize282popID, '< population: Maize282 >')) # ADD A HARD-CODED LINE TO DB USING insert_line() myLine = line(line_name='282set_B73', line_population=maize282popID) insertedLineID = insert.insert_line(conn, myLine) print("[ INSERT ]\t(%s)\t%s" % (insertedLineID, str(myLine))) B73lineID = find.find_line(conn, '282set_B73', maize282popID) print("[ FIND ]\t(%s)\t%s" % (B73lineID, '< line: Maize282 >')) # ADD NEW HARD-CODED GENOTYPE_VERSION TO DB myGenotypeVersion = genotype_version( genotype_version_name='B73 RefGen_v4_AGPv4_Maize282', genotype_version=315, reference_genome=B73lineID, genotype_version_population=maize282popID) B73_agpv4_maize282_versionID = insert.insert_genotype_version( conn, myGenotypeVersion) print("[ INSERT ]\t(%s)\t%s" % (B73_agpv4_maize282_versionID, str(myGenotypeVersion))) # ADD ALL CHROMOSOMES FOR A SPECIES TO DB insertedChromosomeIDs = insert.insert_all_chromosomes_for_species( conn, 10, maizeSpeciesID) print("[ INSERT ]\t%s\t%s" % (insertedChromosomeIDs, '\t10 (sID: %s)' % maizeSpeciesID)) # GET LINES FROM SPECIFIED 012.indv FILE AND ADD TO DB insertedLineIDs = insert.insert_lines_from_file( conn, '../data/chr10_282_agpv4.012.indv', maize282popID) print("[ INSERT ]\t%s\t%s\t(pID: %s)" % (insertedLineIDs, '../data/chr10_282_agpv4.012.indv', maize282popID)) # GET VARIANTS FROM .012.pos FILE AND ADD TO DB # Found the issue, the 'true' database on adriatic houses variants for ALL chromosomes # So, to fix that, we gotta loop through each chromosome file and add them # NOTE(timp): For when this is generalized to more than just Zea mays, there need to be a # variable for the range instead because the number of chromosomes may differ between species for c in range(1, 11): chrShortname = 'chr' + str(c) chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID) filename = '../data/%s_282_agpv4.012.pos' % chrShortname # print("[ FIND ]\t(%s)\t%s" % (chrId, '< chromsome: %s >' % filename)) insertedVariantIDs = insert.insert_variants_from_file( conn, filename, maizeSpeciesID, chrId) # print("num inserted variants:") # print(len(insertedVariantIDs)) # ADD ALL GENOTYPES FROM A ONE-CHROMOSOME .012 FILE TO DB for c in range(1, 11): chrShortname = 'chr' + str(c) chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID) genoFilename = '../data/%s_282_agpv4.012' % chrShortname linesFilename = '../data/%s_282_agpv4.012.indv' % chrShortname # Example input file: chr1_282_agpv4.012.indv # 282set_33-16 # 282set_38-11Goodman-Buckler # 282set_4226 # 282set_4722 # 282set_A188 # 282set_A214NGoodman-Buckler # 282set_A239 # 282set_A441-5 # 282set_A554 # ... # This is a list of all the lines that have been genotyped # AFAIK, this is 1:1 for the rows of each file, so row 1 of .indv contains the line of row 1 in .012 insertedGenotypeIDs = insert.insert_genotypes_from_file( conn, genoFilename, linesFilename, chrId, maize282popID, B73lineID) # print("Inserted genotype IDs:") # print(insertedGenotypeIDs) # print("[ INSERT ]\t%s\t%s\t%s\t(cID: %s, pID: %s, lID: %s)" % (insertedGenotypeIDs, genoFilename, linesFilename, str(chrId), str(maize282popID), str(B73lineID))) # PARSE TRAITS FROM PHENOTYPE FILE AND ADD TO DB phenotypeRawData = pd.read_csv( '../data/5.mergedWeightNorm.LM.rankAvg.longFormat.csv', index_col=0) traits = list(phenotypeRawData) insertedTraitIDs = insert.insert_traits_from_traitlist(conn, traits) # print("num inserted traits:") # print(len(insertedTraitIDs)) # print("Inserted trait IDs:") # print(insertedTraitIDs) # PARSE PHENOTYPES FROM FILE AND ADD TO DB # Example input file: 5.mergedWeightNorm.LM.rankAvg.longFormat.csv # Pedigree weight_FL06 weight_MO06 weight_NC06 ... # 282set_33-16 299.8285 NA 247.08025 # 282set_38-11Goodman-Buckler NA 157.62175 183.5531625 # 282set_4226 NA NA 266.214 # 282set_4722 155.593625 130.501625 98.497 # 282set_A188 252.62675 255.4635 213.556125 # 282set_A214NGoodman-Buckler NA NA 202.21075 # 282set_A239 NA 225.50125 217.842 # ... # It is a line for line listing of all the traits by year # This WILL be changed out for using phenotype (.ph) files instead insertedPhenoIDs = insert.insert_phenotypes_from_file( conn, '../data/5.mergedWeightNorm.LM.rankAvg.longFormat.csv', maize282popID) # print("num phenotypes inserted:") # print(len(insertedPhenoIDs)) # print("phenoIDs:") # print(insertedPhenoIDs) # ADD NEW HARD-CODED GROWOUT_TYPE TO DB greenhouse_GrowoutType = growout_type("greenhouse") greenhouse_GrowoutTypeID = insert.insert_growout_type( conn, greenhouse_GrowoutType) phenotyper_GrowoutType = growout_type("phenotyper") phenotyper_GrowoutTypeID = insert.insert_growout_type( conn, phenotyper_GrowoutType) field_GrowoutType = growout_type("field") field_GrowoutTypeID = insert.insert_growout_type(conn, field_GrowoutType) # LOOK UP ID OF A HARD-CODED GROWOUT_TYPE USING find_chromosome() fieldGrowoutTypeID = find.find_growout_type(conn, 'field') print("[ FIND ]\t(%s)\t%s" % (fieldGrowoutTypeID, '< growout_type: field >')) # ADD NEW HARD-CODED GROWOUT TO DB growouts = [] growouts.append( growout("PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID)) growouts.append( growout("NY06", maize282popID, NYlocID, 2006, fieldGrowoutTypeID)) growouts.append( growout("NY10", maize282popID, NYlocID, 2010, fieldGrowoutTypeID)) growouts.append( growout("FL06", maize282popID, FLlocID, 2006, fieldGrowoutTypeID)) growouts.append( growout("PR06", maize282popID, PRlocID, 2006, fieldGrowoutTypeID)) growouts.append( growout("NC06", maize282popID, NClocID, 2006, fieldGrowoutTypeID)) growouts.append( growout("PU10", maize282popID, PUlocID, 2010, fieldGrowoutTypeID)) growouts.append( growout("SA06", maize282popID, SAlocID, 2006, fieldGrowoutTypeID)) growouts.append( growout("MO06", maize282popID, MOlocID, 2006, fieldGrowoutTypeID)) insertedGrowoutIDs = [] for growout in growouts: print("-------------\t%s" % str(growout)) insertedGrowoutIDs.append(insert.insert_growout(conn, growout)) print("[ INSERT ]\t%s\t(new growout)" % (insertedGenotypeIDs)) # ADD NEW HARD-CODED GWAS_ALGORITHM TO DB gwasAlgorithms = [] gwasAlgorithms.append(gwas_algorithm("MLMM")) gwasAlgorithms.append(gwas_algorithm("EMMAx")) gwasAlgorithms.append(gwas_algorithm("GAPIT")) gwasAlgorithms.append(gwas_algorithm("FarmCPU")) newGWASalgorithmIDs = [] for algorithm in gwasAlgorithms: newGWASalgorithmIDs.append( insert.insert_gwas_algorithm(conn, algorithm)) print("[ INSERT ]\t%s\t(new gwas algorithm IDs)" % (newGWASalgorithmIDs)) newGWASalgorithm = find.find_gwas_algorithm(conn, 'MLMM') # ADD NEW HARD-CODED IMPUTATION_METHOD TO DB newImputationMethods = [] newImputationMethods.append(imputation_method("impute to major allele")) newImputationMethods.append(imputation_method("impute to minor allele")) newImputationMethods.append(imputation_method("impute to average allele")) newImputationMethods.append(imputation_method("IMPUTE")) newImputationMethods.append(imputation_method("BEAGLE")) for im in newImputationMethods: insert.insert_imputation_method(conn, im) # ADD NEW HARD-CODED KINSHIP_ALGORITHM TO DB kinshipAlgorithms = [] kinshipAlgorithms.append(kinship_algorithm("loiselle")) kinshipAlgorithms.append(kinship_algorithm("van raden")) kinshipAlgorithms.append(kinship_algorithm("Synbreed_realizedAB")) newKinshipAlgorithmIDs = [] for algorithm in kinshipAlgorithms: newKinshipAlgorithmIDs.append( insert.insert_kinship_algorithm(conn, algorithm)) print("[ INSERT ]\t%s\t(new kinship algorithm IDs)" % (newKinshipAlgorithmIDs)) # LOOK UP ID OF A HARD-CODED KINSHIP_ALGORITHM USING find_kinship_algorithm() VanRadenID = find.find_kinship_algorithm(conn, "van raden") print("Van Raden kinship alg ID:") print(VanRadenID) # ADD NEW HARD-CODED KINSHIP TO DB newKinship = kinship(VanRadenID, "../data/4.AstleBalding.synbreed.kinship.csv") newKinshipID = insert.insert_kinship(conn, newKinship) print("New kinship ID:") print(newKinshipID) # ADD NEW HARD-CODED POPULATION_STRUCTURE_ALGORITHM TO DB newPopulationStructures = [] newPopulationStructures.append( population_structure_algorithm("Eigenstrat")) newPopulationStructures.append(population_structure_algorithm("STRUCTURE")) newPopulationStructures.append( population_structure_algorithm("FastSTRUCTURE")) for ps in newPopulationStructures: insert.insert_population_structure_algorithm(conn, ps) # LOOK UP ID OF A HARD-CODED POPULATION_STRUCTURE_ALGORITHM USING find_population_structure_algorithm() EigenstratID = find.find_population_structure_algorithm(conn, "Eigenstrat") print("Eigenstrat pop str alg ID:") print(EigenstratID) # ADD NEW HARD-CODED POPULATION_STRUCTURE TO DB # Example input file: 4.Eingenstrat.population.structure.10PCs.csv # Line V1 V2 V3 ... # 282set_4226 -0.002298602 -0.029693879 0.008527265 # 282set_4722 -0.003785163 -0.083527265 -0.059586105 # 282set_33-16 0.000222197 -0.035755785 0.017007817 # 282set_38-11Goodman-Buckler -0.026698262 -0.053115302 -0.01159794 # 282set_A188 0.002520617 -0.041387288 -0.011656126 # 282set_A239 -0.024217977 -0.038008255 0.033222018 # ... # The number of columns is one more than the number of PCs in filename newPopulationStructure = population_structure( EigenstratID, "../data/4.Eigenstrat.population.structure.10PCs.csv") newPopulationStructureID = insert.insert_population_structure( conn, newPopulationStructure) print("New population structure ID:") print(newPopulationStructureID) # LOOK UP ID OF A HARD-CODED GWAS_ALGORITHM MLMMalgorithmID = find.find_gwas_algorithm(conn, "MLMM") print("MLMM algorithm ID:") print(MLMMalgorithmID) # LOOK UP ID OF A HARD-CODED GENOTYPE_VERSION B73_agpv4_maize282_versionID = find.find_genotype_version( conn, "B73 RefGen_v4_AGPv4_Maize282") print("B73 agpv4 maize282 genotype version: ") print(B73_agpv4_maize282_versionID) # LOOK UP ID OF A HARD-CODED IMPUTATION_METHOD majorAlleleImputationID = find.find_imputation_method( conn, "impute to major allele") print("major allele imputation ID: ") print(majorAlleleImputationID) # LOOK UP ID OF A HARD-CODED KINSHIP # NOTE(timp): I could not find this file, but I found a R data file (.rda) that may contain the information. # Although, the data may not be in the correct format. # The temporary file is the one with 'export' in its name. # kinshipID = find.find_kinship(conn, "/opt/BaxDB/file_storage/kinship_files/4.AstleBalding.synbreed.kinship.csv") kinshipID = find.find_kinship( conn, "../data/4.AstleBalding.synbreed.kinship.csv") print("kinshipID: ") print(kinshipID) # LOOK UP ID OF A HARD-CODED POPULATION_STRUCTURE populationStructureID = find.find_population_structure( conn, "../data/4.Eigenstrat.population.structure.10PCs.csv") print("population structure ID: ") print(populationStructureID) # PARSE GWAS_RUNS FROM FILE AND ADD TO DB # NOTE(timp): Could not find file or possible equivalent insertedGwasRunIDs = insert.insert_gwas_runs_from_gwas_results_file( conn, '../data/9.mlmmResults.csv', MLMMalgorithmID, B73_agpv4_maize282_versionID, 0.2, 0.2, 0.1, majorAlleleImputationID, kinshipID, populationStructureID) print("Inserted gwas_run IDs:") print(insertedGwasRunIDs) # PARSE GWAS_RESULTS FROM FILE AND ADD TO DB # NOTE(timp): Could not find file or possible equivalent insertedGwasResultIDs = insert.insert_gwas_results_from_file( conn, maizeSpeciesID, '../data/9.mlmmResults.csv', MLMMalgorithmID, 0.2, 0.2, majorAlleleImputationID, B73_agpv4_maize282_versionID, kinshipID, populationStructureID, 0.1) print("Inserted gwas result IDs: ") print(insertedGwasResultIDs)
def truncate(args): """This function resets by truncating all tables in the QA server Tables affected: growout_type line location growout variant genotype_version genotype kinship_algorithm phenotype trait population_structure_algorithm gwas_algorithm kinship population_structure gwas_run gwas_result tissue chromosome species population imputation_method """ def exec(conn, args, stmt): """Remove all entries in a table with truncation This function removes all data in a table Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments SQL statement (string): truncation command """ cur = conn.cursor() try: cur.execute(stmt) except: raise finally: conn.commit() cur.close() conn = connect(args) tables = [ 'growout_type', 'line', 'location', 'growout', 'variant', 'genotype_version', 'genotype', 'kinship_algorithm', 'phenotype', 'trait', 'population_structure_algorithm', 'gwas_algorithm', 'kinship', 'population_structure', 'gwas_run', 'gwas_result', 'tissue', 'chromosome', 'species', 'population', 'imputation_method' ] # Since table names cannot be passed as parameters to a prepared statement, # I had to construct each truncate statement by hardcoding them sql_stmts = [f"TRUNCATE TABLE {t} CASCADE" for t in tables] logging.debug(sql_stmts) for stmt in sql_stmts: logging.info(f"Executing: {stmt}") exec(conn, args, stmt)