def __create_phenotype_worksheet(self, phenotype_worksheet, row_num, col_num, phenotypeUnitMap, phenotypeFieldList, phenotypeFieldMap): phenotype_worksheet.write_row(0, 0, tuple(PHN_CONST.PHENOTYPE_HEADERS)) local_field_map = {} for field in phenotypeFieldList: for key in phenotypeUnitMap: distance = self.__levenshtein(field.lower(), key.lower()) if distance < 1: local_field_map[field] = key for field in phenotypeFieldList: try: list = phenotypeFieldMap[field.lower()] if field in phenotypeUnitMap: unit_of_measure = phenotypeUnitMap[field] elif field in local_field_map: key = local_field_map[field] unit_of_measure = phenotypeUnitMap[key] else: unit_of_measure = 'NA' seq_list = [x for x in list if x != 'NA'] phenotype_row_data = [field.lower(), unit_of_measure, '', min(seq_list), max(seq_list)] phenotype_worksheet.write_row(row_num, col_num, tuple(phenotype_row_data)) row_num += 1 except ValueError as e: pass console.info('* Added phenotype data')
def fetch_sorghum_germplasm_taxa(): console.info('Reading Taxa germplasm data for Sorghum BAP') with open(DIR_CONST.SORGHUM_GERM_TAXA) as file: reader = csv.DictReader(file, delimiter=',') for row in reader: germplasm_id = remove_underscores(row['Taxa']) GERMPLASM_DATA_LIST.append(germplasm_id)
def __create_experiment_worksheet(self, experiment_worksheet, row_num, col_num, experiment_list): sorted_experiment_list = sorted(set(experiment_list)) experiment_worksheet.write_row(0, 0, tuple(PHN_CONST.EXPERIMENT_HEADERS)) for experiment in sorted_experiment_list: experiment_name = experiment[0] year = experiment[1] location = experiment[2] experiment_row = [experiment_name, location, '', '', '', '', '', year] experiment_worksheet.write_row(row_num, col_num, tuple(experiment_row)) row_num += 1 console.info('* Added experiments data')
def __create_location_worksheet(self, location_worksheet, row_num, col_num, location_list): sorted_location_list = sorted(set(location_list)) location_worksheet.write_row(0, 0, tuple(PHN_CONST.LOCATION_HEADERS)) for location in sorted_location_list: if location == 'NA': location_row = ['NA', 'NA', 'NA', '', '', '', '', '', ''] else: location_row = [1, 'USA', location, '', '', '', '', '', ''] location_worksheet.write_row(row_num, col_num, tuple(location_row)) row_num += 1 console.info('* Added locations data')
def create_germplasm_workbook(self, germplasm_data): start_time = time.time() row_num = 1 col_num = 0 germplasm_file = DIR_CONST.OUTPUT_DIR+'/'+self.__experimentName+'_germplasm.xlsx' workbook = xlsxwriter.Workbook(germplasm_file) # Create Data Sheets germplasm_worksheet = workbook.add_worksheet(GERM_CONST.GERMPLASM_SHEET) germplasm_worksheet.write_row(0, 0, tuple(GERM_CONST.GERMPLASM_HEADERS)) for germ_row in germplasm_data: germplasm_worksheet.write_row(row_num, col_num, tuple(germ_row)) row_num += 1 console.info('* Added germplasm data') elapsed_time = time.time() - start_time mlsec = repr(elapsed_time).split('.')[1][:3] print('\bGermplasm Data written to file in : '+time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))
def __create_phenotype_measure_worksheet(self, phenotype_measures_worksheet, row_num, col_num, data): phenotype_measures_worksheet.write_row(0, 0, tuple(PHN_CONST.PHENOTYPE_MEASURES_HEADERS)) for row in data: phenotype_measures_worksheet.write_row(row_num, col_num, tuple(row)) row_num += 1 console.info('* Added phenotype measures data')
def read_file(file_name, delimiter, germplasm_cols, phenotype_cols, config, data_type=None): global GERMPLASM_DATA_LIST global MAIZE_DUP_PHENOTYPE with open(file_name, encoding="latin-1") as file: try: file_data = [] reader = csv.DictReader(file, delimiter=delimiter) console.info('\nFile Name : ' + file_name) for row in reader: # Process Location if config['compound_location'] == True: location_col_name = config['location_column'] location_map = config['location_map'] compound_location = row[location_col_name] location_name = location_map[compound_location]['name'] year = location_map[compound_location]['year'] else: location_col_name = config['location_col'] year_col_name = config['year_col'] if location_col_name != 'None' and year_col_name != 'None': location_name = row[location_col_name] year = row[year_col_name] else: location_name = 'NA' year = '9999' if data_type == 'SORGHUM' and year_col_name != 'None': year = sorghum_date_process(row[year_col_name]) experiment_name = config['experiment_prefix'] + '_' + row[ 'author'] + '_' + year else: experiment_name = config['experiment_prefix'] + '_' + year # Process germplasm if config['compound_germplasm'] == True: pop_col = germplasm_cols[0] entry_col = germplasm_cols[1] germplasm_id = 'Z' + row[pop_col].rjust( 3, '0') + 'E' + row[entry_col].rjust(4, '0') else: germplasm_col = germplasm_cols[0] if data_type == 'SORGHUM_NAM': germplasm_id = split_germplasm_sorghum_nam( row[germplasm_col]) else: germplasm_id = row[germplasm_col] # Filter germplasm ids between z001 and z026 if (germplasm_check(germplasm_id, data_type)): # Process Phenotype data for column in phenotype_cols: if 'column_flag' in config: column_check = config['column_flag'] if row[column_check] == 'MISSING': skip_column = True else: skip_column = False else: skip_column = False if skip_column == False and row[ column] != '#VALUE!' and row[ column] != '.' and row[column] != '(null)': row_templ = [ experiment_name, '', location_name, '', '', '', '', '', germplasm_id ] if config['compound_phenotype'] == True: if 'maize_inflo7_rawdata.txt' in file_name or 'baptraits.csv' in file_name: phenotype_col_name = config[ 'phenotype_field_name'] phenotype_name = row[phenotype_col_name] else: phenotype_name = config[ 'phenotype_field_name'] else: phenotype_name = column try: if (date_pattern_match(row[column])): phenotype_value = parse( row[column]).strftime("%Y/%m/%d") else: phenotype_value = float(row[column]) except ValueError as e: phenotype_value = 'NA' if (data_type_check(data_type, year, phenotype_value)): if data_type == 'SOY': family_name = row['Family'] family_num = row['FamNo'] GERMPLASM_DATA_LIST.append( tuple([ germplasm_id, family_name, family_num ])) elif data_type == 'SORGHUM_NAM': family_index = row['fam'] GERMPLASM_DATA_LIST.append( tuple([germplasm_id, family_index])) else: GERMPLASM_DATA_LIST.append(germplasm_id) if phenotype_name.lower( ) in MAIZE_DUP_PHENOTYPE and MAIZE_DUP_PHENOTYPE[ phenotype_name.lower()] != 0: phenotype_name = phenotype_name.lower( ) + str(MAIZE_DUP_PHENOTYPE[ phenotype_name.lower()]) else: phenotype_name = phenotype_name.lower() row_templ.append(phenotype_name) row_templ.append(phenotype_value) add_to_phenotype_field_list( phenotype_name, phenotype_value) add_exp_loc_list(experiment_name, year, location_name) if data_type == 'SORGHUM' or data_type == 'SORGHUM_SAP' and phenotype_value != 'NA': file_data.append(row_templ) elif data_type != 'SORGHUM' and data_type != 'SORGHUM_SAP': file_data.append(row_templ) console._print('Processed %d records' % len(file_data)) except (UnicodeError, KeyError) as e: console.error('File name: ' + file_name + ' MISSING :>>>> ' + str(e)) pass for phenotype_col in phenotype_cols: if phenotype_col.lower() in MAIZE_DUP_PHENOTYPE: MAIZE_DUP_PHENOTYPE[phenotype_col.lower()] = int( MAIZE_DUP_PHENOTYPE[phenotype_col.lower()]) + 1 return file_data
def read_heredity_data(file_name, phenotype_cols): global GERMPLASM_DATA_LIST global MAIZE_DUP_PHENOTYPE with open(file_name, encoding="latin-1") as file: try: file_data = [] reader = csv.DictReader(file, delimiter=',') console.info('\nFile Name : ' + file_name) for row in reader: location_col_name = 'env' compound_location = row[location_col_name] if len(compound_location) < 6: compound_location = compound_location.rjust(6, '0') location_name = MAIZE_HEREDITY_LOC_CONF[compound_location][ 'name'] year = MAIZE_HEREDITY_LOC_CONF[compound_location]['year'] germplasm_id = 'Z' + row['pop'].rjust( 3, '0') + 'E' + row['entry_num'].rjust(4, '0') if year == 'NA': experiment_name = 'maizeNAM_Hung_2012_Heterdity' else: experiment_name = 'maizeNAM_Hung_2012_Heterdity_' + year # Filter germplasm ids between z001 and z026 if (germplasm_check(germplasm_id)): GERMPLASM_DATA_LIST.append(germplasm_id) # Process Phenotype data for column in phenotype_cols: if row['entry_id'] == 'MISSING': skip_column = True else: skip_column = False if skip_column == False: row_templ = [ experiment_name, '', location_name, '', '', '', '', '', germplasm_id ] phenotype_name = column try: phenotype_value = float(row[column]) except ValueError as e: phenotype_value = 'NA' if phenotype_name.lower( ) in MAIZE_DUP_PHENOTYPE and MAIZE_DUP_PHENOTYPE[ phenotype_name.lower()] != 0: phenotype_name = phenotype_name.lower() + str( MAIZE_DUP_PHENOTYPE[ phenotype_name.lower()]) else: phenotype_name = phenotype_name.lower() row_templ.append(phenotype_name) row_templ.append(phenotype_value) add_to_phenotype_field_list( phenotype_name, phenotype_value) add_exp_loc_list(experiment_name, year, location_name) file_data.append(row_templ) console._print('Processed %d records' % len(file_data)) except (UnicodeError, KeyError) as e: console.error('File name: ' + file_name + ' MISSING :>>>> ' + str(e)) pass for phenotype_col in phenotype_cols: if phenotype_col.lower() in MAIZE_DUP_PHENOTYPE: MAIZE_DUP_PHENOTYPE[phenotype_col.lower()] = int( MAIZE_DUP_PHENOTYPE[phenotype_col.lower()]) + 1 return file_data