def __create_phenotype_worksheet(self, phenotype_worksheet, row_num, col_num, phenotypeUnitMap, phenotypeFieldList, phenotypeFieldMap):
        phenotype_worksheet.write_row(0, 0, tuple(PHN_CONST.PHENOTYPE_HEADERS))
        local_field_map = {}
        for field in phenotypeFieldList:
            for key in phenotypeUnitMap:
                distance = self.__levenshtein(field.lower(), key.lower())
                if distance < 1:
                    local_field_map[field] = key

        for field in phenotypeFieldList:
            try:
                list = phenotypeFieldMap[field.lower()]
                if field in phenotypeUnitMap:
                    unit_of_measure = phenotypeUnitMap[field]
                elif field in local_field_map:
                    key = local_field_map[field]
                    unit_of_measure = phenotypeUnitMap[key]
                else:
                    unit_of_measure = 'NA'
                seq_list = [x for x in list if x != 'NA']
                phenotype_row_data = [field.lower(), unit_of_measure, '', min(seq_list), max(seq_list)]
                phenotype_worksheet.write_row(row_num, col_num, tuple(phenotype_row_data))
                row_num += 1
            except ValueError as e:
                pass
        console.info('* Added phenotype data')
Exemple #2
0
def fetch_sorghum_germplasm_taxa():
    console.info('Reading Taxa germplasm data for Sorghum BAP')
    with open(DIR_CONST.SORGHUM_GERM_TAXA) as file:
        reader = csv.DictReader(file, delimiter=',')
        for row in reader:
            germplasm_id = remove_underscores(row['Taxa'])
            GERMPLASM_DATA_LIST.append(germplasm_id)
 def __create_experiment_worksheet(self, experiment_worksheet, row_num, col_num, experiment_list):
     sorted_experiment_list = sorted(set(experiment_list))
     experiment_worksheet.write_row(0, 0, tuple(PHN_CONST.EXPERIMENT_HEADERS))
     for experiment in sorted_experiment_list:
         experiment_name = experiment[0]
         year = experiment[1]
         location = experiment[2]
         experiment_row = [experiment_name, location, '', '', '', '', '', year]
         experiment_worksheet.write_row(row_num, col_num, tuple(experiment_row))
         row_num += 1
     console.info('* Added experiments data')
 def __create_location_worksheet(self, location_worksheet, row_num, col_num, location_list):
     sorted_location_list = sorted(set(location_list))
     location_worksheet.write_row(0, 0, tuple(PHN_CONST.LOCATION_HEADERS))
     for location in sorted_location_list:
         if location == 'NA':
             location_row = ['NA', 'NA', 'NA', '', '', '', '', '', '']
         else:
             location_row = [1, 'USA', location, '', '', '', '', '', '']
         location_worksheet.write_row(row_num, col_num, tuple(location_row))
         row_num += 1
     console.info('* Added locations data')
 def create_germplasm_workbook(self, germplasm_data):
     start_time = time.time()
     row_num = 1
     col_num = 0
     germplasm_file = DIR_CONST.OUTPUT_DIR+'/'+self.__experimentName+'_germplasm.xlsx'
     workbook = xlsxwriter.Workbook(germplasm_file)
     # Create Data Sheets
     germplasm_worksheet = workbook.add_worksheet(GERM_CONST.GERMPLASM_SHEET)
     germplasm_worksheet.write_row(0, 0, tuple(GERM_CONST.GERMPLASM_HEADERS))
     for germ_row in germplasm_data:
         germplasm_worksheet.write_row(row_num, col_num, tuple(germ_row))
         row_num += 1
     console.info('* Added germplasm data')
     elapsed_time = time.time() - start_time
     mlsec = repr(elapsed_time).split('.')[1][:3]
     print('\bGermplasm Data written to file in : '+time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))
 def __create_phenotype_measure_worksheet(self, phenotype_measures_worksheet, row_num, col_num, data):
     phenotype_measures_worksheet.write_row(0, 0, tuple(PHN_CONST.PHENOTYPE_MEASURES_HEADERS))
     for row in data:
         phenotype_measures_worksheet.write_row(row_num, col_num, tuple(row))
         row_num += 1
     console.info('* Added phenotype measures data')
Exemple #7
0
def read_file(file_name,
              delimiter,
              germplasm_cols,
              phenotype_cols,
              config,
              data_type=None):
    global GERMPLASM_DATA_LIST
    global MAIZE_DUP_PHENOTYPE

    with open(file_name, encoding="latin-1") as file:
        try:
            file_data = []
            reader = csv.DictReader(file, delimiter=delimiter)
            console.info('\nFile Name : ' + file_name)
            for row in reader:
                # Process Location
                if config['compound_location'] == True:
                    location_col_name = config['location_column']
                    location_map = config['location_map']
                    compound_location = row[location_col_name]
                    location_name = location_map[compound_location]['name']
                    year = location_map[compound_location]['year']
                else:
                    location_col_name = config['location_col']
                    year_col_name = config['year_col']
                    if location_col_name != 'None' and year_col_name != 'None':
                        location_name = row[location_col_name]
                        year = row[year_col_name]
                    else:
                        location_name = 'NA'
                        year = '9999'

                if data_type == 'SORGHUM' and year_col_name != 'None':
                    year = sorghum_date_process(row[year_col_name])
                    experiment_name = config['experiment_prefix'] + '_' + row[
                        'author'] + '_' + year
                else:
                    experiment_name = config['experiment_prefix'] + '_' + year

                # Process germplasm
                if config['compound_germplasm'] == True:
                    pop_col = germplasm_cols[0]
                    entry_col = germplasm_cols[1]
                    germplasm_id = 'Z' + row[pop_col].rjust(
                        3, '0') + 'E' + row[entry_col].rjust(4, '0')
                else:
                    germplasm_col = germplasm_cols[0]
                    if data_type == 'SORGHUM_NAM':
                        germplasm_id = split_germplasm_sorghum_nam(
                            row[germplasm_col])
                    else:
                        germplasm_id = row[germplasm_col]

                # Filter germplasm ids between z001 and z026
                if (germplasm_check(germplasm_id, data_type)):
                    # Process Phenotype data
                    for column in phenotype_cols:
                        if 'column_flag' in config:
                            column_check = config['column_flag']
                            if row[column_check] == 'MISSING':
                                skip_column = True
                            else:
                                skip_column = False
                        else:
                            skip_column = False

                        if skip_column == False and row[
                                column] != '#VALUE!' and row[
                                    column] != '.' and row[column] != '(null)':
                            row_templ = [
                                experiment_name, '', location_name, '', '', '',
                                '', '', germplasm_id
                            ]
                            if config['compound_phenotype'] == True:
                                if 'maize_inflo7_rawdata.txt' in file_name or 'baptraits.csv' in file_name:
                                    phenotype_col_name = config[
                                        'phenotype_field_name']
                                    phenotype_name = row[phenotype_col_name]
                                else:
                                    phenotype_name = config[
                                        'phenotype_field_name']
                            else:
                                phenotype_name = column

                            try:
                                if (date_pattern_match(row[column])):
                                    phenotype_value = parse(
                                        row[column]).strftime("%Y/%m/%d")
                                else:
                                    phenotype_value = float(row[column])
                            except ValueError as e:
                                phenotype_value = 'NA'

                            if (data_type_check(data_type, year,
                                                phenotype_value)):
                                if data_type == 'SOY':
                                    family_name = row['Family']
                                    family_num = row['FamNo']
                                    GERMPLASM_DATA_LIST.append(
                                        tuple([
                                            germplasm_id, family_name,
                                            family_num
                                        ]))
                                elif data_type == 'SORGHUM_NAM':
                                    family_index = row['fam']
                                    GERMPLASM_DATA_LIST.append(
                                        tuple([germplasm_id, family_index]))
                                else:
                                    GERMPLASM_DATA_LIST.append(germplasm_id)

                                if phenotype_name.lower(
                                ) in MAIZE_DUP_PHENOTYPE and MAIZE_DUP_PHENOTYPE[
                                        phenotype_name.lower()] != 0:
                                    phenotype_name = phenotype_name.lower(
                                    ) + str(MAIZE_DUP_PHENOTYPE[
                                        phenotype_name.lower()])
                                else:
                                    phenotype_name = phenotype_name.lower()
                                row_templ.append(phenotype_name)
                                row_templ.append(phenotype_value)
                                add_to_phenotype_field_list(
                                    phenotype_name, phenotype_value)
                                add_exp_loc_list(experiment_name, year,
                                                 location_name)
                                if data_type == 'SORGHUM' or data_type == 'SORGHUM_SAP' and phenotype_value != 'NA':
                                    file_data.append(row_templ)
                                elif data_type != 'SORGHUM' and data_type != 'SORGHUM_SAP':
                                    file_data.append(row_templ)
                console._print('Processed %d records' % len(file_data))
        except (UnicodeError, KeyError) as e:
            console.error('File name: ' + file_name + ' MISSING :>>>> ' +
                          str(e))
            pass
        for phenotype_col in phenotype_cols:
            if phenotype_col.lower() in MAIZE_DUP_PHENOTYPE:
                MAIZE_DUP_PHENOTYPE[phenotype_col.lower()] = int(
                    MAIZE_DUP_PHENOTYPE[phenotype_col.lower()]) + 1
        return file_data
Exemple #8
0
def read_heredity_data(file_name, phenotype_cols):
    global GERMPLASM_DATA_LIST
    global MAIZE_DUP_PHENOTYPE

    with open(file_name, encoding="latin-1") as file:
        try:
            file_data = []
            reader = csv.DictReader(file, delimiter=',')
            console.info('\nFile Name : ' + file_name)
            for row in reader:
                location_col_name = 'env'
                compound_location = row[location_col_name]
                if len(compound_location) < 6:
                    compound_location = compound_location.rjust(6, '0')
                location_name = MAIZE_HEREDITY_LOC_CONF[compound_location][
                    'name']
                year = MAIZE_HEREDITY_LOC_CONF[compound_location]['year']

                germplasm_id = 'Z' + row['pop'].rjust(
                    3, '0') + 'E' + row['entry_num'].rjust(4, '0')

                if year == 'NA':
                    experiment_name = 'maizeNAM_Hung_2012_Heterdity'
                else:
                    experiment_name = 'maizeNAM_Hung_2012_Heterdity_' + year

                # Filter germplasm ids between z001 and z026
                if (germplasm_check(germplasm_id)):
                    GERMPLASM_DATA_LIST.append(germplasm_id)
                    # Process Phenotype data
                    for column in phenotype_cols:
                        if row['entry_id'] == 'MISSING':
                            skip_column = True
                        else:
                            skip_column = False

                        if skip_column == False:
                            row_templ = [
                                experiment_name, '', location_name, '', '', '',
                                '', '', germplasm_id
                            ]
                            phenotype_name = column
                            try:
                                phenotype_value = float(row[column])
                            except ValueError as e:
                                phenotype_value = 'NA'

                            if phenotype_name.lower(
                            ) in MAIZE_DUP_PHENOTYPE and MAIZE_DUP_PHENOTYPE[
                                    phenotype_name.lower()] != 0:
                                phenotype_name = phenotype_name.lower() + str(
                                    MAIZE_DUP_PHENOTYPE[
                                        phenotype_name.lower()])
                            else:
                                phenotype_name = phenotype_name.lower()
                            row_templ.append(phenotype_name)
                            row_templ.append(phenotype_value)
                            add_to_phenotype_field_list(
                                phenotype_name, phenotype_value)
                            add_exp_loc_list(experiment_name, year,
                                             location_name)
                            file_data.append(row_templ)
                console._print('Processed %d records' % len(file_data))
        except (UnicodeError, KeyError) as e:
            console.error('File name: ' + file_name + ' MISSING :>>>> ' +
                          str(e))
            pass

    for phenotype_col in phenotype_cols:
        if phenotype_col.lower() in MAIZE_DUP_PHENOTYPE:
            MAIZE_DUP_PHENOTYPE[phenotype_col.lower()] = int(
                MAIZE_DUP_PHENOTYPE[phenotype_col.lower()]) + 1
    return file_data