def helper_test_score(scorer, method: str, left: str, right: str, filename: Path): """ Applies the selected check using the given filename. The `genotype` sheet and score sheet must be present.""" table_genotype = dataio.import_table(filename, sheet_name='genotype', index='Genotype', keep_empty=True) if method == 'greater': sheetname = 'score.greater' actual_score = scorer.calculate_score_greater( table_genotype.loc[left], table_genotype.loc[right]) elif method == 'fixed': sheetname = 'score.fixed' actual_score = scorer.calculate_score_above_fixed( table_genotype.loc[left], table_genotype.loc[right]) elif method == 'derivative': sheetname = 'score.derivative' actual_score = scorer.calculate_score_derivative( table_genotype.loc[left], table_genotype.loc[right]) elif method == 'jaccard': sheetname = 'score.jaccard' actual_score = scorer.calculate_score_area(table_genotype.loc[left], table_genotype.loc[right]) else: raise ValueError table_scores = dataio.import_table(filename, sheet_name=sheetname, index='name', keep_empty=True) expected_score = table_scores[left][right] return actual_score, expected_score
def edges_table() -> pandas.Series: data = """ Parent Identity genotype-0 genotype-2 genotype-0 genotype-1 genotype-1 genotype-5 genotype-2 genotype-6 genotype-1 genotype-4 genotype-4 genotype-12 genotype-6 genotype-3 genotype-2 genotype-9 genotype-4 genotype-14 genotype-0 genotype-10 genotype-10 genotype-15 genotype-10 genotype-16 genotype-0 genotype-11 genotype-2 genotype-8 genotype-2 genotype-7 genotype-6 genotype-13 genotype-4 genotype-17 genotype-4 genotype-19 genotype-14 genotype-18 """ table = import_table(data, index='Identity') table = table['Parent'] return table
def load_table(io: Union[str, Path, pandas.DataFrame]) -> pandas.DataFrame: if not isinstance(io, pandas.DataFrame): # Assume it is one of the other formats result = dataio.import_table(io) else: result = io return result
def test_parse_annotations(): info_table_string = """ Trajectory mutation gene 1 A>C PROKKA_00139/>glyA 2 T>G gapN/>glgB 3 G>C PROKKA_00438< 4 C>T PROKKA_00487< 5 A>T PROKKA_00512 6 C>A intergenic(62/+110) 7 G>T bglK_1< 8 G>T dnaI< """ info_table = import_table(info_table_string, index='Trajectory') genotype_members = { 'genotype-1': "1|3|5", 'genotype-2': "2", 'genotype-3': "6|7|8" } expected_result = { 'genotype-1': ["PROKKA_00139|glyA A>C", "PROKKA_00438 G>C", "PROKKA_00512 A>T"], 'genotype-2': ["gapN|glgB T>G"], 'genotype-3': ["intergenic(62/+110) C>A", "bglK_1 G>T", "dnaI G>T"] } test_result = annotations.parse_genotype_annotations( genotype_members, info_table) assert expected_result == test_result
def run_lineageplot_workflow(edgesio: Union[str, Path, pandas.DataFrame], filename: Path, sheet_name: Optional[str] = None): """ Generates a lineage plot given an `edges` table. The columns should be named `parent` and `identity`. An optional `annotation` column will be used to annotate the plot. """ from muller.graphics import flowchart from muller.graphics.palettes import generate_palette if isinstance(edgesio, (str, Path)): edges = dataio.import_table(edgesio, sheet_name=sheet_name) else: edges = edgesio # Convert the column labels to lowercase edges.columns = [i.capitalize() for i in edges.columns] edges = edges.set_index('Identity') if 'Annotation' in edges.columns: edges_annotations = edges.pop("Annotation").to_dict() # Make sure 'annotations' is a list of str which is expected by the flowchart edges_annotations = {k: [v] for k, v in edges_annotations.items()} else: edges_annotations = dict() palette = generate_palette(edges['Parent'], kind='lineage') flowchart(edges, palette, edges_annotations, filename)
def genotypeannotations() -> pandas.DataFrame: string = """ Trajectory Chromosome Position Class Mutation Gene Annotation Class Amino Description 1 1 36,414 SNP G>A speA C127Y(TGT>TAT) Non C->Y Arginine decarboxylase 2 1 138,043 SNP C>T rlmCD_1 H441H(CAC>CAT) Syn H->H 23S rRNA (uracilC(5))methyltransferase RlmCD 3 1 165,470 SNP C>A PROKKA_00173/>PROKKA_00174 intergenic(+174/91) NC na Relaxase/Mobilisation nuclease domain protein/hypothetical protein 4 1 234,888 SNP C>A gapN A95E(GCA>GAA) Non A->E NADPdependent glyceraldehyde3phosphate dehydrogenase """ return dataio.import_table(string, index='Trajectory')
def helper_for_testing_tables(sorter, filename): # Since we removed the "unsorted" genotype table, generate a random version instead. expected_table = dataio.import_table(filename, sheet_name="genotype", index="Genotype") unsorted_table = dataio.import_table(filename, sheet_name="unsorted", index="Genotype") expected_table = expected_table[widgets.get_numeric_columns( expected_table.columns)] unsorted_table = unsorted_table[widgets.get_numeric_columns( unsorted_table.columns)] unsorted_table.columns = [int(i) for i in unsorted_table.columns] expected_table.columns = [int(i) for i in expected_table.columns] result = sorter.run(unsorted_table) result.index.name = 'Genotype' return result, expected_table
def test_clustering_algorithm_on_real_tables(cluster, filename): trajectories = dataio.import_table(filename, sheet_name='trajectory', index='Trajectory') expected_members = helper_get_expected_members(trajectories) result = cluster.run(trajectories, distance_cutoff=0.2) assert sorted(result.genotype_members.values()) == sorted( expected_members.values())
def test_correct_math_scale(): string = """Trajectory X0 X1 X2 X3 X4 X5 trajectory-A2 0 0 0 6 35 4 trajectory-A3 0 0 0 0 45 5""" df = import_table(string, index='Trajectory') assert df['X4'].tolist() == [35, 45] fdf = _correct_math_scale(df) assert fdf['X4'].tolist() == [0.35, 0.45] assert fdf['X4'].dtype == float
def test_parse_tree(): string = """ Parent Identity genotype-0 genotype-1 genotype-1 genotype-13 genotype-1 genotype-12 genotype-13 genotype-9 genotype-9 genotype-10 genotype-0 genotype-5 genotype-1 genotype-11 genotype-10 genotype-7 genotype-1 genotype-4 genotype-13 genotype-6 genotype-0 genotype-2 genotype-1 genotype-3 genotype-1 genotype-8 """ table = dataio.import_table(string) table = table.set_index('Identity')['Parent'] expected_parent = { 'genotype-1': 'genotype-1', 'genotype-2': 'genotype-2', 'genotype-3': 'genotype-1', 'genotype-4': 'genotype-1', 'genotype-5': 'genotype-5', 'genotype-6': 'genotype-1', 'genotype-7': 'genotype-1', 'genotype-8': 'genotype-1', 'genotype-9': 'genotype-1', 'genotype-10': 'genotype-1', 'genotype-11': 'genotype-1', 'genotype-12': 'genotype-1', 'genotype-13': 'genotype-1' } expected_distance = { 'genotype-1': 1, 'genotype-2': 1, 'genotype-3': 2, 'genotype-4': 2, 'genotype-5': 1, 'genotype-6': 3, 'genotype-7': 5, 'genotype-8': 2, 'genotype-9': 3, 'genotype-10': 4, 'genotype-11': 2, 'genotype-12': 2, 'genotype-13': 2 } result = treetools.parse_tree(table) assert result['clade'].to_dict() == expected_parent assert result['iterations'].to_dict() == expected_distance
def test_calculate_mean_genotype(genotype_generator, filename): trajectories = import_table(filename, sheet_name='trajectory', index='Trajectory') genotypes = import_table(filename, sheet_name='genotype', index='Genotype') trajectories['Genotype'] = [ 'genotype-' + i.split('-')[1] for i in trajectories.index ] logger.debug(trajectories['Genotype']) groups = trajectories.groupby(by="Genotype") for genotype_label, group in groups: expected = genotypes.loc[genotype_label].astype(float) mean_genotype = genotype_generator._calculate_mean_frequencies_of_trajectories( genotype_label, group, group.index) del mean_genotype['members'] mean_genotype = mean_genotype.astype(float) pandas.testing.assert_series_equal(mean_genotype, expected, check_index_type=False)
def table() -> pandas.DataFrame: """ Returns the path to the input dataset for these tests. Marked as a fixture so that it can be called as a method parameter rather then as a standalone method call within the test methods. """ folder_data = Path(__file__).parent / "data" filename = folder_data / "generic.genotypes.10.xlsx" table = dataio.import_table(filename, index='Trajectory', sheet_name="trajectory") return table
def load_datasets(filenames: List[Path]) -> List[pandas.DataFrame]: tables = list() for filename in filenames: logger.debug(f"Loading {filename}...") t = dataio.import_table(filename, sheet_name='trajectory', index="Trajectory") # Remove any unnecessary columns so that the entire table is numeric t = t[widgets.get_numeric_columns(t.columns)] tables.append(t) return tables
def load_highresolution_datasets(filename: Path) -> List[pandas.DataFrame]: # Basically want to use this large dataset to generate tables of various sizes. table = dataio.import_table(filename, index="Trajectory") table = table[widgets.get_numeric_columns(table.columns)] table.columns = list(range(len(table.columns))) dfs = list() for i in range(2, 10): size = int(2**i) df = table.iloc[:size] dfs.append(df) return dfs
def test_clustering_algorithm_on_generic_tables(cluster, filename): trajectories = dataio.import_table(filename, sheet_name='trajectory', index='Trajectory') if len(trajectories) == 10: # The table with 10 genotypes shouldn't be used as a trajectory table. return None expected_members = helper_get_expected_members(trajectories) result = cluster.run(trajectories, distance_cutoff=0.2) assert sorted(result.genotype_members.values()) == sorted( expected_members.values())
def get_table_genotypes() -> pandas.DataFrame: genotypes = """ Genotype 0 30 45 60 90 genotype-7 0 0.167 0.55 0.91 0.972 genotype-10 0 0.02 0.265 0.9 0.97 genotype-11 0 0 0.15 0.836 0.945 genotype-8 0 0.11363636 0.45 0.735 0.86 genotype-9 0 0.08 0.475 0.625 0.848 genotype-1 0 0.3345 0.28125 0.095 0.045 genotype-13 0 0 0 0 0.315 genotype-4 0 0.005 0.24 0.03 0.0375 genotype-2 0 0 0 0 0.245666666666667 genotype-5 0 0.139 0 0 0 genotype-12 0 0 0 0 0.11 genotype-6 0 0.063 0 0 0.01 genotype-3 0 0 0 0 0.01 """ return dataio.import_table(genotypes)
def get_table_edges() -> pandas.Series: edges = """ Identity Parent genotype-7 genotype-0 genotype-10 genotype-7 genotype-11 genotype-10 genotype-8 genotype-7 genotype-9 genotype-8 genotype-1 genotype-0 genotype-13 genotype-11 genotype-4 genotype-1 genotype-2 genotype-11 genotype-5 genotype-0 genotype-12 genotype-10 genotype-6 genotype-0 genotype-3 genotype-0 """ return dataio.import_table(edges).set_index('Identity')['Parent']
def parse_genotype_table( filename: Path, sheet_name: str = 'Sheet1' ) -> Tuple[pandas.DataFrame, pandas.DataFrame]: """ Imports a table that lists pre-computed genotypes rather than trajectories.""" data = import_table(filename, sheet_name=sheet_name) # For some reason some tables are annotated with 'genotype ' with extra spaces. data.columns = [(i.strip() if isinstance(i, str) else i) for i in data.columns] if 'Genotype' in data.columns: key_column = 'Genotype' elif 'Unnamed: 0' in data.columns: key_column = 'Unnamed: 0' elif 'Trajectory' in data.columns: key_column = 'Trajectory' else: message = f"One of the columns needs to be labeled `Genotype`. Got {data.columns} instead from {filename}." raise ValueError(message) genotype_timeseries, genotype_info = _parse_table(data, key_column) # Make sure the genotype labels are prefixed with 'genotype-' if not genotype_timeseries.index[0].startswith('genotype'): genotype_timeseries.index.name = 'originalLabel' genotype_timeseries['Genotype'] = [ f'genotype-{i}' for i in range(1, len(genotype_timeseries) + 1) ] genotype_timeseries = genotype_timeseries.reset_index() genotype_timeseries = genotype_timeseries.set_index('Genotype') genotype_timeseries.pop('originalLabel') # Try to sort the genotypes by label, if posible. # Note: designed to sort labels of the form `genotype-[\d]+` try: sorted_index = sorted(genotype_timeseries.index, key=lambda s: float(s.split('-')[-1])) except ValueError: sorted_index = genotype_timeseries.index genotype_timeseries = genotype_timeseries.loc[sorted_index] # Remove extraneous whitespace. return genotype_timeseries, genotype_info
def test_calculate_mean_genotype(genotype_generator): test_genotypes = [['7'], ['4', '8'], ['3', '2'], ['13', '20', '11']] trajectories = pandas.read_csv(StringIO(trajectory_csv)) trajectories['Trajectory'] = trajectories['Trajectory'].astype(str) trajectories = trajectories.set_index('Trajectory') expected_csv = """ Genotype 0 17 25 44 66 75 90 members genotype-1 0.0 0.0 0.0 0.273 0.781 1.0 1.0 7 genotype-2 0 0 0 0 0.278 0.822 0.803 4|8 genotype-3 0 0 0 0.336 0.452 0.9175 0.8985 3|2 genotype-4 0 0 0 0.082 0.234666666666667 0.019 0.052 13|20|11 """ expected_mean = import_table(expected_csv, index='Genotype') output = genotype_generator.calculate_mean_genotype( test_genotypes, trajectories) logger.debug(expected_mean.to_string()) logger.debug(output.to_string()) # Rearrange columns to match output pandas.testing.assert_frame_equal(expected_mean, output)
def parse_trajectory_table( filename: Union[str, Path], sheet_name='Sheet1') -> Tuple[pandas.DataFrame, pandas.DataFrame]: """ Reads an excel or csv file. Assumes that the file has a `Trajectory` column and a column for each timepoint. Parameters ---------- filename: Path The table containing the trajectories and associated metadata. Can be an excel sheet or comma/tab delimited file. sheet_name: str; Default 'Sheet1' Indicates which sheet contains the data, if an excel table is given. Returns ------- pandas.DataFrame, pandas.DataFrame A timeseries dataframe - Index -> str Names unique to each trajectory. - Columns -> int The timeseries points will correspond to the frequencies for each trajectory included with the input sheet. Each trajectory/timepoint will include the observed frequency at each timepoint. A metadata dataframe - Index -> str Identical index to the timeseries dataframe. - Columns -> str All columns from the original input table that do no correspond to timepoints. """ # Read in the data table. raw_data = import_table(filename, sheet_name) key_column = 'Trajectory' timeseries, info = _parse_table(raw_data, key_column) if 'genotype' in info: # This file was generated by a previous run. info.pop('genotype') # Make sure the index is named `Trajectory` for consistency timeseries.index.name = 'Trajectory' # Make sure the columns of `info` are lowercase to help with later parsing. info.columns = [i.lower() for i in info.columns] return timeseries, info
def transposed_genotypes() -> pandas.DataFrame: genotype_table_string = """ Genotype 0 17 25 44 66 75 90 genotype-1 0 0 0.261 1 1 1 1 genotype-2 0 0.38 0.432 0 0 0 0 genotype-3 0 0 0 0 0 1 1 genotype-4 0 0 0 0.525 0.454 0.911 0.91 genotype-5 0 0 0 0.147 0.45 0.924 0.887 genotype-6 0 0 0 0.273 0.781 1 1 genotype-7 0 0 0 0.188 0.171 0.232 0.244 genotype-8 0 0 0 0.403 0.489 0.057 0.08 genotype-9 0 0 0.117 0 0 0 0.103 genotype-10 0 0 0 0.138 0.295 0 0.081 genotype-11 0 0 0 0 0.278 0.822 0.803 genotype-12 0 0 0 0 0.2335 0.133 0.0375 genotype-13 0 0 0.033 0.106 0.1065 0 0 genotype-14 0 0 0 0 0 0.2675 0.326 genotype-15 0 0 0 0.1145 0 0.1205 0.0615 """ table = import_table(genotype_table_string, index='Genotype') return table.transpose()
def transposed_mouse_genotypes() -> pandas.DataFrame: table = """ Genotype 0 1 2 3 4 5 6 7 8 9 10 genotype-1 0 0 0.045 0.197 0.261 0.096 0.26 0.596 0.66 0.877 0.969 genotype-2 0.01 0.279 0.341 0.568 0.708 0.913 0.756 0.455 0.399 0.13 0.041 genotype-3 0 0.056 0.101 0.174 0 0 0 0 0 0 0 genotype-4 0.278 0.277 0.224 0.195 0 0 0 0 0 0 0 genotype-5 0 0 0 0 0 0.247 0.388 0.215 0.403 0.141 0.028 genotype-6 0 0 0 0 0.148 0.384 0.344 0.289 0.333 0.146 0.031 genotype-7 0 0 0 0 0 0 0.084 0.12 0.124 0.343 0.398 genotype-8 0 0 0 0 0 0 0 0.077 0.018 0.239 0.308 genotype-9 0 0.088 0.036 0.046 0 0.059 0.052 0 0.073 0 0 genotype-10 0 0 0 0 0.072 0.047 0.057 0 0 0 0 genotype-11 0.027 0.059 0.0325 0.008 0 0 0 0 0 0 0 genotype-12 0.149 0.1885 0.172 0 0 0 0 0 0 0 0 genotype-13 0 0.00525 0.0065 0.005 0.00775 0 0.01275 0.051 0.032 0.0195 0.02175 genotype-14 0 0 0 0 0 0 0 0.0172 0.1156 0.112 0.0948 genotype-15 0.001857 0 0.003714 0.001143 0 0 0.003286 0.006571 0.034 0.040286 0.038143 """ t = import_table(table, index='Genotype') return t.transpose()
def test_flowchart(): edges_string = """ Parent Identity score genotype-0 genotype-1 1 genotype-1 genotype-13 2 """ edges_table = dataio.import_table(edges_string) palette = { "genotype-13": '#222222', 'genotype-1': '#CCCCCC', 'genotype-0': '#000000' } resultgraph = lineageplot.flowchart(edges_table, palette, annotations={'genotype-1': ['gene1']}) dark_node = resultgraph.get_node('genotype-13') assert dark_node.attr['fontcolor'] == '#FFFFFF' assert dark_node.attr['label'] == 'genotype-13' light_node = resultgraph.get_node('genotype-1') assert light_node.attr['fontcolor'] == '#333333' assert light_node.attr['label'] == 'genotype-1\ngene1'
def b1_data() -> pandas.DataFrame: f = filenames.real_tables["B1"] t = dataio.import_table(f, sheet_name='trajectory', index='Trajectory') t.index = [str(i) for i in t.index] return t
def model_strong_selection() -> pandas.DataFrame: filename = model_tables['model.strongselection'] table = dataio.import_table(filename, sheet_name='genotype', index="Genotype") return table
def model_clonal_interferance() -> pandas.DataFrame: filename = model_tables['model.clonalinterferance'] table = dataio.import_table(filename, sheet_name='genotype', index='Genotype') return table
def model_periodic_selection() -> pandas.DataFrame: filename = model_tables['model.periodicselection'] table = dataio.import_table(filename, sheet_name="genotype", index='Genotype') return table
def get_table_population_smoothed() -> pandas.DataFrame: populations_smoothed = """ Identity Generation Population genotype-7 0 0 genotype-7 30 5.336364 genotype-7 45 10 genotype-7 60 1 genotype-7 90 1 genotype-10 0 0 genotype-10 30 0 genotype-10 45 11.5 genotype-10 60 6.40000000000001 genotype-10 90 1 genotype-11 0 0 genotype-11 30 0 genotype-11 45 15 genotype-11 60 83.6 genotype-11 90 63 genotype-8 0 0 genotype-8 30 3.363636 genotype-8 45 1 genotype-8 60 11 genotype-8 90 1 genotype-9 0 0 genotype-9 30 8 genotype-9 45 47.5 genotype-9 60 62.5 genotype-9 90 84.8 genotype-1 0 0 genotype-1 30 32.95 genotype-1 45 4.125 genotype-1 60 6.5 genotype-1 90 1 genotype-13 0 0 genotype-13 30 0 genotype-13 45 0 genotype-13 60 0 genotype-13 90 31.5 genotype-4 0 0 genotype-4 30 0.5 genotype-4 45 24 genotype-4 60 3 genotype-4 90 3.75 genotype-2 0 0 genotype-2 30 0 genotype-2 45 0 genotype-2 60 0 genotype-2 90 24.5666666666667 genotype-5 0 0 genotype-5 30 13.9 genotype-5 45 0 genotype-5 60 0 genotype-5 90 0 genotype-12 0 0 genotype-12 30 0 genotype-12 45 0 genotype-12 60 0 genotype-12 90 11 genotype-6 0 0 genotype-6 30 6.3 genotype-6 45 0 genotype-6 60 0 genotype-6 90 1 genotype-3 0 0 genotype-3 30 0 genotype-3 45 0 genotype-3 60 0 genotype-3 90 1 genotype-0 0 100 genotype-0 30 29.65 genotype-0 45 0 genotype-0 60 0 genotype-0 90 0 """ return dataio.import_table(populations_smoothed, keep_empty=True)
def trajectory_table() -> pandas.DataFrame: filename_table = filenames.fake_tables['generic.model.area'] return dataio.import_table(filename_table, sheet_name = "data", index = 'Trajectory')