Exemple #1
0
def helper_test_score(scorer, method: str, left: str, right: str,
                      filename: Path):
    """ Applies the selected check using the given filename. The `genotype` sheet and score sheet must be present."""
    table_genotype = dataio.import_table(filename,
                                         sheet_name='genotype',
                                         index='Genotype',
                                         keep_empty=True)

    if method == 'greater':
        sheetname = 'score.greater'
        actual_score = scorer.calculate_score_greater(
            table_genotype.loc[left], table_genotype.loc[right])
    elif method == 'fixed':
        sheetname = 'score.fixed'
        actual_score = scorer.calculate_score_above_fixed(
            table_genotype.loc[left], table_genotype.loc[right])
    elif method == 'derivative':
        sheetname = 'score.derivative'
        actual_score = scorer.calculate_score_derivative(
            table_genotype.loc[left], table_genotype.loc[right])
    elif method == 'jaccard':
        sheetname = 'score.jaccard'
        actual_score = scorer.calculate_score_area(table_genotype.loc[left],
                                                   table_genotype.loc[right])
    else:
        raise ValueError
    table_scores = dataio.import_table(filename,
                                       sheet_name=sheetname,
                                       index='name',
                                       keep_empty=True)
    expected_score = table_scores[left][right]

    return actual_score, expected_score
Exemple #2
0
def edges_table() -> pandas.Series:
    data = """
		Parent	Identity
		genotype-0	genotype-2
		genotype-0	genotype-1
		genotype-1	genotype-5
		genotype-2	genotype-6
		genotype-1	genotype-4
		genotype-4	genotype-12
		genotype-6	genotype-3
		genotype-2	genotype-9
		genotype-4	genotype-14
		genotype-0	genotype-10
		genotype-10	genotype-15
		genotype-10	genotype-16
		genotype-0	genotype-11
		genotype-2	genotype-8
		genotype-2	genotype-7
		genotype-6	genotype-13
		genotype-4	genotype-17
		genotype-4	genotype-19
		genotype-14	genotype-18
	"""
    table = import_table(data, index='Identity')
    table = table['Parent']
    return table
Exemple #3
0
 def load_table(io: Union[str, Path, pandas.DataFrame]) -> pandas.DataFrame:
     if not isinstance(io, pandas.DataFrame):
         # Assume it is one of the other formats
         result = dataio.import_table(io)
     else:
         result = io
     return result
def test_parse_annotations():
    info_table_string = """
		Trajectory	mutation	gene
		1	A>C	PROKKA_00139/>glyA
		2	T>G	gapN/>glgB
		3	G>C	PROKKA_00438<
		4	C>T	PROKKA_00487<
		5	A>T	PROKKA_00512
		6	C>A	intergenic(62/+110)
		7	G>T	bglK_1<
		8	G>T	dnaI<
	"""
    info_table = import_table(info_table_string, index='Trajectory')

    genotype_members = {
        'genotype-1': "1|3|5",
        'genotype-2': "2",
        'genotype-3': "6|7|8"
    }
    expected_result = {
        'genotype-1':
        ["PROKKA_00139|glyA A>C", "PROKKA_00438 G>C", "PROKKA_00512 A>T"],
        'genotype-2': ["gapN|glgB T>G"],
        'genotype-3': ["intergenic(62/+110) C>A", "bglK_1 G>T", "dnaI G>T"]
    }

    test_result = annotations.parse_genotype_annotations(
        genotype_members, info_table)

    assert expected_result == test_result
Exemple #5
0
def run_lineageplot_workflow(edgesio: Union[str, Path, pandas.DataFrame],
                             filename: Path,
                             sheet_name: Optional[str] = None):
    """ Generates a lineage plot given an `edges` table. The columns should be named `parent` and `identity`.
		An optional `annotation` column will be used to annotate the plot.
	"""
    from muller.graphics import flowchart
    from muller.graphics.palettes import generate_palette
    if isinstance(edgesio, (str, Path)):
        edges = dataio.import_table(edgesio, sheet_name=sheet_name)
    else:
        edges = edgesio
    # Convert the column labels to lowercase
    edges.columns = [i.capitalize() for i in edges.columns]
    edges = edges.set_index('Identity')
    if 'Annotation' in edges.columns:
        edges_annotations = edges.pop("Annotation").to_dict()
        # Make sure 'annotations' is a list of str which is expected by the flowchart
        edges_annotations = {k: [v] for k, v in edges_annotations.items()}
    else:
        edges_annotations = dict()

    palette = generate_palette(edges['Parent'], kind='lineage')

    flowchart(edges, palette, edges_annotations, filename)
def genotypeannotations() -> pandas.DataFrame:
    string = """
	Trajectory	Chromosome	Position	Class	Mutation	Gene	Annotation	Class	Amino	Description
	1	1	36,414	SNP	G>A	speA	C127Y(TGT>TAT)	Non	C->Y	Arginine decarboxylase
	2	1	138,043	SNP	C>T	rlmCD_1	H441H(CAC>CAT)	Syn	H->H	23S rRNA (uracilC(5))methyltransferase RlmCD
	3	1	165,470	SNP	C>A	PROKKA_00173/>PROKKA_00174	intergenic(+174/91)	NC	na	Relaxase/Mobilisation nuclease domain protein/hypothetical protein
	4	1	234,888	SNP	C>A	gapN	A95E(GCA>GAA)	Non	A->E	NADPdependent glyceraldehyde3phosphate dehydrogenase
	"""
    return dataio.import_table(string, index='Trajectory')
Exemple #7
0
def helper_for_testing_tables(sorter, filename):
    # Since we removed the "unsorted" genotype table, generate a random version instead.
    expected_table = dataio.import_table(filename,
                                         sheet_name="genotype",
                                         index="Genotype")
    unsorted_table = dataio.import_table(filename,
                                         sheet_name="unsorted",
                                         index="Genotype")
    expected_table = expected_table[widgets.get_numeric_columns(
        expected_table.columns)]
    unsorted_table = unsorted_table[widgets.get_numeric_columns(
        unsorted_table.columns)]
    unsorted_table.columns = [int(i) for i in unsorted_table.columns]
    expected_table.columns = [int(i) for i in expected_table.columns]
    result = sorter.run(unsorted_table)
    result.index.name = 'Genotype'

    return result, expected_table
Exemple #8
0
def test_clustering_algorithm_on_real_tables(cluster, filename):
    trajectories = dataio.import_table(filename,
                                       sheet_name='trajectory',
                                       index='Trajectory')
    expected_members = helper_get_expected_members(trajectories)

    result = cluster.run(trajectories, distance_cutoff=0.2)

    assert sorted(result.genotype_members.values()) == sorted(
        expected_members.values())
def test_correct_math_scale():
    string = """Trajectory	X0	X1	X2	X3	X4	X5
		trajectory-A2	0	0	0	6	35	4
		trajectory-A3	0	0	0	0	45	5"""
    df = import_table(string, index='Trajectory')
    assert df['X4'].tolist() == [35, 45]

    fdf = _correct_math_scale(df)
    assert fdf['X4'].tolist() == [0.35, 0.45]
    assert fdf['X4'].dtype == float
Exemple #10
0
def test_parse_tree():
    string = """
		Parent	Identity
		genotype-0	genotype-1
		genotype-1	genotype-13
		genotype-1	genotype-12
		genotype-13	genotype-9
		genotype-9	genotype-10
		genotype-0	genotype-5
		genotype-1	genotype-11
		genotype-10	genotype-7
		genotype-1	genotype-4
		genotype-13	genotype-6
		genotype-0	genotype-2
		genotype-1	genotype-3
		genotype-1	genotype-8
	"""
    table = dataio.import_table(string)
    table = table.set_index('Identity')['Parent']

    expected_parent = {
        'genotype-1': 'genotype-1',
        'genotype-2': 'genotype-2',
        'genotype-3': 'genotype-1',
        'genotype-4': 'genotype-1',
        'genotype-5': 'genotype-5',
        'genotype-6': 'genotype-1',
        'genotype-7': 'genotype-1',
        'genotype-8': 'genotype-1',
        'genotype-9': 'genotype-1',
        'genotype-10': 'genotype-1',
        'genotype-11': 'genotype-1',
        'genotype-12': 'genotype-1',
        'genotype-13': 'genotype-1'
    }
    expected_distance = {
        'genotype-1': 1,
        'genotype-2': 1,
        'genotype-3': 2,
        'genotype-4': 2,
        'genotype-5': 1,
        'genotype-6': 3,
        'genotype-7': 5,
        'genotype-8': 2,
        'genotype-9': 3,
        'genotype-10': 4,
        'genotype-11': 2,
        'genotype-12': 2,
        'genotype-13': 2
    }

    result = treetools.parse_tree(table)

    assert result['clade'].to_dict() == expected_parent
    assert result['iterations'].to_dict() == expected_distance
Exemple #11
0
def test_calculate_mean_genotype(genotype_generator, filename):
    trajectories = import_table(filename,
                                sheet_name='trajectory',
                                index='Trajectory')
    genotypes = import_table(filename, sheet_name='genotype', index='Genotype')

    trajectories['Genotype'] = [
        'genotype-' + i.split('-')[1] for i in trajectories.index
    ]
    logger.debug(trajectories['Genotype'])
    groups = trajectories.groupby(by="Genotype")
    for genotype_label, group in groups:
        expected = genotypes.loc[genotype_label].astype(float)
        mean_genotype = genotype_generator._calculate_mean_frequencies_of_trajectories(
            genotype_label, group, group.index)
        del mean_genotype['members']
        mean_genotype = mean_genotype.astype(float)

        pandas.testing.assert_series_equal(mean_genotype,
                                           expected,
                                           check_index_type=False)
Exemple #12
0
def table() -> pandas.DataFrame:
    """ Returns the path to the input dataset for these tests. Marked as a fixture so that it can be called as a method parameter rather then
		as a standalone method call within the test methods.
	"""

    folder_data = Path(__file__).parent / "data"
    filename = folder_data / "generic.genotypes.10.xlsx"
    table = dataio.import_table(filename,
                                index='Trajectory',
                                sheet_name="trajectory")

    return table
Exemple #13
0
def load_datasets(filenames: List[Path]) -> List[pandas.DataFrame]:
    tables = list()
    for filename in filenames:
        logger.debug(f"Loading {filename}...")
        t = dataio.import_table(filename,
                                sheet_name='trajectory',
                                index="Trajectory")

        # Remove any unnecessary columns so that the entire table is numeric
        t = t[widgets.get_numeric_columns(t.columns)]

        tables.append(t)
    return tables
Exemple #14
0
def load_highresolution_datasets(filename: Path) -> List[pandas.DataFrame]:

    # Basically want to use this large dataset to generate tables of various sizes.
    table = dataio.import_table(filename, index="Trajectory")
    table = table[widgets.get_numeric_columns(table.columns)]
    table.columns = list(range(len(table.columns)))
    dfs = list()
    for i in range(2, 10):
        size = int(2**i)
        df = table.iloc[:size]
        dfs.append(df)

    return dfs
Exemple #15
0
def test_clustering_algorithm_on_generic_tables(cluster, filename):

    trajectories = dataio.import_table(filename,
                                       sheet_name='trajectory',
                                       index='Trajectory')
    if len(trajectories) == 10:
        # The table with 10 genotypes shouldn't be used as a trajectory table.
        return None

    expected_members = helper_get_expected_members(trajectories)

    result = cluster.run(trajectories, distance_cutoff=0.2)

    assert sorted(result.genotype_members.values()) == sorted(
        expected_members.values())
Exemple #16
0
def get_table_genotypes() -> pandas.DataFrame:
    genotypes = """
		Genotype	0	30	45	60	90
		genotype-7	0	0.167	0.55	0.91	0.972
		genotype-10	0	0.02	0.265	0.9	0.97
		genotype-11	0	0	0.15	0.836	0.945
		genotype-8	0	0.11363636	0.45	0.735	0.86
		genotype-9	0	0.08	0.475	0.625	0.848
		genotype-1	0	0.3345	0.28125	0.095	0.045
		genotype-13	0	0	0	0	0.315
		genotype-4	0	0.005	0.24	0.03	0.0375
		genotype-2	0	0	0	0	0.245666666666667
		genotype-5	0	0.139	0	0	0
		genotype-12	0	0	0	0	0.11
		genotype-6	0	0.063	0	0	0.01
		genotype-3	0	0	0	0	0.01
	"""
    return dataio.import_table(genotypes)
Exemple #17
0
def get_table_edges() -> pandas.Series:
    edges = """
		Identity	Parent
		genotype-7	genotype-0
		genotype-10	genotype-7
		genotype-11	genotype-10
		genotype-8	genotype-7
		genotype-9	genotype-8
		genotype-1	genotype-0
		genotype-13	genotype-11
		genotype-4	genotype-1
		genotype-2	genotype-11
		genotype-5	genotype-0
		genotype-12	genotype-10
		genotype-6	genotype-0
		genotype-3	genotype-0
	"""
    return dataio.import_table(edges).set_index('Identity')['Parent']
Exemple #18
0
def parse_genotype_table(
        filename: Path,
        sheet_name: str = 'Sheet1'
) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    """ Imports a table that lists pre-computed genotypes rather than trajectories."""
    data = import_table(filename, sheet_name=sheet_name)
    # For some reason some tables are annotated with 'genotype   ' with extra spaces.
    data.columns = [(i.strip() if isinstance(i, str) else i)
                    for i in data.columns]
    if 'Genotype' in data.columns:
        key_column = 'Genotype'
    elif 'Unnamed: 0' in data.columns:
        key_column = 'Unnamed: 0'
    elif 'Trajectory' in data.columns:
        key_column = 'Trajectory'
    else:
        message = f"One of the columns needs to be labeled `Genotype`. Got {data.columns} instead from {filename}."
        raise ValueError(message)

    genotype_timeseries, genotype_info = _parse_table(data, key_column)
    # Make sure the genotype labels are prefixed with 'genotype-'
    if not genotype_timeseries.index[0].startswith('genotype'):
        genotype_timeseries.index.name = 'originalLabel'
        genotype_timeseries['Genotype'] = [
            f'genotype-{i}' for i in range(1,
                                           len(genotype_timeseries) + 1)
        ]
        genotype_timeseries = genotype_timeseries.reset_index()
        genotype_timeseries = genotype_timeseries.set_index('Genotype')
        genotype_timeseries.pop('originalLabel')

    # Try to sort the genotypes by label, if posible.
    # Note: designed to sort labels of the form `genotype-[\d]+`
    try:
        sorted_index = sorted(genotype_timeseries.index,
                              key=lambda s: float(s.split('-')[-1]))
    except ValueError:
        sorted_index = genotype_timeseries.index

    genotype_timeseries = genotype_timeseries.loc[sorted_index]

    # Remove extraneous whitespace.
    return genotype_timeseries, genotype_info
Exemple #19
0
def test_calculate_mean_genotype(genotype_generator):
    test_genotypes = [['7'], ['4', '8'], ['3', '2'], ['13', '20', '11']]
    trajectories = pandas.read_csv(StringIO(trajectory_csv))
    trajectories['Trajectory'] = trajectories['Trajectory'].astype(str)
    trajectories = trajectories.set_index('Trajectory')

    expected_csv = """
		Genotype	0	17	25	44	66	75	90	members
		genotype-1	0.0	0.0	0.0	0.273	0.781	1.0	1.0	7
		genotype-2	0	0	0	0	0.278	0.822	0.803	4|8
		genotype-3	0	0	0	0.336	0.452	0.9175	0.8985	3|2
		genotype-4	0	0	0	0.082	0.234666666666667	0.019	0.052	13|20|11
		"""
    expected_mean = import_table(expected_csv, index='Genotype')
    output = genotype_generator.calculate_mean_genotype(
        test_genotypes, trajectories)
    logger.debug(expected_mean.to_string())
    logger.debug(output.to_string())
    # Rearrange columns to match output
    pandas.testing.assert_frame_equal(expected_mean, output)
Exemple #20
0
def parse_trajectory_table(
        filename: Union[str, Path],
        sheet_name='Sheet1') -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    """
		Reads an excel or csv file. Assumes that the file has a `Trajectory` column and a column for each timepoint.
	Parameters
	----------
	filename: Path
		The table containing the trajectories and associated metadata. Can be an excel sheet or comma/tab delimited file.
	sheet_name: str; Default 'Sheet1'
		Indicates which sheet contains the data, if an excel table is given.
	Returns
	-------
	pandas.DataFrame, pandas.DataFrame
		A timeseries dataframe
			- Index -> str
				Names unique to each trajectory.
			- Columns -> int
				The timeseries points will correspond to the frequencies for each trajectory included with the input sheet.
				Each trajectory/timepoint will include the observed frequency at each timepoint.
		A metadata dataframe
			- Index -> str
				Identical index to the timeseries dataframe.
			- Columns -> str
				All columns from the original input table that do no correspond to timepoints.
	"""

    # Read in the data table.
    raw_data = import_table(filename, sheet_name)

    key_column = 'Trajectory'
    timeseries, info = _parse_table(raw_data, key_column)

    if 'genotype' in info:
        # This file was generated by a previous run.
        info.pop('genotype')
    # Make sure the index is named `Trajectory` for consistency
    timeseries.index.name = 'Trajectory'
    # Make sure the columns of `info` are lowercase to help with later parsing.
    info.columns = [i.lower() for i in info.columns]
    return timeseries, info
def transposed_genotypes() -> pandas.DataFrame:
    genotype_table_string = """
		Genotype	0	17	25	44	66	75	90
		genotype-1	0	0	0.261	1	1	1	1
		genotype-2	0	0.38	0.432	0	0	0	0
		genotype-3	0	0	0	0	0	1	1
		genotype-4	0	0	0	0.525	0.454	0.911	0.91
		genotype-5	0	0	0	0.147	0.45	0.924	0.887
		genotype-6	0	0	0	0.273	0.781	1	1
		genotype-7	0	0	0	0.188	0.171	0.232	0.244
		genotype-8	0	0	0	0.403	0.489	0.057	0.08
		genotype-9	0	0	0.117	0	0	0	0.103
		genotype-10	0	0	0	0.138	0.295	0	0.081
		genotype-11	0	0	0	0	0.278	0.822	0.803
		genotype-12	0	0	0	0	0.2335	0.133	0.0375
		genotype-13	0	0	0.033	0.106	0.1065	0	0
		genotype-14	0	0	0	0	0	0.2675	0.326
		genotype-15	0	0	0	0.1145	0	0.1205	0.0615
	"""
    table = import_table(genotype_table_string, index='Genotype')
    return table.transpose()
def transposed_mouse_genotypes() -> pandas.DataFrame:
    table = """
		Genotype	0	1	2	3	4	5	6	7	8	9	10
		genotype-1	0	0	0.045	0.197	0.261	0.096	0.26	0.596	0.66	0.877	0.969
		genotype-2	0.01	0.279	0.341	0.568	0.708	0.913	0.756	0.455	0.399	0.13	0.041
		genotype-3	0	0.056	0.101	0.174	0	0	0	0	0	0	0
		genotype-4	0.278	0.277	0.224	0.195	0	0	0	0	0	0	0
		genotype-5	0	0	0	0	0	0.247	0.388	0.215	0.403	0.141	0.028
		genotype-6	0	0	0	0	0.148	0.384	0.344	0.289	0.333	0.146	0.031
		genotype-7	0	0	0	0	0	0	0.084	0.12	0.124	0.343	0.398
		genotype-8	0	0	0	0	0	0	0	0.077	0.018	0.239	0.308
		genotype-9	0	0.088	0.036	0.046	0	0.059	0.052	0	0.073	0	0
		genotype-10	0	0	0	0	0.072	0.047	0.057	0	0	0	0
		genotype-11	0.027	0.059	0.0325	0.008	0	0	0	0	0	0	0
		genotype-12	0.149	0.1885	0.172	0	0	0	0	0	0	0	0
		genotype-13	0	0.00525	0.0065	0.005	0.00775	0	0.01275	0.051	0.032	0.0195	0.02175
		genotype-14	0	0	0	0	0	0	0	0.0172	0.1156	0.112	0.0948
		genotype-15	0.001857	0	0.003714	0.001143	0	0	0.003286	0.006571	0.034	0.040286	0.038143
	"""
    t = import_table(table, index='Genotype')
    return t.transpose()
Exemple #23
0
def test_flowchart():
    edges_string = """
	Parent	Identity	score
	genotype-0	genotype-1	1
	genotype-1	genotype-13	2
	"""
    edges_table = dataio.import_table(edges_string)
    palette = {
        "genotype-13": '#222222',
        'genotype-1': '#CCCCCC',
        'genotype-0': '#000000'
    }

    resultgraph = lineageplot.flowchart(edges_table,
                                        palette,
                                        annotations={'genotype-1': ['gene1']})

    dark_node = resultgraph.get_node('genotype-13')
    assert dark_node.attr['fontcolor'] == '#FFFFFF'
    assert dark_node.attr['label'] == 'genotype-13'

    light_node = resultgraph.get_node('genotype-1')
    assert light_node.attr['fontcolor'] == '#333333'
    assert light_node.attr['label'] == 'genotype-1\ngene1'
Exemple #24
0
def b1_data() -> pandas.DataFrame:
    f = filenames.real_tables["B1"]
    t = dataio.import_table(f, sheet_name='trajectory', index='Trajectory')
    t.index = [str(i) for i in t.index]
    return t
def model_strong_selection() -> pandas.DataFrame:
    filename = model_tables['model.strongselection']
    table = dataio.import_table(filename,
                                sheet_name='genotype',
                                index="Genotype")
    return table
def model_clonal_interferance() -> pandas.DataFrame:
    filename = model_tables['model.clonalinterferance']
    table = dataio.import_table(filename,
                                sheet_name='genotype',
                                index='Genotype')
    return table
def model_periodic_selection() -> pandas.DataFrame:
    filename = model_tables['model.periodicselection']
    table = dataio.import_table(filename,
                                sheet_name="genotype",
                                index='Genotype')
    return table
Exemple #28
0
def get_table_population_smoothed() -> pandas.DataFrame:
    populations_smoothed = """
		Identity	Generation	Population
		genotype-7	0	0
		genotype-7	30	5.336364
		genotype-7	45	10
		genotype-7	60	1
		genotype-7	90	1
		genotype-10	0	0
		genotype-10	30	0
		genotype-10	45	11.5
		genotype-10	60	6.40000000000001
		genotype-10	90	1
		genotype-11	0	0
		genotype-11	30	0
		genotype-11	45	15
		genotype-11	60	83.6
		genotype-11	90	63
		genotype-8	0	0
		genotype-8	30	3.363636
		genotype-8	45	1
		genotype-8	60	11
		genotype-8	90	1
		genotype-9	0	0
		genotype-9	30	8
		genotype-9	45	47.5
		genotype-9	60	62.5
		genotype-9	90	84.8
		genotype-1	0	0
		genotype-1	30	32.95
		genotype-1	45	4.125
		genotype-1	60	6.5
		genotype-1	90	1
		genotype-13	0	0
		genotype-13	30	0
		genotype-13	45	0
		genotype-13	60	0
		genotype-13	90	31.5
		genotype-4	0	0
		genotype-4	30	0.5
		genotype-4	45	24
		genotype-4	60	3
		genotype-4	90	3.75
		genotype-2	0	0
		genotype-2	30	0
		genotype-2	45	0
		genotype-2	60	0
		genotype-2	90	24.5666666666667
		genotype-5	0	0
		genotype-5	30	13.9
		genotype-5	45	0
		genotype-5	60	0
		genotype-5	90	0
		genotype-12	0	0
		genotype-12	30	0
		genotype-12	45	0
		genotype-12	60	0
		genotype-12	90	11
		genotype-6	0	0
		genotype-6	30	6.3
		genotype-6	45	0
		genotype-6	60	0
		genotype-6	90	1
		genotype-3	0	0
		genotype-3	30	0
		genotype-3	45	0
		genotype-3	60	0
		genotype-3	90	1
		genotype-0	0	100
		genotype-0	30	29.65
		genotype-0	45	0
		genotype-0	60	0
		genotype-0	90	0
	"""

    return dataio.import_table(populations_smoothed, keep_empty=True)
Exemple #29
0
def trajectory_table() -> pandas.DataFrame:
	filename_table = filenames.fake_tables['generic.model.area']
	return dataio.import_table(filename_table, sheet_name = "data", index = 'Trajectory')