Esempio n. 1
0
def input_fq(config_file, input_dir):
    """
    Extract names of FASTQ input files from workflow configuration
    file and count the number of reads in each file.

    The configuration file is checked to see if it has an ``fq_files``
    key whose value is mappings from sample names to sample files
    (relative to ``input_dir``). Each FASTQ file has its reads
    counted.

    If there is no ``fq_files`` key but there is a
    ``multiplex_fq_files`` key then the value of this key is assumed
    to be a list of multiplexed input files (relative to
    ``input_dir``). Each FASTQ file has its reads counted.

    If both keys exist then both sets of input files are traversed.

    If neither key exists then no input files are traversed.

    For each file a ``pandas.core.frame.Series`` is created with
    fields ``SampleName`` (sample name recorded in configuration or,
    for multiplexed files, ``''``), ``Program`` (set to ``input``),
    ``File``, ``NumReads``, ``Description`` (``input``).

    :param config_file: Configuration file
    :type config_file: str or unicode
    :param input_dir: Directory
    :type input_dir: str or unicode
    :return: list of ``pandas.core.frame.Series``, or ``[]``
    :rtype: list(pandas.core.frame.Series)
    """
    with open(config_file, 'r') as f:
        config = yaml.load(f, yaml.SafeLoader)
    rows = []
    if utils.value_in_dict(params.FQ_FILES, config):
        sample_files = [
            (sample_name, os.path.join(input_dir, file_name))
            for sample_name, file_name in list(config[params.FQ_FILES].items())
        ]
    else:
        sample_files = []
    if utils.value_in_dict(params.MULTIPLEX_FQ_FILES, config):
        multiplex_files = [("", os.path.join(input_dir, file_name))
                           for file_name in config[params.MULTIPLEX_FQ_FILES]]
    else:
        multiplex_files = []
    files = sample_files + multiplex_files
    for (sample_name, file_name) in files:
        print(file_name)
        try:
            num_reads = fastq.count_sequences(file_name)
            row = pd.DataFrame(
                [[sample_name, INPUT, file_name, num_reads, INPUT]],
                columns=HEADER)
            rows.append(row)
        except Exception as e:
            print(e)
            continue
    return rows
Esempio n. 2
0
def generate_stats_figs(h5_file, out_dir, config, log_file, run_config):
    """
    Create summary statistics, and analyses and QC plots for both RPF
    and mRNA datasets using ``generate_stats_figs.R``.

    :param h5_file: H5 file (input)
    :type h5_file: str or unicode
    :param out_dir: Directory for output files
    :type out_dir: str or unicode
    :param config: Workflow configuration
    :type config: dict
    :param log_file: Log file (output)
    :type log_file: str or unicode
    :param run_config: Run-related configuration
    :type run_config: RunConfigTuple
    :raise KeyError: if a configuration parameter is mssing
    :raise FileNotFoundError: if ``Rscript`` cannot be found
    :raise AssertionError: if ``Rscript`` returns a non-zero exit \
    code
    """
    LOGGER.info(
        "Create summary statistics, and analyses and QC plots for both RPF and mRNA datasets. Log: %s",
        log_file)
    cmd = [
        "Rscript", "--vanilla",
        os.path.join(run_config.r_scripts, workflow_r.GENERATE_STATS_FIGS_R),
        "--num-processes=" + str(run_config.nprocesses),
        "--min-read-length=" + str(config[params.MIN_READ_LENGTH]),
        "--max-read-length=" + str(config[params.MAX_READ_LENGTH]),
        "--buffer=" + str(config[params.BUFFER]),
        "--primary-id=" + config[params.PRIMARY_ID],
        "--dataset=" + config[params.DATASET], "--hd-file=" + h5_file,
        "--orf-fasta-file=" + config[params.ORF_FASTA_FILE],
        "--rpf=" + str(config[params.RPF]), "--output-dir=" + out_dir,
        "--do-pos-sp-nt-freq=" + str(config[params.DO_POS_SP_NT_FREQ])
    ]
    # Add optional flags and values.
    flags = zip([
        params.T_RNA_FILE, params.CODON_POSITIONS_FILE, params.FEATURES_FILE,
        params.ORF_GFF_FILE, params.ASITE_DISP_LENGTH_FILE
    ], [
        "t-rna-file", "codon-positions-file", "features-file", "orf-gff-file",
        "asite-disp-length-file"
    ])
    for (flag, parameter) in flags:
        if value_in_dict(flag, config):
            flag_file = config[flag]
            cmd.append("--" + parameter + "=" + flag_file)
    if value_in_dict(params.COUNT_THRESHOLD, config):
        cmd.append("--count-threshold=" + str(config[params.COUNT_THRESHOLD]))
    process_utils.run_logged_command(cmd, log_file, run_config.cmd_file,
                                     run_config.is_dry_run)
Esempio n. 3
0
def test_value_in_dict_no_key():
    """
    Test that :py:func:`riboviz.utils.value_in_dict` returns `True` if
    non-existent key always returns ``False``.
    """
    values = {"A": 1, "C": 3}
    assert not utils.value_in_dict("B", values)
Esempio n. 4
0
def test_value_in_dict_none(allow_false_empty):
    """
    Test that :py:func:`riboviz.utils.value_in_dict` returns ``False`
    if a key has value ``None`` regardless of the value of
    ``allow_false_empty``.

    :param allow_false_empty: Value for ``allow_false_empty`` \
    parameter
    :type allow_false_empty: bool
    """
    values = {"A": 1, "B": None, "C": 3}
    assert not utils.value_in_dict("B", values, allow_false_empty)
Esempio n. 5
0
def test_value_in_dict(value, allow_false_empty):
    """
    Test that :py:func:`riboviz.utils.value_in_dict` returns `True` if
    a key is present and has one of the given values regardless of the
    value of `allow_false_empty`.

    :param value: Value for key
    :type value: -
    :param allow_false_empty: Value for ``allow_false_empty`` \
    parameter
    :type allow_false_empty: bool
    """
    values = {"A": 1, "B": value, "C": 3}
    assert utils.value_in_dict("B", values, allow_false_empty)
Esempio n. 6
0
def test_value_in_dict_allow_false_empty(value, allow_false_empty):
    """
    Test that :py:func:`riboviz.utils.value_in_dict` returns the same
    value as ``allow_false_empty`` if a key is present and has one of
    the given values.

    :param value: Value for key
    :type value: -
    :param allow_false_empty: Value for ``allow_false_empty`` \
    parameter
    :type allow_false_empty: bool
    """
    values = {"A": 1, "B": value, "C": 3}
    is_value_in_dict = utils.value_in_dict("B", values, allow_false_empty)
    assert (not is_value_in_dict or allow_false_empty) and\
           (is_value_in_dict or not allow_false_empty)  # NXOR
Esempio n. 7
0
def pytest_generate_tests(metafunc):
    """
    Parametrize tests using information within a configuration file.

    * If :py:const:`CONFIG_FILE` has been provided then use this as a
      configuration file, else use
      :py:const:`riboviz.test.VIGNETTE_CONFIG`.
    * Load configuration from file.
    * Inspect each test fixture used by the test functions and \
      configure with values from the configuration:
        - ``sample``:
            - If :py:const:`riboviz.params.FQ_FILES` is provided then
              sample names are the keys from this value.
            - If :py:const:`riboviz.params.MULTIPLEX_FQ_FILES` is
              provided then sample names are extracted from the sample
              sheet file specified in
              :py:const:`riboviz.params.SAMPLE_SHEET`.
            - If sample name
              :py:const:`riboviz.test.VIGNETTE_MISSING_SAMPLE`
              is present, then it is removed from the sample names.
        - ``index_prefix``: value of
          :py:const:`riboviz.params.ORF_INDEX_PREFIX` and
          :py:const:`riboviz.params.RRNA_INDEX_PREFIX`.
        - ``index_dir``: value of
          :py:const:`riboviz.params.INDEX_DIR`.
        - ``tmp_dir``: value of :py:const:`riboviz.params.TMP_DIR`.
        - ``output_dir``: value of
          :py:const:`riboviz.params.OUTPUT_DIR`.
        - ``extract_umis``: value of
          :py:const:`riboviz.params.EXTRACT_UMIS`.
        - ``dedup_umis``: value of
          :py:const:`riboviz.params.DEDUP_UMIS`.
        - ``dedup_stats``: value of
          :py:const:`riboviz.params.DEDUP_STATS` or `TRUE` if
          undefined
        - ``group_umis``: value of
          :py:const:`riboviz.params.GROUP_UMIS`.

    :param metafunc: pytest test function inspection object
    :type metafunc: _pytest.python.Metafunc
    :raise AssertionError: if the configuration file does not \
    exist or is not a file
    """
    if metafunc.config.getoption(CONFIG_FILE):
        config_file = metafunc.config.getoption(CONFIG_FILE)
    else:
        config_file = test.VIGNETTE_CONFIG
    assert os.path.exists(config_file) and os.path.isfile(config_file),\
        "No such file: %s" % config_file
    with open(config_file, 'r') as f:
        config = yaml.load(f, yaml.SafeLoader)
    fixtures = {
        "index_prefix":
        [config[params.ORF_INDEX_PREFIX], config[params.RRNA_INDEX_PREFIX]],
        "index_dir": [config[params.INDEX_DIR]],
        "tmp_dir": [config[params.TMP_DIR]],
        "output_dir": [config[params.OUTPUT_DIR]],
        "extract_umis": [utils.value_in_dict(params.EXTRACT_UMIS, config)],
        "dedup_umis": [utils.value_in_dict(params.DEDUP_UMIS, config)],
        "dedup_stats": [
            True if params.DEDUP_STATS not in config else utils.value_in_dict(
                params.DEDUP_STATS, config)
        ],
        "group_umis": [utils.value_in_dict(params.GROUP_UMIS, config)]
    }
    if "sample" in metafunc.fixturenames:
        samples = []
        if params.FQ_FILES in config:
            samples = list(config[params.FQ_FILES].keys())
        elif params.MULTIPLEX_FQ_FILES in config:
            sample_sheet_file = os.path.join(config[params.INPUT_DIR],
                                             config[params.SAMPLE_SHEET])
            sample_sheet = sample_sheets.load_sample_sheet(sample_sheet_file)
            samples = list(sample_sheet[sample_sheets.SAMPLE_ID])
        if test.VIGNETTE_MISSING_SAMPLE in samples:
            samples.remove(test.VIGNETTE_MISSING_SAMPLE)
        fixtures["sample"] = samples
    for fixture, value in fixtures.items():
        if fixture in metafunc.fixturenames:
            metafunc.parametrize(fixture, value)