Esempio n. 1
0
    def _build_samples(self, read_dir, padding=None, picard_exe=None):
        """
            Method makes a list of sample objects, ready to load into a mongodb. This includes
            looking for the raw reads responsible for the variants in the vcf for each sample,
            write them to fastq files, and add the path to these files in the sample object.

            Args:

                read_dir(pathlib.Path): Path to directory where the new fastq files are to be
                stored.
        """

        date_str = time.strftime("%Y-%m-%d")
        sub_dir = f"{self.input_case['case']['case_id']}/{date_str}"

        case_dir = make_dir(read_dir.joinpath(sub_dir))
        sample_objects = []
        for sample in get_samples(
            samples=self.input_case["samples"],
            variants=self["variants"],
            padding=padding,
            picard_exe=picard_exe,
            case_dir=case_dir,
        ):

            sample_objects.append(sample)

        return sample_objects
Esempio n. 2
0
    def _build_sample(self):

        sample_dir = make_dir(
            self.case_dir.joinpath(self.input_sample['sample_id'])
        )

        if self.input_sample.get('fastq_files'):
            self._extract_fastq(sample_dir)
        else:
            self._extract_bam(sample_dir)
Esempio n. 3
0
def synthesize_command(context,
                       background_bam,
                       background_fastq,
                       background_fastq2,
                       dataset_dir,
                       query,
                       save_background,
                       json_out):

    """
        Command to make synthetic dataset
    """

    # load json file containing a mutacc query
    with open(query, "r") as json_handle:

        samples, _, variants, sample_name = json.load(json_handle)

    #Abort if no cases correspond to query
    num_cases = len(samples)
    if num_cases == 0:
        LOG.warning("No cases were found")
        context.abort()

    num_variants = len(variants)

    log_msg = f"{num_cases} cases found, with a total of {num_variants} variants."
    LOG.info(log_msg)

    if context.obj.get('demo', False):
        background_bam = path_to_background_bam_file
        background_fastq = path_to_background_fastq1_file
        background_fastq2 = path_to_background_fastq2_file

    background = {"bam_file": background_bam,
                  "fastq_files": [background_fastq]}
    if background_fastq2:
        background["fastq_files"].append(background_fastq2)

    #Create temporary directory
    temp_dir = context.obj.get('temp_dir')

    log_msg = f"Temporay files stored in {temp_dir}"
    LOG.info(log_msg)

    seqkit_executable = context.obj['binaries'].get('seqkit')
    dataset_dir = dataset_dir or context.obj.get('dataset_dir')
    dataset_dir = make_dir(dataset_dir)

    #make object make_set from Dataset class
    dataset = Dataset(samples=samples,
                      variants=variants,
                      tmp_dir=temp_dir,
                      background=background,
                      member=sample_name,
                      out_dir=dataset_dir,
                      seqkit_exe=seqkit_executable,
                      save_background=save_background)

    synthetics = dataset.synthetic_fastqs

    for synthetic in synthetics:
        log_msg = f"Synthetic datasets created in {synthetic}"
        LOG.info(log_msg)

    if json_out:
        output_info = {'fastq_files': [str(synthetic) for synthetic in synthetics]}
        output_json = json.dumps(output_info)
        click.echo(output_json)
Esempio n. 4
0
def export(
    context,
    case_mongo,
    variant_mongo,
    variant_type,
    analysis,
    all_variants,
    member,
    sex,
    vcf_dir,
    proband,
    sample_name,
    json_out,
):

    """
        exports dataset from DB
    """

    # Get mongo adapter from context
    adapter = context.obj["adapter"]
    variant_query = None
    case_query = None
    if all_variants:
        variant_query = {}
    else:
        if variant_mongo is not None:
            variant_query = json.loads(variant_mongo)
        if case_mongo is not None:
            case_query = json.loads(case_mongo)
        if variant_type is not None:
            if variant_query is None:
                variant_query = {"variant_type": variant_type}
            else:
                variant_query["variant_type"] = variant_type
        if analysis:
            if case_query is None:
                case_query = {"samples.0.analysis_type": analysis}
            else:
                case_query["samples.0.analysis_type"] = analysis
    # Query the cases in mutaccDB
    samples, regions, variants = mutacc_query(
        adapter, case_query, variant_query, sex=sex, member=member, proband=proband
    )
    sample_name = sample_name or member
    # Info to be dumped into file for later use with 'synthesize' command
    query = (samples, regions, variants, sample_name)
    query_dir = context.obj.get("query_dir")
    # json query and dump to file for later use with 'synthesize' command
    json_file = query_dir.joinpath(sample_name + "_query_mutacc.json")
    with open(json_file, "w") as json_handle:
        json.dump(query, json_handle)
    LOG.info("Query stored in %s", json_file)
    # sort variants
    found_variants = sort_variants(variants)
    # WRITE VCF FILE
    vcf_dir = vcf_dir or context.obj.get("vcf_dir")
    vcf_dir = make_dir(vcf_dir)
    vcf_file = vcf_dir.joinpath("{}_variants.vcf".format(sample_name))
    LOG.info("creating vcf file %s", vcf_file)
    vcf_parser = context.obj.get("vcf_parser_export")
    vcf_writer(found_variants, vcf_file, sample_name, adapter, vcf_parser=vcf_parser)
    if json_out:
        output_info = {"query_file": str(json_file), "vcf_file": str(vcf_file)}
        output_json = json.dumps(output_info)
        click.echo(output_json)
Esempio n. 5
0
def cli(context, loglevel, config_file, root_dir, demo, vcf_parser):

    coloredlogs.install(level=loglevel)
    LOG.info("Running mutacc")

    cli_config = {}
    if demo:
        host = "localhost"
        port = 27017
        db_name = "mutacc-demo"
        username = None
        password = None
        padding = PADDING
        sv_padding = SV_PADDING
        root_dir = make_dir(root_dir or "./mutacc_demo_root")

    else:

        if config_file:
            with open(config_file, "r") as in_handle:
                cli_config = yaml.safe_load(in_handle)

        host = cli_config.get("host") or "localhost"
        port = cli_config.get("port") or 27017
        uri = cli_config.get("uri")
        db_name = cli_config.get("database") or "mutacc"
        username = cli_config.get("username")
        password = cli_config.get("password")
        root_dir = cli_config.get("root_dir") or root_dir
        padding = cli_config.get("padding")
        sv_padding = cli_config.get("sv_padding")

        if not root_dir:
            LOG.warning(
                "Please provide a root directory, through option --root-dir or in config_file"
            )
            context.abort()

    vcf_parser = get_vcf_parser(parser_file=vcf_parser, config_dict=cli_config)

    mutacc_config = {}
    mutacc_config["host"] = host
    mutacc_config["port"] = port
    mutacc_config["uri"] = uri
    mutacc_config["username"] = username
    mutacc_config["password"] = password
    mutacc_config["db_name"] = db_name
    mutacc_config["vcf_parser_import"] = vcf_parser.get("import")
    mutacc_config["vcf_parser_export"] = vcf_parser.get("export")
    mutacc_config["root_dir"] = parse_path(root_dir, file_type="dir")
    mutacc_config["demo"] = demo
    mutacc_config["padding"] = padding
    mutacc_config["sv_padding"] = sv_padding

    # Create subdirectories in root, if not already created
    for dir_type in SUB_DIRS.keys():
        subdir = mutacc_config["root_dir"].joinpath(SUB_DIRS[dir_type])
        mutacc_config[dir_type] = make_dir(subdir)

    # Get binaries for picard and seqkit if specified in config
    mutacc_config["binaries"] = {}

    binaries = {}
    if cli_config.get("binaries"):
        binaries = cli_config["binaries"]

    mutacc_config["binaries"]["picard"] = binaries.get("picard")
    mutacc_config["binaries"]["seqkit"] = binaries.get("seqkit")

    context.obj = mutacc_config