Ejemplo n.º 1
0
def get_family(family_file=None, family_lines=None):
    """Return a family object
    
    """
    family = None
    if family_file:
        family = FamilyParser(open(family_file, 'r'))
    elif family_lines:
        family = FamilyParser(family_lines)

    return family
Ejemplo n.º 2
0
def parse_ped(ped_stream, family_type="ped"):
    """Parse out minimal family information from a PED file.

    Args:
        ped_stream(iterable(str))
        family_type(str): Format of the pedigree information

    Returns:
        family_id(str), samples(list[dict])
    """
    pedigree = FamilyParser(ped_stream, family_type=family_type)

    if len(pedigree.families) != 1:
        raise PedigreeError("Only one case per ped file is allowed")

    family_id = list(pedigree.families.keys())[0]
    family = pedigree.families[family_id]

    samples = [
        {
            "sample_id": ind_id,
            "father": individual.father,
            "mother": individual.mother,
            # Convert sex to human readable
            "sex": SEX_MAP[individual.sex],
            "phenotype": PHENOTYPE_MAP[int(individual.phenotype)],
        } for ind_id, individual in family.individuals.items()
    ]

    return family_id, samples
Ejemplo n.º 3
0
def get_case(family_lines, family_type='ped', vcf_path=None):
    """Return ped_parser case from a family file
    
    Create a dictionary with case data. If no family file is given create from VCF
    
    Args:
        family_lines (iterator): The family lines
        family_type (str): The format of the family lines
        vcf_path(str): Path to VCF
    
    Returns:
        family (Family): A ped_parser family object
    """
    family = None
    LOG.info("Parsing family information")

    family_parser = FamilyParser(family_lines, family_type)

    families = list(family_parser.families.keys())

    LOG.info("Found families {0}".format(', '.join(families)))

    if len(families) > 1:
        raise CaseError("Only one family per load can be used")

    family = family_parser.families[families[0]]

    return family
Ejemplo n.º 4
0
def case_id(request, case_lines):
    """Return a case obj"""
    family_parser = FamilyParser(case_lines, family_type='ped')
    families = list(family_parser.families.keys())
    family = family_parser.families[families[0]]
    family_id = family.family_id

    return family_id
Ejemplo n.º 5
0
    def test_standard_trio_extra_daughter(self):
        """Test if the file is parsed in a correct way."""
        family_parser = FamilyParser(open(self.trio_file.name, 'r'))
        trio_family = family_parser.families['healthyParentsAffectedSon']

        assert family_parser.header == [
            'family_id', 'sample_id', 'father_id', 'mother_id', 'sex',
            'phenotype'
        ]
        assert set(['proband', 'mother', 'father', 'daughter']) == set(
            family_parser.families['healthyParentsAffectedSon'].individuals.
            keys())
        assert set(['proband', 'mother', 'father']) in trio_family.trios
        assert set(['daughter', 'mother', 'father']) in trio_family.trios
        assert 'daughter' in trio_family.individuals['proband'].siblings
Ejemplo n.º 6
0
def sv_case_obj(request, case_lines, sv_vcf_obj, sv_vcf_path):
    """Return a case obj"""
    family_parser = FamilyParser(case_lines, family_type="ped")
    families = list(family_parser.families.keys())
    family = family_parser.families[families[0]]
    vcf_individuals = sv_vcf_obj.samples
    nr_variants = 0
    for nr_variants, variant in enumerate(sv_vcf_obj, 1):
        continue
    return build_case(
        case=family,
        sv_individuals=vcf_individuals,
        vcf_sv_path=sv_vcf_path,
        nr_sv_variants=nr_variants,
    )
Ejemplo n.º 7
0
def case_obj(request, case_lines, vcf_obj, vcf_path, profile_list):
    """Return a case obj"""
    family_parser = FamilyParser(case_lines, family_type="ped")
    families = list(family_parser.families.keys())
    family = family_parser.families[families[0]]
    vcf_individuals = vcf_obj.samples
    nr_variants = 0
    for nr_variants, variant in enumerate(vcf_obj, 1):
        continue
    return build_case(
        case=family,
        vcf_individuals=vcf_individuals,
        vcf_path=vcf_path,
        nr_variants=nr_variants,
        profiles={individual: profile_list
                  for individual in vcf_individuals},
    )
Ejemplo n.º 8
0
def parse_ped(ped_stream, family_type='ped'):
    """Parse out minimal family information from a PED file."""
    pedigree = FamilyParser(ped_stream, family_type=family_type)

    if len(pedigree.families) != 1:
        raise PedigreeError("Only one case per ped file is allowed")

    family_id = list(pedigree.families.keys())[0]
    family = pedigree.families[family_id]

    samples = [{
        'sample_id': ind_id,
        'father': individual.father,
        'mother': individual.mother,
        'sex': SEX_MAP[individual.sex],
        'phenotype': PHENOTYPE_MAP[int(individual.phenotype)],
    } for ind_id, individual in family.individuals.items()]

    return family_id, samples
Ejemplo n.º 9
0
def get_genetic_models(family_file, family_type):
    """
    Return the genetic models found for the family(families).
    
    Args:
        family_file (file): A file with family information 
                            in ped or ped like format.
    
    Returns:
        inheritance_models  : A set with the expected inheritance models
        family_id   : A string that represents the family id
    """
    inheritance_models = set([])
    my_family_parser = FamilyParser(family_file, family_type)
    family_id = None
    for family in my_family_parser.families:
        family_id = family
        for model in my_family_parser.families[family].models_of_inheritance:
            if model not in ['NA', 'na', 'Na']:
                inheritance_models.add(model)
    # Stupid thing but for now when we only look at one family
    return inheritance_models, family_id
Ejemplo n.º 10
0
def get_family(family_lines, family_type='ped'):
    """Return the families found in  a family file
    
        Args:
            family_lines (iterator): The family lines
            family_type (str): The format of the family lines
        
        Returns:
            family (Family): A ped_parser family object
    """
    family = None
    logger.info("Parsing family information")
    family_parser = FamilyParser(family_lines, family_type)

    families = list(family_parser.families.keys())

    logger.info("Found families {0}".format(', '.join(families)))

    if len(families) > 1:
        raise CaseError("Only one family per load can be used")

    family = family_parser.families[families[0]]

    return family
Ejemplo n.º 11
0
def family_obj(request, case_lines):
    """Return a case obj"""
    family_parser = FamilyParser(case_lines, family_type="ped")
    families = list(family_parser.families.keys())
    return family_parser.families[families[0]]
Ejemplo n.º 12
0
def score(context, variant_file, family_id, family_file, family_type,
          score_config, silent, skip_plugin_check, rank_results, outfile):
    """
    Score variants in a vcf file using a Weighted Sum Model.
    
    The specific scores should be defined in a config file, see examples on 
    github.
    """
    logger.info('Running GENMOD score, version: {0}'.format(__version__))

    logger.info("Checking family id")

    variant_file = get_file_handle(variant_file)

    if family_file:
        logger.info("Setting up a family parser")
        family_parser = FamilyParser(family_file, family_type)
        logger.debug("Family parser done")
        family_id = list(family_parser.families.keys())[0]

    logger.info("Family used in analysis: {0}".format(family_id))

    ## Check the score config:
    if not score_config:
        logger.warning("Please provide a score config file.")
        context.abort()

    logger.debug("Parsing config file")

    try:
        config_parser = ConfigParser(score_config)
    except ValidateError as e:
        logger.error(e.message)
        context.abort()

    score_categories = list(config_parser.categories.keys())

    logger.debug("Config parsed succesfully")

    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    logger.info("Check if all score plugins exist in vcf ...")
    if not check_plugins(config_parser, head):
        if not skip_plugin_check:
            logger.error("All score plugins has to be defined in vcf header")
            context.abort()
    else:
        logger.info("All plugins are defined in vcf")

    csq_format = head.vep_columns
    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    header_line = head.header

    if "RankScore" in head.info_dict:
        logger.warning("Variants already scored according to VCF header")
        logger.info("Please check VCF file")
        context.abort()

    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.',
        entry_type='String',
        description=
        "The rank score for this variant in this family. family_id:rank_score."
    )

    if rank_results:
        add_metadata(head,
                     'info',
                     'RankResult',
                     annotation_number='.',
                     entry_type='String',
                     description='|'.join(score_categories))

    print_headers(head=head, outfile=outfile, silent=silent)
    start_scoring = datetime.now()
    last_twenty = datetime.now()
    nr_of_variants = 1

    for line in variant_file:
        if not line.startswith('#'):
            variant = get_variant_dict(line, header_line)
            variant['info_dict'] = get_info_dict(variant['INFO'])
            rank_score = 0
            # This is for printing results to vcf:
            category_scores = []
            for category in score_categories:
                category_score = get_category_score(
                    variant=variant,
                    category=category,
                    config_parser=config_parser,
                    csq_format=csq_format)
                logger.debug("Adding category score {0} to rank_score".format(
                    category_score))

                rank_score += category_score
                logger.debug("Updating rank score to {0}".format(rank_score))

                category_scores.append(str(category_score))

            variant = add_vcf_info(keyword='RankScore',
                                   variant_dict=variant,
                                   annotation="{0}:{1}".format(
                                       family_id, rank_score))

            if rank_results:
                variant = add_vcf_info(keyword='RankResult',
                                       variant_dict=variant,
                                       annotation="|".join(category_scores))

            print_variant(variant_dict=variant,
                          header_line=header_line,
                          outfile=outfile,
                          silent=silent)

            nr_of_variants += 1

            if nr_of_variants % 20000 == 0:
                logger.info("{0} variants scored.".format(nr_of_variants))
                logger.info(
                    "Last 20000 took {0} to score.".format(datetime.now() -
                                                           last_twenty))
                last_twenty = datetime.now()

    logger.info(
        "Variants scored. Number of variants: {0}".format(nr_of_variants))
    logger.info("Time to score variants: {0}".format(datetime.now() -
                                                     start_scoring))
Ejemplo n.º 13
0
def case_id(request, case_lines):
    """Return a case id"""
    family_parser = FamilyParser(case_lines, family_type="ped")
    families = list(family_parser.families.keys())
    family = family_parser.families[families[0]]
    return family.family_id
Ejemplo n.º 14
0
def models(context, variant_file, family_file, family_type, reduced_penetrance,
           vep, keyword, phased, strict, silent, processes, outfile, temp_dir,
           whole_gene):
    """
    Annotate genetic models for vcf variants. 
    
    Checks what patterns of inheritance that are followed in a VCF file.
    The analysis is family based so each family that are specified in the family
    file and exists in the variant file will get it's own annotation.
    """

    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i not in ['frame']
    ]

    variant_file = get_file_handle(variant_file)
    ###########################################################################

    logger.info(
        "Running GENMOD annotate models version {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))

    reduced_penetrance_genes = set()
    nr_reduced_penetrance_genes = 0
    if reduced_penetrance:
        logger.info("Found file with genes that have reduced penetrance")
        for line in reduced_penetrance:
            if not line.startswith('#'):
                nr_reduced_penetrance_genes += 1
                gene_id = line.rstrip().split()[0]
                logger.debug(
                    "Adding gene {0} to reduced penetrance genes".format(
                        gene_id))
                reduced_penetrance_genes.add(gene_id)

        logger.info("Found {0} genes with reduced penetrance".format(
            nr_reduced_penetrance_genes))

    if not family_file:
        logger.warning("Please provide a family file with -f/--family_file")
        context.abort()

    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")

    families = {}
    logger.info("Check if the familys have any affected")
    for family_id in family_parser.families:
        found_affected = False
        family_obj = family_parser.families[family_id]
        for ind_id in family_obj.individuals:
            ind_obj = family_obj.individuals[ind_id]
            if ind_obj.affected:
                found_affected = True

        if found_affected:
            families[family_id] = family_obj
        else:
            logger.warning("No affected individuals found for family {0}."\
                           " Skipping family.".format(family_id))

    if not families:
        logger.warning(
            "Please provide at least one family with affected individuals")
        context.abort()
    # The individuals in the ped file must be present in the variant file:
    logger.info("Families used in analysis: {0}".format(','.join(
        list(families.keys()))))
    logger.info("Individuals included in analysis: {0}".format(','.join(
        list(family_parser.individuals.keys()))))

    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    if vep:
        if not "CSQ" in head.info_dict:
            logger.warning(
                "vep flag is used but there is no CSQ field specified in header"
            )
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using VEP annotation")
    else:
        if not keyword in head.info_dict:
            logger.warning(
                "Annotation key {0} could not be found in VCF header".format(
                    keyword))
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using {0} annotation".format(keyword))

    if "GeneticModels" in head.info_dict:
        logger.warning("Genetic models are already annotated according to vcf"\
        " header.")
        context.abort()

    logger.info("Adding genmod version to vcf header")
    head.add_version_tracking(info_id='genmod',
                              version=__version__,
                              date=datetime.now().strftime("%Y-%m-%d %H:%M"),
                              command_line=' '.join(argument_list))

    logger.debug("Version added")
    logger.info("Adding genetic models to vcf header")
    add_metadata(
        head,
        'info',
        'GeneticModels',
        annotation_number='.',
        entry_type='String',
        description="':'-separated list of genetic models for this variant.")

    logger.debug("Genetic models added")
    logger.info("Adding model score to vcf header")
    add_metadata(head,
                 'info',
                 'ModelScore',
                 annotation_number='.',
                 entry_type='String',
                 description="PHRED score for genotype models.")
    logger.debug("Model score added")

    logger.info("Adding Compounds to vcf header")
    add_metadata(
        head,
        'info',
        'Compounds',
        annotation_number='.',
        entry_type='String',
        description=(
            "List of compound pairs for this variant."
            "The list is splitted on ',' family id is separated with compounds"
            "with ':'. Compounds are separated with '|'."))
    logger.debug("Compounds added")

    vcf_individuals = head.individuals
    logger.debug("Individuals found in vcf file: {}".format(
        ', '.join(vcf_individuals)))

    try:
        check_individuals(family_parser.individuals, vcf_individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(', '.join(
            family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(
            ', '.join(vcf_individuals)))

        context.abort()

    start_time_analysis = datetime.now()

    analysis_individuals = list(family_parser.individuals.keys())

    logger.info("Individuals used in analysis: {0}".format(
        ', '.join(analysis_individuals)))

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    # One batch consists of all variants from one or several overlapping genes
    # there can be a significant amount of variants in a batch for whole genome
    # data...
    variant_queue = JoinableQueue(maxsize=100)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_model_checkers))

    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    try:
        model_checkers = [
            VariantAnnotator(task_queue=variant_queue,
                             results_queue=results,
                             families=families,
                             individuals=analysis_individuals,
                             phased=phased,
                             strict=strict,
                             vep=vep,
                             reduced_penetrance_genes=reduced_penetrance_genes)
            for i in range(num_model_checkers)
        ]
        logger.info('Starting the workers')
        for worker in model_checkers:
            logger.debug('Starting worker {0}'.format(worker))
            worker.start()

        # This process prints the variants to temporary files
        logger.info('Seting up the variant printer')
        if len(model_checkers) == 1:
            print_headers(head=head, outfile=outfile, silent=silent)
            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='normal',
                                             outfile=outfile)
        else:
            # We use a temp file to store the processed variants
            logger.debug("Build a tempfile for printing the variants")
            if temp_dir:
                temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
            else:
                temp_file = NamedTemporaryFile(delete=False)
            temp_file.close()

            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='chromosome',
                                             outfile=temp_file.name)

        logger.info('Starting the variant printer process')
        variant_printer.start()

        start_time_variant_parsing = datetime.now()

        # This process parses the original vcf and create batches to put in the variant queue:
        logger.info('Start parsing the variants')
        chromosome_list = get_batches(variants=variant_file,
                                      batch_queue=variant_queue,
                                      header=head,
                                      vep=vep,
                                      annotation_keyword=keyword)

        logger.debug("Put stop signs in the variant queue")
        for i in range(num_model_checkers):
            variant_queue.put(None)

        variant_queue.join()
        results.put(None)
        variant_printer.join()

        if len(model_checkers) > 1:
            sort_variants(infile=temp_file.name, mode='chromosome')

            print_headers(head=head, outfile=outfile, silent=silent)

            with open(temp_file.name, 'r', encoding='utf-8') as f:
                for line in f:
                    print_variant(variant_line=line,
                                  outfile=outfile,
                                  mode='modified',
                                  silent=silent)

    except Exception as err:
        logger.warning(err)
        for worker in model_checkers:
            worker.terminate()
        variant_printer.terminate()
        context.abort()
    finally:
        if len(model_checkers) > 1:
            logger.info("Removing temp file")
            os.remove(temp_file.name)
            logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Ejemplo n.º 15
0
def annotate(family_file, variant_file, family_type, vep, silent, phased,
             strict, cadd_raw, whole_gene, annotation_dir, cadd_file,
             cadd_1000g, cadd_exac, cadd_esp, cadd_indels, thousand_g, exac,
             outfile, split_variants, processes, dbnfsp, verbose):
    """Annotate variants in a VCF file.\n
        The main function with genmod is to annotate genetic inheritance patterns for variants in families. 
        Use flag --family together with a .ped file to describe which individuals in the vcf you wish to check inheritance for in the analysis.
        Individuals that are not present in the ped file will not be considered in the analysis.\n
        It is also possible to use genmod without a family file. In this case the variants will be annotated with a variety of options seen below.
        Please see docuentation on github.com/moonso/genmod or genmod/examples/readme.md for more information.
    """

    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i != 'config' and i != 'frame'
    ]

    if verbose:
        print('\nRunning GENMOD annotate version %s \n' % VERSION,
              file=sys.stderr)

    start_time_analysis = datetime.now()

    ######### Setup a variant parser #########

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin,
                                   split_variants=split_variants)
    else:
        variant_parser = VCFParser(infile=variant_file,
                                   split_variants=split_variants)

    # These are the individuals in from the vcf file
    individuals = variant_parser.individuals

    head = variant_parser.metadata

    # Update version logging
    add_metadata(head,
                 'version',
                 'genmod',
                 version=VERSION,
                 command_line_string=' '.join(argument_list))

    ######### Parse the ped file (if there is one) #########

    families = {}

    if family_file:
        family_parser = FamilyParser(family_file, family_type)
        # The individuals in the ped file must be present in the variant file:
        families = family_parser.families

        for individual in family_parser.individuals:
            if individual not in individuals:
                warning(
                    'All individuals in ped file must be in vcf file! Aborting...'
                )
                warning('Individuals in PED file: %s' %
                        ' '.join(list(family_parser.individuals.keys())))
                warning('Individuals in VCF file: %s' % ' '.join(individuals))
                print('Exiting...', file=sys.stderr)
                sys.exit()

        add_metadata(
            head,
            'info',
            'GeneticModels',
            annotation_number='.',
            entry_type='String',
            description="':'-separated list of genetic models for this variant."
        )
        add_metadata(head,
                     'info',
                     'ModelScore',
                     annotation_number='1',
                     entry_type='Integer',
                     description="PHRED score for genotype models.")
        add_metadata(
            head,
            'info',
            'Compounds',
            annotation_number='.',
            entry_type='String',
            description=
            ("List of compound pairs for this variant."
             "The list is splitted on ',' family id is separated with compounds"
             "with ':'. Compounds are separated with '|'."))

    if verbose:
        if family_file:
            print('Starting analysis of families: %s' %
                  ','.join(list(families.keys())),
                  file=sys.stderr)
            print('Individuals included in analysis: %s\n' %
                  ','.join(list(family_parser.individuals.keys())),
                  file=sys.stderr)
    ######### Read to the annotation data structures #########

    gene_trees = {}
    exon_trees = {}

    # If the variants are already annotated we do not need to redo the annotation
    if not vep:

        gene_trees, exon_trees = load_annotations(annotation_dir, verbose)

        add_metadata(
            head,
            'info',
            'Annotation',
            annotation_number='.',
            entry_type='String',
            description='Annotates what feature(s) this variant belongs to.')
    else:
        if verbose:
            print('Using VEP annotation', file=sys.stderr)

    ######### Check which other annotations files that should be used in the analysis #########

    cadd_annotation = False

    if cadd_file:
        if verbose:
            print('Cadd file! %s' % cadd_file, file=sys.stderr)
        cadd_annotation = True
    if cadd_1000g:
        if verbose:
            print('Cadd 1000G file! %s' % cadd_1000g, file=sys.stderr)
        cadd_annotation = True
    if cadd_esp:
        if verbose:
            print('Cadd ESP6500 file! %s' % cadd_esp, file=sys.stderr)
        cadd_annotation = True
    if cadd_indels:
        if verbose:
            print('Cadd InDel file! %s' % cadd_indels, file=sys.stderr)
        cadd_annotation = True
    if cadd_exac:
        if verbose:
            print('Cadd ExAC file! %s' % cadd_exac, file=sys.stderr)
        cadd_annotation = True

    if cadd_annotation:
        add_metadata(
            head,
            'info',
            'CADD',
            annotation_number='A',
            entry_type='Float',
            description="The CADD relative score for this alternative.")
        if cadd_raw:
            add_metadata(
                head,
                'info',
                'CADD_raw',
                annotation_number='A',
                entry_type='Float',
                description="The CADD raw score(s) for this alternative(s).")

    if thousand_g:
        if verbose:
            print('1000G frequency file! %s' % thousand_g, file=sys.stderr)
        add_metadata(head,
                     'info',
                     '1000G_freq',
                     annotation_number='A',
                     entry_type='Float',
                     description="Frequency in the 1000G database.")

    if exac:
        if verbose:
            print('ExAC frequency file! %s' % exac, file=sys.stderr)
        add_metadata(head,
                     'info',
                     'ExAC_freq',
                     annotation_number='A',
                     entry_type='Float',
                     description="Frequency in the ExAC database.")

    if dbnfsp:
        if verbose:
            print('dbNFSP file! %s' % dbnfsp, file=sys.stderr)

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    variant_queue = JoinableQueue(maxsize=1000)
    # The consumers will put their results in the results queue
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    if cadd_annotation:
        # We need more power when annotating cadd scores:
        # But if flag is used that overrides
        if num_model_checkers == min(4, cpu_count()):
            num_model_checkers = min(8, cpu_count())

    if verbose:
        print('Number of CPU:s %s' % cpu_count(), file=sys.stderr)
        print('Number of model checkers: %s' % num_model_checkers,
              file=sys.stderr)

    # We use a temp file to store the processed variants
    temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()
    # Open the temp file with codecs
    temporary_variant_file = open(temp_file.name,
                                  mode='w',
                                  encoding='utf-8',
                                  errors='replace')

    # These are the workers that do the heavy part of the analysis
    model_checkers = [
        VariantConsumer(variant_queue, results, families, phased, vep,
                        cadd_raw, cadd_file, cadd_1000g, cadd_exac, cadd_esp,
                        cadd_indels, thousand_g, exac, dbnfsp, strict, verbose)
        for i in range(num_model_checkers)
    ]

    for w in model_checkers:
        w.start()

    # This process prints the variants to temporary files
    var_printer = VariantPrinter(results,
                                 temporary_variant_file,
                                 head,
                                 mode='chromosome',
                                 verbosity=verbose)
    var_printer.start()

    start_time_variant_parsing = datetime.now()

    if verbose:
        print('Start parsing the variants ... \n', file=sys.stderr)

    # This process parses the original vcf and create batches to put in the variant queue:

    chromosome_list = get_batches(variant_parser, variant_queue, individuals,
                                  gene_trees, exon_trees, phased, vep,
                                  whole_gene, verbose)

    # Put stop signs in the variant queue
    for i in range(num_model_checkers):
        variant_queue.put(None)

    variant_queue.join()
    results.put(None)
    var_printer.join()

    temporary_variant_file.close()

    if verbose:
        print('Cromosomes found in variant file: %s \n' %
              ','.join(chromosome_list),
              file=sys.stderr)
        print('Models checked!\n', file=sys.stderr)

    sort_variants(temp_file.name, mode='chromosome', verbose=verbose)

    print_headers(head, outfile, silent)

    print_variants(temp_file.name, outfile, mode='modified', silent=silent)

    # Remove all temp files:
    os.remove(temp_file.name)

    if verbose:
        print('Time for whole analyis: %s' %
              str(datetime.now() - start_time_analysis),
              file=sys.stderr)
Ejemplo n.º 16
0
def get_case(scout_configs, family_type):
  """
  Take a case file and return the case on the specified format.

  Only one case per pedigree file is allowed.

  Args:
    family_type : A string that describe the format of the ped file
    scout_configs (dict): A dictionary scout info.

  Returns:
    case : A mongo engine object that describe the case
            found in the pedigree file.

  """
  logger = logging.getLogger(__name__)
  # Use ped_parser to get information from the pedigree file
  case_parser = FamilyParser(open(scout_configs['ped'], 'r'), 
                             family_type=family_type)

  # Check if there is a owner of the case
  try:
    owner = scout_configs['owner']
  except KeyError as e:
    logger.error("Scout config must include a owner")
    raise e

  # Check if there are any collaborators for the case, a case can belong to
  # several institutes
  collaborators = scout_configs.get('collaborators', None)
  if collaborators:
    collaborators = set(collaborators)
  else:
    collaborators = set()
  collaborators.add(owner)

  logger.info("Collaborators found: {0}".format(','.join(collaborators)))
  logger.info("Cases found in ped file: {0}".format(
    ', '.join(list(case_parser.families.keys()))))
  
  if len(case_parser.families) != 1:
    raise SyntaxError("Only one case per ped file is allowed")
  
  for case_id in case_parser.families:
    case = case_parser.families[case_id]
    # Create a mongo engine case
    mongo_case_id = '_'.join([owner, case_id])
    mongo_case = Case(case_id=mongo_case_id)
    logger.debug("Setting case id to: {0}".format(mongo_case_id))

    mongo_case['owner'] = owner
    logger.debug("Setting owner to: {0}".format(owner))

    mongo_case['collaborators'] = list(collaborators)
    logger.debug("Setting collaborators to: {0}".format(
      ', '.join(collaborators)))

    # We use the family id as display name for scout
    mongo_case['display_name'] = case_id
    logger.debug("Setting display name to: {0}".format(case_id))

    # Get the path of vcf from configs
    mongo_case['vcf_file'] = scout_configs.get('igv_vcf', '')
    logger.debug("Setting igv vcf file to: {0}".format(
      scout_configs.get('igv_vcf', '')))

    # Add the genome build information
    mongo_case['genome_build'] = scout_configs.get('human_genome_build', '')
    logger.debug("Setting genome build to: {0}".format(
      scout_configs.get('human_genome_build', '')))

    # Get the genome version
    mongo_case['genome_version'] = float(scout_configs.get('human_genome_version', '0'))
    logger.debug("Setting genome version to: {0}".format(
      scout_configs.get('human_genome_version', '0')))

    # Check the analysis date
    mongo_case['analysis_date'] = scout_configs.get('analysis_date', '')
    logger.debug("Setting analysis date to: {0}".format(
      scout_configs.get('analysis_date', '')))

    # Add the pedigree picture, this is a xml file that will be read and 
    # saved in the mongo database
    madeline_path = path(scout_configs.get('madeline', '/__menoexist.tXt'))
    if madeline_path.exists():
      logger.debug("Found madeline info")
      with madeline_path.open('r') as handle:
        mongo_case['madeline_info'] = handle.read()
        logger.debug("Madeline file was read succesfully")
    else:
      logger.info("No madeline file found. Skipping madeline file.")

    # Add the coverage report
    coverage_report_path = path(scout_configs.get('coverage_report', '/__menoexist.tXt'))
    if coverage_report_path.exists():
      logger.debug("Found a coverage report")
      with coverage_report_path.open('rb') as handle:
        mongo_case['coverage_report'] = handle.read()
        logger.debug("Coverage was read succesfully")
    else:
      logger.info("No coverage report found. Skipping coverage report.")

    clinical_gene_lists = []
    research_gene_lists = []

    for gene_list in scout_configs.get('gene_lists', {}):
      logger.info("Found gene list {0}".format(gene_list))
      list_info = scout_configs['gene_lists'][gene_list]

      list_type = list_info.get('type', 'clinical')
      list_id = list_info.get('name', '')
      version = float(list_info.get('version', 0))
      date = list_info.get('date', '')
      display_name = list_info.get('full_name', list_id)

      list_object = GeneList(
                          list_id=list_id,
                          version=version,
                          date=date,
                          display_name=display_name
                          )

      if list_type == 'clinical':
        logger.info("Adding {0} to clinical gene lists".format(list_object))
        clinical_gene_lists.append(list_object)
      else:
        logger.info("Adding {0} to research gene lists".format(list_object))
        research_gene_lists.append(list_object)

    mongo_case['clinical_gene_lists'] = clinical_gene_lists
    mongo_case['research_gene_lists'] = research_gene_lists

    default_gene_lists = scout_configs.get('default_gene_lists', [])

    mongo_case['default_gene_lists'] = list(default_gene_lists)

    individuals = []
    for individual_id in case.individuals:
      individual = case.individuals[individual_id]
      # Get info from configs for the individual
      config_info = scout_configs.get(
                                  'individuals', {}
                                  ).get(
                                  individual_id, {}
                                  )
      ind = Individual()
      ind['individual_id'] = individual_id
      ind['father'] = individual.father
      ind['mother'] = individual.mother
      ind['display_name'] = individual.extra_info.get('display_name', individual_id)
      ind['sex'] = str(individual.sex)
      ind['phenotype'] = individual.phenotype
      # Path to the bam file for IGV:
      ind['bam_file'] = config_info.get('bam_path', '')

      ind['capture_kits'] = config_info.get('capture_kit', [])

      individuals.append(ind)

    mongo_case['individuals'] = individuals
Ejemplo n.º 17
0
def get_individuals(variant_source, case_lines=None, case_type='ped', variant_mode='vcf'):
        """Get the individuals from a vcf file, gemini database, and/or a ped file.

            Args:
                variant_source (str): Path to a variant source
                case_lines(Iterable): Ped like lines
                case_type(str): Format of ped lines

            Returns:
                individuals (generator): generator with Individuals
        """
        individuals = []
        ind_dict ={}

        if variant_mode == 'vcf':
            head = get_header(variant_source)
            #Dictionary with ind_id:index where index show where in vcf ind info is

            for index, ind in enumerate(head.individuals):
                ind_dict[ind] = index

            if case_lines:
                # read individuals from ped file
                family_parser = FamilyParser(case_lines, family_type=case_type)
                families = family_parser.families
                logger.debug("Found families {0}".format(
                            ','.join(list(families.keys()))))
                if len(families) != 1:
                    logger.error("Only one family can be used with vcf adapter")
                    raise IOError

                case_id = list(families.keys())[0]
                logger.debug("Family used in analysis: {0}".format(case_id))

                for ind_id in family_parser.individuals:
                    ind = family_parser.individuals[ind_id]
                    logger.info("Found individual {0}".format(ind.individual_id))
                    try:
                        individual = Individual(
                            ind_id=ind_id,
                            case_id=case_id,
                            mother=ind.mother,
                            father=ind.father,
                            sex=str(ind.sex),
                            phenotype=str(ind.phenotype),
                            variant_source=variant_source,
                            ind_index=ind_dict[ind_id],
                            )
                        individuals.append(individual)
                    except KeyError as err:
                        #This is the case when individuals in ped does not exist
                        #in vcf
                        raise PedigreeError(
                            family_id=case_id,
                            individual_id=ind_id,
                            message="Individual {0} exists in ped file but not in vcf".format(ind_id)
                            )

            else:
                case_id = os.path.basename(variant_source)

                for ind in ind_dict:
                    individual = Individual(
                        ind_id=ind,
                        case_id=case_id,
                        variant_source=variant_source,
                        ind_index=ind_dict[ind]
                        )
                    individuals.append(individual)

                    logger.debug("Found individual {0} in {1}".format(
                                 ind, variant_source))
        elif variant_mode == 'gemini':
            gq = GeminiQuery(variant_source)
            #Dictionaru with sample to index in the gemini database
            ind_dict = gq.sample_to_idx
            query = "SELECT * from samples"
            gq.run(query)
            for individual in gq:
                logger.debug("Found individual {0} with family id {1}".format(
                    individual['name'], individual['family_id']))
                individuals.append(
                    Individual(
                        ind_id=individual['name'],
                        case_id=individual['family_id'],
                        mother=individual['maternal_id'],
                        father=individual['paternal_id'],
                        sex=individual['sex'],
                        phenotype=individual['phenotype'],
                        ind_index=ind_dict.get(individual['name']),
                        variant_source=variant_source,
                        bam_path=None)
                        )

        return individuals