def get_family(family_file=None, family_lines=None): """Return a family object """ family = None if family_file: family = FamilyParser(open(family_file, 'r')) elif family_lines: family = FamilyParser(family_lines) return family
def parse_ped(ped_stream, family_type="ped"): """Parse out minimal family information from a PED file. Args: ped_stream(iterable(str)) family_type(str): Format of the pedigree information Returns: family_id(str), samples(list[dict]) """ pedigree = FamilyParser(ped_stream, family_type=family_type) if len(pedigree.families) != 1: raise PedigreeError("Only one case per ped file is allowed") family_id = list(pedigree.families.keys())[0] family = pedigree.families[family_id] samples = [ { "sample_id": ind_id, "father": individual.father, "mother": individual.mother, # Convert sex to human readable "sex": SEX_MAP[individual.sex], "phenotype": PHENOTYPE_MAP[int(individual.phenotype)], } for ind_id, individual in family.individuals.items() ] return family_id, samples
def get_case(family_lines, family_type='ped', vcf_path=None): """Return ped_parser case from a family file Create a dictionary with case data. If no family file is given create from VCF Args: family_lines (iterator): The family lines family_type (str): The format of the family lines vcf_path(str): Path to VCF Returns: family (Family): A ped_parser family object """ family = None LOG.info("Parsing family information") family_parser = FamilyParser(family_lines, family_type) families = list(family_parser.families.keys()) LOG.info("Found families {0}".format(', '.join(families))) if len(families) > 1: raise CaseError("Only one family per load can be used") family = family_parser.families[families[0]] return family
def case_id(request, case_lines): """Return a case obj""" family_parser = FamilyParser(case_lines, family_type='ped') families = list(family_parser.families.keys()) family = family_parser.families[families[0]] family_id = family.family_id return family_id
def test_standard_trio_extra_daughter(self): """Test if the file is parsed in a correct way.""" family_parser = FamilyParser(open(self.trio_file.name, 'r')) trio_family = family_parser.families['healthyParentsAffectedSon'] assert family_parser.header == [ 'family_id', 'sample_id', 'father_id', 'mother_id', 'sex', 'phenotype' ] assert set(['proband', 'mother', 'father', 'daughter']) == set( family_parser.families['healthyParentsAffectedSon'].individuals. keys()) assert set(['proband', 'mother', 'father']) in trio_family.trios assert set(['daughter', 'mother', 'father']) in trio_family.trios assert 'daughter' in trio_family.individuals['proband'].siblings
def sv_case_obj(request, case_lines, sv_vcf_obj, sv_vcf_path): """Return a case obj""" family_parser = FamilyParser(case_lines, family_type="ped") families = list(family_parser.families.keys()) family = family_parser.families[families[0]] vcf_individuals = sv_vcf_obj.samples nr_variants = 0 for nr_variants, variant in enumerate(sv_vcf_obj, 1): continue return build_case( case=family, sv_individuals=vcf_individuals, vcf_sv_path=sv_vcf_path, nr_sv_variants=nr_variants, )
def case_obj(request, case_lines, vcf_obj, vcf_path, profile_list): """Return a case obj""" family_parser = FamilyParser(case_lines, family_type="ped") families = list(family_parser.families.keys()) family = family_parser.families[families[0]] vcf_individuals = vcf_obj.samples nr_variants = 0 for nr_variants, variant in enumerate(vcf_obj, 1): continue return build_case( case=family, vcf_individuals=vcf_individuals, vcf_path=vcf_path, nr_variants=nr_variants, profiles={individual: profile_list for individual in vcf_individuals}, )
def parse_ped(ped_stream, family_type='ped'): """Parse out minimal family information from a PED file.""" pedigree = FamilyParser(ped_stream, family_type=family_type) if len(pedigree.families) != 1: raise PedigreeError("Only one case per ped file is allowed") family_id = list(pedigree.families.keys())[0] family = pedigree.families[family_id] samples = [{ 'sample_id': ind_id, 'father': individual.father, 'mother': individual.mother, 'sex': SEX_MAP[individual.sex], 'phenotype': PHENOTYPE_MAP[int(individual.phenotype)], } for ind_id, individual in family.individuals.items()] return family_id, samples
def get_genetic_models(family_file, family_type): """ Return the genetic models found for the family(families). Args: family_file (file): A file with family information in ped or ped like format. Returns: inheritance_models : A set with the expected inheritance models family_id : A string that represents the family id """ inheritance_models = set([]) my_family_parser = FamilyParser(family_file, family_type) family_id = None for family in my_family_parser.families: family_id = family for model in my_family_parser.families[family].models_of_inheritance: if model not in ['NA', 'na', 'Na']: inheritance_models.add(model) # Stupid thing but for now when we only look at one family return inheritance_models, family_id
def get_family(family_lines, family_type='ped'): """Return the families found in a family file Args: family_lines (iterator): The family lines family_type (str): The format of the family lines Returns: family (Family): A ped_parser family object """ family = None logger.info("Parsing family information") family_parser = FamilyParser(family_lines, family_type) families = list(family_parser.families.keys()) logger.info("Found families {0}".format(', '.join(families))) if len(families) > 1: raise CaseError("Only one family per load can be used") family = family_parser.families[families[0]] return family
def family_obj(request, case_lines): """Return a case obj""" family_parser = FamilyParser(case_lines, family_type="ped") families = list(family_parser.families.keys()) return family_parser.families[families[0]]
def score(context, variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") variant_file = get_file_handle(variant_file) if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") context.abort() logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) context.abort() score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") context.abort() else: logger.info("All plugins are defined in vcf") csq_format = head.vep_columns #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description= "The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata(head, 'info', 'RankResult', annotation_number='.', entry_type='String', description='|'.join(score_categories)) print_headers(head=head, outfile=outfile, silent=silent) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score( variant=variant, category=category, config_parser=config_parser, csq_format=csq_format) logger.debug("Adding category score {0} to rank_score".format( category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info(keyword='RankScore', variant_dict=variant, annotation="{0}:{1}".format( family_id, rank_score)) if rank_results: variant = add_vcf_info(keyword='RankResult', variant_dict=variant, annotation="|".join(category_scores)) print_variant(variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info( "Last 20000 took {0} to score.".format(datetime.now() - last_twenty)) last_twenty = datetime.now() logger.info( "Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now() - start_scoring))
def case_id(request, case_lines): """Return a case id""" family_parser = FamilyParser(case_lines, family_type="ped") families = list(family_parser.families.keys()) family = family_parser.families[families[0]] return family.family_id
def models(context, variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, outfile, temp_dir, whole_gene): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i not in ['frame'] ] variant_file = get_file_handle(variant_file) ########################################################################### logger.info( "Running GENMOD annotate models version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug( "Adding gene {0} to reduced penetrance genes".format( gene_id)) reduced_penetrance_genes.add(gene_id) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") context.abort() logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning( "Please provide at least one family with affected individuals") context.abort() # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format(','.join( list(families.keys())))) logger.info("Individuals included in analysis: {0}".format(','.join( list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if vep: if not "CSQ" in head.info_dict: logger.warning( "vep flag is used but there is no CSQ field specified in header" ) logger.info("Please check VCF file") context.abort() else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning( "Annotation key {0} could not be found in VCF header".format( keyword)) logger.info("Please check VCF file") context.abort() else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") context.abort() logger.info("Adding genmod version to vcf header") head.add_version_tracking(info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list)) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant.") logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata(head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models.") logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=( "List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.")) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format( ', '.join(vcf_individuals))) try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format(', '.join( family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format( ', '.join(vcf_individuals))) context.abort() start_time_analysis = datetime.now() analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") # One batch consists of all variants from one or several overlapping genes # there can be a significant amount of variants in a batch for whole genome # data... variant_queue = JoinableQueue(maxsize=100) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') try: model_checkers = [ VariantAnnotator(task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, vep=vep, reduced_penetrance_genes=reduced_penetrance_genes) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter(task_queue=results, head=head, mode='normal', outfile=outfile) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, annotation_keyword=keyword) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as err: logger.warning(err) for worker in model_checkers: worker.terminate() variant_printer.terminate() context.abort() finally: if len(model_checkers) > 1: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def annotate(family_file, variant_file, family_type, vep, silent, phased, strict, cadd_raw, whole_gene, annotation_dir, cadd_file, cadd_1000g, cadd_exac, cadd_esp, cadd_indels, thousand_g, exac, outfile, split_variants, processes, dbnfsp, verbose): """Annotate variants in a VCF file.\n The main function with genmod is to annotate genetic inheritance patterns for variants in families. Use flag --family together with a .ped file to describe which individuals in the vcf you wish to check inheritance for in the analysis. Individuals that are not present in the ped file will not be considered in the analysis.\n It is also possible to use genmod without a family file. In this case the variants will be annotated with a variety of options seen below. Please see docuentation on github.com/moonso/genmod or genmod/examples/readme.md for more information. """ ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i != 'config' and i != 'frame' ] if verbose: print('\nRunning GENMOD annotate version %s \n' % VERSION, file=sys.stderr) start_time_analysis = datetime.now() ######### Setup a variant parser ######### if variant_file == '-': variant_parser = VCFParser(fsock=sys.stdin, split_variants=split_variants) else: variant_parser = VCFParser(infile=variant_file, split_variants=split_variants) # These are the individuals in from the vcf file individuals = variant_parser.individuals head = variant_parser.metadata # Update version logging add_metadata(head, 'version', 'genmod', version=VERSION, command_line_string=' '.join(argument_list)) ######### Parse the ped file (if there is one) ######### families = {} if family_file: family_parser = FamilyParser(family_file, family_type) # The individuals in the ped file must be present in the variant file: families = family_parser.families for individual in family_parser.individuals: if individual not in individuals: warning( 'All individuals in ped file must be in vcf file! Aborting...' ) warning('Individuals in PED file: %s' % ' '.join(list(family_parser.individuals.keys()))) warning('Individuals in VCF file: %s' % ' '.join(individuals)) print('Exiting...', file=sys.stderr) sys.exit() add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant." ) add_metadata(head, 'info', 'ModelScore', annotation_number='1', entry_type='Integer', description="PHRED score for genotype models.") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description= ("List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.")) if verbose: if family_file: print('Starting analysis of families: %s' % ','.join(list(families.keys())), file=sys.stderr) print('Individuals included in analysis: %s\n' % ','.join(list(family_parser.individuals.keys())), file=sys.stderr) ######### Read to the annotation data structures ######### gene_trees = {} exon_trees = {} # If the variants are already annotated we do not need to redo the annotation if not vep: gene_trees, exon_trees = load_annotations(annotation_dir, verbose) add_metadata( head, 'info', 'Annotation', annotation_number='.', entry_type='String', description='Annotates what feature(s) this variant belongs to.') else: if verbose: print('Using VEP annotation', file=sys.stderr) ######### Check which other annotations files that should be used in the analysis ######### cadd_annotation = False if cadd_file: if verbose: print('Cadd file! %s' % cadd_file, file=sys.stderr) cadd_annotation = True if cadd_1000g: if verbose: print('Cadd 1000G file! %s' % cadd_1000g, file=sys.stderr) cadd_annotation = True if cadd_esp: if verbose: print('Cadd ESP6500 file! %s' % cadd_esp, file=sys.stderr) cadd_annotation = True if cadd_indels: if verbose: print('Cadd InDel file! %s' % cadd_indels, file=sys.stderr) cadd_annotation = True if cadd_exac: if verbose: print('Cadd ExAC file! %s' % cadd_exac, file=sys.stderr) cadd_annotation = True if cadd_annotation: add_metadata( head, 'info', 'CADD', annotation_number='A', entry_type='Float', description="The CADD relative score for this alternative.") if cadd_raw: add_metadata( head, 'info', 'CADD_raw', annotation_number='A', entry_type='Float', description="The CADD raw score(s) for this alternative(s).") if thousand_g: if verbose: print('1000G frequency file! %s' % thousand_g, file=sys.stderr) add_metadata(head, 'info', '1000G_freq', annotation_number='A', entry_type='Float', description="Frequency in the 1000G database.") if exac: if verbose: print('ExAC frequency file! %s' % exac, file=sys.stderr) add_metadata(head, 'info', 'ExAC_freq', annotation_number='A', entry_type='Float', description="Frequency in the ExAC database.") if dbnfsp: if verbose: print('dbNFSP file! %s' % dbnfsp, file=sys.stderr) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### variant_queue = JoinableQueue(maxsize=1000) # The consumers will put their results in the results queue results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis if cadd_annotation: # We need more power when annotating cadd scores: # But if flag is used that overrides if num_model_checkers == min(4, cpu_count()): num_model_checkers = min(8, cpu_count()) if verbose: print('Number of CPU:s %s' % cpu_count(), file=sys.stderr) print('Number of model checkers: %s' % num_model_checkers, file=sys.stderr) # We use a temp file to store the processed variants temp_file = NamedTemporaryFile(delete=False) temp_file.close() # Open the temp file with codecs temporary_variant_file = open(temp_file.name, mode='w', encoding='utf-8', errors='replace') # These are the workers that do the heavy part of the analysis model_checkers = [ VariantConsumer(variant_queue, results, families, phased, vep, cadd_raw, cadd_file, cadd_1000g, cadd_exac, cadd_esp, cadd_indels, thousand_g, exac, dbnfsp, strict, verbose) for i in range(num_model_checkers) ] for w in model_checkers: w.start() # This process prints the variants to temporary files var_printer = VariantPrinter(results, temporary_variant_file, head, mode='chromosome', verbosity=verbose) var_printer.start() start_time_variant_parsing = datetime.now() if verbose: print('Start parsing the variants ... \n', file=sys.stderr) # This process parses the original vcf and create batches to put in the variant queue: chromosome_list = get_batches(variant_parser, variant_queue, individuals, gene_trees, exon_trees, phased, vep, whole_gene, verbose) # Put stop signs in the variant queue for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) var_printer.join() temporary_variant_file.close() if verbose: print('Cromosomes found in variant file: %s \n' % ','.join(chromosome_list), file=sys.stderr) print('Models checked!\n', file=sys.stderr) sort_variants(temp_file.name, mode='chromosome', verbose=verbose) print_headers(head, outfile, silent) print_variants(temp_file.name, outfile, mode='modified', silent=silent) # Remove all temp files: os.remove(temp_file.name) if verbose: print('Time for whole analyis: %s' % str(datetime.now() - start_time_analysis), file=sys.stderr)
def get_case(scout_configs, family_type): """ Take a case file and return the case on the specified format. Only one case per pedigree file is allowed. Args: family_type : A string that describe the format of the ped file scout_configs (dict): A dictionary scout info. Returns: case : A mongo engine object that describe the case found in the pedigree file. """ logger = logging.getLogger(__name__) # Use ped_parser to get information from the pedigree file case_parser = FamilyParser(open(scout_configs['ped'], 'r'), family_type=family_type) # Check if there is a owner of the case try: owner = scout_configs['owner'] except KeyError as e: logger.error("Scout config must include a owner") raise e # Check if there are any collaborators for the case, a case can belong to # several institutes collaborators = scout_configs.get('collaborators', None) if collaborators: collaborators = set(collaborators) else: collaborators = set() collaborators.add(owner) logger.info("Collaborators found: {0}".format(','.join(collaborators))) logger.info("Cases found in ped file: {0}".format( ', '.join(list(case_parser.families.keys())))) if len(case_parser.families) != 1: raise SyntaxError("Only one case per ped file is allowed") for case_id in case_parser.families: case = case_parser.families[case_id] # Create a mongo engine case mongo_case_id = '_'.join([owner, case_id]) mongo_case = Case(case_id=mongo_case_id) logger.debug("Setting case id to: {0}".format(mongo_case_id)) mongo_case['owner'] = owner logger.debug("Setting owner to: {0}".format(owner)) mongo_case['collaborators'] = list(collaborators) logger.debug("Setting collaborators to: {0}".format( ', '.join(collaborators))) # We use the family id as display name for scout mongo_case['display_name'] = case_id logger.debug("Setting display name to: {0}".format(case_id)) # Get the path of vcf from configs mongo_case['vcf_file'] = scout_configs.get('igv_vcf', '') logger.debug("Setting igv vcf file to: {0}".format( scout_configs.get('igv_vcf', ''))) # Add the genome build information mongo_case['genome_build'] = scout_configs.get('human_genome_build', '') logger.debug("Setting genome build to: {0}".format( scout_configs.get('human_genome_build', ''))) # Get the genome version mongo_case['genome_version'] = float(scout_configs.get('human_genome_version', '0')) logger.debug("Setting genome version to: {0}".format( scout_configs.get('human_genome_version', '0'))) # Check the analysis date mongo_case['analysis_date'] = scout_configs.get('analysis_date', '') logger.debug("Setting analysis date to: {0}".format( scout_configs.get('analysis_date', ''))) # Add the pedigree picture, this is a xml file that will be read and # saved in the mongo database madeline_path = path(scout_configs.get('madeline', '/__menoexist.tXt')) if madeline_path.exists(): logger.debug("Found madeline info") with madeline_path.open('r') as handle: mongo_case['madeline_info'] = handle.read() logger.debug("Madeline file was read succesfully") else: logger.info("No madeline file found. Skipping madeline file.") # Add the coverage report coverage_report_path = path(scout_configs.get('coverage_report', '/__menoexist.tXt')) if coverage_report_path.exists(): logger.debug("Found a coverage report") with coverage_report_path.open('rb') as handle: mongo_case['coverage_report'] = handle.read() logger.debug("Coverage was read succesfully") else: logger.info("No coverage report found. Skipping coverage report.") clinical_gene_lists = [] research_gene_lists = [] for gene_list in scout_configs.get('gene_lists', {}): logger.info("Found gene list {0}".format(gene_list)) list_info = scout_configs['gene_lists'][gene_list] list_type = list_info.get('type', 'clinical') list_id = list_info.get('name', '') version = float(list_info.get('version', 0)) date = list_info.get('date', '') display_name = list_info.get('full_name', list_id) list_object = GeneList( list_id=list_id, version=version, date=date, display_name=display_name ) if list_type == 'clinical': logger.info("Adding {0} to clinical gene lists".format(list_object)) clinical_gene_lists.append(list_object) else: logger.info("Adding {0} to research gene lists".format(list_object)) research_gene_lists.append(list_object) mongo_case['clinical_gene_lists'] = clinical_gene_lists mongo_case['research_gene_lists'] = research_gene_lists default_gene_lists = scout_configs.get('default_gene_lists', []) mongo_case['default_gene_lists'] = list(default_gene_lists) individuals = [] for individual_id in case.individuals: individual = case.individuals[individual_id] # Get info from configs for the individual config_info = scout_configs.get( 'individuals', {} ).get( individual_id, {} ) ind = Individual() ind['individual_id'] = individual_id ind['father'] = individual.father ind['mother'] = individual.mother ind['display_name'] = individual.extra_info.get('display_name', individual_id) ind['sex'] = str(individual.sex) ind['phenotype'] = individual.phenotype # Path to the bam file for IGV: ind['bam_file'] = config_info.get('bam_path', '') ind['capture_kits'] = config_info.get('capture_kit', []) individuals.append(ind) mongo_case['individuals'] = individuals
def get_individuals(variant_source, case_lines=None, case_type='ped', variant_mode='vcf'): """Get the individuals from a vcf file, gemini database, and/or a ped file. Args: variant_source (str): Path to a variant source case_lines(Iterable): Ped like lines case_type(str): Format of ped lines Returns: individuals (generator): generator with Individuals """ individuals = [] ind_dict ={} if variant_mode == 'vcf': head = get_header(variant_source) #Dictionary with ind_id:index where index show where in vcf ind info is for index, ind in enumerate(head.individuals): ind_dict[ind] = index if case_lines: # read individuals from ped file family_parser = FamilyParser(case_lines, family_type=case_type) families = family_parser.families logger.debug("Found families {0}".format( ','.join(list(families.keys())))) if len(families) != 1: logger.error("Only one family can be used with vcf adapter") raise IOError case_id = list(families.keys())[0] logger.debug("Family used in analysis: {0}".format(case_id)) for ind_id in family_parser.individuals: ind = family_parser.individuals[ind_id] logger.info("Found individual {0}".format(ind.individual_id)) try: individual = Individual( ind_id=ind_id, case_id=case_id, mother=ind.mother, father=ind.father, sex=str(ind.sex), phenotype=str(ind.phenotype), variant_source=variant_source, ind_index=ind_dict[ind_id], ) individuals.append(individual) except KeyError as err: #This is the case when individuals in ped does not exist #in vcf raise PedigreeError( family_id=case_id, individual_id=ind_id, message="Individual {0} exists in ped file but not in vcf".format(ind_id) ) else: case_id = os.path.basename(variant_source) for ind in ind_dict: individual = Individual( ind_id=ind, case_id=case_id, variant_source=variant_source, ind_index=ind_dict[ind] ) individuals.append(individual) logger.debug("Found individual {0} in {1}".format( ind, variant_source)) elif variant_mode == 'gemini': gq = GeminiQuery(variant_source) #Dictionaru with sample to index in the gemini database ind_dict = gq.sample_to_idx query = "SELECT * from samples" gq.run(query) for individual in gq: logger.debug("Found individual {0} with family id {1}".format( individual['name'], individual['family_id'])) individuals.append( Individual( ind_id=individual['name'], case_id=individual['family_id'], mother=individual['maternal_id'], father=individual['paternal_id'], sex=individual['sex'], phenotype=individual['phenotype'], ind_index=ind_dict.get(individual['name']), variant_source=variant_source, bam_path=None) ) return individuals