def load_case(adapter, case_obj, update=False): """Load a case to the database Args: adapter: Connection to database case_obj: dict update(bool): If existing case should be updated Returns: case_obj(models.Case) """ # Check if the case already exists in database. existing_case = adapter.case(case_obj) if existing_case: if not update: raise CaseError("Case {0} already exists in database".format( case_obj['case_id'])) case_obj = update_case(case_obj, existing_case) # Add the case to database try: adapter.add_case(case_obj, update=update) except CaseError as err: raise err return case_obj
def get_case(family_lines, family_type='ped', vcf_path=None): """Return ped_parser case from a family file Create a dictionary with case data. If no family file is given create from VCF Args: family_lines (iterator): The family lines family_type (str): The format of the family lines vcf_path(str): Path to VCF Returns: family (Family): A ped_parser family object """ family = None LOG.info("Parsing family information") family_parser = FamilyParser(family_lines, family_type) families = list(family_parser.families.keys()) LOG.info("Found families {0}".format(', '.join(families))) if len(families) > 1: raise CaseError("Only one family per load can be used") family = family_parser.families[families[0]] return family
def update_case(case_obj, existing_case): """Update an existing case This will add paths to VCF files, individuals etc Args: case_obj(models.Case) existing_case(models.Case) Returns: updated_case(models.Case): Updated existing case """ variant_nrs = ['nr_variants', 'nr_sv_variants'] individuals = [('individuals', '_inds'), ('sv_individuals', '_sv_inds')] updated_case = deepcopy(existing_case) for i, file_name in enumerate(['vcf_path', 'vcf_sv_path']): variant_type = 'snv' if file_name == 'vcf_sv_path': variant_type = 'sv' if case_obj.get(file_name): if updated_case.get(file_name): LOG.warning("VCF of type %s already exists in case", variant_type) raise CaseError("Can not replace VCF in existing case") else: updated_case[file_name] = case_obj[file_name] updated_case[variant_nrs[i]] = case_obj[variant_nrs[i]] updated_case[individuals[i][0]] = case_obj[individuals[i][0]] updated_case[individuals[i][1]] = case_obj[individuals[i][1]] return updated_case
def update_case(case_obj, existing_case): """Update an existing case This will add paths to VCF files, individuals etc Args: case_obj(models.Case) existing_case(models.Case) Returns: updated_case(models.Case): Updated existing case """ variant_nrs = ["nr_variants", "nr_sv_variants"] individuals = [("individuals", "_inds"), ("sv_individuals", "_sv_inds")] updated_case = deepcopy(existing_case) for i, file_name in enumerate(["vcf_path", "vcf_sv_path"]): variant_type = "snv" if file_name == "vcf_sv_path": variant_type = "sv" if case_obj.get(file_name): if updated_case.get(file_name): LOG.warning("VCF of type %s already exists in case", variant_type) raise CaseError("Can not replace VCF in existing case") else: updated_case[file_name] = case_obj[file_name] updated_case[variant_nrs[i]] = case_obj[variant_nrs[i]] updated_case[individuals[i][0]] = case_obj[individuals[i][0]] updated_case[individuals[i][1]] = case_obj[individuals[i][1]] return updated_case
def get_family(family_lines, family_type='ped'): """Return the families found in a family file Args: family_lines (iterator): The family lines family_type (str): The format of the family lines Returns: family (Family): A ped_parser family object """ family = None logger.info("Parsing family information") family_parser = FamilyParser(family_lines, family_type) families = list(family_parser.families.keys()) logger.info("Found families {0}".format(', '.join(families))) if len(families) > 1: raise CaseError("Only one family per load can be used") family = family_parser.families[families[0]] return family
def get_formated_variant(variant, individuals, family_id, gq_treshold=None): """Return a formated variant line Take a vcf formated variant line and return a dictionary with the relevant information. If criterias are not fullfilled, eg. variant have no gt call or quality is below gq treshold then an empty dictionary is returned. Args: variant (dict): A variant dictionary individuals (list[str]): A list with individual ids family_id (str): The family id Return: formated_variant (dict): A variant dictionary """ gq_treshold = gq_treshold or 20 chrom = variant['CHROM'].lstrip('chr') pos = int(variant['POS']) ref = variant['REF'] alt = variant['ALT'] formated_variant = {} if ',' in alt: raise Exception("Multi allele calls are not allowed.") format_field = variant['FORMAT'].split(':') found_variant = False found_homozygote = False found_hemizygote = False for ind_id in individuals: ind_obj = individuals[ind_id] if ind_id in variant: raw_gt_call = variant[ind_id] else: raise CaseError("Individual {0} from ped does not exist in"\ " vcf".format(ind_id)) gt_call = dict(zip(format_field, raw_gt_call.split(':'))) genotype = Genotype(**gt_call) if genotype.genotype_quality >= gq_treshold: if genotype.has_variant: logger.debug("Found variant in affected") found_variant = True # If variant in X or Y and individual is male, # we need to check hemizygosity if chrom in ['X', 'Y'] and ind_obj.sex == 1: if not check_par(chrom, pos): logger.debug("Found hemizygous variant") found_hemizygote = True if genotype.homo_alt: logger.debug("Found homozygote alternative variant") found_homozygote = True if found_variant: formated_variant['_id'] = '_'.join([chrom, str(pos), ref, alt]) formated_variant['chrom'] = chrom formated_variant['pos'] = pos formated_variant['ref'] = ref formated_variant['alt'] = alt formated_variant['homozygote'] = 0 formated_variant['hemizygote'] = 0 if found_hemizygote: formated_variant['hemizygote'] = 1 elif found_homozygote: formated_variant['homozygote'] = 1 if family_id: formated_variant['family_id'] = family_id return formated_variant
try: # If a profile dict exists, get the profile for ind_id profile = profiles[ind_id] if profiles else None # If matching samples are found, get these samples for ind_id similar_samples = matches[ind_id] if matches else None ind_obj = Individual( ind_id=ind_id, case_id=case_id, ind_index=_ind_pos[ind_id], sex=individual.sex, profile=profile, similar_samples=similar_samples, ) ind_objs.append(dict(ind_obj)) except KeyError: raise CaseError("Ind %s in ped file does not exist in VCF", ind_id) else: # If there where no family file we can create individuals from what we know for ind_id in individual_positions: profile = profiles[ind_id] if profiles else None similar_samples = matches[ind_id] if matches else None ind_obj = Individual( ind_id=ind_id, case_id=case_id, ind_index=individual_positions[ind_id], profile=profile, similar_samples=similar_samples, ) ind_objs.append(dict(ind_obj)) # Add individuals to the correct variant type
def update_database(adapter, variant_file=None, sv_file=None, family_file=None, family_type='ped', skip_case_id=False, gq_treshold=None, case_id=None, max_window=3000): """Update a case in the database Args: adapter: Connection to database variant_file(str): Path to variant file sv_file(str): Path to sv variant file family_file(str): Path to family file family_type(str): Format of family file skip_case_id(bool): If no case information should be added to variants gq_treshold(int): If only quality variants should be considered case_id(str): If different case id than the one in family file should be used max_window(int): Specify the max size for sv windows Returns: nr_inserted(int) """ vcf_files = [] nr_variants = None vcf_individuals = None if variant_file: vcf_info = check_vcf(variant_file) nr_variants = vcf_info['nr_variants'] variant_type = vcf_info['variant_type'] vcf_files.append(variant_file) # Get the indivuduals that are present in vcf file vcf_individuals = vcf_info['individuals'] nr_sv_variants = None sv_individuals = None if sv_file: vcf_info = check_vcf(sv_file, 'sv') nr_sv_variants = vcf_info['nr_variants'] vcf_files.append(sv_file) sv_individuals = vcf_info['individuals'] # If a gq treshold is used the variants needs to have GQ for _vcf_file in vcf_files: # Get a cyvcf2.VCF object vcf = get_vcf(_vcf_file) if gq_treshold: if not vcf.contains('GQ'): LOG.warning( 'Set gq-treshold to 0 or add info to vcf {0}'.format( _vcf_file)) raise SyntaxError('GQ is not defined in vcf header') # Get a ped_parser.Family object from family file family = None family_id = None if family_file: with open(family_file, 'r') as family_lines: family = get_case(family_lines=family_lines, family_type=family_type) family_id = family.family_id # There has to be a case_id or a family at this stage. case_id = case_id or family_id # Convert infromation to a loqusdb Case object case_obj = build_case( case=family, case_id=case_id, vcf_path=variant_file, vcf_individuals=vcf_individuals, nr_variants=nr_variants, vcf_sv_path=sv_file, sv_individuals=sv_individuals, nr_sv_variants=nr_sv_variants, ) existing_case = adapter.case(case_obj) if not existing_case: raise CaseError("Case {} does not exist in database".format( case_obj['case_id'])) # Update the existing case in database case_obj = load_case( adapter=adapter, case_obj=case_obj, update=True, ) nr_inserted = 0 # If case was succesfully added we can store the variants for file_type in ['vcf_path', 'vcf_sv_path']: variant_type = 'snv' if file_type == 'vcf_sv_path': variant_type = 'sv' if case_obj.get(file_type) is None: continue vcf_obj = get_vcf(case_obj[file_type]) try: nr_inserted += load_variants( adapter=adapter, vcf_obj=vcf_obj, case_obj=case_obj, skip_case_id=skip_case_id, gq_treshold=gq_treshold, max_window=max_window, variant_type=variant_type, ) except Exception as err: # If something went wrong do a rollback LOG.warning(err) delete( adapter=adapter, case_obj=case_obj, update=True, existing_case=existing_case, ) raise err return nr_inserted
def load_database(adapter, variant_file, family_file, nr_variants=None, family_type='ped', skip_case_id=False, gq_treshold=None, case_id=None): """Load the database with a case and its variants Args: adapter variant_file(str) family_file(str) family_type(str) skip_case_id(bool) """ vcf = get_vcf(variant_file) if gq_treshold: if not vcf.contains('GQ'): logger.warning('Set gq-treshold to 0 or add info to vcf') raise SyntaxError('GQ is not defined in vcf header') with open(family_file, 'r') as family_lines: family = get_family(family_lines=family_lines, family_type=family_type) family_id = family.family_id if case_id: family_id = case_id if not family.affected_individuals: logger.warning("No affected individuals could be found in ped file") logger.debug("Found affected individuals in ped file: {0}".format( ', '.join(family.affected_individuals))) vcf_individuals = vcf.samples ind_positions = {} for i, ind_id in enumerate(vcf_individuals): ind_positions[ind_id] = i for ind_id in family.individuals: if ind_id not in ind_positions: raise CaseError( "Ind {0} in ped file does not exist in VCF".format(ind_id)) load_family(adapter=adapter, case_id=family_id, vcf_path=variant_file) try: load_variants( adapter=adapter, family_id=family_id, individuals=family.individuals, vcf=vcf, ind_positions=ind_positions, nr_variants=nr_variants, skip_case_id=skip_case_id, gq_treshold=gq_treshold, ) except Exception as err: logger.warning(err) ##TODO Delete inserted information here raise err