def get_variant(request, get_institute): logger.info("setup a variant") variant = Variant( document_id = "document_id", variant_id = "variant_id", display_name = "display_name", variant_type = 'research', case_id = 'case_id', chromosome = '1', position = 10, reference = "A", alternative = "C", rank_score = 10.0, variant_rank = 1, institute = get_institute, ) logger.info("Adding variant to database") variant.save() def teardown(): print('\n') logger.info('Removing variant') variant.delete() logger.info('Case variant') request.addfinalizer(teardown) return variant
def variants(self, case_id, query=None, variant_ids=None, nr_of_variants = 10, skip = 0): """ Returns the number of variants specified in question for a specific case. If skip ≠ 0 skip the first n variants. Arguments: case_id : A string that represents the case query : A dictionary with querys for the database Returns: A generator with the variants """ if variant_ids: nr_of_variants = len(variant_ids) else: nr_of_variants = skip + nr_of_variants mongo_query = self.build_query(case_id, query, variant_ids) for variant in (Variant.objects(__raw__=mongo_query) .order_by('variant_rank') .skip(skip) .limit(nr_of_variants)): yield variant
def variants(self, case_id, query=None, variant_ids=None, nr_of_variants=10, skip=0): """Returns variants specified in question for a specific case. If skip not equal to 0 skip the first n variants. Arguments: case_id(str): A string that represents the case query(dict): A dictionary with querys for the database Yields: Variant objects """ logger.info("Fetching variants from {0}".format(case_id)) if variant_ids: nr_of_variants = len(variant_ids) else: nr_of_variants = skip + nr_of_variants mongo_query = build_query(case_id, query, variant_ids) result = Variant.objects( __raw__=mongo_query).order_by( 'variant_rank').skip( skip).limit(nr_of_variants) for variant in result: yield variant
def delete_variants(self, case_id, variant_type): """Delete variants of one type for a case This is used when a case i reanalyzed Args: case_id(str): The case id variant_type(str): 'research' or 'clinical' """ logger.info("Deleting old {0} variants for case {1}".format( variant_type, case_id)) nr_deleted = Variant.objects( case_id=case_id, variant_type=variant_type).delete() logger.info("{0} variants deleted".format(nr_deleted)) logger.debug("Variants deleted")
def load_mongo_db(scout_configs, vcf_configs=None, family_type='cmms', mongo_db='variantDatabase', variant_type='clinical', username=None, password=None, port=27017, host='localhost', rank_score_threshold=0, variant_number_threshold=5000): """Populate a moongo database with information from ped and variant files.""" # get root path of the Flask app # project_root = '/'.join(app.root_path.split('/')[0:-1]) logger = logging.getLogger(__name__) # For testing only if __name__ == '__main__': logger = logging.getLogger("scout.ext.backend.load_mongo") ####### Check if the vcf file is on the proper format ####### vcf_file = scout_configs['load_vcf'] logger.info( "Found a vcf for loading variants into scout: {0}".format(vcf_file)) logger.info("Connecting to {0}".format(mongo_db)) connect(mongo_db, host=host, port=port, username=username, password=password) variant_database = get_db() ped_file = scout_configs['ped'] logger.info("Found a ped file: {0}".format(ped_file)) ######## Parse the config file to check for keys ######## logger.info("Parsing config file") config_object = ConfigParser(vcf_configs) ######## Get the cases and add them to the mongo db: ######## logger.info("Get the case from ped file") case = get_case(scout_configs, family_type) logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name)) ######## Add the institute to the mongo db: ######## for institute_name in case['collaborators']: if institute_name: institute = get_institute(institute_name) logger.info("Institute found: {0}".format(institute)) try: Institute.objects.get(internal_id=institute.internal_id) logger.info( "Institute {0} already in database".format(institute)) except DoesNotExist: institute.save() logger.info( "Adding new institute {0} to database".format(institute)) logger.info("Updating case in database") update_case(case, variant_type, logger) ######## Get the variants and add them to the mongo db: ######## logger.info("Setting up a variant parser") variant_parser = VCFParser(infile=vcf_file, split_variants=True, skip_info_check=True) nr_of_variants = 0 logger.info("Deleting old variants for case {0}".format(case.case_id)) Variant.objects(case_id=case.case_id, variant_type=variant_type).delete() logger.debug("Variants deleted") start_inserting_variants = datetime.now() # Get the individuals to see which we should include in the analysis ped_individuals = { individual.individual_id: individual.display_name for individual in case.individuals } # Check which individuals that exists in the vcf file. # Save the individuals in a dictionary with individual ids as keys # and display names as values individuals = {} # loop over keys (internal ids) logger.info("Checking which individuals in ped file exists in vcf") for individual_id, display_name in iteritems(ped_individuals): logger.debug("Checking individual {0}".format(individual_id)) if individual_id in variant_parser.individuals: logger.debug("Individual {0} found".format(individual_id)) individuals[individual_id] = display_name else: logger.warning("Individual {0} is present in ped file but"\ " not in vcf".format(individual_id)) logger.info('Start parsing variants') ########## If a rank score threshold is used check if it is below that threshold ########## for variant in variant_parser: logger.debug("Parsing variant {0}".format(variant['variant_id'])) if not float(variant['rank_scores'][ case.display_name]) > rank_score_threshold: logger.info("Lower rank score threshold reaced after {0}"\ " variants".format(nr_of_variants)) break if nr_of_variants > variant_number_threshold: logger.info("Variant number threshold reached. ({0})".format( variant_number_threshold)) break nr_of_variants += 1 mongo_variant = get_mongo_variant(variant, variant_type, individuals, case, config_object, nr_of_variants) mongo_variant.save() if nr_of_variants % 1000 == 0: logger.info('{0} variants parsed'.format(nr_of_variants))
id_fields = [ variant['CHROM'], variant['POS'], variant['REF'], variant['ALT'], variant_type ] variant_id = generate_md5_key(id_fields) document_id = generate_md5_key(id_fields + case_id.split('_')) # Create the mongo variant object mongo_variant = Variant(document_id=document_id, variant_id=variant_id, variant_type=variant_type, case_id=case_id, display_name='_'.join(id_fields), chromosome=variant['CHROM'], position=int(variant['POS']), reference=variant['REF'], alternative=variant['ALT'], variant_rank=variant_count, quality=float(variant['QUAL']), filters=variant['FILTER'].split(';'), institute=institute) # If a variant belongs to any gene lists we check which ones mongo_variant['gene_lists'] = variant['info_dict'].get( config_object['VCF']['GeneLists']['vcf_info_key'], None) ################# Add the rank score and variant rank ################# # Get the rank score as specified in the config file. # This is central for displaying variants in scout.
def load_mongo_db(scout_configs, vcf_configs=None, family_type='cmms', mongo_db='variantDatabase', variant_type='clinical', username=None, password=None, port=27017, host='localhost', rank_score_threshold = 0, variant_number_threshold = 5000): """Populate a moongo database with information from ped and variant files.""" # get root path of the Flask app # project_root = '/'.join(app.root_path.split('/')[0:-1]) logger = logging.getLogger(__name__) # For testing only if __name__ == '__main__': logger = logging.getLogger("scout.ext.backend.load_mongo") ####### Check if the vcf file is on the proper format ####### vcf_file = scout_configs['load_vcf'] logger.info("Found a vcf for loading variants into scout: {0}".format( vcf_file )) logger.info("Connecting to {0}".format(mongo_db)) connect(mongo_db, host=host, port=port, username=username, password=password) variant_database = get_db() ped_file = scout_configs['ped'] logger.info("Found a ped file: {0}".format(ped_file)) ######## Parse the config file to check for keys ######## logger.info("Parsing config file") config_object = ConfigParser(vcf_configs) ######## Get the cases and add them to the mongo db: ######## logger.info("Get the case from ped file") case = get_case(scout_configs, family_type) logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name)) ######## Add the institute to the mongo db: ######## for institute_name in case['collaborators']: if institute_name: institute = get_institute(institute_name) logger.info("Institute found: {0}".format(institute)) try: Institute.objects.get(internal_id = institute.internal_id) logger.info("Institute {0} already in database".format(institute)) except DoesNotExist: institute.save() logger.info("Adding new institute {0} to database".format(institute)) logger.info("Updating case in database") update_case(case, variant_type, logger) ######## Get the variants and add them to the mongo db: ######## logger.info("Setting up a variant parser") variant_parser = VCFParser(infile=vcf_file, split_variants=True, skip_info_check=True) nr_of_variants = 0 logger.info("Deleting old variants for case {0}".format(case.case_id)) Variant.objects(case_id=case.case_id, variant_type=variant_type).delete() logger.debug("Variants deleted") start_inserting_variants = datetime.now() # Get the individuals to see which we should include in the analysis ped_individuals = {individual.individual_id: individual.display_name for individual in case.individuals} # Check which individuals that exists in the vcf file. # Save the individuals in a dictionary with individual ids as keys # and display names as values individuals = {} # loop over keys (internal ids) logger.info("Checking which individuals in ped file exists in vcf") for individual_id, display_name in iteritems(ped_individuals): logger.debug("Checking individual {0}".format(individual_id)) if individual_id in variant_parser.individuals: logger.debug("Individual {0} found".format(individual_id)) individuals[individual_id] = display_name else: logger.warning("Individual {0} is present in ped file but"\ " not in vcf".format(individual_id)) logger.info('Start parsing variants') ########## If a rank score threshold is used check if it is below that threshold ########## for variant in variant_parser: logger.debug("Parsing variant {0}".format(variant['variant_id'])) if not float(variant['rank_scores'][case.display_name]) > rank_score_threshold: logger.info("Lower rank score threshold reaced after {0}"\ " variants".format(nr_of_variants)) break if nr_of_variants > variant_number_threshold: logger.info("Variant number threshold reached. ({0})".format( variant_number_threshold)) break nr_of_variants += 1 mongo_variant = get_mongo_variant(variant, variant_type, individuals, case, config_object, nr_of_variants) mongo_variant.save() if nr_of_variants % 1000 == 0: logger.info('{0} variants parsed'.format(nr_of_variants)) logger.info("Parsing variants done") logger.info("{0} variants inserted".format(nr_of_variants)) logger.info("Time to insert variants: {0}".format( datetime.now() - start_inserting_variants )) logger.info("Updating indexes") ensure_indexes(variant_database, logger) return