def handle(self, *args, **options): # parse and validate args validate_only = options["validate_only"] project_guid = options["project_id"] pedigree_file_path = options["pedigree_file"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals()) # parse the pedigree file if specified input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table( pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records)
def process_records(json_records, filename='ped_file'): pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, filename, user=request.user, project=project) if errors: raise ErrorsWarningsException(errors, ped_warnings) nonlocal warnings warnings += ped_warnings return pedigree_records
def add_individuals_from_pedigree_file(project, pedigree_file_path, validate_only=False): if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals()) # parse the pedigree file if specified input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records)
def test_parse_datstat_pedigree_table(self): records, errors, warnings = parse_pedigree_table( [[ 'DATSTAT_ALTPID', 'FAMILY_ID', 'DDP_CREATED', 'DDP_LASTUPDATED', 'RELATIONSHIP', 'RELATIONSHIP_SPECIFY', 'PATIENT_WEBSITE', 'DESCRIPTION', 'CLINICAL_DIAGNOSES', 'CLINICAL_DIAGNOSES_SPECIFY', 'GENETIC_DIAGNOSES', 'GENETIC_DIAGNOSES_SPECIFY', 'FIND_OUT.DOCTOR', 'FIND_OUT_DOCTOR_DETAILS', 'PATIENT_AGE', 'CONDITION_AGE', 'PATIENT_DECEASED', 'DECEASED_AGE', 'DECEASED_CAUSE', 'DECEASED_STORED_SAMPLE', 'PATIENT_SEX', 'RACE_LIST', 'PTETHNICITY', 'DOCTOR_TYPES_LIST', 'DOCTOR_TYPES_SPECIFY', 'TESTS.NONE', 'TESTS.NOT_SURE', 'TESTS.KARYOTYPE', 'TESTS.SINGLE_GENE_TESTING', 'TESTS.GENE_PANEL_TESTING', 'TESTS.MITOCHON_GENOME_SEQUENCING', 'TESTS.MICROARRAY', 'TESTS_MICROARRAY_YEAR', 'TESTS_MICROARRAY_LAB', 'TESTS_MICROARRAY_RELATIVE_LIST', 'TESTS_MICROARRAY_RELATIVE_SPEC', 'TESTS.WEXOME_SEQUENCING', 'TESTS_WEXOME_SEQUENCING_YEAR', 'TESTS_WEXOME_SEQUENCING_LAB', 'TESTS_WEXOME_SEQUENCING_REL_LI', 'TESTS_WEXOME_SEQUENCING_REL_SP', 'TESTS.WGENOME_SEQUENCING', 'TESTS_WGENOME_SEQUENCING_YEAR', 'TESTS_WGENOME_SEQUENCING_LAB', 'TESTS_WGENOME_SEQUENCING_REL_L', 'TESTS_WGENOME_SEQUENCING_REL_S', 'TESTS.OTHER', 'TEST_OTHER_SPECIFY', 'BIOPSY.NONE', 'BIOPSY', 'BIOPSY.OTHER', 'BIOPSY_OTHER_SPECIFY', 'OTHER_GENETIC_STUDIES', 'OTHER_GENETIC_STUDIES_SPECIFY', 'EXPECTING_GENETIC_RESULTS', 'SAME_CONDITION_MOM', 'CONDITION_AGE_MOM', 'ABLE_TO_PARTICIPATE_MOM', 'DECEASED_MOM', 'STORED_DNA_MOM', 'SAME_CONDITION_DAD', 'CONDITION_AGE_DAD', 'ABLE_TO_PARTICIPATE_DAD', 'DECEASED_DAD', 'STORED_DNA_DAD', 'NO_SIBLINGS', 'SIBLING_LIST', 'NO_CHILDREN', 'CHILD_LIST', 'NO_RELATIVE_AFFECTED', 'RELATIVE_LIST', 'FAMILY_INFO' ], [ '1518231365', '123', '2019-07-31T03:54:21UTC', '2019-08-01T14:12:40UTC', '6', 'Grandchild', 'wwww.myblog.com', 'I have a really debilitating probably genetic condition. I\xe2ve seen many specialists.', '1', 'SMA\xe2s', '1', 'Dwarfism\xe2', '1', 'Dr John Smith', '34', '21', '1', '33', 'heart attack', '2', '1', '["White","Asian","Pacific"]', '2', '["ClinGen","Neurologist","Cardiologist","Other"]', 'Pediatrician', '0', '0', '0', '1', '1', '0', '0', '', '', '', '', '1', '2018', 'UDN\xe2s lab', '["Parent","AuntUncle","NieceNephew","Other"]', 'Grandmother', '1', '', '', '', 'Grandmother', '1', 'Blood work', '0', 'MUSCLE,SKIN,OTHER: Muscle Biopsy, Skin Biopsy, Other Tissue Biopsy', '1', 'Bone\xe2s', '1', 'Undiagnosed Diseases Network', '2', '1', '19', '1', '', '', '2', '', '', '1', '2', '0', '[{"sex":"Female","age":"21","races":["White"],"ethnicity":"NonHispanic","sameCondition":"Yes","ageOnsetCondition":null,"ableToParticipate":"No","siblingId":"d18b9f4b-0995-45e9-9b00-e710d0004a3f"},{"sex":"","age":"17","races":["White"],"ethnicity":"NonHispanic","sameCondition":"","ageOnsetCondition":null,"ableToParticipate":"Yes","siblingId":"3ddc9015-3c2c-484c-b1de-502ba9ffc1e4"}]', '1', '', '0', '[{"sex":"Male","age":"44","races":["White"],"ethnicity":"NonHispanic","sameCondition":"No","ageOnsetCondition":null,"ableToParticipate":null,"siblingId":"bb87c69f-6c52-48b4-8854-e639d998abe7"}]', 'patient\xe2s uncle (dads brother) died from Fahrs disease at 70' ], [ 'b392fd78b440', '987', '2019-08-06T14:30:44UTC', '2019-08-06T15:18:48UTC', '8', 'Grandchild', '', '', '3', 'SMA', '2', 'Dwarfism', '0', 'Dr John Smith', '47', '2', '0', '33', 'heart attack', '2', '3', '["White"]', '3', '[]', 'Pediatrician', '0', '1', '0', '1', '1', '0', '0', '', '', '', '', '1', '2018', 'UDN', '["Parent","AuntUncle","NieceNephew","Other"]', 'Grandmother', '1', '', '', '', 'Grandmother', '1', 'Blood work', '1', 'NONE: This individual hasn\'t had a biopsy', '1', 'Bone', '0', 'Undiagnosed Diseases Network', '2', '3', '19', '2', '3', '', '', '', '', '', '1', '1', '[{"sex":"Female","age":"21","races":["White"],"ethnicity":"NonHispanic","sameCondition":"Yes","ageOnsetCondition":null,"ableToParticipate":"No","siblingId":"d18b9f4b-0995-45e9-9b00-e710d0004a3f"},{"sex":"","age":"17","races":["White"],"ethnicity":"NonHispanic","sameCondition":"","ageOnsetCondition":null,"ableToParticipate":"Yes","siblingId":"3ddc9015-3c2c-484c-b1de-502ba9ffc1e4"}]', '0: No', '[{"sex":"Male","age":"12","races":["White"],"ethnicity":"NonHispanic","sameCondition":"No","ageOnsetCondition":null,"ableToParticipate":"Unsure","siblingId":"bb87c69f-6c52-48b4-8854-e639d998abe7"}]', '1', '', '' ]], FILENAME) note_1 = """#### Clinical Information __Patient is my:__ Grandchild (male) __Current Age:__ Patient is deceased, age 33, due to heart attack, sample not available __Age of Onset:__ 21 __Race/Ethnicity:__ White, Asian, Pacific; Not Hispanic __Case Description:__ I have a really debilitating probably genetic condition. Ive seen many specialists. __Clinical Diagnoses:__ Yes; SMAs __Genetic Diagnoses:__ Yes; Dwarfism __Website/Blog:__ Yes __Additional Information:__ patients uncle (dads brother) died from Fahrs disease at 70 #### Prior Testing __Referring Physician:__ Dr John Smith __Doctors Seen:__ Clinical geneticist, Neurologist, Cardiologist, Other: Pediatrician __Previous Testing:__ Yes; Single gene testing Gene panel testing Whole exome sequencing. Year: 2018, Lab: UDNs lab, Relatives: Parent, Aunt or Uncle, Niece or Nephew, Other: Grandmother Whole genome sequencing. Year: unspecified, Lab: unspecified, Relatives: not specified Other tests: Blood work __Biopsies Available:__ Muscle Biopsy, Skin Biopsy, Other Tissue Biopsy: Bones __Other Research Studies:__ Yes, Name of studies: Undiagnosed Diseases Network, Expecting results: No #### Family Information __Mother:__ affected, onset age 19, available __Father:__ unaffected, unavailable, deceased, sample not available __Siblings:__ Sister, age 21, affected, unavailable Sibling (unspecified sex), age 17, unspecified affected status, available __Children:__ None __Relatives:__ Male, age 44, affected, unspecified availability""" note_2 = """#### Clinical Information __Patient is my:__ Adult Child (unspecified sex) - unable to provide consent __Current Age:__ 47 __Age of Onset:__ 2 __Race/Ethnicity:__ White; Unknown __Case Description:__ __Clinical Diagnoses:__ Unknown/Unsure __Genetic Diagnoses:__ No __Website/Blog:__ No __Additional Information:__ None specified #### Prior Testing __Referring Physician:__ None __Doctors Seen:__ __Previous Testing:__ Not sure __Biopsies Available:__ None __Other Research Studies:__ No #### Family Information __Mother:__ unknown affected status, unavailable, unknown deceased status __Father:__ unknown affected status, unavailable, unspecified deceased status __Siblings:__ None __Children:__ Son, age 12, unaffected, unspecified availability __Relatives:__ None""" self.assertListEqual(records, [ { 'familyId': 'RGP_123', 'individualId': 'RGP_123_1', 'sex': 'F', 'affected': 'N' }, { 'familyId': 'RGP_123', 'individualId': 'RGP_123_2', 'sex': 'M', 'affected': 'N' }, { 'familyId': 'RGP_123', 'individualId': 'RGP_123_3', 'sex': 'M', 'affected': 'A', 'maternalId': 'RGP_123_1', 'paternalId': 'RGP_123_2', 'familyNotes': note_1 }, { 'familyId': 'RGP_987', 'individualId': 'RGP_987_1', 'sex': 'F', 'affected': 'N' }, { 'familyId': 'RGP_987', 'individualId': 'RGP_987_2', 'sex': 'M', 'affected': 'N' }, { 'familyId': 'RGP_987', 'individualId': 'RGP_987_3', 'sex': 'U', 'affected': 'A', 'maternalId': 'RGP_987_1', 'paternalId': 'RGP_987_2', 'familyNotes': note_2 }, ]) self.assertListEqual(errors, []) self.assertListEqual(warnings, [])
def test_parse_pedigree_table(self): records, errors, warnings = parse_pedigree_table( [['family_id', 'individual_id', 'sex', 'affected'], ['fam1', 'ind1', 'male']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ 'Error while parsing file: {}. Row 1 contains 3 columns: fam1, ind1, male, while header contains 4: family_id, individual_id, sex, affected' .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['', '', 'male', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertEqual(len(errors), 1) self.assertEqual( errors[0].split('\n')[0], "Error while converting {} rows to json: Family Id not specified in row #1:" .format(FILENAME)) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', '', 'male', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertEqual(len(errors), 1) self.assertEqual( errors[0].split('\n')[0], "Error while converting {} rows to json: Individual Id not specified in row #1:" .format(FILENAME)) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', 'ind1', 'boy', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Invalid value 'boy' for sex in row #1" .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', 'ind1', 'male', 'no', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Invalid value 'no' for affected status in row #1" .format(FILENAME) ]) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother', 'proband_relation' ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2', 'mom']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ 'Error while converting {} rows to json: Invalid value "mom" for proband relationship in row #1' .format(FILENAME) ]) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother', 'proband_relation' ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2', 'mother' ], ['fam2', 'ind2', 'male', 'unknown', '.', '', '']], FILENAME) self.assertListEqual(records, [ { 'familyId': 'fam1', 'individualId': 'ind1', 'sex': 'M', 'affected': 'A', 'paternalId': 'ind3', 'maternalId': 'ind2', 'probandRelationship': 'M' }, { 'familyId': 'fam2', 'individualId': 'ind2', 'sex': 'M', 'affected': 'U', 'paternalId': '', 'maternalId': '', 'probandRelationship': '' }, ]) self.assertListEqual(errors, [ 'Invalid proband relationship "Mother" for ind1 with given gender Male', 'ind2 is recorded as Male and also as the mother of ind1', 'ind2 is recorded as the mother of ind1 but they have different family ids: fam2 and fam1', ]) self.assertListEqual(warnings, [ "ind3 is the father of ind1 but doesn't have a separate record in the table" ]) records, errors, warnings = parse_pedigree_table( [['A pedigree file'], ['# Some comments'], [ '#family_id', '#individual_id', 'previous_individual_id', 'notes_for_import', 'other_data', 'sex', 'affected', 'father', 'mother', 'phenotype: coded', 'proband_relation' ], [ 'fam1', 'ind1', 'ind1_old_id', 'some notes', 'some more notes', 'male', 'aff.', '.', 'ind2', 'HPO:12345', '' ], [ 'fam1', 'ind2', '', ' ', '', 'female', 'u', '.', '', 'HPO:56789', 'mother' ]], FILENAME) self.assertListEqual(records, [ { 'familyId': 'fam1', 'individualId': 'ind1', 'sex': 'M', 'affected': 'A', 'paternalId': '', 'maternalId': 'ind2', 'notes': 'some notes', 'codedPhenotype': 'HPO:12345', 'probandRelationship': '', 'previousIndividualId': 'ind1_old_id' }, { 'familyId': 'fam1', 'individualId': 'ind2', 'sex': 'F', 'affected': 'N', 'paternalId': '', 'maternalId': '', 'notes': '', 'codedPhenotype': 'HPO:56789', 'probandRelationship': 'M', 'previousIndividualId': '' }, ]) self.assertListEqual(errors, []) self.assertListEqual(warnings, [])
def test_parse_sample_manifest(self, mock_email): header_1 = [ 'Do not modify - Broad use', '', '', 'Please fill in columns D - O', '', '', '', '', '', '', '', '', '', '', '' ] header_2 = [ 'Kit ID', 'Well', 'Sample ID', 'Family ID', 'Alias', 'Alias', 'Paternal Sample ID', 'Maternal Sample ID', 'Gender', 'Affected Status', 'Volume', 'Concentration', 'Notes', 'Coded Phenotype', 'Data Use Restrictions' ] header_3 = [ '', 'Position', '', '', 'Collaborator Participant ID', 'Collaborator Sample ID', '', '', '', '', 'ul', 'ng/ul', '', '', 'indicate study/protocol number' ] records, errors, warnings = parse_pedigree_table([ header_1, [ 'Kit ID', 'Well', 'Sample ID', 'Family ID', 'Alias', 'Maternal Sample ID', 'Gender', 'Affected Status', 'Volume', 'Concentration', 'Notes', 'Coded Phenotype', 'Data Use Restrictions' ], header_3, ], FILENAME) self.assertListEqual(errors, [ 'Error while parsing file: {}. Expected vs. actual header columns: | Sample ID| Family ID| Alias|-Alias|-Paternal Sample ID| Maternal Sample ID| Gender| Affected Status' .format(FILENAME) ]) self.assertListEqual(warnings, []) self.assertListEqual(records, []) records, errors, warnings = parse_pedigree_table([ header_1, header_2, [ '', 'Position', '', '', 'Collaborator Sample ID', '', '', '', '', 'ul', 'ng/ul', '', '', 'indicate study/protocol number' ] ], FILENAME) self.assertListEqual(errors, [ 'Error while parsing file: {}. Expected vs. actual header columns: |-Collaborator Participant ID| Collaborator Sample ID|+' .format(FILENAME) ]) self.assertListEqual(warnings, []) self.assertListEqual(records, []) original_data = [ header_1, header_2, header_3, [ 'SK-3QVD', 'A02', 'SM-IRW6C', 'PED073', 'SCO_PED073B_GA0339', 'SCO_PED073B_GA0339_1', '', '', 'male', 'unaffected', '20', '94.8', 'probably dad', '', '1234' ], [ 'SK-3QVD', 'A03', 'SM-IRW69', 'PED073', 'SCO_PED073C_GA0340', 'SCO_PED073C_GA0340_1', 'SCO_PED073B_GA0339_1', 'SCO_PED073A_GA0338_1', 'female', 'affected', '20', '98', '', 'Perinatal death', '' ] ] records, errors, warnings = parse_pedigree_table( original_data, FILENAME, user=User.objects.get(id=10), project=Project.objects.get(id=1)) self.assertListEqual(records, [{ 'affected': 'N', 'maternalId': '', 'notes': 'probably dad', 'individualId': 'SCO_PED073B_GA0339_1', 'sex': 'M', 'familyId': 'PED073', 'paternalId': '', 'codedPhenotype': '' }, { 'affected': 'A', 'maternalId': 'SCO_PED073A_GA0338_1', 'notes': '', 'individualId': 'SCO_PED073C_GA0340_1', 'sex': 'F', 'familyId': 'PED073', 'paternalId': 'SCO_PED073B_GA0339_1', 'codedPhenotype': 'Perinatal death' }]) self.assertListEqual(warnings, [ "SCO_PED073A_GA0338_1 is the mother of SCO_PED073C_GA0340_1 but doesn't have a separate record in the table" ]) self.assertListEqual(errors, []) mock_email.assert_called_with( subject='SK-3QVD Merged Sample Pedigree File', body=mock.ANY, to=['*****@*****.**'], attachments=[ ('SK-3QVD.xlsx', mock.ANY, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ), ('test.xlsx', mock.ANY, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ), ]) self.assertEqual( mock_email.call_args.kwargs['body'], """User [email protected] just uploaded pedigree info to 1kg project n\xe5me with uni\xe7\xf8de.This email has 2 attached files: SK-3QVD.xlsx is the sample manifest file in a format that can be sent to GP. test.csv is the original merged pedigree-sample-manifest file that the user uploaded. """) mock_email.return_value.attach_alternative.assert_called_with( """User [email protected] just uploaded pedigree info to 1kg project n\xe5me with uni\xe7\xf8de.<br />This email has 2 attached files:<br /> <br /> <b>SK-3QVD.xlsx</b> is the sample manifest file in a format that can be sent to GP.<br /> <br /> <b>test.csv</b> is the original merged pedigree-sample-manifest file that the user uploaded.<br /> """, 'text/html') mock_email.return_value.send.assert_called() # Test sent sample manifest is correct sample_wb = load_workbook( BytesIO(mock_email.call_args.kwargs['attachments'][0][1])) sample_ws = sample_wb.active sample_ws.title = 'Sample Info' self.assertListEqual( [[cell.value or '' for cell in row] for row in sample_ws], [[ 'Well', 'Sample ID', 'Alias', 'Alias', 'Gender', 'Volume', 'Concentration' ], [ 'Position', '', 'Collaborator Participant ID', 'Collaborator Sample ID', '', 'ul', 'ng/ul' ], [ 'A02', 'SM-IRW6C', 'SCO_PED073B_GA0339', 'SCO_PED073B_GA0339_1', 'male', '20', '94.8' ], [ 'A03', 'SM-IRW69', 'SCO_PED073C_GA0340', 'SCO_PED073C_GA0340_1', 'female', '20', '98' ]]) # Test original file copy is correct original_wb = load_workbook( BytesIO(mock_email.call_args.kwargs['attachments'][1][1])) original_ws = original_wb.active self.assertListEqual([[cell.value or '' for cell in row] for row in original_ws], original_data)
def create_project_from_workspace(request, namespace, name): """ Create a project when a cooperator requests to load data from an AnVIL workspace. :param request: Django request object :param namespace: The namespace (or the billing account) of the workspace :param name: The name of the workspace. It also be used as the project name :return the projectsByGuid with the new project json """ # Validate that the current user has logged in through google and has sufficient permissions workspace_meta = check_workspace_perm(request.user, CAN_EDIT, namespace, name, can_share=True, meta_fields=['workspace.bucketName']) projects = Project.objects.filter(workspace_namespace=namespace, workspace_name=name) if projects: error = 'Project "{}" for workspace "{}/{}" exists.'.format(projects.first().name, namespace, name) return create_json_response({'error': error}, status=400, reason=error) # Validate all the user inputs from the post body request_json = json.loads(request.body) missing_fields = [field for field in ['genomeVersion', 'uploadedFileId', 'dataPath'] if not request_json.get(field)] if missing_fields: error = 'Field(s) "{}" are required'.format(', '.join(missing_fields)) return create_json_response({'error': error}, status=400, reason=error) if not request_json.get('agreeSeqrAccess'): error = 'Must agree to grant seqr access to the data in the associated workspace.' return create_json_response({'error': error}, status=400, reason=error) # Add the seqr service account to the corresponding AnVIL workspace added_account_to_workspace = add_service_account(request.user, namespace, name) if added_account_to_workspace: _wait_for_service_account_access(request.user,namespace, name) # Validate the data path bucket_name = workspace_meta['workspace']['bucketName'] data_path = 'gs://{bucket}/{path}'.format(bucket=bucket_name.rstrip('/'), path=request_json['dataPath'].lstrip('/')) if not does_file_exist(data_path): error = 'Data file or path {} is not found.'.format(request_json['dataPath']) return create_json_response({'error': error}, status=400, reason=error) # Parse families/individuals in the uploaded pedigree file json_records = load_uploaded_file(request_json['uploadedFileId']) pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, 'uploaded pedigree file', user=request.user) errors += ped_warnings if errors: return create_json_response({'errors': errors}, status=400) # Create a new Project in seqr project_args = { 'name': name, 'genome_version': request_json['genomeVersion'], 'description': request_json.get('description', ''), 'workspace_namespace': namespace, 'workspace_name': name, } project = create_model_from_json(Project, project_args, user=request.user) # add families and individuals according to the uploaded individual records _, updated_individuals = add_or_update_individuals_and_families( project, individual_records=pedigree_records, user=request.user ) # Send an email to all seqr data managers try: _send_load_data_email(project, updated_individuals, data_path, request.user) except Exception as ee: message = 'Exception while sending email to user {}. {}'.format(request.user, str(ee)) logger.error(message) return create_json_response({'projectGuid': project.guid})
def handle(self, *args, **options): analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS # parse and validate args sample_type = options["sample_type"] genome_version = options["genome_version"] validate_only = options["validate_only"] remap_sample_ids = options["remap_sample_ids"] max_edit_distance = options["max_edit_distance_for_id_match"] pedigree_file_path = options["pedigree_file"] export_pedigree_file_template = options["export_pedigree_file_template"] project_guid = options["project_id"] vcf_path = options["vcf_path"] elasticsearch_index = options["elasticsearch_index"] is_loaded = options["is_loaded"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) #if project.genome_version != genome_version: # raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version)) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals()) # parse the pedigree file if specified if pedigree_file_path: input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records) # validate VCF and get sample ids vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version) if remap_sample_ids: if not does_file_exist(remap_sample_ids): raise ValueError("File not found: " + remap_sample_ids) id_mapping = {} for line in file_iter(remap_sample_ids): fields = line.strip().split("\t") if len(fields) != 2: raise ValueError("Must contain 2 columns: " + str(fields)) id_mapping[fields[0]] = fields[1] remapped_vcf_sample_ids = [] for sample_id in vcf_sample_ids: if sample_id in id_mapping: remapped_vcf_sample_ids.append(id_mapping[sample_id]) print("Remapped %s to %s" % (sample_id, id_mapping[sample_id])) else: remapped_vcf_sample_ids.append(sample_id) print("No sample id mapping for %s" % sample_id) vcf_sample_ids = remapped_vcf_sample_ids vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records( project, sample_ids=vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_sample_records=not validate_only, ) if export_pedigree_file_template: with open(export_pedigree_file_template, "w") as out_f: out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],))) for vcf_sample_id in vcf_sample_ids: if vcf_sample_id in vcf_sample_ids_to_sample_records: continue family_id = individual_id = vcf_sample_id out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],))) logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals()) return if len(vcf_sample_ids_to_sample_records) == 0: all_vcf_sample_id_count = len(vcf_sample_ids) all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_elasticsearch_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=vcf_path, elasticsearch_index=elasticsearch_index, is_loaded=is_loaded, ) if is_loaded and not dataset.loaded_date: dataset.loaded_date=timezone.now() dataset.save() link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values()) # check if all VCF samples loaded already vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return elif not dataset.is_loaded: logger.info("Dataset not loaded. %s Loading..." % (is_loaded,)) elif len(vcf_sample_ids - existing_sample_ids) != 0: logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, )) logger.info("done")
def handle(self, *args, **options): analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS # parse and validate args sample_type = options["sample_type"] genome_version = options["genome_version"] validate_only = options["validate_only"] max_edit_distance = options["max_edit_distance_for_id_match"] pedigree_file_path = options["pedigree_file"] export_pedigree_file_template = options["export_pedigree_file_template"] project_guid = options["project_id"] vcf_path = options["vcf_path"] dataset_id = options["dataset_id"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) if project.genome_version != genome_version: raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version)) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals()) # parse the pedigree file if specified if pedigree_file_path: input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records) # validate VCF and get sample ids vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version) vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records( project, sample_ids=vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_records_for_new_sample_ids=not validate_only, ) if export_pedigree_file_template: with open(export_pedigree_file_template, "w") as out_f: out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],))) for vcf_sample_id in vcf_sample_ids: if vcf_sample_id in vcf_sample_ids_to_sample_records: continue family_id = individual_id = vcf_sample_id out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],))) logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals()) return if len(vcf_sample_ids_to_sample_records) == 0: all_vcf_sample_id_count = len(vcf_sample_ids) all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) logger.info(("No matches found between the %(all_vcf_sample_id_count)s sample id(s) in the VCF and " "the %(all_project_sample_id_count)s %(sample_type)s sample id(s) in %(project_guid)s") % locals()) return if validate_only: return # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_dataset( analysis_type=analysis_type, source_file_path=vcf_path, project=project, dataset_id=dataset_id, ) link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values()) # check if all VCF samples loaded already vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return # load the VCF _load_variants(dataset) logger.info("done")
def process_records(json_records, filename='ped_file'): pedigree_records, errors, warnings = parse_pedigree_table(json_records, filename, user=request.user, project=project) if errors: raise ErrorsWarningsException(errors, warnings) return pedigree_records
def parse_file(filename, stream): pedigree_records, errors, warnings = parse_pedigree_table( filename, stream, user=request.user, project=project) if errors: raise ErrorsWarningsException(errors, warnings) return pedigree_records
def receive_individuals_table_handler(request, project_guid): """Handler for the initial upload of an Excel or .tsv table of individuals. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'token' representing this file back to the client. If/when the client then wants to 'apply' this table, it can send the token to the save_individuals_table(..) handler to actually save the data in the database. Args: request (object): Django request object project_guid (string): project GUID """ project = _get_project_and_check_permissions(project_guid, request.user) if len(request.FILES) != 1: return create_json_response({ 'errors': ["Received %s files instead of 1" % len(request.FILES)] }) # parse file stream = request.FILES.values()[0] filename = stream._name #file_size = stream._size #content_type = stream.content_type #content_type_extra = stream.content_type_extra #for chunk in value.chunks(): # destination.write(chunk) json_records, errors, warnings = parse_pedigree_table(filename, stream) if errors: return create_json_response({'errors': errors, 'warnings': warnings}) # save json to temporary file token = hashlib.md5(str(json_records)).hexdigest() serialized_file_path = _compute_serialized_file_path(token) with gzip.open(serialized_file_path, "w") as f: json.dump(json_records, f) # send back some stats num_families = len(set(r['familyId'] for r in json_records)) num_individuals = len(set(r['individualId'] for r in json_records)) num_families_to_create = len([ family_id for family_id in set(r['familyId'] for r in json_records) if not Family.objects.filter(family_id=family_id, project=project) ]) num_individuals_to_create = len( set(r['individualId'] for r in json_records if not Individual.objects.filter(individual_id=r['individualId'], family__family_id=r['familyId'], family__project=project))) info = [ "%(num_families)s families, %(num_individuals)s inidividuals parsed from %(filename)s" % locals(), "%d new families, %d new individuals will be added to the project" % (num_families_to_create, num_individuals_to_create), "%d existing individuals will be updated" % (num_individuals - num_individuals_to_create), ] return create_json_response({ 'token': token, 'errors': errors, 'warnings': warnings, 'info': info, })
def test_parse_pedigree_table(self): records, errors, warnings = parse_pedigree_table( [['family_id', 'individual_id', 'sex', 'affected'], ['fam1', 'ind1', 'male']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ 'Error while parsing file: {}. Row 1 contains 3 columns: fam1, ind1, male, while header contains 4: family_id, individual_id, sex, affected' .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['', '', 'male', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Family Id not specified in row #1:\n{{'affected': 'u', 'maternalId': 'ind2', 'individualId': '', 'sex': 'male', 'familyId': '', 'paternalId': ''}}" .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', '', 'male', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Individual Id not specified in row #1:\n{{'affected': 'u', 'maternalId': 'ind2', 'individualId': '', 'sex': 'male', 'familyId': 'fam1', 'paternalId': ''}}" .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', 'ind1', 'boy', 'u', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Invalid value 'boy' for sex in row #1" .format(FILENAME) ]) self.assertListEqual(warnings, []) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', 'ind1', 'male', 'no', '.', 'ind2']], FILENAME) self.assertListEqual(records, []) self.assertListEqual(errors, [ "Error while converting {} rows to json: Invalid value 'no' for affected status in row #1" .format(FILENAME) ]) records, errors, warnings = parse_pedigree_table([[ 'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother' ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2' ], ['fam2', 'ind2', 'male', 'u', '.', '']], FILENAME) self.assertListEqual(records, [ { 'familyId': 'fam1', 'individualId': 'ind1', 'sex': 'M', 'affected': 'A', 'paternalId': 'ind3', 'maternalId': 'ind2' }, { 'familyId': 'fam2', 'individualId': 'ind2', 'sex': 'M', 'affected': 'N', 'paternalId': '', 'maternalId': '' }, ]) self.assertListEqual(errors, [ 'ind2 is recorded as Male and also as the mother of ind1', 'ind2 is recorded as the mother of ind1 but they have different family ids: fam2 and fam1', ]) self.assertListEqual(warnings, [ "ind3 is the father of ind1 but doesn't have a separate record in the table" ]) records, errors, warnings = parse_pedigree_table( [[ 'family_id', 'individual_id', 'notes_for_import', 'other_data', 'sex', 'affected', 'father', 'mother', 'phenotype: coded' ], [ 'fam1', 'ind1', 'some notes', 'some more notes', 'male', 'aff.', '.', 'ind2', 'HPO:12345' ], ['fam1', 'ind2', ' ', '', 'female', 'u', '.', '', 'HPO:56789'] ], FILENAME) self.assertListEqual(records, [ { 'familyId': 'fam1', 'individualId': 'ind1', 'sex': 'M', 'affected': 'A', 'paternalId': '', 'maternalId': 'ind2', 'notes': 'some notes', 'codedPhenotype': 'HPO:12345' }, { 'familyId': 'fam1', 'individualId': 'ind2', 'sex': 'F', 'affected': 'N', 'paternalId': '', 'maternalId': '', 'notes': '', 'codedPhenotype': 'HPO:56789' }, ]) self.assertListEqual(errors, []) self.assertListEqual(warnings, [])