async def get_external_participant_id_to_internal_sample_id_export( project: str, export_type: FileExtension, flip_columns: bool = False, connection: Connection = get_project_readonly_connection, ): """ Get csv / tsv export of external_participant_id to internal_sample_id :param flip_columns: Set to True when exporting for seqr """ player = ParticipantLayer(connection) # this wants project ID (connection.project) assert connection.project m = await player.get_external_participant_id_to_internal_sample_id_map( project=connection.project) rows = [[pid, sample_id_format(sid)] for pid, sid in m] if flip_columns: rows = [r[::-1] for r in rows] output = io.StringIO() writer = csv.writer(output, delimiter=export_type.get_delimiter()) writer.writerows(rows) ext = export_type.get_extension() filename = f'{project}-participant-to-sample-map-{date.today().isoformat()}{ext}' return StreamingResponse( iter(output.getvalue()), media_type=export_type.get_mime_type(), headers={'Content-Disposition': f'filename={filename}'}, )
async def import_individual_metadata_manifest( file: UploadFile = File(...), delimiter: Optional[str] = None, extra_participants_method: ExtraParticipantImporterHandler = ExtraParticipantImporterHandler.FAIL, connection: Connection = get_project_write_connection, ): """ Import individual metadata manifest :param extra_participants_method: If extra participants are in the uploaded file, add a PARTICIPANT entry for them """ delimiter = guess_delimiter_by_filename(file.filename, default_delimiter=delimiter) player = ParticipantLayer(connection) csvreader = csv.reader( codecs.iterdecode(file.file, 'utf-8-sig'), delimiter=delimiter ) headers = next(csvreader) await player.generic_individual_metadata_importer( headers, list(csvreader), extra_participants_method=extra_participants_method ) return {'success': True}
async def get_individual_metadata_template_for_seqr( project: str, export_type: FileExtension, external_participant_ids: Optional[List[str]] = Query( default=None), # type: ignore[assignment] # pylint: disable=invalid-name replace_with_participant_external_ids: bool = True, connection: Connection = get_project_readonly_connection, ): """Get individual metadata template for SEQR as a CSV""" participant_layer = ParticipantLayer(connection) assert connection.project rows = await participant_layer.get_seqr_individual_template( project=connection.project, external_participant_ids=external_participant_ids, replace_with_participant_external_ids= replace_with_participant_external_ids, ) output = io.StringIO() writer = csv.writer(output, delimiter=export_type.get_delimiter()) writer.writerows(rows) basefn = f'{project}-{date.today().isoformat()}' ext = export_type.get_extension() return StreamingResponse( iter(output.getvalue()), media_type=export_type.get_mime_type(), headers={'Content-Disposition': f'filename={basefn}{ext}'}, )
async def update_many_participant_external_ids( internal_to_external_id: Dict[int, str], connection: Connection = get_projectless_db_connection, ): """Update external_ids of participants by providing an update map""" player = ParticipantLayer(connection) return await player.update_many_participant_external_ids( internal_to_external_id)
async def fill_in_missing_participants( connection: Connection = get_project_write_connection, ): """ Create a corresponding participant (if required) for each sample within a project, useful for then importing a pedigree """ participant_layer = ParticipantLayer(connection) return {'success': await participant_layer.fill_in_missing_participants()}
async def test_get_participant_by_eid(self): """Test to see what's in the database""" pl = ParticipantLayer(self.connection) ps = await pl.get_participants(project=1, external_participant_ids=['EX02']) self.assertEqual(1, len(ps)) self.assertEqual('EX02', ps[0].external_id) self.assertEqual(2, ps[0].meta['field']) self.assertEqual('XY', ps[0].karyotype)
async def setUp(self) -> None: super().setUp() pl = ParticipantLayer(self.connection) await pl.create_participant(external_id='EX01', reported_sex=2, karyotype='XX', meta={'field': 1}) await pl.create_participant(external_id='EX02', reported_sex=1, karyotype='XY', meta={'field': 2})
async def test_get_all_participants(self): """Test getting all participants""" pl = ParticipantLayer(self.connection) ps = await pl.get_participants(project=1) self.assertEqual(2, len(ps)) self.assertEqual('EX01', ps[0].external_id) self.assertEqual(1, ps[0].meta['field']) self.assertEqual('XX', ps[0].karyotype) self.assertEqual('EX02', ps[1].external_id)
async def get_id_map_by_external_ids( external_participant_ids: List[str], allow_missing: bool = False, connection: Connection = get_project_readonly_connection, ): """Get ID map of participants, by external_id""" player = ParticipantLayer(connection) return await player.get_id_map_by_external_ids( external_participant_ids, allow_missing=allow_missing, project=connection.project, )
async def get_participants( external_participant_ids: List[str] = None, internal_participant_ids: List[int] = None, connection: Connection = get_project_readonly_connection, ): """Get participants, default ALL participants in project""" player = ParticipantLayer(connection) return await player.get_participants( project=connection.project, external_participant_ids=external_participant_ids, internal_participant_ids=internal_participant_ids, )
async def get_external_participant_id_to_internal_sample_id( connection: Connection = get_project_readonly_connection, ): """ Get a map of {external_participant_id} -> {internal_sample_id} useful to matching joint-called samples in the matrix table to the participant Return a list not dictionary, because dict could lose participants with multiple samples. """ player = ParticipantLayer(connection) assert connection.project m = await player.get_external_participant_id_to_internal_sample_id_map( project=connection.project) return [[pid, sample_id_format(sid)] for pid, sid in m]
async def update_participant( participant_id: int, participant: ParticipantUpdateModel, connection: Connection = get_projectless_db_connection, ): """Update Participant Data""" participant_layer = ParticipantLayer(connection) return { 'success': await participant_layer.update_single_participant( participant_id=participant_id, reported_sex=participant.reported_sex, reported_gender=participant.reported_gender, karyotype=participant.karyotype, meta=participant.meta, ) }
async def test_pedigree_without_family(self): """ Test getting pedigree where participants do not belong to a family """ pl = ParticipantLayer(self.connection) fl = FamilyLayer(self.connection) await pl.create_participant( external_id='EX01', reported_sex=1, ) await pl.create_participant(external_id='EX02', reported_sex=None) rows = await fl.get_pedigree( project=self.connection.project, include_participants_not_in_families=True, replace_with_participant_external_ids=True, ) by_id = {r['individual_id']: r for r in rows} self.assertEqual(2, len(rows)) self.assertEqual(1, by_id['EX01']['sex']) self.assertIsNone(by_id['EX02']['sex'])
async def batch_upsert_participants( participants: ParticipantUpsertBody, connection: Connection = get_project_write_connection, ) -> Dict[str, Any]: """ Upserts a list of participants with samples and sequences Returns the list of internal sample IDs """ # Convert id in samples to int for participant in participants.participants: for sample in participant.samples: if sample.id: sample.id = sample_id_transform_to_raw(sample.id) external_pids = [p.external_id for p in participants.participants] async with connection.connection.transaction(): # Table interfaces pt = ParticipantLayer(connection) results = await pt.batch_upsert_participants(participants) pid_key = dict(zip(results.keys(), external_pids)) # Map sids back from ints to strs outputs: Dict[str, Dict[str, Any]] = {} for pid, samples in results.items(): samples_output: Dict[str, Any] = {} for iid, seqs in samples.items(): data = {'sequences': seqs} samples_output[sample_id_format(iid)] = data outputs[pid_key[pid]] = { 'id': pid, 'external_id': pid_key[pid], 'samples': samples_output, } return outputs
async def test_insert_participants(self): """ Test inserting participants, samples and sequences, and make sure they're correctly linked. Tests the other side of: tests.test_parse_generic_metadata:TestParseGenericMetadata.test_rows_with_participants """ all_participants = [ ParticipantUpsert.construct( **{ 'external_id': 'Demeter', 'meta': {}, 'samples': [ SampleUpsert.construct( **{ 'external_id': 'sample_id001', 'meta': {}, 'sequences': [ SequenceUpsert.construct( **{ 'meta': { 'reads': [ [ { 'basename': 'sample_id001.filename-R1.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id001.filename-R1.fastq.gz', 'size': 111, }, { 'basename': 'sample_id001.filename-R2.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id001.filename-R2.fastq.gz', 'size': 111, }, ] ], 'reads_type': 'fastq', }, 'status': SequenceStatus('uploaded'), 'type': SequenceType('genome'), } ), SequenceUpsert.construct( **{ 'meta': { 'reads': [ [ { 'basename': 'sample_id001.exome.filename-R1.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id001.exome.filename-R1.fastq.gz', 'size': 111, }, { 'basename': 'sample_id001.exome.filename-R2.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id001.exome.filename-R2.fastq.gz', 'size': 111, }, ] ], 'reads_type': 'fastq', }, 'status': SequenceStatus('uploaded'), 'type': SequenceType('exome'), } ), ], 'type': SampleType('blood'), } ) ], } ), ParticipantUpsert.construct( **{ 'external_id': 'Apollo', 'meta': {}, 'samples': [ SampleUpsert.construct( **{ 'external_id': 'sample_id002', 'meta': {}, 'sequences': [ SequenceUpsert.construct( **{ 'meta': { 'reads': [ [ { 'basename': 'sample_id002.filename-R1.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id002.filename-R1.fastq.gz', 'size': 111, }, { 'basename': 'sample_id002.filename-R2.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id002.filename-R2.fastq.gz', 'size': 111, }, ] ], 'reads_type': 'fastq', }, 'status': SequenceStatus('uploaded'), 'type': SequenceType('genome'), } ) ], 'type': SampleType('blood'), } ), SampleUpsert.construct( **{ 'external_id': 'sample_id004', 'meta': {}, 'sequences': [ SequenceUpsert.construct( **{ 'meta': { 'reads': [ [ { 'basename': 'sample_id004.filename-R1.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id004.filename-R1.fastq.gz', 'size': 111, }, { 'basename': 'sample_id004.filename-R2.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id004.filename-R2.fastq.gz', 'size': 111, }, ] ], 'reads_type': 'fastq', }, 'status': SequenceStatus('uploaded'), 'type': SequenceType('genome'), } ) ], 'type': SampleType('blood'), } ), ], } ), ParticipantUpsert.construct( **{ 'external_id': 'Athena', 'meta': {}, 'samples': [ SampleUpsert.construct( **{ 'external_id': 'sample_id003', 'meta': {}, 'sequences': [ SequenceUpsert.construct( **{ 'meta': { 'reads': [ [ { 'basename': 'sample_id003.filename-R1.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id003.filename-R1.fastq.gz', 'size': 111, }, { 'basename': 'sample_id003.filename-R2.fastq.gz', 'checksum': None, 'class': 'File', 'location': '/path/to/sample_id003.filename-R2.fastq.gz', 'size': 111, }, ] ], 'reads_type': 'fastq', }, 'status': SequenceStatus('uploaded'), 'type': SequenceType('genome'), } ) ], 'type': SampleType('blood'), } ) ], } ), ] body = ParticipantUpsertBody.construct(participants=all_participants) # Table interfaces pt = ParticipantLayer(self.connection) await pt.batch_upsert_participants(body) expected_sample_eid_to_participant_eid = { sample.external_id: participant.external_id for participant in all_participants for sample in participant.samples } db_participants = await self.connection.connection.fetch_all( 'SELECT * FROM participant' ) self.assertEqual(3, len(db_participants)) self.assertEqual('Demeter', db_participants[0]['external_id']) self.assertEqual('Apollo', db_participants[1]['external_id']) self.assertEqual('Athena', db_participants[2]['external_id']) participant_id_map = {p['external_id']: p['id'] for p in db_participants} db_samples = await self.connection.connection.fetch_all('SELECT * FROM sample') self.assertEqual(4, len(db_samples)) for db_sample in db_samples: self.assertIsNotNone(db_sample['external_id']) self.assertIsNotNone(db_sample['participant_id']) # get expected_participant_id from the db_sample external_id expected_participant_eid = expected_sample_eid_to_participant_eid.get( db_sample['external_id'] ) self.assertEqual( participant_id_map[expected_participant_eid], db_sample['participant_id'], )
async def import_pedigree( self, header: Optional[List[str]], rows: List[List[str]], create_missing_participants=False, perform_sex_check=True, ): """ Import pedigree file """ if header is None: _header = PedRow.default_header() else: _header = PedRow.parse_header_order(header) if len(rows) == 0: return None max_row_length = len(rows[0]) if max_row_length > len(_header): raise ValueError( f"The parsed header {_header} isn't long enough " f'to cover row length ({len(_header)} < {len(rows[0])})') if len(_header) > max_row_length: _header = _header[:max_row_length] pedrows: List[PedRow] = [ PedRow(**{_header[i]: r[i] for i in range(len(_header))}) for r in rows ] # this validates a lot of the pedigree too pedrows = PedRow.order(pedrows) if perform_sex_check: PedRow.validate_sexes(pedrows, throws=True) external_family_ids = set(r.family_id for r in pedrows) # get set of all individual, paternal, maternal participant ids external_participant_ids = set( pid for r in pedrows for pid in [r.individual_id, r.paternal_id, r.maternal_id] if pid) participant_table = ParticipantLayer(self.connection) external_family_id_map = await self.ftable.get_id_map_by_external_ids( list(external_family_ids), project=self.connection.project, allow_missing=True, ) missing_external_family_ids = [ f for f in external_family_ids if f not in external_family_id_map ] external_participant_ids_map = await participant_table.get_id_map_by_external_ids( list(external_participant_ids), project=self.connection.project, # Allow missing participants if we're creating them allow_missing=create_missing_participants, ) async with self.connection.connection.transaction(): if create_missing_participants: missing_participant_ids = set(external_participant_ids) - set( external_participant_ids_map) for row in pedrows: if row.individual_id not in missing_participant_ids: continue external_participant_ids_map[ row. individual_id] = await participant_table.create_participant( external_id=row.individual_id, reported_sex=row.sex) for external_family_id in missing_external_family_ids: internal_family_id = await self.ftable.create_family( external_id=external_family_id, description=None, coded_phenotype=None, ) external_family_id_map[external_family_id] = internal_family_id # now let's map participants back insertable_rows = [{ 'family_id': external_family_id_map[row.family_id], 'participant_id': external_participant_ids_map[row.individual_id], 'paternal_participant_id': external_participant_ids_map.get(row.paternal_id), 'maternal_participant_id': external_participant_ids_map.get(row.maternal_id), 'affected': row.affected, 'notes': row.notes, } for row in pedrows] await participant_table.update_participants( participant_ids=[ external_participant_ids_map[row.individual_id] for row in pedrows ], reported_sexes=[row.sex for row in pedrows], ) await self.fptable.create_rows(insertable_rows) return True