Exemple #1
0
    def process_steps(self, beddf, newPath, parentId, databaseSynId,
                      seq_assay_id):
        """
        Process bed file, update bed database, write bed file to path

        Args:
            beddf: Bed dataframe
            newPath: Path to new bed file
            parentId: Synapse id to store gene panel file
            databaseSynId: Synapse id of bed database
            seq_assay_id: GENIE seq assay id

        Returns:
            string: Path to new bed file
        """
        final_beddf = self._process(beddf, seq_assay_id, newPath, parentId)
        process_functions.updateData(
            self.syn,
            databaseSynId,
            final_beddf,
            seq_assay_id,
            filterByColumn="SEQ_ASSAY_ID",
            toDelete=True,
        )
        final_beddf.to_csv(newPath, sep="\t", index=False)
        return newPath
    def process_steps(self, filePath, databaseToSynIdMappingDf):
        logger.debug("Performing process_steps for {}".format(self._fileType))

        folder_id = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == self._fileType][0]

        table_id = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] ==
            f"{self._fileType}_table"][0]

        logger.debug(f"Storing file at {folder_id}")
        f = self.syn.store(synapseclient.File(filePath,
                                              parent=folder_id,
                                              annotations=dict(
                                                  center=self.center,
                                                  fileType=self._fileType)),
                           forceVersion=False)

        # Add information about assay to the table
        data = self._get_dataframe(filePath)
        data['entity_id'] = f.id
        process_functions.updateData(syn=self.syn,
                                     databaseSynId=table_id,
                                     newData=data,
                                     filterBy=self.center,
                                     filterByColumn="center",
                                     col=self._required_columns,
                                     toDelete=True)

        return (filePath)
Exemple #3
0
 def process_steps(self, patientCountsDf, newPath, oncotreeLink,
                   databaseSynId):
     patientCountsDf = self._process(patientCountsDf, oncotreeLink)
     process_functions.updateData(self.syn, databaseSynId, patientCountsDf,
                                  self.center)
     patientCountsDf.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #4
0
 def process_steps(self, mutationInCis, newPath, databaseSynId):
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  mutationInCis,
                                  self.center,
                                  filterByColumn="Center")
     mutationInCis.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #5
0
 def process_steps(self, fusion, databaseSynId, newPath, test):
     fusion = self._process(fusion, test)
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  fusion,
                                  self.center,
                                  toDelete=True)
     fusion.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #6
0
 def process_steps(self, seg, newPath, databaseSynId):
     seg = self._process(seg)
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  seg,
                                  self.center,
                                  toDelete=True)
     seg.to_csv(newPath, sep="\t", index=False)
     return newPath
Exemple #7
0
 def process_steps(self, filePath, **kwargs):
     logger.info('PROCESSING %s' % filePath)
     newPath = kwargs['newPath']
     databaseSynId = kwargs['databaseSynId']
     clinicalSPDf = pd.read_csv(filePath, sep="\t", comment="#")
     clinicalSPDf = self._process(clinicalSPDf)
     process_functions.updateData(self.syn, databaseSynId, clinicalSPDf,
                                  self.center)
     clinicalSPDf.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #8
0
 def process_steps(self, filePath, **kwargs):
     logger.info('PROCESSING %s' % filePath)
     databaseSynId = kwargs['databaseSynId']
     newPath = kwargs['newPath']
     vitalStatusDf = pd.read_csv(filePath, sep="\t", comment="#")
     vitalStatusDf = self._process(vitalStatusDf)
     #cols = vitalStatusDf.columns
     process_functions.updateData(self.syn, databaseSynId, vitalStatusDf, self.center)
     vitalStatusDf.to_csv(newPath, sep="\t",index=False)
     return(newPath)
Exemple #9
0
 def storeProcessedMaf(
         self, filePath, mafSynId, centerMafSynId, isNarrow=False):
     '''
     Stores SP maf to database
     '''
     logger.info('STORING %s' % filePath)
     mafDataFrame = pd.read_csv(filePath, sep="\t")
     process_functions.updateData(
         self.syn, mafSynId, mafDataFrame,
         self.center, filterByColumn="Center", toDelete=True)
     return(filePath)
 def process_steps(self, deleteSamples, fileSynId, databaseSynId, newPath):
     info = self.syn.get(fileSynId, downloadFile=False)
     deleteSamples = self._process(deleteSamples,
                                   info.modifiedOn.split(".")[0])
     process_functions.updateData(
         self.syn,
         databaseSynId,
         deleteSamples,
         self.center,
         filterByColumn="center",
         toDelete=True,
     )
     return newPath
Exemple #11
0
 def process_steps(self, filePath, **kwargs):
     logger.info('PROCESSING %s' % filePath)
     newPath = kwargs['newPath']
     databaseSynId = kwargs['databaseSynId']
     mutationInCis = pd.read_csv(filePath, comment="#")
     #cols = mutationInCis.columns
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  mutationInCis,
                                  self.center,
                                  filterByColumn="Center")
     mutationInCis.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #12
0
 def process_steps(self, filePath, **kwargs):
     #For CBS files
     if kwargs.get("path") is not None:
         filePath = kwargs['path']
         newPath = filePath
     else:
         newPath = kwargs['newPath']
     logger.info('PROCESSING %s' % filePath)
     databaseSynId = kwargs['databaseSynId']
     seg = pd.read_csv(filePath, sep="\t")
     seg = self._process(seg)
     process_functions.updateData(self.syn, databaseSynId, seg, self.center, toDelete=True)
     seg.to_csv(newPath,sep="\t",index=False)
     return(newPath)
Exemple #13
0
 def process_steps(self, filePath, **kwargs):
     logger.info('PROCESSING %s' % filePath)
     databaseSynId = kwargs['databaseSynId']
     newPath = kwargs['newPath']
     test = kwargs['test']
     fusion = pd.read_csv(filePath, sep="\t", comment="#")
     fusion = self._process(fusion, test)
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  fusion,
                                  self.center,
                                  toDelete=True)
     fusion.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #14
0
 def process_steps(self, filePath, **kwargs):
     logger.info('PROCESSING %s' % filePath)
     fileSynId = kwargs['fileSynId']
     databaseSynId = kwargs['databaseSynId']
     newPath = kwargs['newPath']
     info = self.syn.get(fileSynId, downloadFile=False)
     deleteSamples = pd.read_csv(filePath, header=None)
     deleteSamples = self._process(deleteSamples,
                                   info.modifiedOn.split(".")[0])
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  deleteSamples,
                                  self.center,
                                  filterByColumn="center",
                                  toDelete=True)
     return (newPath)
Exemple #15
0
 def process_steps(self, filePath, **kwargs):
     newPath = kwargs['newPath']
     parentId = kwargs['parentId']
     databaseSynId = kwargs['databaseSynId']
     logger.info('PROCESSING %s' % filePath)
     #standardize all SEQ_ASSAY_IDs
     seq_assay_id = os.path.basename(filePath).replace(".bed", "").upper()
     gene = pd.read_csv(filePath, sep="\t", header=None)
     bed = self._process(gene, seq_assay_id, newPath, parentId)
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  bed,
                                  seq_assay_id,
                                  filterByColumn="SEQ_ASSAY_ID",
                                  toDelete=True)
     bed.to_csv(newPath, sep="\t", index=False)
     return (newPath)
Exemple #16
0
 def process_steps(self, filePath, newPath, databaseSynId):
     logger.info('PROCESSING %s' % filePath)
     # databaseSynId = kwargs['databaseSynId']
     # Must pass in a list
     assay_info_df = self._get_dataframe([filePath])
     process_assay_info_df = self._process(assay_info_df)
     col = ['SEQ_ASSAY_ID', 'is_paired_end', 'library_selection',
            'library_strategy', 'platform', 'read_length',
            'instrument_model', 'gene_padding', 'number_of_genes',
            'variant_classifications', 'CENTER']
     process_functions.updateData(
         self.syn,
         databaseSynId,
         process_assay_info_df,
         self.center,
         col=col,
         filterByColumn="CENTER",
         toDelete=True)
     return(filePath)
Exemple #17
0
 def process_steps(self, assay_info_df, newPath, databaseSynId):
     # databaseSynId = kwargs['databaseSynId']
     # Must pass in a list
     process_assay_info_df = self._process(assay_info_df)
     col = [
         'SEQ_ASSAY_ID', 'is_paired_end', 'library_selection',
         'library_strategy', 'platform', 'read_length', 'instrument_model',
         'gene_padding', 'number_of_genes', 'variant_classifications',
         'CENTER'
     ]
     process_functions.updateData(self.syn,
                                  databaseSynId,
                                  process_assay_info_df,
                                  self.center,
                                  col=col,
                                  filterByColumn="CENTER",
                                  toDelete=True)
     process_assay_info_df.to_csv(newPath, sep="\t", index=False)
     return (newPath)
    def process_steps(self, filePath, databaseToSynIdMappingDf, newPath,
                      parentId):

        data = self._get_dataframe(filePath)

        table_id = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == self._fileType][0]

        data['center'] = self.center

        logger.debug(f"Updating {self._fileType} data in table {table_id}.")
        process_functions.updateData(syn=self.syn,
                                     databaseSynId=table_id,
                                     newData=data,
                                     filterBy=self.center,
                                     filterByColumn="center",
                                     col=None,
                                     toDelete=True)

        data.to_csv(newPath, sep="\t", index=False)
        return (newPath)
Exemple #19
0
    def process_steps(self, assay_info_df, newPath, databaseSynId):
        """
        Process bed input and update bed database

        Args:
            assay_info_df: Assay information dataframe
            newPath: Path to processed assay information
            databaseSynId: assay information database synapse id

        Returns:
            path to assay information dataframe
        """
        # Must pass in a list
        process_assay_info_df = self._process(assay_info_df)
        process_functions.updateData(
            self.syn,
            databaseSynId,
            process_assay_info_df,
            self.center,
            filterByColumn="CENTER",
            toDelete=True,
        )
        process_assay_info_df.to_csv(newPath, sep="\t", index=False)
        return newPath
Exemple #20
0
    def process_steps(self, filePath, databaseToSynIdMappingDf, newPath,
                      parentId, oncotreeLink):
        patientSynId = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == "patient"][0]
        sampleSynId = databaseToSynIdMappingDf.Id[
            databaseToSynIdMappingDf['Database'] == "sample"][0]

        clinicalDf = pd.read_csv(filePath, sep="\t", comment="#")

        patient = False
        sample = False
        # These synapse ids for the clinical tier release scope is
        # hardcoded because it never changes
        patientColsTable = self.syn.tableQuery(
            'select fieldName from syn8545211 where patient is '
            'True and inClinicalDb is True')
        patientCols = patientColsTable.asDataFrame()['fieldName'].tolist()
        sampleColsTable = self.syn.tableQuery(
            'select fieldName from syn8545211 where sample is True '
            'and inClinicalDb is True')
        sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist()

        if "patient" in filePath.lower():
            clinicalTemplate = pd.DataFrame(columns=patientCols)
            patient = True
        elif "sample" in filePath.lower():
            clinicalTemplate = pd.DataFrame(columns=sampleCols)
            sample = True
        else:
            clinicalTemplate = pd.DataFrame(columns=set(patientCols +
                                                        sampleCols))
            sample = True
            patient = True

        newClinicalDf = self._process(clinicalDf, clinicalTemplate)

        if patient:
            patientClinical = newClinicalDf[patientCols].drop_duplicates(
                "PATIENT_ID")
            self.uploadMissingData(patientClinical, "PATIENT_ID", patientSynId,
                                   parentId)
            # retractedPatientSynId)
            process_functions.updateData(self.syn,
                                         patientSynId,
                                         patientClinical,
                                         self.center,
                                         col=patientCols,
                                         toDelete=True)
        if sample:
            if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0:
                logger.error("There are duplicated samples, "
                             "and the duplicates are removed")
            sampleClinical = newClinicalDf[sampleCols].drop_duplicates(
                "SAMPLE_ID")
            # Exclude all clinical samples with wrong oncotree codes
            oncotree_mapping = pd.DataFrame()
            oncotree_mapping_dict = \
                process_functions.get_oncotree_code_mappings(oncotreeLink)
            # Add in unknown key for oncotree code
            oncotree_mapping_dict['UNKNOWN'] = {}
            oncotree_mapping['ONCOTREE_CODE'] = oncotree_mapping_dict.keys()
            # Make oncotree codes uppercase (SpCC/SPCC)
            sampleClinical['ONCOTREE_CODE'] = sampleClinical[
                'ONCOTREE_CODE'].astype(str).str.upper()
            sampleClinical = sampleClinical[
                sampleClinical['ONCOTREE_CODE'].isin(
                    oncotree_mapping['ONCOTREE_CODE'])]
            self.uploadMissingData(sampleClinical, "SAMPLE_ID", sampleSynId,
                                   parentId)
            # ,retractedSampleSynId)
            process_functions.updateData(self.syn,
                                         sampleSynId,
                                         sampleClinical,
                                         self.center,
                                         col=sampleCols,
                                         toDelete=True)

        newClinicalDf.to_csv(newPath, sep="\t", index=False)
        return (newPath)
Exemple #21
0
 def process_steps(self, clinicalSPDf, newPath, databaseSynId):
     clinicalSPDf = self._process(clinicalSPDf)
     process_functions.updateData(self.syn, databaseSynId, clinicalSPDf,
                                  self.center)
     clinicalSPDf.to_csv(newPath, sep="\t", index=False)
     return newPath
Exemple #22
0
    def run(self):
        """
        This function runs the redcap export to export all files
        """
        if not os.path.exists(self._SPONSORED_PROJECT):
            os.mkdir(self._SPONSORED_PROJECT)
        else:
            filelists = os.listdir(self._SPONSORED_PROJECT)
            for file in filelists:
                if file != "case_lists":
                    os.remove(os.path.join(self._SPONSORED_PROJECT, file))
        # Create full mapping table to get the values of the data model
        mapping = self.syn.tableQuery(
            "select genie_field_name,instrument from {} where "
            "{} is true and phi is false".format(
                self._DATA_ELEMENT_SYN_ID, self._SPONSORED_PROJECT.lower()))
        mappingDf = mapping.asDataFrame()
        newMappingDf = pd.DataFrame()
        for field, instrument in zip(mappingDf.genie_field_name,
                                     mappingDf.instrument):
            # Do not want to append the # values
            if "#" in field:
                # find fields with # and replace with however many times
                # it should loop through
                newfields = [
                    field.replace("#", str(count))
                    for count in list(range(1, self._NUM_COUNTS + 1))
                ]
                newDataFrame = pd.DataFrame({
                    "genie_field_name":
                    newfields,
                    "instrument": [instrument] * len(newfields)
                })
            else:
                newDataFrame = pd.DataFrame(
                    {
                        "genie_field_name": field,
                        "instrument": instrument
                    },
                    index=[0])
            newMappingDf = newMappingDf.append(newDataFrame, sort=False)

        # If there are ever missing fields, they must be added in
        # or else the script will fail
        # missingFields= ['her_status_sample','sample_seq_yn']
        # missingFieldType = ['sample_information']*2
        # newMappingDf = newMappingDf.append(pd.DataFrame({
        #     "genie_field_name": missingFields,
        #     "instrument": missingFieldType}))

        # Extract patient/sample/treatment columns
        patientCols = extractColumns(newMappingDf, [
            "patient_information", "treatment_information_general",
            "diagnosis_information"
        ], [
            'errors_patient_info_yn', 'patient_info_errors',
            'errors_dx_info_yn', 'dx_info_errors', 'so_yn'
        ])
        sampleCols = extractColumns(newMappingDf, ["sample_information"], [
            "test_sample", "fgfr4_variant", "errors_sample_info_yn",
            "sample_info_errors"
        ])
        treatmentCols = extractColumns(newMappingDf,
                                       ["treatment_information_detailed"], [])

        unlabelledEnt = self.syn.get(self._UNLABELLED_SYN_ID)
        labelledEnt = self.syn.get(self._LABELLED_SYN_ID)
        unlabeledDf = pd.read_csv(unlabelledEnt.path)
        labeledDf = pd.read_csv(labelledEnt.path)
        # Add on CENTER column for all three file formats
        patientCols.append("redcap_data_access_group")
        sampleCols.append("redcap_data_access_group")
        treatmentCols.append("redcap_data_access_group")

        labeledDf.columns = unlabeledDf.columns
        labeledDf['redcap_data_access_group'][
            labeledDf['redcap_data_access_group'] == "hop"] = "JHU"
        labeledDf['redcap_data_access_group'] = \
            labeledDf['redcap_data_access_group'].apply(lambda x: x.upper())

        patientDf = labeledDf[patientCols]
        patientRows = labeledDf.redcap_repeat_instrument.isnull()
        patientDf = patientDf[patientRows]

        sampleDf = labeledDf[sampleCols]
        sampleRows = labeledDf.redcap_repeat_instrument == "Sample Information"
        sampleDf = sampleDf[sampleRows]
        # Red cap header to cbio header Table mapping
        redCapToCbioMapping = self.syn.tableQuery(
            "SELECT * FROM %s" % self._REDCAP_TO_CBIOMAPPING_SYNID)
        redCapToCbioMappingDf = redCapToCbioMapping.asDataFrame()

        # Get all the samples/patients that should be uploaded to SP projects
        # Hard coded clinical database
        genie_clinicalDb = self.syn.tableQuery(
            'select SAMPLE_ID, PATIENT_ID, ONCOTREE_CODE, SEQ_ASSAY_ID '
            'from syn7517674')
        genie_clinicalDf = genie_clinicalDb.asDataFrame()
        # Hard coded clinicalSP database
        # nonGenie_clinicalDb = self.syn.tableQuery(
        #     'SELECT * FROM syn11492579')
        # nonGenie_clinicalDf = nonGenie_clinicalDb.asDataFrame()
        # genie_clinicalDf = genie_clinicalDf.append(nonGenie_clinicalDf)

        # Only patients and samples that exist in the
        # sponsored project uploads are going to be pulled into the SP project
        finalPatientDf = self.configureClinicalDf(patientDf,
                                                  redCapToCbioMappingDf)
        patient_date_col = [
            col for col in finalPatientDf.columns if col.endswith("INT")
        ]
        patient_date_col.append("OS_MONTHS")
        final_patientdf_datesdays = finalPatientDf.copy()
        finalPatientDf[patient_date_col] = \
            finalPatientDf[patient_date_col].applymap(change_days_to_months)
        subsetPatientDf = finalPatientDf[finalPatientDf['PATIENT_ID'].isin(
            genie_clinicalDf['PATIENT_ID'])]
        del subsetPatientDf['SP']
        # Remove CENTER and ONCOTREE_CODE from patient because you
        # cannot have these columns in both sample and patient Df,
        # it will fail validation for cbioportal
        del subsetPatientDf['CENTER']
        del subsetPatientDf['ONCOTREE_CODE']

        patientPath = self.writeClinicalFile(subsetPatientDf,
                                             redCapToCbioMappingDf, "patient")

        finalSampleDf = self.configureClinicalDf(sampleDf,
                                                 redCapToCbioMappingDf)

        sample_date_cols = ['SAMPLE_DATE_INT', 'AGE_AT_SEQ_REPORT']
        final_sampledf_datesdays = finalSampleDf.copy()
        finalSampleDf[sample_date_cols] = \
            finalSampleDf[sample_date_cols].applymap(change_days_to_months)
        # Fill in ONCOTREE_CODE
        finalSampleDf['ONCOTREE_CODE'] = [
            genie_clinicalDf['ONCOTREE_CODE'][genie_clinicalDf['SAMPLE_ID'] ==
                                              sample].values[0] if
            sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan')
            for sample in finalSampleDf['SAMPLE_ID']
        ]
        # Fill in SEQ_ASSAY_ID
        finalSampleDf['SEQ_ASSAY_ID'] = [
            genie_clinicalDf['SEQ_ASSAY_ID'][genie_clinicalDf['SAMPLE_ID'] ==
                                             sample].values[0] if
            sum(genie_clinicalDf['SAMPLE_ID'] == sample) > 0 else float('nan')
            for sample in finalSampleDf['SAMPLE_ID']
        ]

        subsetSampleDf = finalSampleDf[finalSampleDf['SAMPLE_ID'].isin(
            genie_clinicalDf['SAMPLE_ID'])]
        del subsetSampleDf['SP']

        samplePath = self.writeClinicalFile(subsetSampleDf,
                                            redCapToCbioMappingDf, "sample")

        # Remove oncotree code here, because no longer need it
        mergedClinicalDf = subsetSampleDf.merge(subsetPatientDf,
                                                on="PATIENT_ID",
                                                how="outer")

        if mergedClinicalDf.get("SAMPLE_ID") is not None:
            print("Samples not in GENIE clinical databases (SP and normal)")
            notFoundSamples = mergedClinicalDf[
                'SAMPLE_ID'][~mergedClinicalDf['SAMPLE_ID'].
                             isin(genie_clinicalDf['SAMPLE_ID'])]
            if not notFoundSamples.empty:
                print(notFoundSamples[~notFoundSamples.isnull()])
                notFoundSamples.to_csv("notfoundsamples.csv", header=False)
                if not self.staging:
                    self.syn.store(
                        synapseclient.File(
                            "notfoundsamples.csv",
                            parent=self._SP_REDCAP_EXPORTS_SYNID))

        # Hard coded most up to date oncotree version
        oncotreeLink = self.syn.get("syn13890902").externalURL
        # Use the old oncotree link for now
        oncotreeLink = 'http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21'
        oncotreeDict = \
            process_functions.get_oncotree_code_mappings(oncotreeLink)
        mergedClinicalDf['CANCER_TYPE'] = [
            oncotreeDict[code.upper()].get("CANCER_TYPE", float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['CANCER_TYPE_DETAILED'] = [
            oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['ONCOTREE_PRIMARY_NODE'] = [
            oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]
        mergedClinicalDf['ONCOTREE_SECONDARY_NODE'] = [
            oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",
                                           float('nan'))
            for code in mergedClinicalDf['ONCOTREE_CODE']
        ]

        mergedClinicalDf.to_csv("%s/data_clinical.txt" %
                                self._SPONSORED_PROJECT,
                                index=False,
                                sep="\t")

        if not self.staging:
            process_functions.updateData(self.syn,
                                         "syn17010637",
                                         finalPatientDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

            patientFileEnt = File(patientPath, parent=self._SP_SYN_ID)
            patientEnt = self.syn.store(patientFileEnt,
                                        used=labelledEnt.id,
                                        executed=self._GITHUB_REPO)

            process_functions.updateData(self.syn,
                                         "syn17010638",
                                         finalSampleDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

            sampleFileEnt = File(samplePath, parent=self._SP_SYN_ID)
            sampleEnt = self.syn.store(sampleFileEnt,
                                       used=labelledEnt.id,
                                       executed=self._GITHUB_REPO)

        treatmentDf = labeledDf[treatmentCols]
        treatmentRows = labeledDf.redcap_repeat_instrument == \
            "Treatment Information Detailed"
        treatmentDf = treatmentDf[treatmentRows]
        finalTimelineDf = self.makeTimeLineDf(treatmentDf,
                                              final_patientdf_datesdays)
        finalTimelineDf.PATIENT_ID = finalTimelineDf.apply(
            lambda x: process_functions.checkGenieId(x['PATIENT_ID'], x[
                'CENTER']),
            axis=1)
        if not self.staging:
            process_functions.updateData(self.syn,
                                         "syn17011214",
                                         finalTimelineDf,
                                         self._SPONSORED_PROJECT,
                                         filterByColumn="SP",
                                         toDelete=True)

        # METASTATIC DIAGNOSIS (append to timeline)
        metaDiagnosisDf = self.createMetaDiagnosisDf(finalTimelineDf)
        # Maintain ordering of timeline
        ordering = finalTimelineDf.columns.tolist()
        # Two extra timeline columns from specimen file
        ordering.extend(["SAMPLE_ID", "SAMPLE_NOTES"])
        finalTimelineDf = finalTimelineDf.append(metaDiagnosisDf, sort=False)

        # Create specimen file to append to timeline file too
        specimenDf = self.createSpecimenDf(final_sampledf_datesdays,
                                           final_patientdf_datesdays)
        specimenDf = specimenDf[specimenDf['SAMPLE_ID'].isin(
            genie_clinicalDf['SAMPLE_ID'])]
        # dates = ['START_DATE', 'STOP_DATE', 'LINE_START']
        finalTimelineDf = finalTimelineDf.append(specimenDf, sort=False)
        # No need to convert timeline dates to months
        # finalTimelineDf[dates] = \
        #     finalTimelineDf[dates].applymap(change_days_to_months)
        finalTimelineDf = finalTimelineDf[ordering]
        finalTimelineDf = finalTimelineDf[finalTimelineDf['PATIENT_ID'].isin(
            genie_clinicalDf['PATIENT_ID'])]
        finalTimelineDf['AGENT'][finalTimelineDf['AGENT'].isnull()] = "Unknown"
        timelineText = finalTimelineDf.to_csv(index=False, sep="\t")
        timelineText = replace0(timelineText)
        timeline_path = "%s/data_timeline.txt" % self._SPONSORED_PROJECT
        with open(timeline_path, 'w') as timelineFile:
            timelineFile.write(timelineText)
        if not self.staging:
            fileEnt = File(timeline_path, parent=self._SP_SYN_ID)
            self.syn.store(fileEnt,
                           used=labelledEnt.id,
                           executed=self._GITHUB_REPO)

        # Get database to synapse id mapping table so no need to
        # hardcode synapse ids
        databaseToSynIdMapping = \
            self.syn.tableQuery('SELECT * FROM syn10967259')
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

        centerMafFileViewSynId = databaseToSynIdMappingDf['Id'][
            databaseToSynIdMappingDf['Database'] == "centerMafView"][0]
        centerMafSynIds = self.syn.tableQuery(
            "select id from {} where name like '%mutation%'".format(
                centerMafFileViewSynId))
        centerMafSynIdsDf = centerMafSynIds.asDataFrame()
        # This value must be set outside here because the first maf file might
        # Not be part of the centers
        index = 0
        mafpath = "{}/data_mutations_extended.txt".format(
            self._SPONSORED_PROJECT)
        for mafSynId in centerMafSynIdsDf.id:
            mafEnt = self.syn.get(mafSynId, downloadFile=False)
            mafcenter = mafEnt.name.split("_")[3]
            if mafcenter in finalSampleDf['CENTER'].tolist():
                mafEnt = self.syn.get(mafSynId)
                print("running", mafEnt.name)
                with open(mafEnt.path, "r") as mafFile:
                    header = mafFile.readline()
                    headers = header.replace("\n", "").split("\t")
                    if index == 0:
                        with open(mafpath, 'w') as f:
                            f.write(header)
                    index += 1
                    for row in mafFile:
                        rowArray = row.replace("\n", "").split("\t")
                        center = rowArray[headers.index('Center')]
                        newMergedRow = configureMafRow(
                            rowArray, headers, finalSampleDf['SAMPLE_ID'])
                        if newMergedRow is not None:
                            with open(mafpath, 'a') as f:
                                f.write(newMergedRow)
        # No longer need to pulling from non genie db
        fileEnt = File(mafpath, parent=self._SP_SYN_ID)
        if not self.staging:
            self.syn.store(fileEnt,
                           used=centerMafSynIdsDf.id.tolist(),
                           executed=self._GITHUB_REPO)

        CNA_PATH = "%s/data_CNA.txt" % self._SPONSORED_PROJECT
        CNA_CENTER_PATH = self._SPONSORED_PROJECT + "/data_CNA_%s.txt"
        centerCNASynIds = self.syn.tableQuery(
            "select id from {} where name like 'data_CNA%'".format(
                centerMafFileViewSynId))
        centerCNASynIdsDf = centerCNASynIds.asDataFrame()
        # Grab all unique symbols and form cnaTemplate
        allSymbols = set()

        for cnaSynId in centerCNASynIdsDf.id:
            cnaEnt = self.syn.get(cnaSynId)
            with open(cnaEnt.path, "r") as cnaFile:
                # Read first line first to get all the samples
                cnaFile.readline()
                # Get all hugo symbols
                allSymbols = allSymbols.union(
                    set(line.split("\t")[0] for line in cnaFile))
        cnaTemplate = pd.DataFrame({"Hugo_Symbol": list(allSymbols)})
        cnaTemplate.sort_values("Hugo_Symbol", inplace=True)
        cnaTemplate.to_csv(CNA_PATH, sep="\t", index=False)

        withMergedHugoSymbol = pd.Series("Hugo_Symbol")
        withMergedHugoSymbol = \
            withMergedHugoSymbol.append(pd.Series(finalSampleDf['SAMPLE_ID']))
        cnaSamples = []

        for cnaSynId in centerCNASynIdsDf.id:
            cnaEnt = self.syn.get(cnaSynId)
            center = cnaEnt.name.replace("data_CNA_", "").replace(".txt", "")
            print(cnaEnt.path)
            # if center in CENTER_MAPPING_DF.center.tolist():
            centerCNA = pd.read_csv(cnaEnt.path, sep="\t")
            merged = cnaTemplate.merge(centerCNA,
                                       on="Hugo_Symbol",
                                       how="outer")
            merged.sort_values("Hugo_Symbol", inplace=True)

            # This is to remove more samples for the final cna file
            merged = merged[merged.columns[merged.columns.isin(
                withMergedHugoSymbol)]]

            cnaText = process_functions.removePandasDfFloat(merged)
            # Must do this replace twice because \t\t\t ->
            # \tNA\t\t -> \tNA\tNA\t
            cnaText = cnaText.replace("\t\t", "\tNA\t").replace(
                "\t\t", "\tNA\t").replace('\t\n', "\tNA\n")

            with open(CNA_CENTER_PATH % center, "w") as cnaFile:
                cnaFile.write(cnaText)
            cnaSamples.extend(merged.columns[1:].tolist())

            # Join CNA file
            joinCommand = ["join", CNA_PATH, CNA_CENTER_PATH % center]
            output = subprocess.check_output(joinCommand)
            with open(CNA_PATH, "w") as cnaFile:
                cnaFile.write(output.decode("utf-8").replace(" ", "\t"))

        fileEnt = File(CNA_PATH, parent=self._SP_SYN_ID)
        if not self.staging:
            self.syn.store(fileEnt,
                           used=centerCNASynIdsDf.id.tolist(),
                           executed=self._GITHUB_REPO)

        self.createGeneMatrixDf(finalSampleDf, cnaSamples, labelledEnt)

        fusion = self.syn.tableQuery("SELECT * FROM syn7893268 where "
                                     "TUMOR_SAMPLE_BARCODE in ('{}')".format(
                                         "','".join(
                                             finalSampleDf['SAMPLE_ID'])))
        fusions_df = fusion.asDataFrame()

        if not fusions_df.empty:
            fusions_df = fusions_df.rename(
                columns={
                    'HUGO_SYMBOL': 'Hugo_Symbol',
                    'ENTREZ_GENE_ID': 'Entrez_Gene_Id',
                    'CENTER': 'Center',
                    'TUMOR_SAMPLE_BARCODE': 'Tumor_Sample_Barcode',
                    'FUSION': 'Fusion',
                    'DNA_SUPPORT': 'DNA_support',
                    'RNA_SUPPORT': 'RNA_support',
                    'METHOD': 'Method',
                    'FRAME': 'Frame',
                    'COMMENTS': 'Comments'
                })
            fusions_df.Entrez_Gene_Id[fusions_df.Entrez_Gene_Id ==
                                      0] = pd.np.nan
            fusionText = fusions_df.to_csv(sep="\t", index=False)
            fusionText = replace0(fusionText)
            fusion_path = "%s/data_fusions.txt" % self._SPONSORED_PROJECT
            with open(fusion_path, "w") as fusionFile:
                fusionFile.write(fusionText)
            fileEnt = File(fusion_path, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=fusion.tableId,
                               executed=self._GITHUB_REPO)

        seg = self.syn.tableQuery(
            "SELECT ID, CHROM, LOCSTART, LOCEND, NUMMARK, SEGMEAN "
            "FROM syn7893341 where ID in ('{}')".format("','".join(
                finalSampleDf['SAMPLE_ID'])))
        seg_df = seg.asDataFrame()
        if not seg_df.empty:
            seg_df.rename(columns={
                "CHROM": "chrom",
                "LOCSTART": "loc.start",
                "LOCEND": "loc.end",
                "NUMMARK": "num.mark",
                "SEGMEAN": "seg.mean"
            },
                          inplace=True)
            segText = replace0(seg_df.to_csv(sep="\t", index=False))
            segpath = "{}/genie_{}_data_cna_hg19.seg".format(
                self._SPONSORED_PROJECT, self._SPONSORED_PROJECT.lower())
            with open(segpath, 'w') as segFile:
                segFile.write(segText)
            fileEnt = File(segpath, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=seg.tableId,
                               executed=self._GITHUB_REPO)

        # Create case lists
        if not os.path.exists(self._CASE_LIST_PATH):
            os.mkdir(self._CASE_LIST_PATH)
        else:
            caselists = os.listdir(self._CASE_LIST_PATH)
            for caselist in caselists:
                os.remove(os.path.join(self._CASE_LIST_PATH, caselist))

        # Write out cases sequenced so people can tell
        # which samples were sequenced
        create_case_lists.main(
            "%s/data_clinical.txt" % self._SPONSORED_PROJECT,
            "%s/data_gene_matrix.txt" % self._SPONSORED_PROJECT,
            self._CASE_LIST_PATH,
            "genie_{}".format(self._SPONSORED_PROJECT.lower()))

        caseListFiles = os.listdir(self._CASE_LIST_PATH)
        for casePath in caseListFiles:
            casePath = os.path.join(self._CASE_LIST_PATH, casePath)
            fileEnt = File(casePath, parent=self._CASE_LIST_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt,
                               used=[patientEnt.id, sampleEnt.id],
                               executed=self._GITHUB_REPO)

        seq_assays = "','".join(set(finalSampleDf['SEQ_ASSAY_ID']))
        bed = self.syn.tableQuery(
            "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn8457748 where "
            "SEQ_ASSAY_ID in ('{}') and "
            "Feature_Type = 'exon' and "
            "Hugo_Symbol is not null and "
            "includeInPanel is true".format(seq_assays))
        beddf = bed.asDataFrame()
        bed = self.syn.tableQuery(
            "SELECT Hugo_Symbol, SEQ_ASSAY_ID FROM syn11516678 where "
            "SEQ_ASSAY_ID in ('{}') and "
            "Feature_Type = 'exon' and "
            "Hugo_Symbol is not null and "
            "includeInPanel is true".format(seq_assays))
        non_genie_beddf = bed.asDataFrame()
        beddf = beddf.append(non_genie_beddf)
        seq_assay_groups = beddf.groupby('SEQ_ASSAY_ID')
        for seq_assay_id, seqdf in seq_assay_groups:
            unique_genes = seqdf.Hugo_Symbol.unique()
            gene_panel_text = ("stable_id: {seq_assay_id}\n"
                               "description: {seq_assay_id}, "
                               "Number of Genes - {num_genes}\n"
                               "gene_list:\t{genelist}".format(
                                   seq_assay_id=seq_assay_id,
                                   num_genes=len(unique_genes),
                                   genelist="\t".join(unique_genes)))
            gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt"
            gene_panel_path = os.path.join(self._SPONSORED_PROJECT,
                                           gene_panel_name)
            with open(gene_panel_path, "w+") as f:
                f.write(gene_panel_text)
            fileEnt = File(gene_panel_path, parent=self._SP_SYN_ID)
            if not self.staging:
                self.syn.store(fileEnt, executed=self._GITHUB_REPO)

        # Make sure to re download all the metadata files again
        self.reviseMetadataFiles()

        cmd = [
            'python',
            os.path.join(self.cbioPath,
                         "core/src/main/scripts/importer/validateData.py"),
            "-s", self._SPONSORED_PROJECT, "-n"
        ]
        subprocess.call(cmd)
Exemple #23
0
    def process_steps(
        self,
        clinicalDf,
        databaseToSynIdMappingDf,
        newPath,
        parentId,
        oncotree_link,
        clinicalTemplate,
        sample,
        patient,
        patientCols,
        sampleCols,
    ):
        """Process clincial file, redact PHI values, upload to clinical
        database
        """
        patientdb_idx = databaseToSynIdMappingDf["Database"] == "patient"
        patient_synid = databaseToSynIdMappingDf.Id[patientdb_idx][0]
        sampledb_idx = databaseToSynIdMappingDf["Database"] == "sample"
        sample_synid = databaseToSynIdMappingDf.Id[sampledb_idx][0]

        newClinicalDf = self._process(clinicalDf, clinicalTemplate)
        newClinicalDf = redact_phi(newClinicalDf)

        if patient:
            cols = newClinicalDf.columns[newClinicalDf.columns.isin(patientCols)]
            patientClinical = newClinicalDf[cols].drop_duplicates("PATIENT_ID")
            self.uploadMissingData(
                patientClinical, "PATIENT_ID", patient_synid, parentId
            )

            process_functions.updateData(
                self.syn,
                patient_synid,
                patientClinical,
                self.center,
                col=cols.tolist(),
                toDelete=True,
            )
        if sample:
            cols = newClinicalDf.columns[newClinicalDf.columns.isin(sampleCols)]
            if sum(newClinicalDf["SAMPLE_ID"].duplicated()) > 0:
                logger.error(
                    "There are duplicated samples, " "and the duplicates are removed"
                )
            sampleClinical = newClinicalDf[cols].drop_duplicates("SAMPLE_ID")
            # Exclude all clinical samples with wrong oncotree codes
            oncotree_mapping = pd.DataFrame()
            oncotree_mapping_dict = process_functions.get_oncotree_code_mappings(
                oncotree_link
            )
            # Add in unknown key for oncotree code
            oncotree_mapping_dict["UNKNOWN"] = {}
            oncotree_mapping["ONCOTREE_CODE"] = list(oncotree_mapping_dict.keys())
            # Make oncotree codes uppercase (SpCC/SPCC)
            sampleClinical["ONCOTREE_CODE"] = (
                sampleClinical["ONCOTREE_CODE"].astype(str).str.upper()
            )
            sampleClinical = sampleClinical[
                sampleClinical["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])
            ]
            self.uploadMissingData(sampleClinical, "SAMPLE_ID", sample_synid, parentId)
            # ,retractedSampleSynId)
            process_functions.updateData(
                self.syn,
                sample_synid,
                sampleClinical,
                self.center,
                col=cols.tolist(),
                toDelete=True,
            )

        newClinicalDf.to_csv(newPath, sep="\t", index=False)
        return newPath